import requests import re from urllib.parse import urlparse, urljoin import argparse from collections import deque import os headers = { 'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)' } class Colors: """ A class to hold color codes for terminal output. """ GREEN = '\033[92m' YELLOW = '\033[93m' BLUE = '\033[94m' RED = '\033[91m' RESET = '\033[0m' """ get all links from a given url """ def get_all_links(url): """ Fetch all links from the given URL. Args: url (str): The URL to fetch links from. Returns: set: A set of unique links found on the page. """ try: response = requests.get(url, headers=headers) response.raise_for_status() links = set(re.findall(r']+href="([^"]+)"', response.text, re.IGNORECASE)) links = {urljoin(url, link) for link in links if not link.startswith(('#', 'mailto:', 'javascript:', 'tel:'))} print(f"{Colors.GREEN}Found {len(links)} links on {url}{Colors.RESET}") return links except requests.RequestException as e: print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") return set() def same_domain(url1, url2): """ Check if two URLs belong to the same domain. Args: url1 (str): The first URL. url2 (str): The second URL. Returns: bool: True if both URLs belong to the same domain, False otherwise. """ domain1 = urlparse(url1).netloc domain2 = urlparse(url2).netloc return domain1 == domain2 """breadth first crawl using get_all_links """ def crawl(url, depth): """ Crawl the given URL and extract links up to a specified depth. Args: url (str): The URL to start crawling from. depth (int): The maximum depth of crawling. Returns: set: A set of unique links found during the crawl. """ visited = set() queue = deque([(url, 0)]) # (current_url, current_depth) while queue: current_url, current_depth = queue.popleft() if current_depth > depth or current_url in visited: continue if current_depth <= depth: visited.add(current_url) if current_depth < depth: links = get_all_links(current_url) for link in links: if link not in visited and same_domain(url, link): queue.append((link, current_depth + 1)) return visited def extract_image_sources(url): """ Fetch image links from the given URL. Args: url (str): The URL to fetch images from. Returns: list: A list of image URLs found on the page. """ try: response = requests.get(url, headers=headers) response.raise_for_status() # Raise an error for bad responses img_sources = re.findall(r']+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE) img_sources = [requests.compat.urljoin(url, src) for src in img_sources] return img_sources except requests.RequestException as e: print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") return [] def get_filename_from_url(url): """ Remove http:// or https:// from the URL and replace / with _ to create a valid filename. """ parsed_url = urlparse(url) path = os.path.basename(parsed_url.path) if len(parsed_url.path) > 20 else parsed_url.path filename = parsed_url.netloc + '__' + path filename = filename.replace('/', '_') if len(filename) > 225: filename = filename[:225] return filename if filename else url def download_images(imgs, directory): """ Download images to the specified directory. Args: imgs (set): A set of image URLs to download. directory (str): The directory where images will be saved. Returns: set: A set of file paths of downloaded images. """ downloaded = set() for img in imgs: try: response = requests.get(img, headers=headers) response.raise_for_status() filename = get_filename_from_url(img) filepath = f"{directory}/{filename}" base, ext = os.path.splitext(filepath) counter = 1 while os.path.exists(filepath): filepath = f"{base}({counter}){ext}" counter += 1 with open(filepath, 'wb') as file: file.write(response.content) downloaded.add(filepath) except requests.RequestException as e: print(f"Error downloading {img}: {e}") return downloaded def main(): parser = argparse.ArgumentParser(description="Spider Web Crawler") parser.add_argument("url", type=str, help="The URL to start crawling from") parser.add_argument("-r", action="store_true", default=False, help="Spider the URL") parser.add_argument("-l", type=int, default=5, help="Depth of crawling") parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save") args = parser.parse_args() if not args.r: print(f"{Colors.RED}Please use -r to spider the URL. Setting depth to 0{Colors.RESET}") args.l = 0 if not args.url.startswith(('http://', 'https://')): print(f"{Colors.RED}Invalid URL format. trying http://{args.url}{Colors.RESET}") args.url = f"http://{args.url}" visited_links = crawl(args.url, args.l) print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}") imgs = set() for link in visited_links: print(f"{Colors.BLUE}Extracting images from {link}{Colors.RESET}") imgs.update(extract_image_sources(link)) print(f"{Colors.BLUE}Found {len(imgs)} images:{Colors.RESET}") if not os.path.exists(args.p): os.makedirs(args.p) downloaded = download_images(imgs, args.p) print(f"{Colors.GREEN}Downloaded {len(downloaded)} images to {args.p}{Colors.RESET}") if __name__ == "__main__": main()