import os import argparse import requests import re from urllib.parse import urlparse from PIL import Image from PIL.ExifTags import TAGS def same_domain(url1, url2): """ Return True if url1 and url2 have the same domain name. Args: url1 (str): The first URL. url2 (str): The second URL. Returns: bool: True if both URLs have the same domain, False otherwise. """ domain1 = urlparse(url1).netloc domain2 = urlparse(url2).netloc return domain1 == domain2 def crawl(url, depth, imgs = set()): """ Crawl the given URL and extract links and images. If depth is greater than 0, continue crawling links found on the page. If depth is 0, only extract images. Args: url (str): The URL to crawl. depth (int): The depth of crawling. imgs (set): A set to store found image URLs. Returns: set: A set of found image URLs. """ print(f"\033[92mFetching links from {url} at depth {depth}\033[0m") try: response = requests.get(url) response.raise_for_status() links = re.findall(r']+href="([^"]+)"', response.text) for link in links: if link.startswith(('#', 'mailto:', 'javascript:', 'tel:')): continue if not link.startswith('http'): link = requests.compat.urljoin(url, link) if not same_domain(url, link): continue crawl(link, depth - 1, imgs) if depth > 0 else None for img in extract_image_sources(url): if img not in imgs: print(f"\033[94mFound image: {img}\033[0m") imgs.add(img) imgs.update(extract_image_sources(url)) return imgs except requests.RequestException as e: print(f"Error fetching {url}: {e}") return [] def extract_image_sources(url): """ Fetch image links from the given URL. Args: url (str): The URL to fetch images from. Returns: list: A list of image URLs found on the page. """ try: response = requests.get(url) response.raise_for_status() # Raise an error for bad responses img_sources = re.findall(r'src="([^"]+(?:jpg|jpeg|gif|png|bmp))"', response.text, re.IGNORECASE) img_sources = [requests.compat.urljoin(url, src) for src in img_sources] return img_sources except requests.RequestException as e: print(f"Error fetching {url}: {e}") return [] def download_images(imgs, directory): """ Download images to the specified directory. Args: imgs (set): A set of image URLs to download. directory (str): The directory where images will be saved. Returns: set: A set of file paths of downloaded images. """ downloaded = set() for img in imgs: try: response = requests.get(img) response.raise_for_status() filename = get_filename_from_url(img) filepath = f"{directory}/{filename}" with open(filepath, 'wb') as file: file.write(response.content) downloaded.add(filepath) except requests.RequestException as e: print(f"Error downloading {img}: {e}") return downloaded def get_filename_from_url(url): """ Remove http:// or https:// from the URL and replace / with _ to create a valid filename. """ parsed_url = urlparse(url) filename = parsed_url.netloc + parsed_url.path filename = filename.replace('/', '_') return filename if filename else url def main(): parser = argparse.ArgumentParser(description="Spider Web Crawler") parser.add_argument("url", type=str, help="The URL to start crawling from") parser.add_argument("-r", help="Spider the URL") parser.add_argument("-l", type=int, default=5, help="Depth of crawling") parser.add_argument("-p", default="./data", help="Output file to save") args = parser.parse_args() print(args) imgs = crawl(args.url, args.l) print(f"\033[92mFound {len(imgs)} images\033[0m") if args.p: if not os.path.exists(args.p): os.makedirs(args.p) downloaded = download_images(imgs, args.p) print(f"\033[92mDownloaded {len(downloaded)} images to {args.p}\033[0m") # for img in downloaded: # exif_data = get_exif_data(img) # if exif_data: # print(f"\033[93mExif data for {img}:\033[0m") # for tag, value in exif_data.items(): # print(f" {tag}: {value}") # return 0 if __name__ == "__main__": main()