From 32a43816f43c4a80d6afe2cb27c2b278c9ce9544 Mon Sep 17 00:00:00 2001 From: whaffman Date: Wed, 2 Jul 2025 12:30:01 +0200 Subject: [PATCH] Add initial implementation of image metadata extraction and web crawling functionality --- .gitignore | 1 + Anaconda.py | 45 +++++++++++++ Spider.py | 156 +++++++++++++++++++++++++++++++++++++++++++++ Spider2.py | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 379 insertions(+) create mode 100644 .gitignore create mode 100644 Anaconda.py create mode 100644 Spider.py create mode 100644 Spider2.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0cafc1c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv/ \ No newline at end of file diff --git a/Anaconda.py b/Anaconda.py new file mode 100644 index 0000000..562408b --- /dev/null +++ b/Anaconda.py @@ -0,0 +1,45 @@ +import os +import argparse + +from PIL import Image +from PIL.ExifTags import TAGS + +def get_exif_data(filepath): + """ + Extract and print Exif data from an image file. + """ + try: + image = Image.open(filepath) + exif_data = image._getexif() + if not exif_data: + print(f"No Exif data found in {filepath}") + return {} + exif = {} + for tag_id, value in exif_data.items(): + tag = TAGS.get(tag_id, tag_id) + exif[tag] = value + return exif + except Exception as e: + print(f"Error reading Exif data from {filepath}: {e}") + return {} + +def main(): + parser = argparse.ArgumentParser(description="Program to display Metadata of Images") + parser.add_argument('images', nargs='+', help='Paths to one or more image files') + args = parser.parse_args() + + for image_path in args.images: + if not os.path.isfile(image_path): + print(f"File {image_path} does not exist or is not a file.") + continue + + exif_data = get_exif_data(image_path) + if exif_data: + print(f"\033[93mExif data for {image_path}:\033[0m") + for tag, value in exif_data.items(): + print(f" {tag}: {value}") + else: + print(f"No Exif data found for {image_path}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Spider.py b/Spider.py new file mode 100644 index 0000000..f84ea39 --- /dev/null +++ b/Spider.py @@ -0,0 +1,156 @@ +import os + +import argparse +import requests +import re +from urllib.parse import urlparse + +from PIL import Image +from PIL.ExifTags import TAGS + + + +def same_domain(url1, url2): + """ + Return True if url1 and url2 have the same domain name. + Args: + url1 (str): The first URL. + url2 (str): The second URL. + Returns: + bool: True if both URLs have the same domain, False otherwise. + """ + + domain1 = urlparse(url1).netloc + domain2 = urlparse(url2).netloc + + return domain1 == domain2 + + +def crawl(url, depth, imgs = set()): + """ + Crawl the given URL and extract links and images. + If depth is greater than 0, continue crawling links found on the page. + If depth is 0, only extract images. + Args: + url (str): The URL to crawl. + depth (int): The depth of crawling. + imgs (set): A set to store found image URLs. + Returns: + set: A set of found image URLs. + """ + print(f"\033[92mFetching links from {url} at depth {depth}\033[0m") + + + try: + response = requests.get(url) + response.raise_for_status() + links = re.findall(r']+href="([^"]+)"', response.text) + for link in links: + if link.startswith(('#', 'mailto:', 'javascript:', 'tel:')): + continue + if not link.startswith('http'): + link = requests.compat.urljoin(url, link) + if not same_domain(url, link): + continue + + crawl(link, depth - 1, imgs) if depth > 0 else None + + for img in extract_image_sources(url): + if img not in imgs: + print(f"\033[94mFound image: {img}\033[0m") + imgs.add(img) + imgs.update(extract_image_sources(url)) + return imgs + except requests.RequestException as e: + print(f"Error fetching {url}: {e}") + return [] + + +def extract_image_sources(url): + """ + Fetch image links from the given URL. + Args: + url (str): The URL to fetch images from. + Returns: + list: A list of image URLs found on the page. + """ + try: + response = requests.get(url) + response.raise_for_status() # Raise an error for bad responses + + img_sources = re.findall(r'src="([^"]+(?:jpg|jpeg|gif|png|bmp))"', response.text, re.IGNORECASE) + img_sources = [requests.compat.urljoin(url, src) for src in img_sources] + + return img_sources + + except requests.RequestException as e: + print(f"Error fetching {url}: {e}") + return [] + + +def download_images(imgs, directory): + """ + Download images to the specified directory. + Args: + imgs (set): A set of image URLs to download. + directory (str): The directory where images will be saved. + Returns: + set: A set of file paths of downloaded images. + """ + downloaded = set() + for img in imgs: + try: + response = requests.get(img) + response.raise_for_status() + + filename = get_filename_from_url(img) + filepath = f"{directory}/{filename}" + + with open(filepath, 'wb') as file: + file.write(response.content) + downloaded.add(filepath) + + except requests.RequestException as e: + print(f"Error downloading {img}: {e}") + + return downloaded + + +def get_filename_from_url(url): + """ + Remove http:// or https:// from the URL and replace / with _ to create a valid filename. + """ + parsed_url = urlparse(url) + filename = parsed_url.netloc + parsed_url.path + filename = filename.replace('/', '_') + return filename if filename else url + + +def main(): + parser = argparse.ArgumentParser(description="Spider Web Crawler") + parser.add_argument("url", type=str, help="The URL to start crawling from") + parser.add_argument("-r", help="Spider the URL") + parser.add_argument("-l", type=int, default=5, help="Depth of crawling") + parser.add_argument("-p", default="./data", help="Output file to save") + args = parser.parse_args() + print(args) + + imgs = crawl(args.url, args.l) + print(f"\033[92mFound {len(imgs)} images\033[0m") + if args.p: + if not os.path.exists(args.p): + os.makedirs(args.p) + downloaded = download_images(imgs, args.p) + print(f"\033[92mDownloaded {len(downloaded)} images to {args.p}\033[0m") + # for img in downloaded: + # exif_data = get_exif_data(img) + # if exif_data: + # print(f"\033[93mExif data for {img}:\033[0m") + # for tag, value in exif_data.items(): + # print(f" {tag}: {value}") + + # return 0 + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Spider2.py b/Spider2.py new file mode 100644 index 0000000..1b1758c --- /dev/null +++ b/Spider2.py @@ -0,0 +1,177 @@ +import requests +import re +from urllib.parse import urlparse, urljoin +import argparse +from collections import deque +import os + + +class Colors: + """ + A class to hold color codes for terminal output. + """ + GREEN = '\033[92m' + YELLOW = '\033[93m' + BLUE = '\033[94m' + RED = '\033[91m' + RESET = '\033[0m' + + +""" +get all links from a given url +""" +def get_all_links(url): + """ + Fetch all links from the given URL. + Args: + url (str): The URL to fetch links from. + Returns: + set: A set of unique links found on the page. + """ + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)' + } + response = requests.get(url, headers=headers) + response.raise_for_status() + + links = set(re.findall(r']+href="([^"]+)"', response.text, re.IGNORECASE)) + links = {urljoin(url, link) for link in links if not link.startswith(('#', 'mailto:', 'javascript:', 'tel:'))} + + print(f"{Colors.GREEN}Found {len(links)} links on {url}{Colors.RESET}") + return links + except requests.RequestException as e: + print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") + return set() + +def same_domain(url1, url2): + """ + Check if two URLs belong to the same domain. + Args: + url1 (str): The first URL. + url2 (str): The second URL. + Returns: + bool: True if both URLs belong to the same domain, False otherwise. + """ + domain1 = urlparse(url1).netloc + domain2 = urlparse(url2).netloc + return domain1 == domain2 + +"""breadth first crawl using get_all_links +""" +def crawl(url, depth): + """ + Crawl the given URL and extract links up to a specified depth. + Args: + url (str): The URL to start crawling from. + depth (int): The maximum depth of crawling. + Returns: + set: A set of unique links found during the crawl. + """ + visited = set() + queue = deque([(url, 0)]) # (current_url, current_depth) + + while queue: + current_url, current_depth = queue.popleft() + if current_depth > depth or current_url in visited: + continue + if current_depth <= depth: + visited.add(current_url) + if current_depth < depth: + links = get_all_links(current_url) + + for link in links: + if link not in visited and same_domain(url, link): + queue.append((link, current_depth + 1)) + return visited + +def extract_image_sources(url): + """ + Fetch image links from the given URL. + Args: + url (str): The URL to fetch images from. + Returns: + list: A list of image URLs found on the page. + """ + try: + response = requests.get(url) + response.raise_for_status() # Raise an error for bad responses + + img_sources = re.findall(r']+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE) + img_sources = [requests.compat.urljoin(url, src) for src in img_sources] + + return img_sources + + except requests.RequestException as e: + print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") + return [] + +def get_filename_from_url(url): + """ + Remove http:// or https:// from the URL and replace / with _ to create a valid filename. + """ + parsed_url = urlparse(url) + path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path + filename = parsed_url.netloc + path + filename = filename.replace('/', '_') + return filename if filename else url + +def download_images(imgs, directory): + """ + Download images to the specified directory. + Args: + imgs (set): A set of image URLs to download. + directory (str): The directory where images will be saved. + Returns: + set: A set of file paths of downloaded images. + """ + downloaded = set() + for img in imgs: + try: + response = requests.get(img) + response.raise_for_status() + + filename = get_filename_from_url(img) + filepath = f"{directory}/{filename}" + base, ext = os.path.splitext(filepath) + counter = 1 + while os.path.exists(filepath): + filepath = f"{base}({counter}){ext}" + counter += 1 + + with open(filepath, 'wb') as file: + file.write(response.content) + downloaded.add(filepath) + + except requests.RequestException as e: + print(f"Error downloading {img}: {e}") + + return downloaded + +def main(): + parser = argparse.ArgumentParser(description="Spider Web Crawler") + parser.add_argument("url", type=str, help="The URL to start crawling from") + parser.add_argument("-r", action="store_true", default=False, help="Spider the URL") + parser.add_argument("-l", type=int, default=5, help="Depth of crawling") + parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save") + args = parser.parse_args() + + + visited_links = crawl(args.url, args.l) + print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}") + + imgs = set() + for link in visited_links: + print(f"{Colors.BLUE}Extracting images from {link}{Colors.RESET}") + imgs.update(extract_image_sources(link)) + + print(f"{Colors.BLUE}Found {len(imgs)} images:{Colors.RESET}") + + if not os.path.exists(args.p): + os.makedirs(args.p) + downloaded = download_images(imgs, args.p) + print(f"{Colors.GREEN}Downloaded {len(downloaded)} images to {args.p}{Colors.RESET}") + + +if __name__ == "__main__": + main() \ No newline at end of file