Add initial implementation of image metadata extraction and web crawling functionality

2025-07-02 12:30:01 +02:00 · 2025-07-02 12:30:01 +02:00 · 32a43816f4
commit 32a43816f4
parent d601559ba7
4 changed files with 379 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 .venv/
--- a/Anaconda.py
+++ b/Anaconda.py
@ -0,0 +1,45 @@
 import os
 import argparse
 from PIL import Image
 from PIL.ExifTags import TAGS
 def get_exif_data(filepath):
    """
    Extract and print Exif data from an image file.
    """
    try:
        image = Image.open(filepath)
        exif_data = image._getexif()
        if not exif_data:
            print(f"No Exif data found in {filepath}")
            return {}
        exif = {}
        for tag_id, value in exif_data.items():
            tag = TAGS.get(tag_id, tag_id)
            exif[tag] = value
        return exif
    except Exception as e:
        print(f"Error reading Exif data from {filepath}: {e}")
        return {}
 def main():
    parser = argparse.ArgumentParser(description="Program to display Metadata of Images")
    parser.add_argument('images', nargs='+', help='Paths to one or more image files')
    args = parser.parse_args()
    for image_path in args.images:
        if not os.path.isfile(image_path):
            print(f"File {image_path} does not exist or is not a file.")
            continue
        exif_data = get_exif_data(image_path)
        if exif_data:
            print(f"\033[93mExif data for {image_path}:\033[0m")
            for tag, value in exif_data.items():
                print(f"  {tag}: {value}")
        else:
            print(f"No Exif data found for {image_path}")       
 if __name__ == "__main__":
    main()
--- a/Spider.py
+++ b/Spider.py
@ -0,0 +1,156 @@
 import os
 import argparse
 import requests
 import re
 from urllib.parse import urlparse
 from PIL import Image
 from PIL.ExifTags import TAGS
 def same_domain(url1, url2):
    """
    Return True if url1 and url2 have the same domain name.
    Args:
        url1 (str): The first URL.
        url2 (str): The second URL.
    Returns:
        bool: True if both URLs have the same domain, False otherwise.
    """
    domain1 = urlparse(url1).netloc
    domain2 = urlparse(url2).netloc
    return domain1 == domain2
 def crawl(url, depth, imgs = set()):
    """
    Crawl the given URL and extract links and images.
    If depth is greater than 0, continue crawling links found on the page.
    If depth is 0, only extract images.
    Args:
        url (str): The URL to crawl.
        depth (int): The depth of crawling.
        imgs (set): A set to store found image URLs.
    Returns:
        set: A set of found image URLs.
    """
    print(f"\033[92mFetching links from {url} at depth {depth}\033[0m")  
    try:
        response = requests.get(url)
        response.raise_for_status()  
        links = re.findall(r'<a[^>]+href="([^"]+)"', response.text)
        for link in links:
            if link.startswith(('#', 'mailto:', 'javascript:', 'tel:')):
                continue
            if not link.startswith('http'):
                link = requests.compat.urljoin(url, link)
            if not same_domain(url, link):
                continue
            crawl(link, depth - 1, imgs) if depth > 0 else None
        for img in extract_image_sources(url):
            if img not in imgs:
                print(f"\033[94mFound image: {img}\033[0m")
                imgs.add(img)
        imgs.update(extract_image_sources(url))
        return imgs
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
 def extract_image_sources(url):
    """
    Fetch image links from the given URL. 
    Args:
        url (str): The URL to fetch images from.
    Returns:
        list: A list of image URLs found on the page.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        img_sources = re.findall(r'src="([^"]+(?:jpg|jpeg|gif|png|bmp))"', response.text, re.IGNORECASE)
        img_sources = [requests.compat.urljoin(url, src) for src in img_sources]
        return img_sources
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
 def download_images(imgs, directory):
    """
    Download images to the specified directory.
    Args:
        imgs (set): A set of image URLs to download.
        directory (str): The directory where images will be saved.
    Returns:
        set: A set of file paths of downloaded images.
    """
    downloaded = set()
    for img in imgs:
        try:
            response = requests.get(img)
            response.raise_for_status()
            filename = get_filename_from_url(img)
            filepath = f"{directory}/{filename}"
            with open(filepath, 'wb') as file:
                file.write(response.content)
                downloaded.add(filepath)
        except requests.RequestException as e:
            print(f"Error downloading {img}: {e}")
    return downloaded
 def get_filename_from_url(url):
    """
    Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
    """
    parsed_url = urlparse(url)
    filename = parsed_url.netloc + parsed_url.path
    filename = filename.replace('/', '_')
    return filename if filename else url
 def main():
    parser = argparse.ArgumentParser(description="Spider Web Crawler")
    parser.add_argument("url", type=str, help="The URL to start crawling from")
    parser.add_argument("-r", help="Spider the URL")
    parser.add_argument("-l", type=int, default=5, help="Depth of crawling")
    parser.add_argument("-p", default="./data", help="Output file to save")
    args = parser.parse_args()
    print(args)
    imgs = crawl(args.url, args.l)
    print(f"\033[92mFound {len(imgs)} images\033[0m")
    if args.p:
        if not os.path.exists(args.p):
            os.makedirs(args.p)
        downloaded = download_images(imgs, args.p)
        print(f"\033[92mDownloaded {len(downloaded)} images to {args.p}\033[0m")
    #     for img in downloaded:
    #         exif_data = get_exif_data(img)
    #         if exif_data:
    #             print(f"\033[93mExif data for {img}:\033[0m")
    #             for tag, value in exif_data.items():
    #                 print(f"  {tag}: {value}")
    # return 0
 if __name__ == "__main__":
    main()
--- a/Spider2.py
+++ b/Spider2.py
@ -0,0 +1,177 @@
 import requests
 import re
 from urllib.parse import urlparse, urljoin
 import argparse
 from collections import deque
 import os
 class Colors:
    """
    A class to hold color codes for terminal output.
    """
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    BLUE = '\033[94m'
    RED = '\033[91m'
    RESET = '\033[0m'
 """
 get all links from a given url
 """
 def get_all_links(url):
    """
    Fetch all links from the given URL.
    Args:
        url (str): The URL to fetch links from.
    Returns:
        set: A set of unique links found on the page.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
            }
        response = requests.get(url, headers=headers)
        response.raise_for_status() 
        links = set(re.findall(r'<a[^>]+href="([^"]+)"', response.text, re.IGNORECASE))
        links = {urljoin(url, link) for link in links if not link.startswith(('#', 'mailto:', 'javascript:', 'tel:'))}
        print(f"{Colors.GREEN}Found {len(links)} links on {url}{Colors.RESET}")
        return links
    except requests.RequestException as e:
        print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}")
        return set()
 def same_domain(url1, url2):
    """
    Check if two URLs belong to the same domain.
    Args:
        url1 (str): The first URL.
        url2 (str): The second URL.
    Returns:
        bool: True if both URLs belong to the same domain, False otherwise.
    """
    domain1 = urlparse(url1).netloc
    domain2 = urlparse(url2).netloc
    return domain1 == domain2
 """breadth first crawl using get_all_links
 """
 def crawl(url, depth):
    """
    Crawl the given URL and extract links up to a specified depth.
    Args:
        url (str): The URL to start crawling from.
        depth (int): The maximum depth of crawling.
    Returns:
        set: A set of unique links found during the crawl.
    """
    visited = set()
    queue = deque([(url, 0)])  # (current_url, current_depth)
    while queue:
        current_url, current_depth = queue.popleft()
        if current_depth > depth or current_url in visited:
            continue
        if current_depth <= depth:
            visited.add(current_url)
        if current_depth < depth:
            links = get_all_links(current_url)
            for link in links:
                if link not in visited and same_domain(url, link):
                    queue.append((link, current_depth + 1))
    return visited
 def extract_image_sources(url):
    """
    Fetch image links from the given URL. 
    Args:
        url (str): The URL to fetch images from.
    Returns:
        list: A list of image URLs found on the page.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        img_sources = re.findall(r'<img[^>]+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE)
        img_sources = [requests.compat.urljoin(url, src) for src in img_sources]
        return img_sources
    except requests.RequestException as e:
        print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}")
        return []
 def get_filename_from_url(url):
    """
    Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
    """
    parsed_url = urlparse(url)
    path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path
    filename = parsed_url.netloc + path
    filename = filename.replace('/', '_')
    return filename if filename else url
 def download_images(imgs, directory):
    """
    Download images to the specified directory.
    Args:
        imgs (set): A set of image URLs to download.
        directory (str): The directory where images will be saved.
    Returns:
        set: A set of file paths of downloaded images.
    """
    downloaded = set()
    for img in imgs:
        try:
            response = requests.get(img)
            response.raise_for_status()
            filename = get_filename_from_url(img)
            filepath = f"{directory}/{filename}"
            base, ext = os.path.splitext(filepath)
            counter = 1
            while os.path.exists(filepath):
                filepath = f"{base}({counter}){ext}"
                counter += 1
            with open(filepath, 'wb') as file:
                file.write(response.content)
                downloaded.add(filepath)
        except requests.RequestException as e:
            print(f"Error downloading {img}: {e}")
    return downloaded
 def main():
    parser = argparse.ArgumentParser(description="Spider Web Crawler")
    parser.add_argument("url", type=str, help="The URL to start crawling from")
    parser.add_argument("-r", action="store_true", default=False, help="Spider the URL")
    parser.add_argument("-l", type=int, default=5, help="Depth of crawling")
    parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save")
    args = parser.parse_args()
    visited_links = crawl(args.url, args.l)
    print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}")
    imgs = set()
    for link in visited_links:
        print(f"{Colors.BLUE}Extracting images from {link}{Colors.RESET}")
        imgs.update(extract_image_sources(link))
    print(f"{Colors.BLUE}Found {len(imgs)} images:{Colors.RESET}")
    if not os.path.exists(args.p):
        os.makedirs(args.p)
    downloaded = download_images(imgs, args.p)
    print(f"{Colors.GREEN}Downloaded {len(downloaded)} images to {args.p}{Colors.RESET}")
 if __name__ == "__main__":
    main()