Update .gitignore, remove Anaconda.py, add Scorpion.py for image metadata extraction, and fix setup script for virtual environment

2025-07-03 15:05:57 +02:00 · 2025-07-03 15:05:57 +02:00 · f9c281b454
commit f9c281b454
parent e5700d4fbd
4 changed files with 47 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 .venv/
+data/
--- a/Scorpion.py
+++ b/Scorpion.py
@ -4,6 +4,25 @@ import argparse
 from PIL import Image
 from PIL.ExifTags import TAGS

+def get_file_metadata(filepath):
+    """
+    Extract and print metadata from an image file.
+    """
+    try:
+        image = Image.open(filepath)
+        metadata = {
+            'Filename': os.path.basename(filepath),
+            'Format': image.format,
+            'Mode': image.mode,
+            'Size': image.size,
+            'Info': image.info
+        }
+        return metadata
+    except Exception as e:
+        print(f"Error reading metadata from {filepath}: {e}")
+        return {}
+    
+
 def get_exif_data(filepath):
    """
    Extract and print Exif data from an image file.
@ -38,6 +57,13 @@ def main():
            print(f"\033[93mExif data for {image_path}:\033[0m")
            for tag, value in exif_data.items():
                print(f"  {tag}: {value}")
+
+        metadata = get_file_metadata(image_path)
+        if metadata:
+            print(f"\033[92mMetadata for {image_path}:\033[0m")
+            for key, value in metadata.items():
+                print(f"  {key}: {value}")
+                
        else:
            print(f"No Exif data found for {image_path}")       

--- a/Spider.py
+++ b/Spider.py
@ -5,6 +5,9 @@ import argparse
 from collections import deque
 import os
        
+headers = {
+            'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
+            }

 class Colors:
    """
@ -29,9 +32,6 @@ def get_all_links(url):
        set: A set of unique links found on the page.
    """
    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
-            }
        response = requests.get(url, headers=headers)
        response.raise_for_status() 

@ -94,7 +94,7 @@ def extract_image_sources(url):
        list: A list of image URLs found on the page.
    """
    try:
-        response = requests.get(url)
+        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses

        img_sources = re.findall(r'<img[^>]+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE)
@ -111,9 +111,12 @@ def get_filename_from_url(url):
    Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
    """
    parsed_url = urlparse(url)
-    path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path
-    filename = parsed_url.netloc + path
+    path = os.path.basename(parsed_url.path) if len(parsed_url.path) > 20 else parsed_url.path
+    
+    filename = parsed_url.netloc + '__' + path
    filename = filename.replace('/', '_')
+    if len(filename) > 225:
+        filename = filename[:225]
    return filename if filename else url
    
 def download_images(imgs, directory):
@ -128,7 +131,7 @@ def download_images(imgs, directory):
    downloaded = set()
    for img in imgs:
        try:
-            response = requests.get(img)
+            response = requests.get(img, headers=headers)
            response.raise_for_status()
            
            filename = get_filename_from_url(img)
@ -156,6 +159,13 @@ def main():
    parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save")
    args = parser.parse_args()

+    if not args.r:
+        print(f"{Colors.RED}Please use -r to spider the URL. Setting depth to 0{Colors.RESET}")
+        args.l = 0
+
+    if not args.url.startswith(('http://', 'https://')):
+        print(f"{Colors.RED}Invalid URL format. trying http://{args.url}{Colors.RESET}")
+        args.url = f"http://{args.url}"
    
    visited_links = crawl(args.url, args.l)
    print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}")
--- a/setup.sh
+++ b/setup.sh
@ -1,4 +1,4 @@
 #!/bin/bash
-python3 -m venv venv
+python3 -m venv .venv
 source venv/bin/activate
 pip install -r requirements.txt