diff --git a/.gitignore b/.gitignore index 0cafc1c..252f497 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.venv/ \ No newline at end of file +.venv/ +data/ \ No newline at end of file diff --git a/Anaconda.py b/Scorpion.py similarity index 64% rename from Anaconda.py rename to Scorpion.py index 562408b..ad96202 100644 --- a/Anaconda.py +++ b/Scorpion.py @@ -4,6 +4,25 @@ import argparse from PIL import Image from PIL.ExifTags import TAGS +def get_file_metadata(filepath): + """ + Extract and print metadata from an image file. + """ + try: + image = Image.open(filepath) + metadata = { + 'Filename': os.path.basename(filepath), + 'Format': image.format, + 'Mode': image.mode, + 'Size': image.size, + 'Info': image.info + } + return metadata + except Exception as e: + print(f"Error reading metadata from {filepath}: {e}") + return {} + + def get_exif_data(filepath): """ Extract and print Exif data from an image file. @@ -38,6 +57,13 @@ def main(): print(f"\033[93mExif data for {image_path}:\033[0m") for tag, value in exif_data.items(): print(f" {tag}: {value}") + + metadata = get_file_metadata(image_path) + if metadata: + print(f"\033[92mMetadata for {image_path}:\033[0m") + for key, value in metadata.items(): + print(f" {key}: {value}") + else: print(f"No Exif data found for {image_path}") diff --git a/Spider.py b/Spider.py index 1b1758c..3e181aa 100644 --- a/Spider.py +++ b/Spider.py @@ -4,7 +4,10 @@ from urllib.parse import urlparse, urljoin import argparse from collections import deque import os - + +headers = { + 'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)' + } class Colors: """ @@ -29,9 +32,6 @@ def get_all_links(url): set: A set of unique links found on the page. """ try: - headers = { - 'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)' - } response = requests.get(url, headers=headers) response.raise_for_status() @@ -94,7 +94,7 @@ def extract_image_sources(url): list: A list of image URLs found on the page. """ try: - response = requests.get(url) + response = requests.get(url, headers=headers) response.raise_for_status() # Raise an error for bad responses img_sources = re.findall(r']+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE) @@ -111,9 +111,12 @@ def get_filename_from_url(url): Remove http:// or https:// from the URL and replace / with _ to create a valid filename. """ parsed_url = urlparse(url) - path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path - filename = parsed_url.netloc + path + path = os.path.basename(parsed_url.path) if len(parsed_url.path) > 20 else parsed_url.path + + filename = parsed_url.netloc + '__' + path filename = filename.replace('/', '_') + if len(filename) > 225: + filename = filename[:225] return filename if filename else url def download_images(imgs, directory): @@ -128,7 +131,7 @@ def download_images(imgs, directory): downloaded = set() for img in imgs: try: - response = requests.get(img) + response = requests.get(img, headers=headers) response.raise_for_status() filename = get_filename_from_url(img) @@ -156,6 +159,13 @@ def main(): parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save") args = parser.parse_args() + if not args.r: + print(f"{Colors.RED}Please use -r to spider the URL. Setting depth to 0{Colors.RESET}") + args.l = 0 + + if not args.url.startswith(('http://', 'https://')): + print(f"{Colors.RED}Invalid URL format. trying http://{args.url}{Colors.RESET}") + args.url = f"http://{args.url}" visited_links = crawl(args.url, args.l) print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}") diff --git a/setup.sh b/setup.sh index 64795d3..c0b603b 100644 --- a/setup.sh +++ b/setup.sh @@ -1,4 +1,4 @@ #!/bin/bash -python3 -m venv venv +python3 -m venv .venv source venv/bin/activate pip install -r requirements.txt