Update .gitignore, remove Anaconda.py, add Scorpion.py for image metadata extraction, and fix setup script for virtual environment

This commit is contained in:
whaffman 2025-07-03 15:05:57 +02:00
parent e5700d4fbd
commit f9c281b454
4 changed files with 47 additions and 10 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
.venv/ .venv/
data/

View File

@ -4,6 +4,25 @@ import argparse
from PIL import Image from PIL import Image
from PIL.ExifTags import TAGS from PIL.ExifTags import TAGS
def get_file_metadata(filepath):
"""
Extract and print metadata from an image file.
"""
try:
image = Image.open(filepath)
metadata = {
'Filename': os.path.basename(filepath),
'Format': image.format,
'Mode': image.mode,
'Size': image.size,
'Info': image.info
}
return metadata
except Exception as e:
print(f"Error reading metadata from {filepath}: {e}")
return {}
def get_exif_data(filepath): def get_exif_data(filepath):
""" """
Extract and print Exif data from an image file. Extract and print Exif data from an image file.
@ -38,6 +57,13 @@ def main():
print(f"\033[93mExif data for {image_path}:\033[0m") print(f"\033[93mExif data for {image_path}:\033[0m")
for tag, value in exif_data.items(): for tag, value in exif_data.items():
print(f" {tag}: {value}") print(f" {tag}: {value}")
metadata = get_file_metadata(image_path)
if metadata:
print(f"\033[92mMetadata for {image_path}:\033[0m")
for key, value in metadata.items():
print(f" {key}: {value}")
else: else:
print(f"No Exif data found for {image_path}") print(f"No Exif data found for {image_path}")

View File

@ -4,7 +4,10 @@ from urllib.parse import urlparse, urljoin
import argparse import argparse
from collections import deque from collections import deque
import os import os
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
}
class Colors: class Colors:
""" """
@ -29,9 +32,6 @@ def get_all_links(url):
set: A set of unique links found on the page. set: A set of unique links found on the page.
""" """
try: try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
}
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
response.raise_for_status() response.raise_for_status()
@ -94,7 +94,7 @@ def extract_image_sources(url):
list: A list of image URLs found on the page. list: A list of image URLs found on the page.
""" """
try: try:
response = requests.get(url) response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an error for bad responses response.raise_for_status() # Raise an error for bad responses
img_sources = re.findall(r'<img[^>]+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE) img_sources = re.findall(r'<img[^>]+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE)
@ -111,9 +111,12 @@ def get_filename_from_url(url):
Remove http:// or https:// from the URL and replace / with _ to create a valid filename. Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
""" """
parsed_url = urlparse(url) parsed_url = urlparse(url)
path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path path = os.path.basename(parsed_url.path) if len(parsed_url.path) > 20 else parsed_url.path
filename = parsed_url.netloc + path
filename = parsed_url.netloc + '__' + path
filename = filename.replace('/', '_') filename = filename.replace('/', '_')
if len(filename) > 225:
filename = filename[:225]
return filename if filename else url return filename if filename else url
def download_images(imgs, directory): def download_images(imgs, directory):
@ -128,7 +131,7 @@ def download_images(imgs, directory):
downloaded = set() downloaded = set()
for img in imgs: for img in imgs:
try: try:
response = requests.get(img) response = requests.get(img, headers=headers)
response.raise_for_status() response.raise_for_status()
filename = get_filename_from_url(img) filename = get_filename_from_url(img)
@ -156,6 +159,13 @@ def main():
parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save") parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save")
args = parser.parse_args() args = parser.parse_args()
if not args.r:
print(f"{Colors.RED}Please use -r to spider the URL. Setting depth to 0{Colors.RESET}")
args.l = 0
if not args.url.startswith(('http://', 'https://')):
print(f"{Colors.RED}Invalid URL format. trying http://{args.url}{Colors.RESET}")
args.url = f"http://{args.url}"
visited_links = crawl(args.url, args.l) visited_links = crawl(args.url, args.l)
print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}") print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}")

View File

@ -1,4 +1,4 @@
#!/bin/bash #!/bin/bash
python3 -m venv venv python3 -m venv .venv
source venv/bin/activate source venv/bin/activate
pip install -r requirements.txt pip install -r requirements.txt