Update .gitignore, remove Anaconda.py, add Scorpion.py for image metadata extraction, and fix setup script for virtual environment
This commit is contained in:
parent
e5700d4fbd
commit
f9c281b454
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
.venv/
|
.venv/
|
||||||
|
data/
|
||||||
@ -4,6 +4,25 @@ import argparse
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from PIL.ExifTags import TAGS
|
from PIL.ExifTags import TAGS
|
||||||
|
|
||||||
|
def get_file_metadata(filepath):
|
||||||
|
"""
|
||||||
|
Extract and print metadata from an image file.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
image = Image.open(filepath)
|
||||||
|
metadata = {
|
||||||
|
'Filename': os.path.basename(filepath),
|
||||||
|
'Format': image.format,
|
||||||
|
'Mode': image.mode,
|
||||||
|
'Size': image.size,
|
||||||
|
'Info': image.info
|
||||||
|
}
|
||||||
|
return metadata
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading metadata from {filepath}: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def get_exif_data(filepath):
|
def get_exif_data(filepath):
|
||||||
"""
|
"""
|
||||||
Extract and print Exif data from an image file.
|
Extract and print Exif data from an image file.
|
||||||
@ -38,6 +57,13 @@ def main():
|
|||||||
print(f"\033[93mExif data for {image_path}:\033[0m")
|
print(f"\033[93mExif data for {image_path}:\033[0m")
|
||||||
for tag, value in exif_data.items():
|
for tag, value in exif_data.items():
|
||||||
print(f" {tag}: {value}")
|
print(f" {tag}: {value}")
|
||||||
|
|
||||||
|
metadata = get_file_metadata(image_path)
|
||||||
|
if metadata:
|
||||||
|
print(f"\033[92mMetadata for {image_path}:\033[0m")
|
||||||
|
for key, value in metadata.items():
|
||||||
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print(f"No Exif data found for {image_path}")
|
print(f"No Exif data found for {image_path}")
|
||||||
|
|
||||||
26
Spider.py
26
Spider.py
@ -4,7 +4,10 @@ from urllib.parse import urlparse, urljoin
|
|||||||
import argparse
|
import argparse
|
||||||
from collections import deque
|
from collections import deque
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
|
||||||
|
}
|
||||||
|
|
||||||
class Colors:
|
class Colors:
|
||||||
"""
|
"""
|
||||||
@ -29,9 +32,6 @@ def get_all_links(url):
|
|||||||
set: A set of unique links found on the page.
|
set: A set of unique links found on the page.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
|
|
||||||
}
|
|
||||||
response = requests.get(url, headers=headers)
|
response = requests.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
@ -94,7 +94,7 @@ def extract_image_sources(url):
|
|||||||
list: A list of image URLs found on the page.
|
list: A list of image URLs found on the page.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url, headers=headers)
|
||||||
response.raise_for_status() # Raise an error for bad responses
|
response.raise_for_status() # Raise an error for bad responses
|
||||||
|
|
||||||
img_sources = re.findall(r'<img[^>]+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE)
|
img_sources = re.findall(r'<img[^>]+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE)
|
||||||
@ -111,9 +111,12 @@ def get_filename_from_url(url):
|
|||||||
Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
|
Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
|
||||||
"""
|
"""
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path
|
path = os.path.basename(parsed_url.path) if len(parsed_url.path) > 20 else parsed_url.path
|
||||||
filename = parsed_url.netloc + path
|
|
||||||
|
filename = parsed_url.netloc + '__' + path
|
||||||
filename = filename.replace('/', '_')
|
filename = filename.replace('/', '_')
|
||||||
|
if len(filename) > 225:
|
||||||
|
filename = filename[:225]
|
||||||
return filename if filename else url
|
return filename if filename else url
|
||||||
|
|
||||||
def download_images(imgs, directory):
|
def download_images(imgs, directory):
|
||||||
@ -128,7 +131,7 @@ def download_images(imgs, directory):
|
|||||||
downloaded = set()
|
downloaded = set()
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
try:
|
try:
|
||||||
response = requests.get(img)
|
response = requests.get(img, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
filename = get_filename_from_url(img)
|
filename = get_filename_from_url(img)
|
||||||
@ -156,6 +159,13 @@ def main():
|
|||||||
parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save")
|
parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.r:
|
||||||
|
print(f"{Colors.RED}Please use -r to spider the URL. Setting depth to 0{Colors.RESET}")
|
||||||
|
args.l = 0
|
||||||
|
|
||||||
|
if not args.url.startswith(('http://', 'https://')):
|
||||||
|
print(f"{Colors.RED}Invalid URL format. trying http://{args.url}{Colors.RESET}")
|
||||||
|
args.url = f"http://{args.url}"
|
||||||
|
|
||||||
visited_links = crawl(args.url, args.l)
|
visited_links = crawl(args.url, args.l)
|
||||||
print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}")
|
print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user