Add initial implementation of image metadata extraction and web crawling functionality

This commit is contained in:
whaffman 2025-07-02 12:30:01 +02:00
parent d601559ba7
commit 32a43816f4
4 changed files with 379 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.venv/

45
Anaconda.py Normal file
View File

@ -0,0 +1,45 @@
import os
import argparse
from PIL import Image
from PIL.ExifTags import TAGS
def get_exif_data(filepath):
"""
Extract and print Exif data from an image file.
"""
try:
image = Image.open(filepath)
exif_data = image._getexif()
if not exif_data:
print(f"No Exif data found in {filepath}")
return {}
exif = {}
for tag_id, value in exif_data.items():
tag = TAGS.get(tag_id, tag_id)
exif[tag] = value
return exif
except Exception as e:
print(f"Error reading Exif data from {filepath}: {e}")
return {}
def main():
parser = argparse.ArgumentParser(description="Program to display Metadata of Images")
parser.add_argument('images', nargs='+', help='Paths to one or more image files')
args = parser.parse_args()
for image_path in args.images:
if not os.path.isfile(image_path):
print(f"File {image_path} does not exist or is not a file.")
continue
exif_data = get_exif_data(image_path)
if exif_data:
print(f"\033[93mExif data for {image_path}:\033[0m")
for tag, value in exif_data.items():
print(f" {tag}: {value}")
else:
print(f"No Exif data found for {image_path}")
if __name__ == "__main__":
main()

156
Spider.py Normal file
View File

@ -0,0 +1,156 @@
import os
import argparse
import requests
import re
from urllib.parse import urlparse
from PIL import Image
from PIL.ExifTags import TAGS
def same_domain(url1, url2):
"""
Return True if url1 and url2 have the same domain name.
Args:
url1 (str): The first URL.
url2 (str): The second URL.
Returns:
bool: True if both URLs have the same domain, False otherwise.
"""
domain1 = urlparse(url1).netloc
domain2 = urlparse(url2).netloc
return domain1 == domain2
def crawl(url, depth, imgs = set()):
"""
Crawl the given URL and extract links and images.
If depth is greater than 0, continue crawling links found on the page.
If depth is 0, only extract images.
Args:
url (str): The URL to crawl.
depth (int): The depth of crawling.
imgs (set): A set to store found image URLs.
Returns:
set: A set of found image URLs.
"""
print(f"\033[92mFetching links from {url} at depth {depth}\033[0m")
try:
response = requests.get(url)
response.raise_for_status()
links = re.findall(r'<a[^>]+href="([^"]+)"', response.text)
for link in links:
if link.startswith(('#', 'mailto:', 'javascript:', 'tel:')):
continue
if not link.startswith('http'):
link = requests.compat.urljoin(url, link)
if not same_domain(url, link):
continue
crawl(link, depth - 1, imgs) if depth > 0 else None
for img in extract_image_sources(url):
if img not in imgs:
print(f"\033[94mFound image: {img}\033[0m")
imgs.add(img)
imgs.update(extract_image_sources(url))
return imgs
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return []
def extract_image_sources(url):
"""
Fetch image links from the given URL.
Args:
url (str): The URL to fetch images from.
Returns:
list: A list of image URLs found on the page.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise an error for bad responses
img_sources = re.findall(r'src="([^"]+(?:jpg|jpeg|gif|png|bmp))"', response.text, re.IGNORECASE)
img_sources = [requests.compat.urljoin(url, src) for src in img_sources]
return img_sources
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return []
def download_images(imgs, directory):
"""
Download images to the specified directory.
Args:
imgs (set): A set of image URLs to download.
directory (str): The directory where images will be saved.
Returns:
set: A set of file paths of downloaded images.
"""
downloaded = set()
for img in imgs:
try:
response = requests.get(img)
response.raise_for_status()
filename = get_filename_from_url(img)
filepath = f"{directory}/{filename}"
with open(filepath, 'wb') as file:
file.write(response.content)
downloaded.add(filepath)
except requests.RequestException as e:
print(f"Error downloading {img}: {e}")
return downloaded
def get_filename_from_url(url):
"""
Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
"""
parsed_url = urlparse(url)
filename = parsed_url.netloc + parsed_url.path
filename = filename.replace('/', '_')
return filename if filename else url
def main():
parser = argparse.ArgumentParser(description="Spider Web Crawler")
parser.add_argument("url", type=str, help="The URL to start crawling from")
parser.add_argument("-r", help="Spider the URL")
parser.add_argument("-l", type=int, default=5, help="Depth of crawling")
parser.add_argument("-p", default="./data", help="Output file to save")
args = parser.parse_args()
print(args)
imgs = crawl(args.url, args.l)
print(f"\033[92mFound {len(imgs)} images\033[0m")
if args.p:
if not os.path.exists(args.p):
os.makedirs(args.p)
downloaded = download_images(imgs, args.p)
print(f"\033[92mDownloaded {len(downloaded)} images to {args.p}\033[0m")
# for img in downloaded:
# exif_data = get_exif_data(img)
# if exif_data:
# print(f"\033[93mExif data for {img}:\033[0m")
# for tag, value in exif_data.items():
# print(f" {tag}: {value}")
# return 0
if __name__ == "__main__":
main()

177
Spider2.py Normal file
View File

@ -0,0 +1,177 @@
import requests
import re
from urllib.parse import urlparse, urljoin
import argparse
from collections import deque
import os
class Colors:
"""
A class to hold color codes for terminal output.
"""
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
RED = '\033[91m'
RESET = '\033[0m'
"""
get all links from a given url
"""
def get_all_links(url):
"""
Fetch all links from the given URL.
Args:
url (str): The URL to fetch links from.
Returns:
set: A set of unique links found on the page.
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
links = set(re.findall(r'<a[^>]+href="([^"]+)"', response.text, re.IGNORECASE))
links = {urljoin(url, link) for link in links if not link.startswith(('#', 'mailto:', 'javascript:', 'tel:'))}
print(f"{Colors.GREEN}Found {len(links)} links on {url}{Colors.RESET}")
return links
except requests.RequestException as e:
print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}")
return set()
def same_domain(url1, url2):
"""
Check if two URLs belong to the same domain.
Args:
url1 (str): The first URL.
url2 (str): The second URL.
Returns:
bool: True if both URLs belong to the same domain, False otherwise.
"""
domain1 = urlparse(url1).netloc
domain2 = urlparse(url2).netloc
return domain1 == domain2
"""breadth first crawl using get_all_links
"""
def crawl(url, depth):
"""
Crawl the given URL and extract links up to a specified depth.
Args:
url (str): The URL to start crawling from.
depth (int): The maximum depth of crawling.
Returns:
set: A set of unique links found during the crawl.
"""
visited = set()
queue = deque([(url, 0)]) # (current_url, current_depth)
while queue:
current_url, current_depth = queue.popleft()
if current_depth > depth or current_url in visited:
continue
if current_depth <= depth:
visited.add(current_url)
if current_depth < depth:
links = get_all_links(current_url)
for link in links:
if link not in visited and same_domain(url, link):
queue.append((link, current_depth + 1))
return visited
def extract_image_sources(url):
"""
Fetch image links from the given URL.
Args:
url (str): The URL to fetch images from.
Returns:
list: A list of image URLs found on the page.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise an error for bad responses
img_sources = re.findall(r'<img[^>]+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE)
img_sources = [requests.compat.urljoin(url, src) for src in img_sources]
return img_sources
except requests.RequestException as e:
print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}")
return []
def get_filename_from_url(url):
"""
Remove http:// or https:// from the URL and replace / with _ to create a valid filename.
"""
parsed_url = urlparse(url)
path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path
filename = parsed_url.netloc + path
filename = filename.replace('/', '_')
return filename if filename else url
def download_images(imgs, directory):
"""
Download images to the specified directory.
Args:
imgs (set): A set of image URLs to download.
directory (str): The directory where images will be saved.
Returns:
set: A set of file paths of downloaded images.
"""
downloaded = set()
for img in imgs:
try:
response = requests.get(img)
response.raise_for_status()
filename = get_filename_from_url(img)
filepath = f"{directory}/{filename}"
base, ext = os.path.splitext(filepath)
counter = 1
while os.path.exists(filepath):
filepath = f"{base}({counter}){ext}"
counter += 1
with open(filepath, 'wb') as file:
file.write(response.content)
downloaded.add(filepath)
except requests.RequestException as e:
print(f"Error downloading {img}: {e}")
return downloaded
def main():
parser = argparse.ArgumentParser(description="Spider Web Crawler")
parser.add_argument("url", type=str, help="The URL to start crawling from")
parser.add_argument("-r", action="store_true", default=False, help="Spider the URL")
parser.add_argument("-l", type=int, default=5, help="Depth of crawling")
parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save")
args = parser.parse_args()
visited_links = crawl(args.url, args.l)
print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}")
imgs = set()
for link in visited_links:
print(f"{Colors.BLUE}Extracting images from {link}{Colors.RESET}")
imgs.update(extract_image_sources(link))
print(f"{Colors.BLUE}Found {len(imgs)} images:{Colors.RESET}")
if not os.path.exists(args.p):
os.makedirs(args.p)
downloaded = download_images(imgs, args.p)
print(f"{Colors.GREEN}Downloaded {len(downloaded)} images to {args.p}{Colors.RESET}")
if __name__ == "__main__":
main()