diff --git a/Spider.py b/Spider.py index f84ea39..1b1758c 100644 --- a/Spider.py +++ b/Spider.py @@ -1,70 +1,89 @@ -import os - -import argparse import requests import re -from urllib.parse import urlparse - -from PIL import Image -from PIL.ExifTags import TAGS +from urllib.parse import urlparse, urljoin +import argparse +from collections import deque +import os +class Colors: + """ + A class to hold color codes for terminal output. + """ + GREEN = '\033[92m' + YELLOW = '\033[93m' + BLUE = '\033[94m' + RED = '\033[91m' + RESET = '\033[0m' + + +""" +get all links from a given url +""" +def get_all_links(url): + """ + Fetch all links from the given URL. + Args: + url (str): The URL to fetch links from. + Returns: + set: A set of unique links found on the page. + """ + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)' + } + response = requests.get(url, headers=headers) + response.raise_for_status() + + links = set(re.findall(r']+href="([^"]+)"', response.text, re.IGNORECASE)) + links = {urljoin(url, link) for link in links if not link.startswith(('#', 'mailto:', 'javascript:', 'tel:'))} + + print(f"{Colors.GREEN}Found {len(links)} links on {url}{Colors.RESET}") + return links + except requests.RequestException as e: + print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") + return set() def same_domain(url1, url2): """ - Return True if url1 and url2 have the same domain name. + Check if two URLs belong to the same domain. Args: url1 (str): The first URL. url2 (str): The second URL. Returns: - bool: True if both URLs have the same domain, False otherwise. + bool: True if both URLs belong to the same domain, False otherwise. """ - domain1 = urlparse(url1).netloc domain2 = urlparse(url2).netloc - return domain1 == domain2 - -def crawl(url, depth, imgs = set()): +"""breadth first crawl using get_all_links +""" +def crawl(url, depth): """ - Crawl the given URL and extract links and images. - If depth is greater than 0, continue crawling links found on the page. - If depth is 0, only extract images. + Crawl the given URL and extract links up to a specified depth. Args: - url (str): The URL to crawl. - depth (int): The depth of crawling. - imgs (set): A set to store found image URLs. + url (str): The URL to start crawling from. + depth (int): The maximum depth of crawling. Returns: - set: A set of found image URLs. + set: A set of unique links found during the crawl. """ - print(f"\033[92mFetching links from {url} at depth {depth}\033[0m") + visited = set() + queue = deque([(url, 0)]) # (current_url, current_depth) + while queue: + current_url, current_depth = queue.popleft() + if current_depth > depth or current_url in visited: + continue + if current_depth <= depth: + visited.add(current_url) + if current_depth < depth: + links = get_all_links(current_url) - try: - response = requests.get(url) - response.raise_for_status() - links = re.findall(r']+href="([^"]+)"', response.text) - for link in links: - if link.startswith(('#', 'mailto:', 'javascript:', 'tel:')): - continue - if not link.startswith('http'): - link = requests.compat.urljoin(url, link) - if not same_domain(url, link): - continue - - crawl(link, depth - 1, imgs) if depth > 0 else None - - for img in extract_image_sources(url): - if img not in imgs: - print(f"\033[94mFound image: {img}\033[0m") - imgs.add(img) - imgs.update(extract_image_sources(url)) - return imgs - except requests.RequestException as e: - print(f"Error fetching {url}: {e}") - return [] - + for link in links: + if link not in visited and same_domain(url, link): + queue.append((link, current_depth + 1)) + return visited def extract_image_sources(url): """ @@ -78,15 +97,24 @@ def extract_image_sources(url): response = requests.get(url) response.raise_for_status() # Raise an error for bad responses - img_sources = re.findall(r'src="([^"]+(?:jpg|jpeg|gif|png|bmp))"', response.text, re.IGNORECASE) + img_sources = re.findall(r']+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE) img_sources = [requests.compat.urljoin(url, src) for src in img_sources] return img_sources except requests.RequestException as e: - print(f"Error fetching {url}: {e}") + print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") return [] +def get_filename_from_url(url): + """ + Remove http:// or https:// from the URL and replace / with _ to create a valid filename. + """ + parsed_url = urlparse(url) + path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path + filename = parsed_url.netloc + path + filename = filename.replace('/', '_') + return filename if filename else url def download_images(imgs, directory): """ @@ -105,6 +133,11 @@ def download_images(imgs, directory): filename = get_filename_from_url(img) filepath = f"{directory}/{filename}" + base, ext = os.path.splitext(filepath) + counter = 1 + while os.path.exists(filepath): + filepath = f"{base}({counter}){ext}" + counter += 1 with open(filepath, 'wb') as file: file.write(response.content) @@ -115,41 +148,29 @@ def download_images(imgs, directory): return downloaded - -def get_filename_from_url(url): - """ - Remove http:// or https:// from the URL and replace / with _ to create a valid filename. - """ - parsed_url = urlparse(url) - filename = parsed_url.netloc + parsed_url.path - filename = filename.replace('/', '_') - return filename if filename else url - - def main(): parser = argparse.ArgumentParser(description="Spider Web Crawler") parser.add_argument("url", type=str, help="The URL to start crawling from") - parser.add_argument("-r", help="Spider the URL") + parser.add_argument("-r", action="store_true", default=False, help="Spider the URL") parser.add_argument("-l", type=int, default=5, help="Depth of crawling") - parser.add_argument("-p", default="./data", help="Output file to save") + parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save") args = parser.parse_args() - print(args) - imgs = crawl(args.url, args.l) - print(f"\033[92mFound {len(imgs)} images\033[0m") - if args.p: - if not os.path.exists(args.p): - os.makedirs(args.p) - downloaded = download_images(imgs, args.p) - print(f"\033[92mDownloaded {len(downloaded)} images to {args.p}\033[0m") - # for img in downloaded: - # exif_data = get_exif_data(img) - # if exif_data: - # print(f"\033[93mExif data for {img}:\033[0m") - # for tag, value in exif_data.items(): - # print(f" {tag}: {value}") + + visited_links = crawl(args.url, args.l) + print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}") - # return 0 + imgs = set() + for link in visited_links: + print(f"{Colors.BLUE}Extracting images from {link}{Colors.RESET}") + imgs.update(extract_image_sources(link)) + + print(f"{Colors.BLUE}Found {len(imgs)} images:{Colors.RESET}") + + if not os.path.exists(args.p): + os.makedirs(args.p) + downloaded = download_images(imgs, args.p) + print(f"{Colors.GREEN}Downloaded {len(downloaded)} images to {args.p}{Colors.RESET}") if __name__ == "__main__": diff --git a/Spider2.py b/Spider2.py deleted file mode 100644 index 1b1758c..0000000 --- a/Spider2.py +++ /dev/null @@ -1,177 +0,0 @@ -import requests -import re -from urllib.parse import urlparse, urljoin -import argparse -from collections import deque -import os - - -class Colors: - """ - A class to hold color codes for terminal output. - """ - GREEN = '\033[92m' - YELLOW = '\033[93m' - BLUE = '\033[94m' - RED = '\033[91m' - RESET = '\033[0m' - - -""" -get all links from a given url -""" -def get_all_links(url): - """ - Fetch all links from the given URL. - Args: - url (str): The URL to fetch links from. - Returns: - set: A set of unique links found on the page. - """ - try: - headers = { - 'User-Agent': 'Mozilla/5.0 (compatible; Spider2/1.0)' - } - response = requests.get(url, headers=headers) - response.raise_for_status() - - links = set(re.findall(r']+href="([^"]+)"', response.text, re.IGNORECASE)) - links = {urljoin(url, link) for link in links if not link.startswith(('#', 'mailto:', 'javascript:', 'tel:'))} - - print(f"{Colors.GREEN}Found {len(links)} links on {url}{Colors.RESET}") - return links - except requests.RequestException as e: - print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") - return set() - -def same_domain(url1, url2): - """ - Check if two URLs belong to the same domain. - Args: - url1 (str): The first URL. - url2 (str): The second URL. - Returns: - bool: True if both URLs belong to the same domain, False otherwise. - """ - domain1 = urlparse(url1).netloc - domain2 = urlparse(url2).netloc - return domain1 == domain2 - -"""breadth first crawl using get_all_links -""" -def crawl(url, depth): - """ - Crawl the given URL and extract links up to a specified depth. - Args: - url (str): The URL to start crawling from. - depth (int): The maximum depth of crawling. - Returns: - set: A set of unique links found during the crawl. - """ - visited = set() - queue = deque([(url, 0)]) # (current_url, current_depth) - - while queue: - current_url, current_depth = queue.popleft() - if current_depth > depth or current_url in visited: - continue - if current_depth <= depth: - visited.add(current_url) - if current_depth < depth: - links = get_all_links(current_url) - - for link in links: - if link not in visited and same_domain(url, link): - queue.append((link, current_depth + 1)) - return visited - -def extract_image_sources(url): - """ - Fetch image links from the given URL. - Args: - url (str): The URL to fetch images from. - Returns: - list: A list of image URLs found on the page. - """ - try: - response = requests.get(url) - response.raise_for_status() # Raise an error for bad responses - - img_sources = re.findall(r']+src="([^"]+(?:jpg|jpeg|gif|png|bmp))(?:\?[^"]*)?"', response.text, re.IGNORECASE) - img_sources = [requests.compat.urljoin(url, src) for src in img_sources] - - return img_sources - - except requests.RequestException as e: - print(f"{Colors.RED}Error fetching {url}: {e}{Colors.RESET}") - return [] - -def get_filename_from_url(url): - """ - Remove http:// or https:// from the URL and replace / with _ to create a valid filename. - """ - parsed_url = urlparse(url) - path = parsed_url.path[-20:] if len(parsed_url.path) > 20 else parsed_url.path - filename = parsed_url.netloc + path - filename = filename.replace('/', '_') - return filename if filename else url - -def download_images(imgs, directory): - """ - Download images to the specified directory. - Args: - imgs (set): A set of image URLs to download. - directory (str): The directory where images will be saved. - Returns: - set: A set of file paths of downloaded images. - """ - downloaded = set() - for img in imgs: - try: - response = requests.get(img) - response.raise_for_status() - - filename = get_filename_from_url(img) - filepath = f"{directory}/{filename}" - base, ext = os.path.splitext(filepath) - counter = 1 - while os.path.exists(filepath): - filepath = f"{base}({counter}){ext}" - counter += 1 - - with open(filepath, 'wb') as file: - file.write(response.content) - downloaded.add(filepath) - - except requests.RequestException as e: - print(f"Error downloading {img}: {e}") - - return downloaded - -def main(): - parser = argparse.ArgumentParser(description="Spider Web Crawler") - parser.add_argument("url", type=str, help="The URL to start crawling from") - parser.add_argument("-r", action="store_true", default=False, help="Spider the URL") - parser.add_argument("-l", type=int, default=5, help="Depth of crawling") - parser.add_argument("-p", metavar="OUTPUT_PATH", default="./data", help="Output file to save") - args = parser.parse_args() - - - visited_links = crawl(args.url, args.l) - print(f"{Colors.YELLOW}Found {len(visited_links)} unique links on same domain:{Colors.RESET}") - - imgs = set() - for link in visited_links: - print(f"{Colors.BLUE}Extracting images from {link}{Colors.RESET}") - imgs.update(extract_image_sources(link)) - - print(f"{Colors.BLUE}Found {len(imgs)} images:{Colors.RESET}") - - if not os.path.exists(args.p): - os.makedirs(args.p) - downloaded = download_images(imgs, args.p) - print(f"{Colors.GREEN}Downloaded {len(downloaded)} images to {args.p}{Colors.RESET}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..92e2c22 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +certifi==2025.6.15 +charset-normalizer==3.4.2 +idna==3.10 +pillow==11.3.0 +requests==2.32.4 +urllib3==2.5.0 diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..64795d3 --- /dev/null +++ b/setup.sh @@ -0,0 +1,4 @@ +#!/bin/bash +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt