diff --git a/script/Scraper/gutenberg/gutenberg_scrapper.py b/script/Scraper/gutenberg/gutenberg_scrapper.py index b3911fc..33b3f36 100644 --- a/script/Scraper/gutenberg/gutenberg_scrapper.py +++ b/script/Scraper/gutenberg/gutenberg_scrapper.py @@ -2,6 +2,7 @@ import json import os from bs4 import BeautifulSoup +from collections import deque import time PROXY_CONNECTION_TIMEOUT = 5 # Timeout value in seconds @@ -10,10 +11,10 @@ class GutenbergScraper: def __init__(self): """ Initializes the GutenbergScraper class. - Loads the proxy list from the local file. + Initializes the proxy list to be handled with a deque Loads the last processed book number from the progress file. """ - self.proxy_list = self.load_proxy_list() + self.proxy_list = deque(self.load_proxy_list()) self.progress_file = r"script\Scraper\gutenberg\progress.txt" self.json_file_path = r"StoreHouse\Literature\gutenberg_bibliographic_records.json" self.last_book_number = self.load_progress() @@ -21,13 +22,24 @@ def __init__(self): def load_proxy_list(self): """ - Loads the proxy list from the local file. - Returns a list of proxies. + Prompts the user for a path to a proxy list. + Returns a deque of proxies. """ - with open(r"script\proxy\validProxyList.txt", "r") as file: + user_input = input("Enter the path for a proxy list (or press Enter to skip without using a proxy list):\n") + + # Check if the user provided a path + if not user_input: + print("Skipping proxy list, loading...") + return [] + + file_path = os.path.join(user_input) + + with open(file_path, "r") as file: proxy_list = file.read().splitlines() + print("Confirming proxy list was loaded:", proxy_list[1]) return proxy_list + def load_progress(self): """ Loads the last processed book number from the progress file. @@ -67,11 +79,14 @@ def save_progress(self, book_number, book_data): def rotate_proxy(self): """ - Rotates the proxy list by moving the first proxy to the end. + Rotates the deque by moving the first proxy to the end. """ - if self.proxy_list: - proxy = self.proxy_list.pop(0) # Get the first proxy from the list - self.proxy_list.append(proxy) + try: + if self.proxy_list: + self.proxy_list.rotate(-1) # Rotate the deque for better efficiency + except IndexError: + pass + def get_html_content(self, url, use_proxy=False): """ @@ -107,6 +122,7 @@ def get_html_content(self, url, use_proxy=False): print(f"\nFailed to fetch URL: {url}") return None + def scrape_gutenberg(self): """ Scrapes the Gutenberg website for bibliographic records. @@ -159,7 +175,6 @@ def scrape_gutenberg(self): with open(self.json_file_path, "a", encoding="utf-8") as file: file.write("}") - # Remove progress file after scraping is complete if os.path.exists(self.progress_file): os.remove(self.progress_file)