From 72c1af855a144ab3fb6828cc4bf3e1100fb896e5 Mon Sep 17 00:00:00 2001 From: chasserb Date: Mon, 2 Oct 2023 21:04:22 -0400 Subject: [PATCH 1/2] Bug fix and list replaced with deque --- .../Scraper/gutenberg/gutenberg_scrapper.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/script/Scraper/gutenberg/gutenberg_scrapper.py b/script/Scraper/gutenberg/gutenberg_scrapper.py index b3911fc..2574582 100644 --- a/script/Scraper/gutenberg/gutenberg_scrapper.py +++ b/script/Scraper/gutenberg/gutenberg_scrapper.py @@ -2,6 +2,7 @@ import json import os from bs4 import BeautifulSoup +from collections import deque import time PROXY_CONNECTION_TIMEOUT = 5 # Timeout value in seconds @@ -13,7 +14,7 @@ def __init__(self): Loads the proxy list from the local file. Loads the last processed book number from the progress file. """ - self.proxy_list = self.load_proxy_list() + self.proxy_list = deque(self.load_proxy_list()) self.progress_file = r"script\Scraper\gutenberg\progress.txt" self.json_file_path = r"StoreHouse\Literature\gutenberg_bibliographic_records.json" self.last_book_number = self.load_progress() @@ -24,9 +25,17 @@ def load_proxy_list(self): Loads the proxy list from the local file. Returns a list of proxies. """ - with open(r"script\proxy\validProxyList.txt", "r") as file: - proxy_list = file.read().splitlines() - return proxy_list + # Provide path and file name to be read in + directory_path = "/script/proxy/validProxyList.txt" + file_name = "validProxyList.txt" + file_path = os.path.join(directory_path, file_name) + + if os.path.exists(file_path): + with open(file_path, "r") as file: + proxy_list = file.read().splitlines() + return proxy_list + else: + return [] # Return an empty list if the file doesn't exist def load_progress(self): """ @@ -69,9 +78,11 @@ def rotate_proxy(self): """ Rotates the proxy list by moving the first proxy to the end. """ - if self.proxy_list: - proxy = self.proxy_list.pop(0) # Get the first proxy from the list - self.proxy_list.append(proxy) + try: + if self.proxy_list: + self.proxy_list.rotate(-1) # Rotate the deque for better efficiency + except IndexError: + pass def get_html_content(self, url, use_proxy=False): """ From 9c2af485a2c4cc0373ee193baabb2d21f2bdd781 Mon Sep 17 00:00:00 2001 From: chasserb Date: Tue, 3 Oct 2023 20:32:16 -0400 Subject: [PATCH 2/2] Prompt the user for a proxy list or skip --- .../Scraper/gutenberg/gutenberg_scrapper.py | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/script/Scraper/gutenberg/gutenberg_scrapper.py b/script/Scraper/gutenberg/gutenberg_scrapper.py index 2574582..33b3f36 100644 --- a/script/Scraper/gutenberg/gutenberg_scrapper.py +++ b/script/Scraper/gutenberg/gutenberg_scrapper.py @@ -11,7 +11,7 @@ class GutenbergScraper: def __init__(self): """ Initializes the GutenbergScraper class. - Loads the proxy list from the local file. + Initializes the proxy list to be handled with a deque Loads the last processed book number from the progress file. """ self.proxy_list = deque(self.load_proxy_list()) @@ -22,20 +22,23 @@ def __init__(self): def load_proxy_list(self): """ - Loads the proxy list from the local file. - Returns a list of proxies. + Prompts the user for a path to a proxy list. + Returns a deque of proxies. """ - # Provide path and file name to be read in - directory_path = "/script/proxy/validProxyList.txt" - file_name = "validProxyList.txt" - file_path = os.path.join(directory_path, file_name) - - if os.path.exists(file_path): - with open(file_path, "r") as file: - proxy_list = file.read().splitlines() - return proxy_list - else: - return [] # Return an empty list if the file doesn't exist + user_input = input("Enter the path for a proxy list (or press Enter to skip without using a proxy list):\n") + + # Check if the user provided a path + if not user_input: + print("Skipping proxy list, loading...") + return [] + + file_path = os.path.join(user_input) + + with open(file_path, "r") as file: + proxy_list = file.read().splitlines() + print("Confirming proxy list was loaded:", proxy_list[1]) + return proxy_list + def load_progress(self): """ @@ -76,7 +79,7 @@ def save_progress(self, book_number, book_data): def rotate_proxy(self): """ - Rotates the proxy list by moving the first proxy to the end. + Rotates the deque by moving the first proxy to the end. """ try: if self.proxy_list: @@ -84,6 +87,7 @@ def rotate_proxy(self): except IndexError: pass + def get_html_content(self, url, use_proxy=False): """ Fetches the HTML content of the given URL using a rotating proxy if specified. @@ -118,6 +122,7 @@ def get_html_content(self, url, use_proxy=False): print(f"\nFailed to fetch URL: {url}") return None + def scrape_gutenberg(self): """ Scrapes the Gutenberg website for bibliographic records. @@ -170,7 +175,6 @@ def scrape_gutenberg(self): with open(self.json_file_path, "a", encoding="utf-8") as file: file.write("}") - # Remove progress file after scraping is complete if os.path.exists(self.progress_file): os.remove(self.progress_file)