From 72c1af855a144ab3fb6828cc4bf3e1100fb896e5 Mon Sep 17 00:00:00 2001
From: chasserb <chasserb@childandfamilyagency.org>
Date: Mon, 2 Oct 2023 21:04:22 -0400
Subject: [PATCH 1/2] Bug fix and list replaced with deque

---
 .../Scraper/gutenberg/gutenberg_scrapper.py   | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/script/Scraper/gutenberg/gutenberg_scrapper.py b/script/Scraper/gutenberg/gutenberg_scrapper.py
index b3911fc..2574582 100644
--- a/script/Scraper/gutenberg/gutenberg_scrapper.py
+++ b/script/Scraper/gutenberg/gutenberg_scrapper.py
@@ -2,6 +2,7 @@
 import json
 import os
 from bs4 import BeautifulSoup
+from collections import deque
 import time
 
 PROXY_CONNECTION_TIMEOUT = 5  # Timeout value in seconds
@@ -13,7 +14,7 @@ def __init__(self):
         Loads the proxy list from the local file.
         Loads the last processed book number from the progress file.
         """
-        self.proxy_list = self.load_proxy_list()
+        self.proxy_list = deque(self.load_proxy_list())
         self.progress_file = r"script\Scraper\gutenberg\progress.txt"
         self.json_file_path = r"StoreHouse\Literature\gutenberg_bibliographic_records.json"
         self.last_book_number = self.load_progress()
@@ -24,9 +25,17 @@ def load_proxy_list(self):
         Loads the proxy list from the local file.
         Returns a list of proxies.
         """
-        with open(r"script\proxy\validProxyList.txt", "r") as file:
-            proxy_list = file.read().splitlines()
-        return proxy_list
+        # Provide path and file name to be read in
+        directory_path = "/script/proxy/validProxyList.txt"
+        file_name = "validProxyList.txt"
+        file_path = os.path.join(directory_path, file_name)
+
+        if os.path.exists(file_path):
+            with open(file_path, "r") as file:
+                proxy_list = file.read().splitlines()
+            return proxy_list
+        else:
+            return [] # Return an empty list if the file doesn't exist
 
     def load_progress(self):
         """
@@ -69,9 +78,11 @@ def rotate_proxy(self):
         """
         Rotates the proxy list by moving the first proxy to the end.
         """
-        if self.proxy_list:
-            proxy = self.proxy_list.pop(0)  # Get the first proxy from the list
-            self.proxy_list.append(proxy)
+        try:
+            if self.proxy_list:
+                self.proxy_list.rotate(-1)  # Rotate the deque for better efficiency
+        except IndexError:
+            pass
 
     def get_html_content(self, url, use_proxy=False):
         """

From 9c2af485a2c4cc0373ee193baabb2d21f2bdd781 Mon Sep 17 00:00:00 2001
From: chasserb <chasserb@childandfamilyagency.org>
Date: Tue, 3 Oct 2023 20:32:16 -0400
Subject: [PATCH 2/2] Prompt the user for a proxy list or skip

---
 .../Scraper/gutenberg/gutenberg_scrapper.py   | 36 ++++++++++---------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/script/Scraper/gutenberg/gutenberg_scrapper.py b/script/Scraper/gutenberg/gutenberg_scrapper.py
index 2574582..33b3f36 100644
--- a/script/Scraper/gutenberg/gutenberg_scrapper.py
+++ b/script/Scraper/gutenberg/gutenberg_scrapper.py
@@ -11,7 +11,7 @@ class GutenbergScraper:
     def __init__(self):
         """
         Initializes the GutenbergScraper class.
-        Loads the proxy list from the local file.
+        Initializes the proxy list to be handled with a deque
         Loads the last processed book number from the progress file.
         """
         self.proxy_list = deque(self.load_proxy_list())
@@ -22,20 +22,23 @@ def __init__(self):
 
     def load_proxy_list(self):
         """
-        Loads the proxy list from the local file.
-        Returns a list of proxies.
+        Prompts the user for a path to a proxy list.
+        Returns a deque of proxies.
         """
-        # Provide path and file name to be read in
-        directory_path = "/script/proxy/validProxyList.txt"
-        file_name = "validProxyList.txt"
-        file_path = os.path.join(directory_path, file_name)
-
-        if os.path.exists(file_path):
-            with open(file_path, "r") as file:
-                proxy_list = file.read().splitlines()
-            return proxy_list
-        else:
-            return [] # Return an empty list if the file doesn't exist
+        user_input = input("Enter the path for a proxy list (or press Enter to skip without using a proxy list):\n")
+        
+        # Check if the user provided a path
+        if not user_input:
+            print("Skipping proxy list, loading...")
+            return []
+
+        file_path = os.path.join(user_input)
+
+        with open(file_path, "r") as file:
+            proxy_list = file.read().splitlines()
+            print("Confirming proxy list was loaded:", proxy_list[1])
+        return proxy_list
+
 
     def load_progress(self):
         """
@@ -76,7 +79,7 @@ def save_progress(self, book_number, book_data):
 
     def rotate_proxy(self):
         """
-        Rotates the proxy list by moving the first proxy to the end.
+        Rotates the deque by moving the first proxy to the end.
         """
         try:
             if self.proxy_list:
@@ -84,6 +87,7 @@ def rotate_proxy(self):
         except IndexError:
             pass
 
+
     def get_html_content(self, url, use_proxy=False):
         """
         Fetches the HTML content of the given URL using a rotating proxy if specified.
@@ -118,6 +122,7 @@ def get_html_content(self, url, use_proxy=False):
             print(f"\nFailed to fetch URL: {url}")
         return None
 
+
     def scrape_gutenberg(self):
         """
         Scrapes the Gutenberg website for bibliographic records.
@@ -170,7 +175,6 @@ def scrape_gutenberg(self):
         with open(self.json_file_path, "a", encoding="utf-8") as file:
             file.write("}")
 
-
         # Remove progress file after scraping is complete
         if os.path.exists(self.progress_file):
             os.remove(self.progress_file)