From 3a62f5edb8f03a87ef858e0cc27f8757097d1979 Mon Sep 17 00:00:00 2001
From: Will Bennett <william.11bennett@gmail.com>
Date: Mon, 1 Nov 2021 17:01:09 -0400
Subject: [PATCH 01/25] Use f-strings instead of .format()

---
 spidy/crawler.py | 77 +++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 40 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 19547c8..9c7a0a9 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -50,13 +50,13 @@ def get_full_time():
 except OSError:
     pass  # Assumes only OSError will complain if /logs already exists
 
-LOG_FILE = open(path.join(WORKING_DIR, 'logs', 'spidy_log_{0}.txt'.format(START_TIME)),
+LOG_FILE = open(path.join(WORKING_DIR, 'logs', f'spidy_log_{START_TIME}.txt'),
                 'w+', encoding='utf-8', errors='ignore')
-LOG_FILE_NAME = path.join('logs', 'spidy_log_{0}'.format(START_TIME))
+LOG_FILE_NAME = path.join('logs', f'spidy_log_{START_TIME}')
 
 # Error log location
-ERR_LOG_FILE = path.join(WORKING_DIR, 'logs', 'spidy_error_log_{0}.txt'.format(START_TIME))
-ERR_LOG_FILE_NAME = path.join('logs', 'spidy_error_log_{0}.txt'.format(START_TIME))
+ERR_LOG_FILE = path.join(WORKING_DIR, 'logs', f'spidy_error_log_{START_TIME}.txt')
+ERR_LOG_FILE_NAME = path.join('logs', f'spidy_error_log_{START_TIME}.txt')
 
 LOGGER = logging.getLogger('SPIDY')
 LOGGER.setLevel(logging.DEBUG)
@@ -101,15 +101,14 @@ def write_log(operation, message, package='spidy', status='INFO', worker=0):
     """
     global LOG_FILE, log_mutex
     with log_mutex:
-        message = '[{0}] [{1}] [WORKER #{2}] [{3}] [{4}]: {5}'\
-                  .format(get_time(), package, str(worker), operation, status, message)
+        message = f'[{get_time()}] [{package}] [WORKER #{str(worker)}] [{operation}] [{status}]: {message}'
         print(message)
         if not LOG_FILE.closed:
             LOG_FILE.write('\n' + message)
 
 
-write_log('INIT', 'Starting spidy Web Crawler version {0}'.format(VERSION))
-write_log('INIT', 'Report any problems to GitHub at https://github.com/rivermont/spidy')
+write_log('INIT', f'Starting spidy Web Crawler version {VERSION}')
+write_log('INIT', 'Report any problems on GitHub at https://github.com/rivermont/spidy/issues')
 
 
 ###########
@@ -214,8 +213,7 @@ def _lookup(self, url):
     def _remember(self, url):
         urlparsed = urllib.parse.urlparse(url)
         robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt'
-        write_log('ROBOTS',
-                  'Reading robots.txt file at: {0}'.format(robots_url),
+        write_log('ROBOTS', f'Reading robots.txt file at: {robots_url}'),
                   package='reppy')
         robots = Robots.fetch(robots_url)
         checker = robots.agent(self.user_agent)
@@ -262,12 +260,11 @@ def crawl(url, thread_id=0):
         save_page(url, page)
     if SAVE_WORDS:
         # Announce which link was crawled
-        write_log('CRAWL', 'Found {0} links and {1} words on {2}'.format(len(links), len(word_list), url),
+        write_log('CRAWL', f'Found {len(links)} links and {len(word_list)} words on {url}',
                   worker=thread_id)
     else:
         # Announce which link was crawled
-        write_log('CRAWL', 'Found {0} links on {1}'.format(len(links), url),
-                  worker=thread_id)
+        write_log('CRAWL', f'Found {len(links)} links on {url}', worker=thread_id)
     return links
 
 
@@ -319,7 +316,7 @@ def crawl_worker(thread_id, robots_index):
                 with save_mutex:
                     if COUNTER.val > 0:
                         try:
-                            write_log('CRAWL', 'Queried {0} links.'.format(str(COUNTER.val)), worker=thread_id)
+                            write_log('CRAWL', f'Queried {str(COUNTER.val)} links.', worker=thread_id)
                             info_log()
                             write_log('SAVE', 'Saving files...')
                             save_files()
@@ -356,8 +353,8 @@ def crawl_worker(thread_id, robots_index):
 
         except Exception as e:
             link = url
-            write_log('CRAWL', 'An error was raised trying to process {0}'
-                      .format(link), status='ERROR', worker=thread_id)
+            write_log('CRAWL', f'An error was raised trying to process {link}',
+                      status='ERROR', worker=thread_id)
             err_mro = type(e).mro()
 
             if SizeError in err_mro:
@@ -406,7 +403,7 @@ def crawl_worker(thread_id, robots_index):
 
             elif 'Unknown MIME type' in str(e):
                 NEW_MIME_COUNT.increment()
-                write_log('ERROR', 'Unknown MIME type: {0}'.format(str(e)[18:]), worker=thread_id)
+                write_log('ERROR', f'Unknown MIME type: {str(e)[18:]}', worker=thread_id)
                 err_log(link, 'Unknown MIME', e)
 
             else:  # Any other error
@@ -498,7 +495,7 @@ def save_files():
                 todoList.write(site + '\n')  # Save TODO list
             except UnicodeError:
                 continue
-    write_log('SAVE', 'Saved TODO list to {0}'.format(TODO_FILE))
+    write_log('SAVE', f'Saved TODO list to {TODO_FILE}')
 
     with open(DONE_FILE, 'w', encoding='utf-8', errors='ignore') as done_list:
         for site in copy(DONE.queue):
@@ -506,7 +503,7 @@ def save_files():
                 done_list.write(site + '\n')  # Save done list
             except UnicodeError:
                 continue
-    write_log('SAVE', 'Saved DONE list to {0}'.format(TODO_FILE))
+    write_log('SAVE', f'Saved DONE list to {TODO_FILE}')
 
     if SAVE_WORDS:
         update_file(WORD_FILE, WORDS.get_all(), 'words')
@@ -549,7 +546,7 @@ def mime_lookup(value):
     elif value == '':
         return '.html'
     else:
-        raise HeaderError('Unknown MIME type: {0}'.format(value))
+        raise HeaderError(f'Unknown MIME type: {value}')
 
 
 def save_page(url, page):
@@ -559,15 +556,15 @@ def save_page(url, page):
     # Make file path
     ext = mime_lookup(get_mime_type(page))
     cropped_url = make_file_path(url, ext)
-    file_path = path.join(WORKING_DIR, 'saved', '{0}'.format(cropped_url))
+    file_path = path.join(WORKING_DIR, 'saved', cropped_url)
 
     # Save file
     with open(file_path, 'w', encoding='utf-8', errors='ignore') as file:
         if ext == '.html':
-            file.write('''<!-- "{0}" -->
+            file.write(f'''<!-- "{url}" -->
 <!-- Downloaded with the spidy Web Crawler -->
 <!-- https://github.com/rivermont/spidy -->
-'''.format(url))
+''')
         file.write(page.text)
 
 
@@ -583,7 +580,7 @@ def update_file(file, content, file_type):
         for item in content:
             open_file.write('\n' + str(item))  # Write all words to file
         open_file.truncate()  # Delete everything in file beyond what has been written (old stuff)
-    write_log('SAVE', 'Saved {0} {1} to {2}'.format(len(content), file_type, file))
+    write_log('SAVE', f'Saved {len(content)} {file_type} to {file}')
 
 
 def info_log():
@@ -591,16 +588,16 @@ def info_log():
     Logs important information to the console and log file.
     """
     # Print to console
-    write_log('LOG', 'Started at {0}'.format(START_TIME_LONG))
-    write_log('LOG', 'Log location: {0}'.format(LOG_FILE_NAME))
-    write_log('LOG', 'Error log location: {0}'.format(ERR_LOG_FILE_NAME))
-    write_log('LOG', '{0} links in TODO'.format(TODO.qsize()))
-    write_log('LOG', '{0} links in DONE'.format(DONE.qsize()))
-    write_log('LOG', 'TODO/DONE: {0}'.format(TODO.qsize() / DONE.qsize()))
-    write_log('LOG', '{0}/{1} new errors caught.'.format(NEW_ERROR_COUNT.val, MAX_NEW_ERRORS))
-    write_log('LOG', '{0}/{1} HTTP errors encountered.'.format(HTTP_ERROR_COUNT.val, MAX_HTTP_ERRORS))
-    write_log('LOG', '{0}/{1} new MIMEs found.'.format(NEW_MIME_COUNT.val, MAX_NEW_MIMES))
-    write_log('LOG', '{0}/{1} known errors caught.'.format(KNOWN_ERROR_COUNT.val, MAX_KNOWN_ERRORS))
+    write_log('LOG', f'Started at {START_TIME_LONG}')
+    write_log('LOG', f'Log location: {LOG_FILE_NAME}')
+    write_log('LOG', f'Error log location: {ERR_LOG_FILE_NAME}')
+    write_log('LOG', f'{TODO.qsize()} links in TODO')
+    write_log('LOG', f'{DONE.qsize()} links in DONE')
+    write_log('LOG', f'TODO/DONE: {TODO.qsize() / DONE.qsize()}')
+    write_log('LOG', f'{NEW_ERROR_COUNT.val}/{MAX_NEW_ERRORS} new errors caught.')
+    write_log('LOG', f'{HTTP_ERROR_COUNT.val}/{MAX_HTTP_ERRORS} HTTP errors encountered.')
+    write_log('LOG', f'{NEW_MIME_COUNT.val}/{MAX_NEW_MIMES} new MIMEs found.')
+    write_log('LOG', f'{KNOWN_ERROR_COUNT.val}/{MAX_KNOWN_ERRORS} known errors caught.')
 
 
 def log(message, level=logging.DEBUG):
@@ -622,7 +619,7 @@ def handle_invalid_input(type_='input. (yes/no)'):
     """
     Handles an invalid user input, usually from the input() function.
     """
-    write_log('INIT', 'Please enter a valid {0}'.format(type_), status='ERROR')
+    write_log('INIT', f'Please enter a valid {type_}', status='ERROR')
     # could raise InputError but this means the user must go through the whole init process again
 
 
@@ -632,7 +629,7 @@ def err_log(url, error1, error2):
     error1 is the trimmed error source.
     error2 is the extended text of the error.
     """
-    LOGGER.error("\nURL: {0}\nERROR: {1}\nEXT: {2}\n\n".format(url, error1, str(error2)))
+    LOGGER.error(f"\nURL: {url}\nERROR: {error1}\nEXT: {str(error2)}\n\n")
 
 
 def zip_saved_files(out_file_name, directory):
@@ -642,7 +639,7 @@ def zip_saved_files(out_file_name, directory):
     shutil.make_archive(str(out_file_name), 'zip', directory)  # Zips files
     shutil.rmtree(directory)  # Deletes folder
     makedirs(directory)  # Creates empty folder of same name
-    write_log('SAVE', 'Zipped documents to {0}.zip'.format(out_file_name))
+    write_log('SAVE', f'Zipped documents to {out_file_name}.zip')
 
 
 ########
@@ -1260,10 +1257,10 @@ def main():
     with open(WORD_FILE, 'w', encoding='utf-8', errors='ignore'):
         pass
 
-    write_log('INIT', 'Successfully started spidy Web Crawler version {0}...'.format(VERSION))
+    write_log('INIT', f'Successfully started spidy Web Crawler version {VERSION}...')
     LOGGER.log(logging.INFO, 'Successfully started crawler.')
 
-    write_log('INIT', 'Using headers: {0}'.format(HEADER))
+    write_log('INIT', f'Using headers: {HEADER}')
 
     robots_index = RobotsIndex(RESPECT_ROBOTS, HEADER['User-Agent'])
 
@@ -1274,6 +1271,6 @@ def main():
 if __name__ == '__main__':
     main()
 else:
-    write_log('INIT', 'Successfully imported spidy Web Crawler version {0}.'.format(VERSION))
+    write_log('INIT', f'Successfully imported spidy Web Crawler version {VERSION}.')
     write_log('INIT',
               'Call `crawler.main()` to start crawling, or refer to DOCS.md to see use of specific functions.')

From 59e124d0ca34b609ace0a911a72ca9151dc24b5c Mon Sep 17 00:00:00 2001
From: Will Bennett <william.11bennett@gmail.com>
Date: Wed, 27 Apr 2022 20:12:36 -0400
Subject: [PATCH 02/25] Remove stray parenthesis.

---
 spidy/crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 9c7a0a9..99b0b2c 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -213,7 +213,7 @@ def _lookup(self, url):
     def _remember(self, url):
         urlparsed = urllib.parse.urlparse(url)
         robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt'
-        write_log('ROBOTS', f'Reading robots.txt file at: {robots_url}'),
+        write_log('ROBOTS', f'Reading robots.txt file at: {robots_url}',
                   package='reppy')
         robots = Robots.fetch(robots_url)
         checker = robots.agent(self.user_agent)

From 15d4e8c58db0061d78fb8066b4de00a463017be1 Mon Sep 17 00:00:00 2001
From: rivermont <william.11bennett@gmail.com>
Date: Wed, 27 Apr 2022 20:23:05 -0400
Subject: [PATCH 03/25] Remove obselete configs.

---
 spidy/config/rivermont-infinite.cfg | 21 ---------------------
 spidy/config/rivermont.cfg          | 20 --------------------
 2 files changed, 41 deletions(-)
 delete mode 100644 spidy/config/rivermont-infinite.cfg
 delete mode 100644 spidy/config/rivermont.cfg

diff --git a/spidy/config/rivermont-infinite.cfg b/spidy/config/rivermont-infinite.cfg
deleted file mode 100644
index 7682ae0..0000000
--- a/spidy/config/rivermont-infinite.cfg
+++ /dev/null
@@ -1,21 +0,0 @@
-THREAD_COUNT = 8
-OVERWRITE = False
-THREAD_COUNT = 8
-RAISE_ERRORS = False
-SAVE_PAGES = True
-SAVE_WORDS = False
-ZIP_FILES = False
-OVERRIDE_SIZE = False
-RESTRICT = False
-DOMAIN = ''
-RESPECT_ROBOTS = False
-TODO_FILE = 'crawler_todo.txt'
-DONE_FILE = 'crawler_done.txt'
-WORD_FILE = 'crawler_words.txt'
-SAVE_COUNT = 100
-HEADER = HEADERS['spidy']
-MAX_NEW_ERRORS = 1000000
-MAX_KNOWN_ERRORS = 1000000
-MAX_HTTP_ERRORS = 1000000
-MAX_NEW_MIMES = 1000000
-START = ['http://24.40.136.85/']
\ No newline at end of file
diff --git a/spidy/config/rivermont.cfg b/spidy/config/rivermont.cfg
deleted file mode 100644
index b942436..0000000
--- a/spidy/config/rivermont.cfg
+++ /dev/null
@@ -1,20 +0,0 @@
-THREAD_COUNT = 8
-OVERWRITE = False
-RAISE_ERRORS = False
-SAVE_PAGES = True
-ZIP_FILES = False
-OVERRIDE_SIZE = False
-SAVE_WORDS = False
-RESTRICT = False
-DOMAIN = ''
-RESPECT_ROBOTS = False
-TODO_FILE = 'crawler_todo.txt'
-DONE_FILE = 'crawler_done.txt'
-WORD_FILE = 'crawler_words.txt'
-SAVE_COUNT = 100
-HEADER = HEADERS['spidy']
-MAX_NEW_ERRORS = 5
-MAX_KNOWN_ERRORS = 20
-MAX_HTTP_ERRORS = 20
-MAX_NEW_MIMES = 10
-START = ['http://24.40.136.85/']

From a242c7ca721fd7cfb45101253b0e8431c3a3f365 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 17:57:20 -0400
Subject: [PATCH 04/25] Adding argparse stuff

---
 spidy/crawler.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 99b0b2c..bbe30f0 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -3,6 +3,7 @@
 spidy Web Crawler
 Built by rivermont and FalconWarriorr
 """
+import argparse
 import time
 import shutil
 import requests
@@ -829,7 +830,7 @@ def zip_saved_files(out_file_name, directory):
 THREAD_RUNNING = True
 
 
-def init():
+def init(args=None):
     """
     Sets all of the variables for spidy,
     and as a result can be used for effectively resetting the crawler.
@@ -847,6 +848,26 @@ def init():
 
     # Getting Arguments
 
+    if (args):
+        write_log('INIT', 'Config file name:', status='INPUT')
+        while True:
+            input_ = input()
+            try:
+                if input_[-4:] == '.cfg':
+                    file_path = path.join(PACKAGE_DIR, 'config', input_)
+                else:
+                    file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_))
+                write_log('INIT', 'Loading configuration settings from {0}'.format(file_path))
+                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                    for line in file.readlines():
+                        exec(line, globals())
+                break
+            except FileNotFoundError:
+                write_log('INIT', 'Config file not found.', status='ERROR')
+                # raise FileNotFoundError()
+            
+            write_log('INIT', 'Please name a valid .cfg file.')
+
     if not path.exists(path.join(PACKAGE_DIR, 'config')):
         write_log('INIT', 'No config folder available.')
         USE_CONFIG = False
@@ -1243,7 +1264,14 @@ def main():
     global WORDS, TODO, DONE
 
     try:
-        init()
+        parser = argparse.ArgumentParser(prog="net.py", description="Builds Containernet Topology")
+        parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False)
+        args = parser.parse_args()
+
+        if (args["f"]):
+            init(args["f"])
+        else:
+            init()
     except KeyboardInterrupt:
         handle_keyboard_interrupt()
 

From 584a6ac154a01254ad236f9a5c082de890012652 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 18:01:22 -0400
Subject: [PATCH 05/25] Basic outline of out of scope options

---
 spidy/config/blank.cfg   | 3 +++
 spidy/config/default.cfg | 1 +
 spidy/crawler.py         | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg
index 91c8f37..f10aa95 100644
--- a/spidy/config/blank.cfg
+++ b/spidy/config/blank.cfg
@@ -28,6 +28,9 @@ RESTRICT = <True/False>
 # The domain within which to restrict crawling.
 DOMAIN = ''
 
+# Domains or subdomains that are out of scope for the crawl
+OUT_OF_SCOPE = ['', '']
+
 # Whether to respect sites' robots.txt or not
 RESPECT_ROBOTS = <True/False>
 
diff --git a/spidy/config/default.cfg b/spidy/config/default.cfg
index c02de63..4faafed 100644
--- a/spidy/config/default.cfg
+++ b/spidy/config/default.cfg
@@ -7,6 +7,7 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
diff --git a/spidy/crawler.py b/spidy/crawler.py
index bbe30f0..87ba236 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -1260,7 +1260,7 @@ def main():
     global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES
     global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT
     global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE
-    global RESPECT_ROBOTS, RESTRICT, DOMAIN
+    global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE
     global WORDS, TODO, DONE
 
     try:

From 043a834540ce8f614f6a7034ef0fe4df28fb4db4 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 18:13:53 -0400
Subject: [PATCH 06/25] Add out of scope functionality and adjust the
 restricted domain logic

---
 spidy/crawler.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 87ba236..abbfde4 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -432,7 +432,8 @@ def check_link(item, robots_index=None):
     if robots_index and not robots_index.is_allowed(item):
         return True
     if RESTRICT:
-        if DOMAIN not in item:
+        if DOMAIN not in item.split('/')[2][]:
+            # Splitting a url on '/' results in ['http(s)', '', '[sub]DOMAIN', 'dir', 'dir', ...]
             return True
     if len(item) < 10 or len(item) > 255:
         return True
@@ -441,6 +442,12 @@ def check_link(item, robots_index=None):
         return True
     elif item in copy(DONE.queue):
         return True
+    
+    # Check each domain in the out of scope blacklist
+    for domain in OUT_OF_SCOPE:
+        if domain in item:
+            return True
+
     return False
 
 
@@ -848,7 +855,7 @@ def init(args=None):
 
     # Getting Arguments
 
-    if (args):
+    if args:
         write_log('INIT', 'Config file name:', status='INPUT')
         while True:
             input_ = input()
@@ -1268,7 +1275,7 @@ def main():
         parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False)
         args = parser.parse_args()
 
-        if (args["f"]):
+        if args["f"]:
             init(args["f"])
         else:
             init()

From cb0e33e1d6b3289cf217e5373d356ccf5a08cdbb Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 18:16:17 -0400
Subject: [PATCH 07/25] Fix my wording on out of scope stuff

---
 spidy/config/blank.cfg | 2 +-
 spidy/crawler.py       | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg
index f10aa95..8c933c1 100644
--- a/spidy/config/blank.cfg
+++ b/spidy/config/blank.cfg
@@ -28,7 +28,7 @@ RESTRICT = <True/False>
 # The domain within which to restrict crawling.
 DOMAIN = ''
 
-# Domains or subdomains that are out of scope for the crawl
+# Domains, subdomains, and paths that are out of scope for the crawl
 OUT_OF_SCOPE = ['', '']
 
 # Whether to respect sites' robots.txt or not
diff --git a/spidy/crawler.py b/spidy/crawler.py
index abbfde4..78aa0f9 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -443,9 +443,9 @@ def check_link(item, robots_index=None):
     elif item in copy(DONE.queue):
         return True
     
-    # Check each domain in the out of scope blacklist
-    for domain in OUT_OF_SCOPE:
-        if domain in item:
+    # Check each domain, subdomain, or path in the out of scope blacklist
+    for scope in OUT_OF_SCOPE:
+        if scope in item:
             return True
 
     return False

From 69e42556041784f58b7433a4473d01e59b451980 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 18:31:25 -0400
Subject: [PATCH 08/25] Fix syntax error (I am a programming genius)

---
 spidy/crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 78aa0f9..9b866ef 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -432,7 +432,7 @@ def check_link(item, robots_index=None):
     if robots_index and not robots_index.is_allowed(item):
         return True
     if RESTRICT:
-        if DOMAIN not in item.split('/')[2][]:
+        if DOMAIN not in item.split('/')[2]:
             # Splitting a url on '/' results in ['http(s)', '', '[sub]DOMAIN', 'dir', 'dir', ...]
             return True
     if len(item) < 10 or len(item) > 255:

From 6155f7bc7ffec1aa2eaa9f5ef138fe9db3e26c30 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 18:41:14 -0400
Subject: [PATCH 09/25] Fix some of my logic

---
 spidy/crawler.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 9b866ef..db18c46 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -837,7 +837,7 @@ def zip_saved_files(out_file_name, directory):
 THREAD_RUNNING = True
 
 
-def init(args=None):
+def init(arg_file=None):
     """
     Sets all of the variables for spidy,
     and as a result can be used for effectively resetting the crawler.
@@ -855,15 +855,17 @@ def init(args=None):
 
     # Getting Arguments
 
-    if args:
+    if not path.exists(path.join(PACKAGE_DIR, 'config')):
+        write_log('INIT', 'No config folder available.')
+        USE_CONFIG = False
+    elif arg_file:
         write_log('INIT', 'Config file name:', status='INPUT')
         while True:
-            input_ = input()
             try:
-                if input_[-4:] == '.cfg':
-                    file_path = path.join(PACKAGE_DIR, 'config', input_)
+                if arg_file[-4:] == '.cfg':
+                    file_path = path.join(PACKAGE_DIR, 'config', arg_file)
                 else:
-                    file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_))
+                    file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(arg_file))
                 write_log('INIT', 'Loading configuration settings from {0}'.format(file_path))
                 with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                     for line in file.readlines():
@@ -874,10 +876,6 @@ def init(args=None):
                 # raise FileNotFoundError()
             
             write_log('INIT', 'Please name a valid .cfg file.')
-
-    if not path.exists(path.join(PACKAGE_DIR, 'config')):
-        write_log('INIT', 'No config folder available.')
-        USE_CONFIG = False
     else:
         write_log('INIT', 'Should spidy load settings from an available config file? (y/n):', status='INPUT')
         while True:
@@ -1275,8 +1273,8 @@ def main():
         parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False)
         args = parser.parse_args()
 
-        if args["f"]:
-            init(args["f"])
+        if args.config_file is not None:
+            init(args.config_file)
         else:
             init()
     except KeyboardInterrupt:

From 251230d8dbd2e3da6aba99fd5ca99c64da495517 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 18:51:44 -0400
Subject: [PATCH 10/25] If the argument is used, don't go looking for user
 input

---
 spidy/crawler.py | 463 ++++++++++++++++++++++++-----------------------
 1 file changed, 232 insertions(+), 231 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index db18c46..da18af5 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -892,285 +892,286 @@ def init(arg_file=None):
             else:
                 handle_invalid_input()
 
-    if USE_CONFIG:
-        write_log('INIT', 'Config file name:', status='INPUT')
-        while True:
-            input_ = input()
-            try:
-                if input_[-4:] == '.cfg':
-                    file_path = path.join(PACKAGE_DIR, 'config', input_)
-                else:
-                    file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_))
-                write_log('INIT', 'Loading configuration settings from {0}'.format(file_path))
-                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
-                    for line in file.readlines():
-                        exec(line, globals())
-                break
-            except FileNotFoundError:
-                write_log('INIT', 'Config file not found.', status='ERROR')
-                # raise FileNotFoundError()
-            
-            write_log('INIT', 'Please name a valid .cfg file.')
-
-    else:
-        write_log('INIT', 'Please enter the following arguments. Leave blank to use the default values.')
-
-        write_log('INIT', 'How many parallel threads should be used for crawler? (Default: 1):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                THREAD_COUNT = 1
-                break
-            elif input_.isdigit():
-                THREAD_COUNT = int(input_)
-                break
-            else:
-                handle_invalid_input('integer.')
+    if arg_file is None:
+        if USE_CONFIG:
+            write_log('INIT', 'Config file name:', status='INPUT')
+            while True:
+                input_ = input()
+                try:
+                    if input_[-4:] == '.cfg':
+                        file_path = path.join(PACKAGE_DIR, 'config', input_)
+                    else:
+                        file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_))
+                    write_log('INIT', 'Loading configuration settings from {0}'.format(file_path))
+                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                        for line in file.readlines():
+                            exec(line, globals())
+                    break
+                except FileNotFoundError:
+                    write_log('INIT', 'Config file not found.', status='ERROR')
+                    # raise FileNotFoundError()
+                
+                write_log('INIT', 'Please name a valid .cfg file.')
 
-        write_log('INIT', 'Should spidy load from existing save files? (y/n) (Default: Yes):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                OVERWRITE = False
-                break
-            elif input_ in yes:
-                OVERWRITE = False
-                break
-            elif input_ in no:
-                OVERWRITE = True
-                break
-            else:
-                handle_invalid_input()
+        else:
+            write_log('INIT', 'Please enter the following arguments. Leave blank to use the default values.')
 
-        write_log('INIT', 'Should spidy raise NEW errors and stop crawling? (y/n) (Default: No):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                RAISE_ERRORS = False
-                break
-            elif input_ in yes:
-                RAISE_ERRORS = True
-                break
-            elif input_ in no:
-                RAISE_ERRORS = False
-                break
-            else:
-                handle_invalid_input()
+            write_log('INIT', 'How many parallel threads should be used for crawler? (Default: 1):', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    THREAD_COUNT = 1
+                    break
+                elif input_.isdigit():
+                    THREAD_COUNT = int(input_)
+                    break
+                else:
+                    handle_invalid_input('integer.')
 
-        write_log('INIT', 'Should spidy save the pages it scrapes to the saved folder? (y/n) (Default: Yes):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                SAVE_PAGES = True
-                break
-            elif input_ in yes:
-                SAVE_PAGES = True
-                break
-            elif input_ in no:
-                SAVE_PAGES = False
-                break
-            else:
-                handle_invalid_input()
+            write_log('INIT', 'Should spidy load from existing save files? (y/n) (Default: Yes):', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    OVERWRITE = False
+                    break
+                elif input_ in yes:
+                    OVERWRITE = False
+                    break
+                elif input_ in no:
+                    OVERWRITE = True
+                    break
+                else:
+                    handle_invalid_input()
 
-        if SAVE_PAGES:
-            write_log('INIT', 'Should spidy zip saved documents when autosaving? (y/n) (Default: No):', status='INPUT')
+            write_log('INIT', 'Should spidy raise NEW errors and stop crawling? (y/n) (Default: No):', status='INPUT')
             while True:
                 input_ = input()
                 if not bool(input_):
-                    ZIP_FILES = False
+                    RAISE_ERRORS = False
                     break
                 elif input_ in yes:
-                    ZIP_FILES = True
+                    RAISE_ERRORS = True
                     break
                 elif input_ in no:
-                    ZIP_FILES = False
+                    RAISE_ERRORS = False
                     break
                 else:
                     handle_invalid_input()
-        else:
-            ZIP_FILES = False
 
-        write_log('INIT', 'Should spidy download documents larger than 500 MB? (y/n) (Default: No):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                OVERRIDE_SIZE = False
-                break
-            elif input_ in yes:
-                OVERRIDE_SIZE = True
-                break
-            elif input_ in no:
-                OVERRIDE_SIZE = False
-                break
-            else:
-                handle_invalid_input()
+            write_log('INIT', 'Should spidy save the pages it scrapes to the saved folder? (y/n) (Default: Yes):', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    SAVE_PAGES = True
+                    break
+                elif input_ in yes:
+                    SAVE_PAGES = True
+                    break
+                elif input_ in no:
+                    SAVE_PAGES = False
+                    break
+                else:
+                    handle_invalid_input()
 
-        write_log('INIT', 'Should spidy scrape words and save them? (y/n) (Default: Yes):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                SAVE_WORDS = True
-                break
-            elif input_ in yes:
-                SAVE_WORDS = True
-                break
-            elif input_ in no:
-                SAVE_WORDS = False
-                break
+            if SAVE_PAGES:
+                write_log('INIT', 'Should spidy zip saved documents when autosaving? (y/n) (Default: No):', status='INPUT')
+                while True:
+                    input_ = input()
+                    if not bool(input_):
+                        ZIP_FILES = False
+                        break
+                    elif input_ in yes:
+                        ZIP_FILES = True
+                        break
+                    elif input_ in no:
+                        ZIP_FILES = False
+                        break
+                    else:
+                        handle_invalid_input()
             else:
-                handle_invalid_input()
+                ZIP_FILES = False
 
-        write_log('INIT', 'Should spidy restrict crawling to a specific domain only? (y/n) (Default: No):',
-                  status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                RESTRICT = False
-                break
-            elif input_ in yes:
-                RESTRICT = True
-                break
-            elif input_ in no:
-                RESTRICT = False
-                break
-            else:
-                handle_invalid_input()
-
-        if RESTRICT:
-            write_log('INIT', 'What domain should crawling be limited to? Can be subdomains, http/https, etc.',
-                      status='INPUT')
+            write_log('INIT', 'Should spidy download documents larger than 500 MB? (y/n) (Default: No):', status='INPUT')
             while True:
                 input_ = input()
-                try:
-                    DOMAIN = input_
+                if not bool(input_):
+                    OVERRIDE_SIZE = False
+                    break
+                elif input_ in yes:
+                    OVERRIDE_SIZE = True
+                    break
+                elif input_ in no:
+                    OVERRIDE_SIZE = False
                     break
-                except KeyError:
-                    handle_invalid_input('string.')
+                else:
+                    handle_invalid_input()
 
-        write_log('INIT', 'Should spidy respect sites\' robots.txt? (y/n) (Default: Yes):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                RESPECT_ROBOTS = True
-                break
-            elif input_ in yes:
-                RESPECT_ROBOTS = True
-                break
-            elif input_ in no:
-                RESPECT_ROBOTS = False
-                break
-            else:
-                handle_invalid_input()
+            write_log('INIT', 'Should spidy scrape words and save them? (y/n) (Default: Yes):', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    SAVE_WORDS = True
+                    break
+                elif input_ in yes:
+                    SAVE_WORDS = True
+                    break
+                elif input_ in no:
+                    SAVE_WORDS = False
+                    break
+                else:
+                    handle_invalid_input()
 
-        write_log('INIT', 'What HTTP browser headers should spidy imitate?', status='INPUT')
-        write_log('INIT', 'Choices: spidy (default), Chrome, Firefox, IE, Edge, Custom:', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                HEADER = HEADERS['spidy']
-                break
-            elif input_.lower() == 'custom':
-                # Here we just trust that the user is inputting valid headers...
-                write_log('INIT', 'Valid HTTP headers:', status='INPUT')
-                HEADER = input()
-                break
-            else:
-                try:
-                    HEADER = HEADERS[input_]
+            write_log('INIT', 'Should spidy restrict crawling to a specific domain only? (y/n) (Default: No):',
+                    status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    RESTRICT = False
+                    break
+                elif input_ in yes:
+                    RESTRICT = True
+                    break
+                elif input_ in no:
+                    RESTRICT = False
                     break
-                except KeyError:
-                    handle_invalid_input('browser name.')
+                else:
+                    handle_invalid_input()
 
-        write_log('INIT', 'Location of the TODO save file (Default: crawler_todo.txt):', status='INPUT')
-        input_ = input()
-        if not bool(input_):
-            TODO_FILE = 'crawler_todo.txt'
-        else:
-            TODO_FILE = input_
+            if RESTRICT:
+                write_log('INIT', 'What domain should crawling be limited to? Can be subdomains, http/https, etc.',
+                        status='INPUT')
+                while True:
+                    input_ = input()
+                    try:
+                        DOMAIN = input_
+                        break
+                    except KeyError:
+                        handle_invalid_input('string.')
+
+            write_log('INIT', 'Should spidy respect sites\' robots.txt? (y/n) (Default: Yes):', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    RESPECT_ROBOTS = True
+                    break
+                elif input_ in yes:
+                    RESPECT_ROBOTS = True
+                    break
+                elif input_ in no:
+                    RESPECT_ROBOTS = False
+                    break
+                else:
+                    handle_invalid_input()
 
-        write_log('INIT', 'Location of the DONE save file (Default: crawler_done.txt):', status='INPUT')
-        input_ = input()
-        if not bool(input_):
-            DONE_FILE = 'crawler_done.txt'
-        else:
-            DONE_FILE = input_
+            write_log('INIT', 'What HTTP browser headers should spidy imitate?', status='INPUT')
+            write_log('INIT', 'Choices: spidy (default), Chrome, Firefox, IE, Edge, Custom:', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    HEADER = HEADERS['spidy']
+                    break
+                elif input_.lower() == 'custom':
+                    # Here we just trust that the user is inputting valid headers...
+                    write_log('INIT', 'Valid HTTP headers:', status='INPUT')
+                    HEADER = input()
+                    break
+                else:
+                    try:
+                        HEADER = HEADERS[input_]
+                        break
+                    except KeyError:
+                        handle_invalid_input('browser name.')
 
-        if SAVE_WORDS:
-            write_log('INIT', 'Location of the words save file (Default: crawler_words.txt):', status='INPUT')
+            write_log('INIT', 'Location of the TODO save file (Default: crawler_todo.txt):', status='INPUT')
             input_ = input()
             if not bool(input_):
-                WORD_FILE = 'crawler_words.txt'
+                TODO_FILE = 'crawler_todo.txt'
             else:
-                WORD_FILE = input_
-        else:
-            WORD_FILE = 'None'
+                TODO_FILE = input_
 
-        write_log('INIT', 'After how many queried links should the crawler autosave? (Default: 100):', status='INPUT')
-        while True:
+            write_log('INIT', 'Location of the DONE save file (Default: crawler_done.txt):', status='INPUT')
             input_ = input()
             if not bool(input_):
-                SAVE_COUNT = 100
-                break
-            elif input_.isdigit():
-                SAVE_COUNT = int(input_)
-                break
+                DONE_FILE = 'crawler_done.txt'
             else:
-                handle_invalid_input('integer.')
+                DONE_FILE = input_
 
-        if not RAISE_ERRORS:
-            write_log('INIT', 'After how many new errors should spidy stop? (Default: 5):', status='INPUT')
+            if SAVE_WORDS:
+                write_log('INIT', 'Location of the words save file (Default: crawler_words.txt):', status='INPUT')
+                input_ = input()
+                if not bool(input_):
+                    WORD_FILE = 'crawler_words.txt'
+                else:
+                    WORD_FILE = input_
+            else:
+                WORD_FILE = 'None'
+
+            write_log('INIT', 'After how many queried links should the crawler autosave? (Default: 100):', status='INPUT')
             while True:
                 input_ = input()
                 if not bool(input_):
-                    MAX_NEW_ERRORS = 5
+                    SAVE_COUNT = 100
                     break
                 elif input_.isdigit():
-                    MAX_NEW_ERRORS = int(input_)
+                    SAVE_COUNT = int(input_)
                     break
                 else:
                     handle_invalid_input('integer.')
-        else:
-            MAX_NEW_ERRORS = 1
 
-        write_log('INIT', 'After how many known errors should spidy stop? (Default: 10):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                MAX_KNOWN_ERRORS = 20
-                break
-            elif input_.isdigit():
-                MAX_KNOWN_ERRORS = int(input_)
-                break
+            if not RAISE_ERRORS:
+                write_log('INIT', 'After how many new errors should spidy stop? (Default: 5):', status='INPUT')
+                while True:
+                    input_ = input()
+                    if not bool(input_):
+                        MAX_NEW_ERRORS = 5
+                        break
+                    elif input_.isdigit():
+                        MAX_NEW_ERRORS = int(input_)
+                        break
+                    else:
+                        handle_invalid_input('integer.')
             else:
-                handle_invalid_input('integer.')
+                MAX_NEW_ERRORS = 1
 
-        write_log('INIT', 'After how many HTTP errors should spidy stop? (Default: 20):', status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                MAX_HTTP_ERRORS = 50
-                break
-            elif not input_.isdigit():
-                MAX_HTTP_ERRORS = int(input_)
-                break
-            else:
-                handle_invalid_input('integer.')
+            write_log('INIT', 'After how many known errors should spidy stop? (Default: 10):', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    MAX_KNOWN_ERRORS = 20
+                    break
+                elif input_.isdigit():
+                    MAX_KNOWN_ERRORS = int(input_)
+                    break
+                else:
+                    handle_invalid_input('integer.')
 
-        write_log('INIT', 'After encountering how many new MIME types should spidy stop? (Default: 20):',
-                  status='INPUT')
-        while True:
-            input_ = input()
-            if not bool(input_):
-                MAX_NEW_MIMES = 10
-                break
-            elif input_.isdigit():
-                MAX_NEW_MIMES = int(input_)
-                break
-            else:
-                handle_invalid_input('integer')
+            write_log('INIT', 'After how many HTTP errors should spidy stop? (Default: 20):', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    MAX_HTTP_ERRORS = 50
+                    break
+                elif not input_.isdigit():
+                    MAX_HTTP_ERRORS = int(input_)
+                    break
+                else:
+                    handle_invalid_input('integer.')
+
+            write_log('INIT', 'After encountering how many new MIME types should spidy stop? (Default: 20):',
+                    status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    MAX_NEW_MIMES = 10
+                    break
+                elif input_.isdigit():
+                    MAX_NEW_MIMES = int(input_)
+                    break
+                else:
+                    handle_invalid_input('integer')
 
-        # Remove INPUT variable from memory
-        del input_
+            # Remove INPUT variable from memory
+            del input_
 
     if OVERWRITE:
         write_log('INIT', 'Creating save files...')

From 0b109c77ad661da241d3bb05fb9784da5e8192f5 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 19:17:40 -0400
Subject: [PATCH 11/25] Check if OUT_OF_SCOPE was set

---
 spidy/crawler.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index da18af5..537c872 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -444,9 +444,10 @@ def check_link(item, robots_index=None):
         return True
     
     # Check each domain, subdomain, or path in the out of scope blacklist
-    for scope in OUT_OF_SCOPE:
-        if scope in item:
-            return True
+    if OUT_OF_SCOPE is not None:
+        for scope in OUT_OF_SCOPE:
+            if scope in item:
+                return True
 
     return False
 

From da4d2c721b366892e0d324bddd064448ce315114 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 19:25:37 -0400
Subject: [PATCH 12/25] Scratch that previous commit...

---
 spidy/crawler.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 537c872..da18af5 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -444,10 +444,9 @@ def check_link(item, robots_index=None):
         return True
     
     # Check each domain, subdomain, or path in the out of scope blacklist
-    if OUT_OF_SCOPE is not None:
-        for scope in OUT_OF_SCOPE:
-            if scope in item:
-                return True
+    for scope in OUT_OF_SCOPE:
+        if scope in item:
+            return True
 
     return False
 

From 91080bce2dfbcc8153ca78a4720aa9f7693a4c02 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 1 Aug 2024 19:35:06 -0400
Subject: [PATCH 13/25] Optimize by preventing multiple checks of the same URL

---
 spidy/crawler.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index da18af5..dbadc36 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -448,6 +448,11 @@ def check_link(item, robots_index=None):
         if scope in item:
             return True
 
+    # Check if the URL has already been processed
+    if item in FOUND_URLS:
+        return True
+
+    FOUND_URLS.add(item)
     return False
 
 
@@ -1268,6 +1273,9 @@ def main():
     global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE
     global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE
     global WORDS, TODO, DONE
+    global FOUND_URLS
+
+    FOUND_URLS = set()
 
     try:
         parser = argparse.ArgumentParser(prog="net.py", description="Builds Containernet Topology")

From 2ebe062867786efdb8f407c4f62510aa012d1f22 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 09:24:44 -0400
Subject: [PATCH 14/25] Fix some globals and whatnot

---
 spidy/crawler.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index dbadc36..5659eab 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -830,7 +830,7 @@ def zip_saved_files(out_file_name, directory):
 HEADER = {}
 SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS = 0, 0, 0, 0
 MAX_NEW_MIMES = 0
-RESPECT_ROBOTS, RESTRICT, DOMAIN = False, False, ''
+RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE = False, False, '', []
 USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE = False, False, False, False, False
 SAVE_PAGES, SAVE_WORDS = False, False
 TODO_FILE, DONE_FILE, WORD_FILE = '', '', ''
@@ -840,6 +840,7 @@ def zip_saved_files(out_file_name, directory):
 save_mutex = threading.Lock()
 FINISHED = False
 THREAD_RUNNING = True
+FOUND_URLS = set()
 
 
 def init(arg_file=None):
@@ -1275,8 +1276,6 @@ def main():
     global WORDS, TODO, DONE
     global FOUND_URLS
 
-    FOUND_URLS = set()
-
     try:
         parser = argparse.ArgumentParser(prog="net.py", description="Builds Containernet Topology")
         parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False)

From 305ee307d11a1102c6780e8f2714b926ecbee739 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 10:14:29 -0400
Subject: [PATCH 15/25] Update config files, add selenium (import only, no code
 yet) to crawler and requirements

---
 requirements.txt               | 1 +
 spidy/config/docker.cfg        | 1 +
 spidy/config/heavy.cfg         | 1 +
 spidy/config/infinite.cfg      | 1 +
 spidy/config/light.cfg         | 1 +
 spidy/config/multithreaded.cfg | 1 +
 spidy/config/wsj.cfg           | 3 +++
 spidy/crawler.py               | 1 +
 8 files changed, 10 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 8028352..4b48956 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ requests
 lxml
 flake8
 reppy
+selenium
\ No newline at end of file
diff --git a/spidy/config/docker.cfg b/spidy/config/docker.cfg
index 9e52af7..8d90285 100644
--- a/spidy/config/docker.cfg
+++ b/spidy/config/docker.cfg
@@ -7,6 +7,7 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = '/data/crawler_todo.txt'
 DONE_FILE = '/data/crawler_done.txt'
diff --git a/spidy/config/heavy.cfg b/spidy/config/heavy.cfg
index 1c41c91..7641797 100644
--- a/spidy/config/heavy.cfg
+++ b/spidy/config/heavy.cfg
@@ -7,6 +7,7 @@ ZIP_FILES = True
 OVERRIDE_SIZE = True
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = False
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
diff --git a/spidy/config/infinite.cfg b/spidy/config/infinite.cfg
index bcf11bc..840c05f 100644
--- a/spidy/config/infinite.cfg
+++ b/spidy/config/infinite.cfg
@@ -7,6 +7,7 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
diff --git a/spidy/config/light.cfg b/spidy/config/light.cfg
index 9a916c9..5d06f4d 100644
--- a/spidy/config/light.cfg
+++ b/spidy/config/light.cfg
@@ -7,6 +7,7 @@ OVERRIDE_SIZE = False
 SAVE_WORDS = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
diff --git a/spidy/config/multithreaded.cfg b/spidy/config/multithreaded.cfg
index 1af0311..5d212e3 100644
--- a/spidy/config/multithreaded.cfg
+++ b/spidy/config/multithreaded.cfg
@@ -7,6 +7,7 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = False
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
diff --git a/spidy/config/wsj.cfg b/spidy/config/wsj.cfg
index 5a5ed40..e412caf 100644
--- a/spidy/config/wsj.cfg
+++ b/spidy/config/wsj.cfg
@@ -12,6 +12,9 @@ RESTRICT = True
 # The domain within which to restrict crawling.
 DOMAIN = 'wsj.com/'
 
+# Do not allow crawling involving specific pages and subdomains
+OUT_OF_SCOPE = ['wsj.com/business/airlines', 'africa.wsj.com']
+
 RESPECT_ROBOTS = True
 TODO_FILE = 'wsj_todo.txt'
 DONE_FILE = 'wsj_done.txt'
diff --git a/spidy/crawler.py b/spidy/crawler.py
index 5659eab..4055446 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -11,6 +11,7 @@
 import threading
 import queue
 import logging
+import selenium
 
 from os import path, makedirs
 from copy import copy

From 2ccf5b103baf9a9c0ebf86dbe1c6b452cd0325bb Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 10:50:25 -0400
Subject: [PATCH 16/25] Fix imports

---
 spidy/crawler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 4055446..7efbcbf 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -11,13 +11,14 @@
 import threading
 import queue
 import logging
-import selenium
 
 from os import path, makedirs
 from copy import copy
 from lxml import etree
 from lxml.html import iterlinks, resolve_base_href, make_links_absolute
 from reppy.robots import Robots
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
 
 try:
     from spidy import __version__
@@ -228,7 +229,7 @@ def _remember(self, url):
 
 write_log('INIT', 'Creating functions...')
 
-
+# TODO: Integrate selenium for fully rendered pages
 def crawl(url, thread_id=0):
     global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS
     if not OVERRIDE_SIZE:

From 6b9f1b8e6d89d61269f218ef392ba18bc5d9b019 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 14:15:32 -0400
Subject: [PATCH 17/25] This should work

---
 requirements.txt               |  3 +-
 spidy/config/blank.cfg         |  3 ++
 spidy/config/default.cfg       |  1 +
 spidy/config/docker.cfg        |  1 +
 spidy/config/heavy.cfg         |  1 +
 spidy/config/infinite.cfg      |  1 +
 spidy/config/light.cfg         |  1 +
 spidy/config/multithreaded.cfg |  1 +
 spidy/config/wsj.cfg           |  1 +
 spidy/crawler.py               | 64 +++++++++++++++++++++++++++-------
 10 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4b48956..a7ffbbd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ requests
 lxml
 flake8
 reppy
-selenium
\ No newline at end of file
+selenium-wire
+blinker==1.7.0
\ No newline at end of file
diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg
index 8c933c1..1e8c5eb 100644
--- a/spidy/config/blank.cfg
+++ b/spidy/config/blank.cfg
@@ -51,6 +51,9 @@ HEADER = HEADERS['<Header>']
 # Or if you want to use custom headers:
 HEADER = {'<Header Name>': '<Value>', '<Header2>': '<Value2>'}
 
+# Select if you would like to have pages rendered with a headless browser (more thorough, but slower)
+USE_BROWSER = <True/False>
+
 # Amount of errors allowed to happen before automatic shutdown.
 MAX_NEW_ERRORS = <Int>
 MAX_KNOWN_ERRORS = <Int>
diff --git a/spidy/config/default.cfg b/spidy/config/default.cfg
index 4faafed..6e89984 100644
--- a/spidy/config/default.cfg
+++ b/spidy/config/default.cfg
@@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
diff --git a/spidy/config/docker.cfg b/spidy/config/docker.cfg
index 8d90285..9a4b5d7 100644
--- a/spidy/config/docker.cfg
+++ b/spidy/config/docker.cfg
@@ -14,6 +14,7 @@ DONE_FILE = '/data/crawler_done.txt'
 WORD_FILE = '/data/crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
diff --git a/spidy/config/heavy.cfg b/spidy/config/heavy.cfg
index 7641797..8f3be0d 100644
--- a/spidy/config/heavy.cfg
+++ b/spidy/config/heavy.cfg
@@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = True
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
diff --git a/spidy/config/infinite.cfg b/spidy/config/infinite.cfg
index 840c05f..71d616a 100644
--- a/spidy/config/infinite.cfg
+++ b/spidy/config/infinite.cfg
@@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 250
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 1000000
 MAX_KNOWN_ERRORS = 1000000
 MAX_HTTP_ERRORS = 1000000
diff --git a/spidy/config/light.cfg b/spidy/config/light.cfg
index 5d06f4d..991dfdf 100644
--- a/spidy/config/light.cfg
+++ b/spidy/config/light.cfg
@@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 150
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
diff --git a/spidy/config/multithreaded.cfg b/spidy/config/multithreaded.cfg
index 5d212e3..eec2eff 100644
--- a/spidy/config/multithreaded.cfg
+++ b/spidy/config/multithreaded.cfg
@@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
diff --git a/spidy/config/wsj.cfg b/spidy/config/wsj.cfg
index e412caf..3997015 100644
--- a/spidy/config/wsj.cfg
+++ b/spidy/config/wsj.cfg
@@ -21,6 +21,7 @@ DONE_FILE = 'wsj_done.txt'
 WORD_FILE = 'wsj_words.txt'
 SAVE_COUNT = 60
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 100
 MAX_KNOWN_ERRORS = 100
 MAX_HTTP_ERRORS = 100
diff --git a/spidy/crawler.py b/spidy/crawler.py
index 7efbcbf..cd8d55e 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -17,8 +17,8 @@
 from lxml import etree
 from lxml.html import iterlinks, resolve_base_href, make_links_absolute
 from reppy.robots import Robots
-from selenium import webdriver
-from selenium.webdriver.firefox.options import Options
+from seleniumwire import webdriver
+from types import SimpleNamespace
 
 try:
     from spidy import __version__
@@ -230,7 +230,7 @@ def _remember(self, url):
 write_log('INIT', 'Creating functions...')
 
 # TODO: Integrate selenium for fully rendered pages
-def crawl(url, thread_id=0):
+def crawl(url, browser, thread_id=0):
     global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS
     if not OVERRIDE_SIZE:
         try:
@@ -242,7 +242,14 @@ def crawl(url, thread_id=0):
             raise SizeError
     # If the SizeError is raised it will be caught in the except block in the run section,
     # and the following code will not be run.
-    page = requests.get(url, headers=HEADER)  # Get page
+    r = requests.get(url, headers=HEADER)
+    
+    if (browser is None):
+        page = r  # Get page
+    else:
+        browser.get(url)
+        page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers)
+
     word_list = []
     doctype = get_mime_type(page)
     if doctype.find('image') < 0 and doctype.find('video') < 0:
@@ -279,13 +286,23 @@ def crawl_worker(thread_id, robots_index):
     # Declare global variables
     global VERSION, START_TIME, START_TIME_LONG
     global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME
-    global HEADER, WORKING_DIR, KILL_LIST
+    global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST
     global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT
     global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES
     global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT
     global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE
-    global RESPECT_ROBOTS, RESTRICT, DOMAIN
-    global WORDS, TODO, DONE, THREAD_RUNNING
+    global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE
+    global WORDS, TODO, DONE
+    global FOUND_URLS
+
+    browser = None
+    if (USE_BROWSER):
+        browser_options = webdriver.FirefoxOptions()
+        browser_options.add_argument('--headless')
+        
+        browser = webdriver.Firefox(options=browser_options)
+
+        browser.request_interceptor = interceptor
 
     while THREAD_RUNNING:
         # Check if there are more urls to crawl
@@ -338,7 +355,7 @@ def crawl_worker(thread_id, robots_index):
                 else:
                     if check_link(url, robots_index):  # If the link is invalid
                         continue
-                    links = crawl(url, thread_id)
+                    links = crawl(url, browser, thread_id)
                     for link in links:
                         # Skip empty links
                         if len(link) <= 0 or link == "/":
@@ -830,6 +847,7 @@ def zip_saved_files(out_file_name, directory):
 
 # Initialize variables as empty that will be needed in the global scope
 HEADER = {}
+USE_BROWSER = False
 SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS = 0, 0, 0, 0
 MAX_NEW_MIMES = 0
 RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE = False, False, '', []
@@ -853,13 +871,14 @@ def init(arg_file=None):
     # Declare global variables
     global VERSION, START_TIME, START_TIME_LONG
     global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME
-    global HEADER, PACKAGE_DIR, WORKING_DIR, KILL_LIST
+    global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST
     global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT
     global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES
     global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT
     global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE
-    global RESPECT_ROBOTS, RESTRICT, DOMAIN
-    global WORDS, TODO, DONE, THREAD_COUNT
+    global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE
+    global WORDS, TODO, DONE
+    global FOUND_URLS
 
     # Getting Arguments
 
@@ -1090,6 +1109,21 @@ def init(arg_file=None):
                     except KeyError:
                         handle_invalid_input('browser name.')
 
+            write_log('INIT', 'Should spidy use a headless browser? (y/n) (Default: No)', status='INPUT')
+            while True:
+                input_ = input()
+                if not bool(input_):
+                    USE_BROWSER = True
+                    break
+                elif input_ in yes:
+                    USE_BROWSER = True
+                    break
+                elif input_ in no:
+                    USE_BROWSER = False
+                    break
+                else:
+                    handle_invalid_input()
+
             write_log('INIT', 'Location of the TODO save file (Default: crawler_todo.txt):', status='INPUT')
             input_ = input()
             if not bool(input_):
@@ -1262,6 +1296,12 @@ def handle_keyboard_interrupt():
     done_crawling(True)
 
 
+# Used by the webdriver to add custom headers
+def interceptor(request):
+    for key in HEADER:
+        request[key] = HEADER[key]
+
+
 def main():
     """
     The main function of spidy.
@@ -1269,7 +1309,7 @@ def main():
     # Declare global variables
     global VERSION, START_TIME, START_TIME_LONG
     global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME
-    global HEADER, WORKING_DIR, KILL_LIST
+    global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST
     global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT
     global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES
     global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT

From 476ccc0122e96f99ce0f34ef48a20fc592541045 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 15:00:10 -0400
Subject: [PATCH 18/25] Fix interceptor function

---
 spidy/crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index cd8d55e..22f9fd4 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -1299,7 +1299,7 @@ def handle_keyboard_interrupt():
 # Used by the webdriver to add custom headers
 def interceptor(request):
     for key in HEADER:
-        request[key] = HEADER[key]
+        request.headers[key] = HEADER[key]
 
 
 def main():

From b05e00667bcc28a9200f120d83fd362094aa8458 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 17:36:01 -0400
Subject: [PATCH 19/25] Bug fixes and testing

---
 spidy/crawler.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 22f9fd4..07edd14 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -18,6 +18,8 @@
 from lxml.html import iterlinks, resolve_base_href, make_links_absolute
 from reppy.robots import Robots
 from seleniumwire import webdriver
+from selenium.webdriver.common.alert import Alert
+from selenium.common.exceptions import TimeoutException, UnexpectedAlertPresentException, WebDriverException
 from types import SimpleNamespace
 
 try:
@@ -243,12 +245,27 @@ def crawl(url, browser, thread_id=0):
     # If the SizeError is raised it will be caught in the except block in the run section,
     # and the following code will not be run.
     r = requests.get(url, headers=HEADER)
+
+    print(f"attempting url: {url}")
     
     if (browser is None):
         page = r  # Get page
     else:
-        browser.get(url)
-        page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers)
+        try:
+            browser.get(url)
+            page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers)
+        except TimeoutException:
+            KNOWN_ERROR_COUNT += 1
+            return []
+        except UnexpectedAlertPresentException:
+            browser.get(url)
+            alert = Alert(browser)
+            alert.accept()
+            page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers)
+            KNOWN_ERROR_COUNT += 1
+        except WebDriverException:
+            KNOWN_ERROR_COUNT += 1
+            return []
 
     word_list = []
     doctype = get_mime_type(page)
@@ -303,6 +320,9 @@ def crawl_worker(thread_id, robots_index):
         browser = webdriver.Firefox(options=browser_options)
 
         browser.request_interceptor = interceptor
+        browser.implicitly_wait(10)
+        browser.set_page_load_timeout(10)
+        webdriver.DesiredCapabilities.FIREFOX["unexpectedAlertBehaviour"] = "accept"
 
     while THREAD_RUNNING:
         # Check if there are more urls to crawl

From cb4f856a0510175244a876f376415cb05d171c38 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 17:37:50 -0400
Subject: [PATCH 20/25] Fix requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index a7ffbbd..de5777a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,6 @@ requests
 lxml
 flake8
 reppy
+selenium
 selenium-wire
 blinker==1.7.0
\ No newline at end of file

From e417d34621b5277742da0de588d0dc6183c0c83f Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 18:05:53 -0400
Subject: [PATCH 21/25] Update docs and fix comments

---
 README.md          |   4 +-
 spidy/crawler.py   |   1 -
 spidy/docs/DOCS.md | 123 +++++++++++++++++++++++----------------------
 3 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/README.md b/README.md
index 3d9ab37..42eb5b6 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,8 @@ Pretty simple!
 ![All Platforms!](https://img.shields.io/badge/Windows,%20OS/X,%20Linux-%20%20-brightgreen.svg)
 ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
 <br>
-![Lines of Code: 1553](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
-![Lines of Docs: 605](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
+![Lines of Code: 1810](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
+![Lines of Docs: 614](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
 [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card)
 [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy)
 [![PyPI Wheel](https://img.shields.io/pypi/wheel/spidy-web-crawler.svg)](https://pypi.org/project/spidy-web-crawler/)
diff --git a/spidy/crawler.py b/spidy/crawler.py
index 07edd14..d642596 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -231,7 +231,6 @@ def _remember(self, url):
 
 write_log('INIT', 'Creating functions...')
 
-# TODO: Integrate selenium for fully rendered pages
 def crawl(url, browser, thread_id=0):
     global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS
     if not OVERRIDE_SIZE:
diff --git a/spidy/docs/DOCS.md b/spidy/docs/DOCS.md
index 4e0c570..6fd46a6 100644
--- a/spidy/docs/DOCS.md
+++ b/spidy/docs/DOCS.md
@@ -99,17 +99,17 @@ Everything that follows is intended to be detailed information on each piece in
 This section lists the custom classes in `crawler.py`.<br>
 Most are Errors or Exceptions that may be raised throughout the code.
 
-## `HeaderError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L120))
+## `HeaderError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L126))
 Raised when there is a problem deciphering HTTP headers returned from a website.
 
-## `SizeError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L127))
+## `SizeError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L133))
 Raised when a file is too large to download in an acceptable time.
 
 
 # Functions
 This section lists the functions in `crawler.py` that are used throughout the code.
 
-## `check_link` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L399))
+## `check_link` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L464))
 Determines whether links should be crawled.<br>
 Types of links that will be pruned:
 
@@ -118,34 +118,34 @@ Types of links that will be pruned:
   - Links that have already been crawled.
   - Links in [`KILL_LIST`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#kill_list--source).
 
-## `check_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L433))
+## `check_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L510))
 Checks whether a file path will cause errors when saving.<br>
 Paths longer than 256 characters cannot be saved (Windows).
 
-## `check_word` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L421))
+## `check_word` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L498))
 Checks whether a word is valid.<br>
 The word-saving feature was originally added to be used for password cracking with hashcat, which is why `check_word` checks for length of less than 16 characters.<br>
 The average password length is around 8 characters.
 
-## `crawl` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L190))
+## `crawl` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L253))
 Does all of the crawling, scraping, scraping of a single document.
 
-## `err_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L601))
+## `err_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L678))
 Saves the triggering error to the log file.
 
-## `get_mime_type` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L500))
+## `get_mime_type` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L577))
 Extracts the Content-Type header from the headers returned by page.
 
-## `get_time` - ([Source](https://github.com/rivermont/spidy/blobl/master/spidy/crawler.py#L29))
+## `get_time` - ([Source](https://github.com/rivermont/spidy/blobl/master/spidy/crawler.py#L36))
 Returns the current time in the format `HH:MM::SS`.
 
-## `get_full_time` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L33))
+## `get_full_time` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L40))
 Returns the current time in the format `HH:MM:SS, Day, Mon, YYYY`.
 
-## `handle_keyboard_interrupt` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L1137))
+## `handle_keyboard_interrupt` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L1314))
 Shuts down the crawler when a `KeyboardInterrupt` is performed.
 
-## `info_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L561))
+## `info_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L638))
 Logs important information to the console and log file.<br>
 Example log:
 
@@ -164,17 +164,17 @@ Example log:
     [23:17:06] [spidy] [LOG]: Saved done list to crawler_done.txt
     [23:17:06] [spidy] [LOG]: Saved 90 bad links to crawler_bad.txt
 
-## `log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L578))
+## `log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L655))
 Logs a single message to the error log file.
 Prints message verbatim, so message must be formatted correctly in the function call.
 
-## `make_file_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L487))
+## `make_file_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L564))
 Makes a valid Windows file path for a given url.
 
-## `make_words` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L166))
+## `make_words` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L522))
 Returns a list of all the valid words (determined using [`check_word`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#check_word--source)) on a given page.
 
-## `mime_lookup` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L511))
+## `mime_lookup` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L588))
 This finds the correct file extension for a MIME type using the [`MIME_TYPES`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#mime_types--source) dictionary.<br>
 If the MIME type is blank it defaults to `.html`, and if the MIME type is not in the dictionary a [`HeaderError`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#headererror--source) is raised.<br>
 Usage:
@@ -183,59 +183,62 @@ Usage:
 
 Where `value` is the MIME type.
 
-## `save_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L459))
+## `save_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L536))
 Saves the TODO, DONE, word, and bad lists to their respective files.<br>
 The word and bad link lists use the same function to save space.
 
-## `save_page` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L527))
+## `save_page` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L604))
 Download content of url and save to the `save` folder.
 
-## `update_file` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L546))
+## `update_file` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L623))
 TODO
 
-## `write_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L78)
+## `write_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L85)
 Writes message to both the console and the log file.<br>
 NOTE: Automatically adds timestamp and `[spidy]` to message, and formats message for log appropriately.
 
-## `zip_saved_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L610))
+## `zip_saved_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L687))
 Zips the contents of `saved/` to a `.zip` file.<br>
 Each archive is unique, with names generated from the current time.
 
+## `interceptor` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L11320))
+Intercepts a request from selenium and updates the headers to match the one selected by the user.
+
 
 # Global Variables
 This section lists the variables in [`crawler.py`](https://github.om/rivermont/spidy/blob/master/spidy/crawler.py) that are used throughout the code.
 
-## `COUNTER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L774))
+## `COUNTER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L852))
 Incremented each time a link is crawled.<br>
 
-## `CRAWLER_DIR` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L30))
+## `WORKING_DIR` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L48))
 The directory that `crawler.py` is located in.
 
-## `DOMAIN` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L794))
+## `DOMAIN` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L873))
 The domain that crawling is restricted to if [`RESTRICT`](#restrict--source) is `True`.
 
-## `DONE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L798))
+## `DONE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L877))
 TODO
 
-## `DONE_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L797))
+## `DONE_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L876))
 TODO
 
-## `ERR_LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L56))
+## `ERR_LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L63))
 TODO
 
-## `ERR_LOG_FILE_NAME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L57))
+## `ERR_LOG_FILE_NAME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L64))
 TODO
 
-## `HEADER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L791))
+## `HEADER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L869))
 TODO
 
-## `HEADERS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L727))
+## `HEADERS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L805))
 TODO
 
-## `HTTP_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L777))
+## `HTTP_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L895))
 TODO
 
-## `KILL_LIST` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L762))
+## `KILL_LIST` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L894))
 A list of pages that are known to cause problems with the crawler.
 
   - `bhphotovideo.com/c/search`
@@ -243,33 +246,33 @@ A list of pages that are known to cause problems with the crawler.
   - `w3.org`: I have never been able to access W3, although it never says it's down. If someone knows of this problem, please let me know.
   - `web.archive.org/web/`: While there is some good content, there are sometimes thousands of copies of the same exact page. Not good for web crawling.
 
-## `KNOWN_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L776))
+## `KNOWN_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L895))
 TODO
 
-## `LOG_END` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L504))
+## `LOG_END` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L))
 Line to print at the end of each `logFile` log
 
-## `LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L51))
+## `LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L893))
 The file that the command line logs are written to.<br>
 Kept open until the crawler stops for whatever reason so that it can be written to.
 
-## `LOG_FILE_NAME`  - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L53))
+## `LOG_FILE_NAME`  - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L893))
 The actual file name of [`LOG_FILE`](#log_file--source).<br>
 Used in [`info_log`](#info_log--source).
 
-## `MAX_HTTP_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792))
+## `MAX_HTTP_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896))
 TODO
 
-## `MAX_KNOWN_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792))
+## `MAX_KNOWN_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896))
 TODO
 
-## `MAX_NEW_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792))
+## `MAX_NEW_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896))
 TODO
 
-## `MAX_NEW_MIMES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L793))
+## `MAX_NEW_MIMES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896))
 TODO
 
-## `MIME_TYPES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L628))
+## `MIME_TYPES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L705))
 A dictionary of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) encountered by the crawler.<br>
 While there are [thousands of other types](https://www.iana.org/assignments/media-types/media-types.xhtml) that are not listed, to list them all would be impractical:
   - The size of the list would be huge, using memory, space, etc.
@@ -288,63 +291,63 @@ Where `value` is the MIME type.<br>
 This will return the extension associated with the MIME type if it exists, however this will throw an [`IndexError`](https://docs.python.org/2/library/exceptions.html#exceptions.IndexError) if the MIME type is not in the dictionary.<br>
 Because of this, it is recommended to use the [`mime_lookup`](#mime_lookup--source) function.
 
-## `NEW_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L775))
+## `NEW_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L853))
 TODO
 
-## `NEW_MIME_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L778))
+## `NEW_MIME_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L856))
 TODO
 
-## `OVERRIDE_SIZE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795))
+## `OVERRIDE_SIZE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874))
 TODO
 
-## `OVERWRITE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795))
+## `OVERWRITE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874))
 TODO
 
-## `RAISE_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795))
+## `RAISE_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874))
 TODO
 
-## `RESTRICT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L794))
+## `RESTRICT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L873))
 Whether to restrict crawling to [`DOMAIN`](#domain--source) or not.
 
-## `SAVE_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792))
+## `SAVE_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L871))
 TODO
 
-## `SAVE_PAGES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L796))
+## `SAVE_PAGES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L875))
 TODO
 
-## `SAVE_WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L796))
+## `SAVE_WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L875))
 TODO
 
-## `START` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L771))
+## `START` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L849))
 Links to start crawling if the TODO list is empty
 
-## `START_TIME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L37))
+## `START_TIME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L44))
 The time that `crawler.py` was started, in seconds from the epoch.<br>
 More information can be found on the page for the Python [time](https://docs.python.org/3/library/time.html) library.
 
-## `START_TIME_LONG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L38))
+## `START_TIME_LONG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L45))
 The time that `crawler.py` was started, in the format `HH:MM:SS, Date Month Year`.<br>
 Used in `info_log`.
 
-## `TODO` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L798))
+## `TODO` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L877))
 The list containing all links that are yet to be crawled.
 
-## `TODO_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L797))
+## `TODO_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L876))
 TODO
 
-## `USE_CONFIG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795))
+## `USE_CONFIG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874))
 TODO
 
-## `VERSION` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L24))
+## `VERSION` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L31))
 The current version of the crawler.
 
-## `WORD_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L797))
+## `WORD_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L876))
 TODO
 
-## `WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L782))
+## `WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L860))
 TODO
 
-## `ZIP_FILES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795))
+## `ZIP_FILES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874))
 TODO
 
 ***

From 14566941b68da6be02130888bedbe0335e5f83cb Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Fri, 2 Aug 2024 18:13:51 -0400
Subject: [PATCH 22/25] Contributors

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 42eb5b6..4f396d2 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Pretty simple!
 ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
 <br>
 ![Lines of Code: 1810](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
-![Lines of Docs: 614](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
+![Lines of Docs: 616](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
 [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card)
 [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy)
 [![PyPI Wheel](https://img.shields.io/pypi/wheel/spidy-web-crawler.svg)](https://pypi.org/project/spidy-web-crawler/)
@@ -101,6 +101,7 @@ Here are some features we figure are worth noting.
   - Cross-Platform compatibility: spidy will work on all three major operating systems, Windows, Mac OS/X, and Linux!
   - Frequent Timestamp Logging: Spidy logs almost every action it takes to both the console and one of two log files.
   - Browser Spoofing: Make requests using User Agents from 4 popular web browsers, use a custom spidy bot one, or create your own!
+  - Headless Browser Support: Render full webpages to get dynamic content.
   - Portability: Move spidy's folder and its contents somewhere else and it will run right where it left off. *Note*: This only works if you run it from source code.
   - User-Friendly Logs: Both the console and log file messages are simple and easy to interpret, but packed with information.
   - Webpage saving: Spidy downloads each page that it runs into, regardless of file type. The crawler uses the HTTP `Content-Type` header returned with most files to determine the file type.
@@ -225,6 +226,7 @@ See the [`CONTRIBUTING.md`](https://github.com/rivermont/spidy/blob/master/spidy
 * [quatroka](https://github.com/quatroka) - Fixed testing bugs.
 * [stevelle](https://github.com/stevelle) - Respect robots.txt.
 * [thatguywiththatname](https://github.com/thatguywiththatname) - README link corrections.
+* [lkotlus](https://github.com/lkotlus) - Optimizations, out of scope items, and headless browser support.
 
 # License
 We used the [Gnu General Public License](https://www.gnu.org/licenses/gpl-3.0.en.html) (see [`LICENSE`](https://github.com/rivermont/spidy/blob/master/LICENSE)) as it was the license that best suited our needs.<br>

From 62b4668c50dfdd449d90f35aeadb7cb64e22bc30 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Mon, 5 Aug 2024 11:55:44 -0400
Subject: [PATCH 23/25] Remove unnecesary print

---
 README.md        | 2 +-
 spidy/crawler.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4f396d2..9060ffd 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Pretty simple!
 ![All Platforms!](https://img.shields.io/badge/Windows,%20OS/X,%20Linux-%20%20-brightgreen.svg)
 ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
 <br>
-![Lines of Code: 1810](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
+![Lines of Code: 1808](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
 ![Lines of Docs: 616](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
 [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card)
 [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy)
diff --git a/spidy/crawler.py b/spidy/crawler.py
index d642596..421fa4c 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -244,8 +244,6 @@ def crawl(url, browser, thread_id=0):
     # If the SizeError is raised it will be caught in the except block in the run section,
     # and the following code will not be run.
     r = requests.get(url, headers=HEADER)
-
-    print(f"attempting url: {url}")
     
     if (browser is None):
         page = r  # Get page

From 1547563d0c65e2bc6d67d65d595f338aea803c63 Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Mon, 5 Aug 2024 11:58:31 -0400
Subject: [PATCH 24/25] KNOWN_ERROR_COUNT referenced before assignment fixed.

---
 spidy/crawler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spidy/crawler.py b/spidy/crawler.py
index 421fa4c..532605f 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -232,7 +232,7 @@ def _remember(self, url):
 write_log('INIT', 'Creating functions...')
 
 def crawl(url, browser, thread_id=0):
-    global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS
+    global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS, KNOWN_ERROR_COUNT
     if not OVERRIDE_SIZE:
         try:
             # Attempt to get the size in bytes of the document
@@ -252,16 +252,16 @@ def crawl(url, browser, thread_id=0):
             browser.get(url)
             page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers)
         except TimeoutException:
-            KNOWN_ERROR_COUNT += 1
+            KNOWN_ERROR_COUNT.increment()
             return []
         except UnexpectedAlertPresentException:
             browser.get(url)
             alert = Alert(browser)
             alert.accept()
             page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers)
-            KNOWN_ERROR_COUNT += 1
+            KNOWN_ERROR_COUNT.increment()
         except WebDriverException:
-            KNOWN_ERROR_COUNT += 1
+            KNOWN_ERROR_COUNT.increment()
             return []
 
     word_list = []

From b37fd416eab0eadc58b95a48fc3f8a2980ce8d6c Mon Sep 17 00:00:00 2001
From: lkotlus <hockeyplayerlogan@gmail.com>
Date: Thu, 8 Aug 2024 10:25:54 -0400
Subject: [PATCH 25/25] Add maximum time

---
 README.md                      | 4 ++--
 spidy/config/blank.cfg         | 3 +++
 spidy/config/default.cfg       | 1 +
 spidy/config/docker.cfg        | 1 +
 spidy/config/heavy.cfg         | 1 +
 spidy/config/infinite.cfg      | 1 +
 spidy/config/light.cfg         | 1 +
 spidy/config/multithreaded.cfg | 1 +
 spidy/config/wsj.cfg           | 1 +
 spidy/crawler.py               | 8 ++++++--
 spidy/docs/DOCS.md             | 3 +++
 11 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9060ffd..f48aa96 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,8 @@ Pretty simple!
 ![All Platforms!](https://img.shields.io/badge/Windows,%20OS/X,%20Linux-%20%20-brightgreen.svg)
 ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
 <br>
-![Lines of Code: 1808](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
-![Lines of Docs: 616](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
+![Lines of Code: 1811](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
+![Lines of Docs: 619](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
 [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card)
 [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy)
 [![PyPI Wheel](https://img.shields.io/pypi/wheel/spidy-web-crawler.svg)](https://pypi.org/project/spidy-web-crawler/)
diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg
index 1e8c5eb..ed55fa9 100644
--- a/spidy/config/blank.cfg
+++ b/spidy/config/blank.cfg
@@ -60,5 +60,8 @@ MAX_KNOWN_ERRORS = <Int>
 MAX_HTTP_ERRORS = <Int>
 MAX_NEW_MIMES = <Int>
 
+# Amount of time (in seconds) the crawl is allowed to run for (set to float('inf') if you want it to run forever)
+MAX_TIME = <Int>
+
 # Pages to start crawling on in case TODO is empty at start.
 START = ['', '']
diff --git a/spidy/config/default.cfg b/spidy/config/default.cfg
index 6e89984..fa2afc0 100644
--- a/spidy/config/default.cfg
+++ b/spidy/config/default.cfg
@@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
\ No newline at end of file
diff --git a/spidy/config/docker.cfg b/spidy/config/docker.cfg
index 9a4b5d7..3a546ca 100644
--- a/spidy/config/docker.cfg
+++ b/spidy/config/docker.cfg
@@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
diff --git a/spidy/config/heavy.cfg b/spidy/config/heavy.cfg
index 8f3be0d..4e2f0ea 100644
--- a/spidy/config/heavy.cfg
+++ b/spidy/config/heavy.cfg
@@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
\ No newline at end of file
diff --git a/spidy/config/infinite.cfg b/spidy/config/infinite.cfg
index 71d616a..1c41881 100644
--- a/spidy/config/infinite.cfg
+++ b/spidy/config/infinite.cfg
@@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 1000000
 MAX_KNOWN_ERRORS = 1000000
 MAX_HTTP_ERRORS = 1000000
 MAX_NEW_MIMES = 1000000
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
\ No newline at end of file
diff --git a/spidy/config/light.cfg b/spidy/config/light.cfg
index 991dfdf..7a11da4 100644
--- a/spidy/config/light.cfg
+++ b/spidy/config/light.cfg
@@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = 600
 START = ['https://en.wikipedia.org/wiki/Main_Page']
\ No newline at end of file
diff --git a/spidy/config/multithreaded.cfg b/spidy/config/multithreaded.cfg
index eec2eff..17daafa 100644
--- a/spidy/config/multithreaded.cfg
+++ b/spidy/config/multithreaded.cfg
@@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
\ No newline at end of file
diff --git a/spidy/config/wsj.cfg b/spidy/config/wsj.cfg
index 3997015..d03ad06 100644
--- a/spidy/config/wsj.cfg
+++ b/spidy/config/wsj.cfg
@@ -26,4 +26,5 @@ MAX_NEW_ERRORS = 100
 MAX_KNOWN_ERRORS = 100
 MAX_HTTP_ERRORS = 100
 MAX_NEW_MIMES = 5
+MAX_TIME = float('inf')
 START = ['https://www.wsj.com/']
diff --git a/spidy/crawler.py b/spidy/crawler.py
index 532605f..9075848 100755
--- a/spidy/crawler.py
+++ b/spidy/crawler.py
@@ -298,7 +298,7 @@ def crawl_worker(thread_id, robots_index):
     """
 
     # Declare global variables
-    global VERSION, START_TIME, START_TIME_LONG
+    global VERSION, START_TIME, START_TIME_LONG, MAX_TIME
     global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME
     global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST
     global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT
@@ -348,6 +348,10 @@ def crawl_worker(thread_id, robots_index):
                 write_log('CRAWL', 'Too many errors have accumulated; stopping crawler.')
                 done_crawling()
                 break
+            elif time.time() - START_TIME >= MAX_TIME: # If too much time has passed
+                write_log('CRAWL', 'Maximum time has been exceeded.')
+                done_crawling()
+                break
             elif COUNTER.val >= SAVE_COUNT:  # If it's time for an autosave
                 # Make sure only one thread saves files
                 with save_mutex:
@@ -865,7 +869,7 @@ def zip_saved_files(out_file_name, directory):
 # Initialize variables as empty that will be needed in the global scope
 HEADER = {}
 USE_BROWSER = False
-SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS = 0, 0, 0, 0
+SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_TIME = 0, 0, 0, 0, float('inf')
 MAX_NEW_MIMES = 0
 RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE = False, False, '', []
 USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE = False, False, False, False, False
diff --git a/spidy/docs/DOCS.md b/spidy/docs/DOCS.md
index 6fd46a6..43c9939 100644
--- a/spidy/docs/DOCS.md
+++ b/spidy/docs/DOCS.md
@@ -272,6 +272,9 @@ TODO
 ## `MAX_NEW_MIMES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896))
 TODO
 
+## `MAX_TIME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896))
+Maximum amount of time (in seconds) that a crawl will go for. Defaults to float('inf'), allowing it to run forever.
+
 ## `MIME_TYPES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L705))
 A dictionary of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) encountered by the crawler.<br>
 While there are [thousands of other types](https://www.iana.org/assignments/media-types/media-types.xhtml) that are not listed, to list them all would be impractical: