From 3a62f5edb8f03a87ef858e0cc27f8757097d1979 Mon Sep 17 00:00:00 2001 From: Will Bennett Date: Mon, 1 Nov 2021 17:01:09 -0400 Subject: [PATCH 01/25] Use f-strings instead of .format() --- spidy/crawler.py | 77 +++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 19547c8..9c7a0a9 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -50,13 +50,13 @@ def get_full_time(): except OSError: pass # Assumes only OSError will complain if /logs already exists -LOG_FILE = open(path.join(WORKING_DIR, 'logs', 'spidy_log_{0}.txt'.format(START_TIME)), +LOG_FILE = open(path.join(WORKING_DIR, 'logs', f'spidy_log_{START_TIME}.txt'), 'w+', encoding='utf-8', errors='ignore') -LOG_FILE_NAME = path.join('logs', 'spidy_log_{0}'.format(START_TIME)) +LOG_FILE_NAME = path.join('logs', f'spidy_log_{START_TIME}') # Error log location -ERR_LOG_FILE = path.join(WORKING_DIR, 'logs', 'spidy_error_log_{0}.txt'.format(START_TIME)) -ERR_LOG_FILE_NAME = path.join('logs', 'spidy_error_log_{0}.txt'.format(START_TIME)) +ERR_LOG_FILE = path.join(WORKING_DIR, 'logs', f'spidy_error_log_{START_TIME}.txt') +ERR_LOG_FILE_NAME = path.join('logs', f'spidy_error_log_{START_TIME}.txt') LOGGER = logging.getLogger('SPIDY') LOGGER.setLevel(logging.DEBUG) @@ -101,15 +101,14 @@ def write_log(operation, message, package='spidy', status='INFO', worker=0): """ global LOG_FILE, log_mutex with log_mutex: - message = '[{0}] [{1}] [WORKER #{2}] [{3}] [{4}]: {5}'\ - .format(get_time(), package, str(worker), operation, status, message) + message = f'[{get_time()}] [{package}] [WORKER #{str(worker)}] [{operation}] [{status}]: {message}' print(message) if not LOG_FILE.closed: LOG_FILE.write('\n' + message) -write_log('INIT', 'Starting spidy Web Crawler version {0}'.format(VERSION)) -write_log('INIT', 'Report any problems to GitHub at https://github.com/rivermont/spidy') +write_log('INIT', f'Starting spidy Web Crawler version {VERSION}') +write_log('INIT', 'Report any problems on GitHub at https://github.com/rivermont/spidy/issues') ########### @@ -214,8 +213,7 @@ def _lookup(self, url): def _remember(self, url): urlparsed = urllib.parse.urlparse(url) robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt' - write_log('ROBOTS', - 'Reading robots.txt file at: {0}'.format(robots_url), + write_log('ROBOTS', f'Reading robots.txt file at: {robots_url}'), package='reppy') robots = Robots.fetch(robots_url) checker = robots.agent(self.user_agent) @@ -262,12 +260,11 @@ def crawl(url, thread_id=0): save_page(url, page) if SAVE_WORDS: # Announce which link was crawled - write_log('CRAWL', 'Found {0} links and {1} words on {2}'.format(len(links), len(word_list), url), + write_log('CRAWL', f'Found {len(links)} links and {len(word_list)} words on {url}', worker=thread_id) else: # Announce which link was crawled - write_log('CRAWL', 'Found {0} links on {1}'.format(len(links), url), - worker=thread_id) + write_log('CRAWL', f'Found {len(links)} links on {url}', worker=thread_id) return links @@ -319,7 +316,7 @@ def crawl_worker(thread_id, robots_index): with save_mutex: if COUNTER.val > 0: try: - write_log('CRAWL', 'Queried {0} links.'.format(str(COUNTER.val)), worker=thread_id) + write_log('CRAWL', f'Queried {str(COUNTER.val)} links.', worker=thread_id) info_log() write_log('SAVE', 'Saving files...') save_files() @@ -356,8 +353,8 @@ def crawl_worker(thread_id, robots_index): except Exception as e: link = url - write_log('CRAWL', 'An error was raised trying to process {0}' - .format(link), status='ERROR', worker=thread_id) + write_log('CRAWL', f'An error was raised trying to process {link}', + status='ERROR', worker=thread_id) err_mro = type(e).mro() if SizeError in err_mro: @@ -406,7 +403,7 @@ def crawl_worker(thread_id, robots_index): elif 'Unknown MIME type' in str(e): NEW_MIME_COUNT.increment() - write_log('ERROR', 'Unknown MIME type: {0}'.format(str(e)[18:]), worker=thread_id) + write_log('ERROR', f'Unknown MIME type: {str(e)[18:]}', worker=thread_id) err_log(link, 'Unknown MIME', e) else: # Any other error @@ -498,7 +495,7 @@ def save_files(): todoList.write(site + '\n') # Save TODO list except UnicodeError: continue - write_log('SAVE', 'Saved TODO list to {0}'.format(TODO_FILE)) + write_log('SAVE', f'Saved TODO list to {TODO_FILE}') with open(DONE_FILE, 'w', encoding='utf-8', errors='ignore') as done_list: for site in copy(DONE.queue): @@ -506,7 +503,7 @@ def save_files(): done_list.write(site + '\n') # Save done list except UnicodeError: continue - write_log('SAVE', 'Saved DONE list to {0}'.format(TODO_FILE)) + write_log('SAVE', f'Saved DONE list to {TODO_FILE}') if SAVE_WORDS: update_file(WORD_FILE, WORDS.get_all(), 'words') @@ -549,7 +546,7 @@ def mime_lookup(value): elif value == '': return '.html' else: - raise HeaderError('Unknown MIME type: {0}'.format(value)) + raise HeaderError(f'Unknown MIME type: {value}') def save_page(url, page): @@ -559,15 +556,15 @@ def save_page(url, page): # Make file path ext = mime_lookup(get_mime_type(page)) cropped_url = make_file_path(url, ext) - file_path = path.join(WORKING_DIR, 'saved', '{0}'.format(cropped_url)) + file_path = path.join(WORKING_DIR, 'saved', cropped_url) # Save file with open(file_path, 'w', encoding='utf-8', errors='ignore') as file: if ext == '.html': - file.write(''' + file.write(f''' -'''.format(url)) +''') file.write(page.text) @@ -583,7 +580,7 @@ def update_file(file, content, file_type): for item in content: open_file.write('\n' + str(item)) # Write all words to file open_file.truncate() # Delete everything in file beyond what has been written (old stuff) - write_log('SAVE', 'Saved {0} {1} to {2}'.format(len(content), file_type, file)) + write_log('SAVE', f'Saved {len(content)} {file_type} to {file}') def info_log(): @@ -591,16 +588,16 @@ def info_log(): Logs important information to the console and log file. """ # Print to console - write_log('LOG', 'Started at {0}'.format(START_TIME_LONG)) - write_log('LOG', 'Log location: {0}'.format(LOG_FILE_NAME)) - write_log('LOG', 'Error log location: {0}'.format(ERR_LOG_FILE_NAME)) - write_log('LOG', '{0} links in TODO'.format(TODO.qsize())) - write_log('LOG', '{0} links in DONE'.format(DONE.qsize())) - write_log('LOG', 'TODO/DONE: {0}'.format(TODO.qsize() / DONE.qsize())) - write_log('LOG', '{0}/{1} new errors caught.'.format(NEW_ERROR_COUNT.val, MAX_NEW_ERRORS)) - write_log('LOG', '{0}/{1} HTTP errors encountered.'.format(HTTP_ERROR_COUNT.val, MAX_HTTP_ERRORS)) - write_log('LOG', '{0}/{1} new MIMEs found.'.format(NEW_MIME_COUNT.val, MAX_NEW_MIMES)) - write_log('LOG', '{0}/{1} known errors caught.'.format(KNOWN_ERROR_COUNT.val, MAX_KNOWN_ERRORS)) + write_log('LOG', f'Started at {START_TIME_LONG}') + write_log('LOG', f'Log location: {LOG_FILE_NAME}') + write_log('LOG', f'Error log location: {ERR_LOG_FILE_NAME}') + write_log('LOG', f'{TODO.qsize()} links in TODO') + write_log('LOG', f'{DONE.qsize()} links in DONE') + write_log('LOG', f'TODO/DONE: {TODO.qsize() / DONE.qsize()}') + write_log('LOG', f'{NEW_ERROR_COUNT.val}/{MAX_NEW_ERRORS} new errors caught.') + write_log('LOG', f'{HTTP_ERROR_COUNT.val}/{MAX_HTTP_ERRORS} HTTP errors encountered.') + write_log('LOG', f'{NEW_MIME_COUNT.val}/{MAX_NEW_MIMES} new MIMEs found.') + write_log('LOG', f'{KNOWN_ERROR_COUNT.val}/{MAX_KNOWN_ERRORS} known errors caught.') def log(message, level=logging.DEBUG): @@ -622,7 +619,7 @@ def handle_invalid_input(type_='input. (yes/no)'): """ Handles an invalid user input, usually from the input() function. """ - write_log('INIT', 'Please enter a valid {0}'.format(type_), status='ERROR') + write_log('INIT', f'Please enter a valid {type_}', status='ERROR') # could raise InputError but this means the user must go through the whole init process again @@ -632,7 +629,7 @@ def err_log(url, error1, error2): error1 is the trimmed error source. error2 is the extended text of the error. """ - LOGGER.error("\nURL: {0}\nERROR: {1}\nEXT: {2}\n\n".format(url, error1, str(error2))) + LOGGER.error(f"\nURL: {url}\nERROR: {error1}\nEXT: {str(error2)}\n\n") def zip_saved_files(out_file_name, directory): @@ -642,7 +639,7 @@ def zip_saved_files(out_file_name, directory): shutil.make_archive(str(out_file_name), 'zip', directory) # Zips files shutil.rmtree(directory) # Deletes folder makedirs(directory) # Creates empty folder of same name - write_log('SAVE', 'Zipped documents to {0}.zip'.format(out_file_name)) + write_log('SAVE', f'Zipped documents to {out_file_name}.zip') ######## @@ -1260,10 +1257,10 @@ def main(): with open(WORD_FILE, 'w', encoding='utf-8', errors='ignore'): pass - write_log('INIT', 'Successfully started spidy Web Crawler version {0}...'.format(VERSION)) + write_log('INIT', f'Successfully started spidy Web Crawler version {VERSION}...') LOGGER.log(logging.INFO, 'Successfully started crawler.') - write_log('INIT', 'Using headers: {0}'.format(HEADER)) + write_log('INIT', f'Using headers: {HEADER}') robots_index = RobotsIndex(RESPECT_ROBOTS, HEADER['User-Agent']) @@ -1274,6 +1271,6 @@ def main(): if __name__ == '__main__': main() else: - write_log('INIT', 'Successfully imported spidy Web Crawler version {0}.'.format(VERSION)) + write_log('INIT', f'Successfully imported spidy Web Crawler version {VERSION}.') write_log('INIT', 'Call `crawler.main()` to start crawling, or refer to DOCS.md to see use of specific functions.') From 59e124d0ca34b609ace0a911a72ca9151dc24b5c Mon Sep 17 00:00:00 2001 From: Will Bennett Date: Wed, 27 Apr 2022 20:12:36 -0400 Subject: [PATCH 02/25] Remove stray parenthesis. --- spidy/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 9c7a0a9..99b0b2c 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -213,7 +213,7 @@ def _lookup(self, url): def _remember(self, url): urlparsed = urllib.parse.urlparse(url) robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt' - write_log('ROBOTS', f'Reading robots.txt file at: {robots_url}'), + write_log('ROBOTS', f'Reading robots.txt file at: {robots_url}', package='reppy') robots = Robots.fetch(robots_url) checker = robots.agent(self.user_agent) From 15d4e8c58db0061d78fb8066b4de00a463017be1 Mon Sep 17 00:00:00 2001 From: rivermont Date: Wed, 27 Apr 2022 20:23:05 -0400 Subject: [PATCH 03/25] Remove obselete configs. --- spidy/config/rivermont-infinite.cfg | 21 --------------------- spidy/config/rivermont.cfg | 20 -------------------- 2 files changed, 41 deletions(-) delete mode 100644 spidy/config/rivermont-infinite.cfg delete mode 100644 spidy/config/rivermont.cfg diff --git a/spidy/config/rivermont-infinite.cfg b/spidy/config/rivermont-infinite.cfg deleted file mode 100644 index 7682ae0..0000000 --- a/spidy/config/rivermont-infinite.cfg +++ /dev/null @@ -1,21 +0,0 @@ -THREAD_COUNT = 8 -OVERWRITE = False -THREAD_COUNT = 8 -RAISE_ERRORS = False -SAVE_PAGES = True -SAVE_WORDS = False -ZIP_FILES = False -OVERRIDE_SIZE = False -RESTRICT = False -DOMAIN = '' -RESPECT_ROBOTS = False -TODO_FILE = 'crawler_todo.txt' -DONE_FILE = 'crawler_done.txt' -WORD_FILE = 'crawler_words.txt' -SAVE_COUNT = 100 -HEADER = HEADERS['spidy'] -MAX_NEW_ERRORS = 1000000 -MAX_KNOWN_ERRORS = 1000000 -MAX_HTTP_ERRORS = 1000000 -MAX_NEW_MIMES = 1000000 -START = ['http://24.40.136.85/'] \ No newline at end of file diff --git a/spidy/config/rivermont.cfg b/spidy/config/rivermont.cfg deleted file mode 100644 index b942436..0000000 --- a/spidy/config/rivermont.cfg +++ /dev/null @@ -1,20 +0,0 @@ -THREAD_COUNT = 8 -OVERWRITE = False -RAISE_ERRORS = False -SAVE_PAGES = True -ZIP_FILES = False -OVERRIDE_SIZE = False -SAVE_WORDS = False -RESTRICT = False -DOMAIN = '' -RESPECT_ROBOTS = False -TODO_FILE = 'crawler_todo.txt' -DONE_FILE = 'crawler_done.txt' -WORD_FILE = 'crawler_words.txt' -SAVE_COUNT = 100 -HEADER = HEADERS['spidy'] -MAX_NEW_ERRORS = 5 -MAX_KNOWN_ERRORS = 20 -MAX_HTTP_ERRORS = 20 -MAX_NEW_MIMES = 10 -START = ['http://24.40.136.85/'] From a242c7ca721fd7cfb45101253b0e8431c3a3f365 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 17:57:20 -0400 Subject: [PATCH 04/25] Adding argparse stuff --- spidy/crawler.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 99b0b2c..bbe30f0 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -3,6 +3,7 @@ spidy Web Crawler Built by rivermont and FalconWarriorr """ +import argparse import time import shutil import requests @@ -829,7 +830,7 @@ def zip_saved_files(out_file_name, directory): THREAD_RUNNING = True -def init(): +def init(args=None): """ Sets all of the variables for spidy, and as a result can be used for effectively resetting the crawler. @@ -847,6 +848,26 @@ def init(): # Getting Arguments + if (args): + write_log('INIT', 'Config file name:', status='INPUT') + while True: + input_ = input() + try: + if input_[-4:] == '.cfg': + file_path = path.join(PACKAGE_DIR, 'config', input_) + else: + file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_)) + write_log('INIT', 'Loading configuration settings from {0}'.format(file_path)) + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + for line in file.readlines(): + exec(line, globals()) + break + except FileNotFoundError: + write_log('INIT', 'Config file not found.', status='ERROR') + # raise FileNotFoundError() + + write_log('INIT', 'Please name a valid .cfg file.') + if not path.exists(path.join(PACKAGE_DIR, 'config')): write_log('INIT', 'No config folder available.') USE_CONFIG = False @@ -1243,7 +1264,14 @@ def main(): global WORDS, TODO, DONE try: - init() + parser = argparse.ArgumentParser(prog="net.py", description="Builds Containernet Topology") + parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False) + args = parser.parse_args() + + if (args["f"]): + init(args["f"]) + else: + init() except KeyboardInterrupt: handle_keyboard_interrupt() From 584a6ac154a01254ad236f9a5c082de890012652 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 18:01:22 -0400 Subject: [PATCH 05/25] Basic outline of out of scope options --- spidy/config/blank.cfg | 3 +++ spidy/config/default.cfg | 1 + spidy/crawler.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg index 91c8f37..f10aa95 100644 --- a/spidy/config/blank.cfg +++ b/spidy/config/blank.cfg @@ -28,6 +28,9 @@ RESTRICT = # The domain within which to restrict crawling. DOMAIN = '' +# Domains or subdomains that are out of scope for the crawl +OUT_OF_SCOPE = ['', ''] + # Whether to respect sites' robots.txt or not RESPECT_ROBOTS = diff --git a/spidy/config/default.cfg b/spidy/config/default.cfg index c02de63..4faafed 100644 --- a/spidy/config/default.cfg +++ b/spidy/config/default.cfg @@ -7,6 +7,7 @@ ZIP_FILES = True OVERRIDE_SIZE = False RESTRICT = False DOMAIN = '' +OUT_OF_SCOPE = [] RESPECT_ROBOTS = True TODO_FILE = 'crawler_todo.txt' DONE_FILE = 'crawler_done.txt' diff --git a/spidy/crawler.py b/spidy/crawler.py index bbe30f0..87ba236 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -1260,7 +1260,7 @@ def main(): global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE - global RESPECT_ROBOTS, RESTRICT, DOMAIN + global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE global WORDS, TODO, DONE try: From 043a834540ce8f614f6a7034ef0fe4df28fb4db4 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 18:13:53 -0400 Subject: [PATCH 06/25] Add out of scope functionality and adjust the restricted domain logic --- spidy/crawler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 87ba236..abbfde4 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -432,7 +432,8 @@ def check_link(item, robots_index=None): if robots_index and not robots_index.is_allowed(item): return True if RESTRICT: - if DOMAIN not in item: + if DOMAIN not in item.split('/')[2][]: + # Splitting a url on '/' results in ['http(s)', '', '[sub]DOMAIN', 'dir', 'dir', ...] return True if len(item) < 10 or len(item) > 255: return True @@ -441,6 +442,12 @@ def check_link(item, robots_index=None): return True elif item in copy(DONE.queue): return True + + # Check each domain in the out of scope blacklist + for domain in OUT_OF_SCOPE: + if domain in item: + return True + return False @@ -848,7 +855,7 @@ def init(args=None): # Getting Arguments - if (args): + if args: write_log('INIT', 'Config file name:', status='INPUT') while True: input_ = input() @@ -1268,7 +1275,7 @@ def main(): parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False) args = parser.parse_args() - if (args["f"]): + if args["f"]: init(args["f"]) else: init() From cb0e33e1d6b3289cf217e5373d356ccf5a08cdbb Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 18:16:17 -0400 Subject: [PATCH 07/25] Fix my wording on out of scope stuff --- spidy/config/blank.cfg | 2 +- spidy/crawler.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg index f10aa95..8c933c1 100644 --- a/spidy/config/blank.cfg +++ b/spidy/config/blank.cfg @@ -28,7 +28,7 @@ RESTRICT = # The domain within which to restrict crawling. DOMAIN = '' -# Domains or subdomains that are out of scope for the crawl +# Domains, subdomains, and paths that are out of scope for the crawl OUT_OF_SCOPE = ['', ''] # Whether to respect sites' robots.txt or not diff --git a/spidy/crawler.py b/spidy/crawler.py index abbfde4..78aa0f9 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -443,9 +443,9 @@ def check_link(item, robots_index=None): elif item in copy(DONE.queue): return True - # Check each domain in the out of scope blacklist - for domain in OUT_OF_SCOPE: - if domain in item: + # Check each domain, subdomain, or path in the out of scope blacklist + for scope in OUT_OF_SCOPE: + if scope in item: return True return False From 69e42556041784f58b7433a4473d01e59b451980 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 18:31:25 -0400 Subject: [PATCH 08/25] Fix syntax error (I am a programming genius) --- spidy/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 78aa0f9..9b866ef 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -432,7 +432,7 @@ def check_link(item, robots_index=None): if robots_index and not robots_index.is_allowed(item): return True if RESTRICT: - if DOMAIN not in item.split('/')[2][]: + if DOMAIN not in item.split('/')[2]: # Splitting a url on '/' results in ['http(s)', '', '[sub]DOMAIN', 'dir', 'dir', ...] return True if len(item) < 10 or len(item) > 255: From 6155f7bc7ffec1aa2eaa9f5ef138fe9db3e26c30 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 18:41:14 -0400 Subject: [PATCH 09/25] Fix some of my logic --- spidy/crawler.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 9b866ef..db18c46 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -837,7 +837,7 @@ def zip_saved_files(out_file_name, directory): THREAD_RUNNING = True -def init(args=None): +def init(arg_file=None): """ Sets all of the variables for spidy, and as a result can be used for effectively resetting the crawler. @@ -855,15 +855,17 @@ def init(args=None): # Getting Arguments - if args: + if not path.exists(path.join(PACKAGE_DIR, 'config')): + write_log('INIT', 'No config folder available.') + USE_CONFIG = False + elif arg_file: write_log('INIT', 'Config file name:', status='INPUT') while True: - input_ = input() try: - if input_[-4:] == '.cfg': - file_path = path.join(PACKAGE_DIR, 'config', input_) + if arg_file[-4:] == '.cfg': + file_path = path.join(PACKAGE_DIR, 'config', arg_file) else: - file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_)) + file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(arg_file)) write_log('INIT', 'Loading configuration settings from {0}'.format(file_path)) with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: for line in file.readlines(): @@ -874,10 +876,6 @@ def init(args=None): # raise FileNotFoundError() write_log('INIT', 'Please name a valid .cfg file.') - - if not path.exists(path.join(PACKAGE_DIR, 'config')): - write_log('INIT', 'No config folder available.') - USE_CONFIG = False else: write_log('INIT', 'Should spidy load settings from an available config file? (y/n):', status='INPUT') while True: @@ -1275,8 +1273,8 @@ def main(): parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False) args = parser.parse_args() - if args["f"]: - init(args["f"]) + if args.config_file is not None: + init(args.config_file) else: init() except KeyboardInterrupt: From 251230d8dbd2e3da6aba99fd5ca99c64da495517 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 18:51:44 -0400 Subject: [PATCH 10/25] If the argument is used, don't go looking for user input --- spidy/crawler.py | 463 ++++++++++++++++++++++++----------------------- 1 file changed, 232 insertions(+), 231 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index db18c46..da18af5 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -892,285 +892,286 @@ def init(arg_file=None): else: handle_invalid_input() - if USE_CONFIG: - write_log('INIT', 'Config file name:', status='INPUT') - while True: - input_ = input() - try: - if input_[-4:] == '.cfg': - file_path = path.join(PACKAGE_DIR, 'config', input_) - else: - file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_)) - write_log('INIT', 'Loading configuration settings from {0}'.format(file_path)) - with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: - for line in file.readlines(): - exec(line, globals()) - break - except FileNotFoundError: - write_log('INIT', 'Config file not found.', status='ERROR') - # raise FileNotFoundError() - - write_log('INIT', 'Please name a valid .cfg file.') - - else: - write_log('INIT', 'Please enter the following arguments. Leave blank to use the default values.') - - write_log('INIT', 'How many parallel threads should be used for crawler? (Default: 1):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - THREAD_COUNT = 1 - break - elif input_.isdigit(): - THREAD_COUNT = int(input_) - break - else: - handle_invalid_input('integer.') + if arg_file is None: + if USE_CONFIG: + write_log('INIT', 'Config file name:', status='INPUT') + while True: + input_ = input() + try: + if input_[-4:] == '.cfg': + file_path = path.join(PACKAGE_DIR, 'config', input_) + else: + file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_)) + write_log('INIT', 'Loading configuration settings from {0}'.format(file_path)) + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + for line in file.readlines(): + exec(line, globals()) + break + except FileNotFoundError: + write_log('INIT', 'Config file not found.', status='ERROR') + # raise FileNotFoundError() + + write_log('INIT', 'Please name a valid .cfg file.') - write_log('INIT', 'Should spidy load from existing save files? (y/n) (Default: Yes):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - OVERWRITE = False - break - elif input_ in yes: - OVERWRITE = False - break - elif input_ in no: - OVERWRITE = True - break - else: - handle_invalid_input() + else: + write_log('INIT', 'Please enter the following arguments. Leave blank to use the default values.') - write_log('INIT', 'Should spidy raise NEW errors and stop crawling? (y/n) (Default: No):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - RAISE_ERRORS = False - break - elif input_ in yes: - RAISE_ERRORS = True - break - elif input_ in no: - RAISE_ERRORS = False - break - else: - handle_invalid_input() + write_log('INIT', 'How many parallel threads should be used for crawler? (Default: 1):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + THREAD_COUNT = 1 + break + elif input_.isdigit(): + THREAD_COUNT = int(input_) + break + else: + handle_invalid_input('integer.') - write_log('INIT', 'Should spidy save the pages it scrapes to the saved folder? (y/n) (Default: Yes):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - SAVE_PAGES = True - break - elif input_ in yes: - SAVE_PAGES = True - break - elif input_ in no: - SAVE_PAGES = False - break - else: - handle_invalid_input() + write_log('INIT', 'Should spidy load from existing save files? (y/n) (Default: Yes):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + OVERWRITE = False + break + elif input_ in yes: + OVERWRITE = False + break + elif input_ in no: + OVERWRITE = True + break + else: + handle_invalid_input() - if SAVE_PAGES: - write_log('INIT', 'Should spidy zip saved documents when autosaving? (y/n) (Default: No):', status='INPUT') + write_log('INIT', 'Should spidy raise NEW errors and stop crawling? (y/n) (Default: No):', status='INPUT') while True: input_ = input() if not bool(input_): - ZIP_FILES = False + RAISE_ERRORS = False break elif input_ in yes: - ZIP_FILES = True + RAISE_ERRORS = True break elif input_ in no: - ZIP_FILES = False + RAISE_ERRORS = False break else: handle_invalid_input() - else: - ZIP_FILES = False - write_log('INIT', 'Should spidy download documents larger than 500 MB? (y/n) (Default: No):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - OVERRIDE_SIZE = False - break - elif input_ in yes: - OVERRIDE_SIZE = True - break - elif input_ in no: - OVERRIDE_SIZE = False - break - else: - handle_invalid_input() + write_log('INIT', 'Should spidy save the pages it scrapes to the saved folder? (y/n) (Default: Yes):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + SAVE_PAGES = True + break + elif input_ in yes: + SAVE_PAGES = True + break + elif input_ in no: + SAVE_PAGES = False + break + else: + handle_invalid_input() - write_log('INIT', 'Should spidy scrape words and save them? (y/n) (Default: Yes):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - SAVE_WORDS = True - break - elif input_ in yes: - SAVE_WORDS = True - break - elif input_ in no: - SAVE_WORDS = False - break + if SAVE_PAGES: + write_log('INIT', 'Should spidy zip saved documents when autosaving? (y/n) (Default: No):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + ZIP_FILES = False + break + elif input_ in yes: + ZIP_FILES = True + break + elif input_ in no: + ZIP_FILES = False + break + else: + handle_invalid_input() else: - handle_invalid_input() + ZIP_FILES = False - write_log('INIT', 'Should spidy restrict crawling to a specific domain only? (y/n) (Default: No):', - status='INPUT') - while True: - input_ = input() - if not bool(input_): - RESTRICT = False - break - elif input_ in yes: - RESTRICT = True - break - elif input_ in no: - RESTRICT = False - break - else: - handle_invalid_input() - - if RESTRICT: - write_log('INIT', 'What domain should crawling be limited to? Can be subdomains, http/https, etc.', - status='INPUT') + write_log('INIT', 'Should spidy download documents larger than 500 MB? (y/n) (Default: No):', status='INPUT') while True: input_ = input() - try: - DOMAIN = input_ + if not bool(input_): + OVERRIDE_SIZE = False + break + elif input_ in yes: + OVERRIDE_SIZE = True + break + elif input_ in no: + OVERRIDE_SIZE = False break - except KeyError: - handle_invalid_input('string.') + else: + handle_invalid_input() - write_log('INIT', 'Should spidy respect sites\' robots.txt? (y/n) (Default: Yes):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - RESPECT_ROBOTS = True - break - elif input_ in yes: - RESPECT_ROBOTS = True - break - elif input_ in no: - RESPECT_ROBOTS = False - break - else: - handle_invalid_input() + write_log('INIT', 'Should spidy scrape words and save them? (y/n) (Default: Yes):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + SAVE_WORDS = True + break + elif input_ in yes: + SAVE_WORDS = True + break + elif input_ in no: + SAVE_WORDS = False + break + else: + handle_invalid_input() - write_log('INIT', 'What HTTP browser headers should spidy imitate?', status='INPUT') - write_log('INIT', 'Choices: spidy (default), Chrome, Firefox, IE, Edge, Custom:', status='INPUT') - while True: - input_ = input() - if not bool(input_): - HEADER = HEADERS['spidy'] - break - elif input_.lower() == 'custom': - # Here we just trust that the user is inputting valid headers... - write_log('INIT', 'Valid HTTP headers:', status='INPUT') - HEADER = input() - break - else: - try: - HEADER = HEADERS[input_] + write_log('INIT', 'Should spidy restrict crawling to a specific domain only? (y/n) (Default: No):', + status='INPUT') + while True: + input_ = input() + if not bool(input_): + RESTRICT = False + break + elif input_ in yes: + RESTRICT = True + break + elif input_ in no: + RESTRICT = False break - except KeyError: - handle_invalid_input('browser name.') + else: + handle_invalid_input() - write_log('INIT', 'Location of the TODO save file (Default: crawler_todo.txt):', status='INPUT') - input_ = input() - if not bool(input_): - TODO_FILE = 'crawler_todo.txt' - else: - TODO_FILE = input_ + if RESTRICT: + write_log('INIT', 'What domain should crawling be limited to? Can be subdomains, http/https, etc.', + status='INPUT') + while True: + input_ = input() + try: + DOMAIN = input_ + break + except KeyError: + handle_invalid_input('string.') + + write_log('INIT', 'Should spidy respect sites\' robots.txt? (y/n) (Default: Yes):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + RESPECT_ROBOTS = True + break + elif input_ in yes: + RESPECT_ROBOTS = True + break + elif input_ in no: + RESPECT_ROBOTS = False + break + else: + handle_invalid_input() - write_log('INIT', 'Location of the DONE save file (Default: crawler_done.txt):', status='INPUT') - input_ = input() - if not bool(input_): - DONE_FILE = 'crawler_done.txt' - else: - DONE_FILE = input_ + write_log('INIT', 'What HTTP browser headers should spidy imitate?', status='INPUT') + write_log('INIT', 'Choices: spidy (default), Chrome, Firefox, IE, Edge, Custom:', status='INPUT') + while True: + input_ = input() + if not bool(input_): + HEADER = HEADERS['spidy'] + break + elif input_.lower() == 'custom': + # Here we just trust that the user is inputting valid headers... + write_log('INIT', 'Valid HTTP headers:', status='INPUT') + HEADER = input() + break + else: + try: + HEADER = HEADERS[input_] + break + except KeyError: + handle_invalid_input('browser name.') - if SAVE_WORDS: - write_log('INIT', 'Location of the words save file (Default: crawler_words.txt):', status='INPUT') + write_log('INIT', 'Location of the TODO save file (Default: crawler_todo.txt):', status='INPUT') input_ = input() if not bool(input_): - WORD_FILE = 'crawler_words.txt' + TODO_FILE = 'crawler_todo.txt' else: - WORD_FILE = input_ - else: - WORD_FILE = 'None' + TODO_FILE = input_ - write_log('INIT', 'After how many queried links should the crawler autosave? (Default: 100):', status='INPUT') - while True: + write_log('INIT', 'Location of the DONE save file (Default: crawler_done.txt):', status='INPUT') input_ = input() if not bool(input_): - SAVE_COUNT = 100 - break - elif input_.isdigit(): - SAVE_COUNT = int(input_) - break + DONE_FILE = 'crawler_done.txt' else: - handle_invalid_input('integer.') + DONE_FILE = input_ - if not RAISE_ERRORS: - write_log('INIT', 'After how many new errors should spidy stop? (Default: 5):', status='INPUT') + if SAVE_WORDS: + write_log('INIT', 'Location of the words save file (Default: crawler_words.txt):', status='INPUT') + input_ = input() + if not bool(input_): + WORD_FILE = 'crawler_words.txt' + else: + WORD_FILE = input_ + else: + WORD_FILE = 'None' + + write_log('INIT', 'After how many queried links should the crawler autosave? (Default: 100):', status='INPUT') while True: input_ = input() if not bool(input_): - MAX_NEW_ERRORS = 5 + SAVE_COUNT = 100 break elif input_.isdigit(): - MAX_NEW_ERRORS = int(input_) + SAVE_COUNT = int(input_) break else: handle_invalid_input('integer.') - else: - MAX_NEW_ERRORS = 1 - write_log('INIT', 'After how many known errors should spidy stop? (Default: 10):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - MAX_KNOWN_ERRORS = 20 - break - elif input_.isdigit(): - MAX_KNOWN_ERRORS = int(input_) - break + if not RAISE_ERRORS: + write_log('INIT', 'After how many new errors should spidy stop? (Default: 5):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + MAX_NEW_ERRORS = 5 + break + elif input_.isdigit(): + MAX_NEW_ERRORS = int(input_) + break + else: + handle_invalid_input('integer.') else: - handle_invalid_input('integer.') + MAX_NEW_ERRORS = 1 - write_log('INIT', 'After how many HTTP errors should spidy stop? (Default: 20):', status='INPUT') - while True: - input_ = input() - if not bool(input_): - MAX_HTTP_ERRORS = 50 - break - elif not input_.isdigit(): - MAX_HTTP_ERRORS = int(input_) - break - else: - handle_invalid_input('integer.') + write_log('INIT', 'After how many known errors should spidy stop? (Default: 10):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + MAX_KNOWN_ERRORS = 20 + break + elif input_.isdigit(): + MAX_KNOWN_ERRORS = int(input_) + break + else: + handle_invalid_input('integer.') - write_log('INIT', 'After encountering how many new MIME types should spidy stop? (Default: 20):', - status='INPUT') - while True: - input_ = input() - if not bool(input_): - MAX_NEW_MIMES = 10 - break - elif input_.isdigit(): - MAX_NEW_MIMES = int(input_) - break - else: - handle_invalid_input('integer') + write_log('INIT', 'After how many HTTP errors should spidy stop? (Default: 20):', status='INPUT') + while True: + input_ = input() + if not bool(input_): + MAX_HTTP_ERRORS = 50 + break + elif not input_.isdigit(): + MAX_HTTP_ERRORS = int(input_) + break + else: + handle_invalid_input('integer.') + + write_log('INIT', 'After encountering how many new MIME types should spidy stop? (Default: 20):', + status='INPUT') + while True: + input_ = input() + if not bool(input_): + MAX_NEW_MIMES = 10 + break + elif input_.isdigit(): + MAX_NEW_MIMES = int(input_) + break + else: + handle_invalid_input('integer') - # Remove INPUT variable from memory - del input_ + # Remove INPUT variable from memory + del input_ if OVERWRITE: write_log('INIT', 'Creating save files...') From 0b109c77ad661da241d3bb05fb9784da5e8192f5 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 19:17:40 -0400 Subject: [PATCH 11/25] Check if OUT_OF_SCOPE was set --- spidy/crawler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index da18af5..537c872 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -444,9 +444,10 @@ def check_link(item, robots_index=None): return True # Check each domain, subdomain, or path in the out of scope blacklist - for scope in OUT_OF_SCOPE: - if scope in item: - return True + if OUT_OF_SCOPE is not None: + for scope in OUT_OF_SCOPE: + if scope in item: + return True return False From da4d2c721b366892e0d324bddd064448ce315114 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 19:25:37 -0400 Subject: [PATCH 12/25] Scratch that previous commit... --- spidy/crawler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 537c872..da18af5 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -444,10 +444,9 @@ def check_link(item, robots_index=None): return True # Check each domain, subdomain, or path in the out of scope blacklist - if OUT_OF_SCOPE is not None: - for scope in OUT_OF_SCOPE: - if scope in item: - return True + for scope in OUT_OF_SCOPE: + if scope in item: + return True return False From 91080bce2dfbcc8153ca78a4720aa9f7693a4c02 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 1 Aug 2024 19:35:06 -0400 Subject: [PATCH 13/25] Optimize by preventing multiple checks of the same URL --- spidy/crawler.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spidy/crawler.py b/spidy/crawler.py index da18af5..dbadc36 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -448,6 +448,11 @@ def check_link(item, robots_index=None): if scope in item: return True + # Check if the URL has already been processed + if item in FOUND_URLS: + return True + + FOUND_URLS.add(item) return False @@ -1268,6 +1273,9 @@ def main(): global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE global WORDS, TODO, DONE + global FOUND_URLS + + FOUND_URLS = set() try: parser = argparse.ArgumentParser(prog="net.py", description="Builds Containernet Topology") From 2ebe062867786efdb8f407c4f62510aa012d1f22 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 09:24:44 -0400 Subject: [PATCH 14/25] Fix some globals and whatnot --- spidy/crawler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index dbadc36..5659eab 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -830,7 +830,7 @@ def zip_saved_files(out_file_name, directory): HEADER = {} SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS = 0, 0, 0, 0 MAX_NEW_MIMES = 0 -RESPECT_ROBOTS, RESTRICT, DOMAIN = False, False, '' +RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE = False, False, '', [] USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE = False, False, False, False, False SAVE_PAGES, SAVE_WORDS = False, False TODO_FILE, DONE_FILE, WORD_FILE = '', '', '' @@ -840,6 +840,7 @@ def zip_saved_files(out_file_name, directory): save_mutex = threading.Lock() FINISHED = False THREAD_RUNNING = True +FOUND_URLS = set() def init(arg_file=None): @@ -1275,8 +1276,6 @@ def main(): global WORDS, TODO, DONE global FOUND_URLS - FOUND_URLS = set() - try: parser = argparse.ArgumentParser(prog="net.py", description="Builds Containernet Topology") parser.add_argument("-f", "--config-file", type=str, help="Path to the desired config file.", required=False) From 305ee307d11a1102c6780e8f2714b926ecbee739 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 10:14:29 -0400 Subject: [PATCH 15/25] Update config files, add selenium (import only, no code yet) to crawler and requirements --- requirements.txt | 1 + spidy/config/docker.cfg | 1 + spidy/config/heavy.cfg | 1 + spidy/config/infinite.cfg | 1 + spidy/config/light.cfg | 1 + spidy/config/multithreaded.cfg | 1 + spidy/config/wsj.cfg | 3 +++ spidy/crawler.py | 1 + 8 files changed, 10 insertions(+) diff --git a/requirements.txt b/requirements.txt index 8028352..4b48956 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ requests lxml flake8 reppy +selenium \ No newline at end of file diff --git a/spidy/config/docker.cfg b/spidy/config/docker.cfg index 9e52af7..8d90285 100644 --- a/spidy/config/docker.cfg +++ b/spidy/config/docker.cfg @@ -7,6 +7,7 @@ ZIP_FILES = True OVERRIDE_SIZE = False RESTRICT = False DOMAIN = '' +OUT_OF_SCOPE = [] RESPECT_ROBOTS = True TODO_FILE = '/data/crawler_todo.txt' DONE_FILE = '/data/crawler_done.txt' diff --git a/spidy/config/heavy.cfg b/spidy/config/heavy.cfg index 1c41c91..7641797 100644 --- a/spidy/config/heavy.cfg +++ b/spidy/config/heavy.cfg @@ -7,6 +7,7 @@ ZIP_FILES = True OVERRIDE_SIZE = True RESTRICT = False DOMAIN = '' +OUT_OF_SCOPE = [] RESPECT_ROBOTS = False TODO_FILE = 'crawler_todo.txt' DONE_FILE = 'crawler_done.txt' diff --git a/spidy/config/infinite.cfg b/spidy/config/infinite.cfg index bcf11bc..840c05f 100644 --- a/spidy/config/infinite.cfg +++ b/spidy/config/infinite.cfg @@ -7,6 +7,7 @@ ZIP_FILES = True OVERRIDE_SIZE = False RESTRICT = False DOMAIN = '' +OUT_OF_SCOPE = [] RESPECT_ROBOTS = True TODO_FILE = 'crawler_todo.txt' DONE_FILE = 'crawler_done.txt' diff --git a/spidy/config/light.cfg b/spidy/config/light.cfg index 9a916c9..5d06f4d 100644 --- a/spidy/config/light.cfg +++ b/spidy/config/light.cfg @@ -7,6 +7,7 @@ OVERRIDE_SIZE = False SAVE_WORDS = False RESTRICT = False DOMAIN = '' +OUT_OF_SCOPE = [] RESPECT_ROBOTS = True TODO_FILE = 'crawler_todo.txt' DONE_FILE = 'crawler_done.txt' diff --git a/spidy/config/multithreaded.cfg b/spidy/config/multithreaded.cfg index 1af0311..5d212e3 100644 --- a/spidy/config/multithreaded.cfg +++ b/spidy/config/multithreaded.cfg @@ -7,6 +7,7 @@ ZIP_FILES = True OVERRIDE_SIZE = False RESTRICT = False DOMAIN = '' +OUT_OF_SCOPE = [] RESPECT_ROBOTS = False TODO_FILE = 'crawler_todo.txt' DONE_FILE = 'crawler_done.txt' diff --git a/spidy/config/wsj.cfg b/spidy/config/wsj.cfg index 5a5ed40..e412caf 100644 --- a/spidy/config/wsj.cfg +++ b/spidy/config/wsj.cfg @@ -12,6 +12,9 @@ RESTRICT = True # The domain within which to restrict crawling. DOMAIN = 'wsj.com/' +# Do not allow crawling involving specific pages and subdomains +OUT_OF_SCOPE = ['wsj.com/business/airlines', 'africa.wsj.com'] + RESPECT_ROBOTS = True TODO_FILE = 'wsj_todo.txt' DONE_FILE = 'wsj_done.txt' diff --git a/spidy/crawler.py b/spidy/crawler.py index 5659eab..4055446 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -11,6 +11,7 @@ import threading import queue import logging +import selenium from os import path, makedirs from copy import copy From 2ccf5b103baf9a9c0ebf86dbe1c6b452cd0325bb Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 10:50:25 -0400 Subject: [PATCH 16/25] Fix imports --- spidy/crawler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 4055446..7efbcbf 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -11,13 +11,14 @@ import threading import queue import logging -import selenium from os import path, makedirs from copy import copy from lxml import etree from lxml.html import iterlinks, resolve_base_href, make_links_absolute from reppy.robots import Robots +from selenium import webdriver +from selenium.webdriver.firefox.options import Options try: from spidy import __version__ @@ -228,7 +229,7 @@ def _remember(self, url): write_log('INIT', 'Creating functions...') - +# TODO: Integrate selenium for fully rendered pages def crawl(url, thread_id=0): global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS if not OVERRIDE_SIZE: From 6b9f1b8e6d89d61269f218ef392ba18bc5d9b019 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 14:15:32 -0400 Subject: [PATCH 17/25] This should work --- requirements.txt | 3 +- spidy/config/blank.cfg | 3 ++ spidy/config/default.cfg | 1 + spidy/config/docker.cfg | 1 + spidy/config/heavy.cfg | 1 + spidy/config/infinite.cfg | 1 + spidy/config/light.cfg | 1 + spidy/config/multithreaded.cfg | 1 + spidy/config/wsj.cfg | 1 + spidy/crawler.py | 64 +++++++++++++++++++++++++++------- 10 files changed, 64 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4b48956..a7ffbbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ requests lxml flake8 reppy -selenium \ No newline at end of file +selenium-wire +blinker==1.7.0 \ No newline at end of file diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg index 8c933c1..1e8c5eb 100644 --- a/spidy/config/blank.cfg +++ b/spidy/config/blank.cfg @@ -51,6 +51,9 @@ HEADER = HEADERS['
'] # Or if you want to use custom headers: HEADER = {'
': '', '': ''} +# Select if you would like to have pages rendered with a headless browser (more thorough, but slower) +USE_BROWSER = + # Amount of errors allowed to happen before automatic shutdown. MAX_NEW_ERRORS = MAX_KNOWN_ERRORS = diff --git a/spidy/config/default.cfg b/spidy/config/default.cfg index 4faafed..6e89984 100644 --- a/spidy/config/default.cfg +++ b/spidy/config/default.cfg @@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt' WORD_FILE = 'crawler_words.txt' SAVE_COUNT = 100 HEADER = HEADERS['spidy'] +USE_BROWSER = False MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 diff --git a/spidy/config/docker.cfg b/spidy/config/docker.cfg index 8d90285..9a4b5d7 100644 --- a/spidy/config/docker.cfg +++ b/spidy/config/docker.cfg @@ -14,6 +14,7 @@ DONE_FILE = '/data/crawler_done.txt' WORD_FILE = '/data/crawler_words.txt' SAVE_COUNT = 100 HEADER = HEADERS['spidy'] +USE_BROWSER = False MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 diff --git a/spidy/config/heavy.cfg b/spidy/config/heavy.cfg index 7641797..8f3be0d 100644 --- a/spidy/config/heavy.cfg +++ b/spidy/config/heavy.cfg @@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt' WORD_FILE = 'crawler_words.txt' SAVE_COUNT = 100 HEADER = HEADERS['spidy'] +USE_BROWSER = True MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 diff --git a/spidy/config/infinite.cfg b/spidy/config/infinite.cfg index 840c05f..71d616a 100644 --- a/spidy/config/infinite.cfg +++ b/spidy/config/infinite.cfg @@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt' WORD_FILE = 'crawler_words.txt' SAVE_COUNT = 250 HEADER = HEADERS['spidy'] +USE_BROWSER = False MAX_NEW_ERRORS = 1000000 MAX_KNOWN_ERRORS = 1000000 MAX_HTTP_ERRORS = 1000000 diff --git a/spidy/config/light.cfg b/spidy/config/light.cfg index 5d06f4d..991dfdf 100644 --- a/spidy/config/light.cfg +++ b/spidy/config/light.cfg @@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt' WORD_FILE = 'crawler_words.txt' SAVE_COUNT = 150 HEADER = HEADERS['spidy'] +USE_BROWSER = False MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 diff --git a/spidy/config/multithreaded.cfg b/spidy/config/multithreaded.cfg index 5d212e3..eec2eff 100644 --- a/spidy/config/multithreaded.cfg +++ b/spidy/config/multithreaded.cfg @@ -14,6 +14,7 @@ DONE_FILE = 'crawler_done.txt' WORD_FILE = 'crawler_words.txt' SAVE_COUNT = 100 HEADER = HEADERS['spidy'] +USE_BROWSER = False MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 diff --git a/spidy/config/wsj.cfg b/spidy/config/wsj.cfg index e412caf..3997015 100644 --- a/spidy/config/wsj.cfg +++ b/spidy/config/wsj.cfg @@ -21,6 +21,7 @@ DONE_FILE = 'wsj_done.txt' WORD_FILE = 'wsj_words.txt' SAVE_COUNT = 60 HEADER = HEADERS['spidy'] +USE_BROWSER = False MAX_NEW_ERRORS = 100 MAX_KNOWN_ERRORS = 100 MAX_HTTP_ERRORS = 100 diff --git a/spidy/crawler.py b/spidy/crawler.py index 7efbcbf..cd8d55e 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -17,8 +17,8 @@ from lxml import etree from lxml.html import iterlinks, resolve_base_href, make_links_absolute from reppy.robots import Robots -from selenium import webdriver -from selenium.webdriver.firefox.options import Options +from seleniumwire import webdriver +from types import SimpleNamespace try: from spidy import __version__ @@ -230,7 +230,7 @@ def _remember(self, url): write_log('INIT', 'Creating functions...') # TODO: Integrate selenium for fully rendered pages -def crawl(url, thread_id=0): +def crawl(url, browser, thread_id=0): global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS if not OVERRIDE_SIZE: try: @@ -242,7 +242,14 @@ def crawl(url, thread_id=0): raise SizeError # If the SizeError is raised it will be caught in the except block in the run section, # and the following code will not be run. - page = requests.get(url, headers=HEADER) # Get page + r = requests.get(url, headers=HEADER) + + if (browser is None): + page = r # Get page + else: + browser.get(url) + page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers) + word_list = [] doctype = get_mime_type(page) if doctype.find('image') < 0 and doctype.find('video') < 0: @@ -279,13 +286,23 @@ def crawl_worker(thread_id, robots_index): # Declare global variables global VERSION, START_TIME, START_TIME_LONG global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME - global HEADER, WORKING_DIR, KILL_LIST + global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE - global RESPECT_ROBOTS, RESTRICT, DOMAIN - global WORDS, TODO, DONE, THREAD_RUNNING + global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE + global WORDS, TODO, DONE + global FOUND_URLS + + browser = None + if (USE_BROWSER): + browser_options = webdriver.FirefoxOptions() + browser_options.add_argument('--headless') + + browser = webdriver.Firefox(options=browser_options) + + browser.request_interceptor = interceptor while THREAD_RUNNING: # Check if there are more urls to crawl @@ -338,7 +355,7 @@ def crawl_worker(thread_id, robots_index): else: if check_link(url, robots_index): # If the link is invalid continue - links = crawl(url, thread_id) + links = crawl(url, browser, thread_id) for link in links: # Skip empty links if len(link) <= 0 or link == "/": @@ -830,6 +847,7 @@ def zip_saved_files(out_file_name, directory): # Initialize variables as empty that will be needed in the global scope HEADER = {} +USE_BROWSER = False SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS = 0, 0, 0, 0 MAX_NEW_MIMES = 0 RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE = False, False, '', [] @@ -853,13 +871,14 @@ def init(arg_file=None): # Declare global variables global VERSION, START_TIME, START_TIME_LONG global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME - global HEADER, PACKAGE_DIR, WORKING_DIR, KILL_LIST + global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE - global RESPECT_ROBOTS, RESTRICT, DOMAIN - global WORDS, TODO, DONE, THREAD_COUNT + global RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE + global WORDS, TODO, DONE + global FOUND_URLS # Getting Arguments @@ -1090,6 +1109,21 @@ def init(arg_file=None): except KeyError: handle_invalid_input('browser name.') + write_log('INIT', 'Should spidy use a headless browser? (y/n) (Default: No)', status='INPUT') + while True: + input_ = input() + if not bool(input_): + USE_BROWSER = True + break + elif input_ in yes: + USE_BROWSER = True + break + elif input_ in no: + USE_BROWSER = False + break + else: + handle_invalid_input() + write_log('INIT', 'Location of the TODO save file (Default: crawler_todo.txt):', status='INPUT') input_ = input() if not bool(input_): @@ -1262,6 +1296,12 @@ def handle_keyboard_interrupt(): done_crawling(True) +# Used by the webdriver to add custom headers +def interceptor(request): + for key in HEADER: + request[key] = HEADER[key] + + def main(): """ The main function of spidy. @@ -1269,7 +1309,7 @@ def main(): # Declare global variables global VERSION, START_TIME, START_TIME_LONG global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME - global HEADER, WORKING_DIR, KILL_LIST + global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT From 476ccc0122e96f99ce0f34ef48a20fc592541045 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 15:00:10 -0400 Subject: [PATCH 18/25] Fix interceptor function --- spidy/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index cd8d55e..22f9fd4 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -1299,7 +1299,7 @@ def handle_keyboard_interrupt(): # Used by the webdriver to add custom headers def interceptor(request): for key in HEADER: - request[key] = HEADER[key] + request.headers[key] = HEADER[key] def main(): From b05e00667bcc28a9200f120d83fd362094aa8458 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 17:36:01 -0400 Subject: [PATCH 19/25] Bug fixes and testing --- spidy/crawler.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 22f9fd4..07edd14 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -18,6 +18,8 @@ from lxml.html import iterlinks, resolve_base_href, make_links_absolute from reppy.robots import Robots from seleniumwire import webdriver +from selenium.webdriver.common.alert import Alert +from selenium.common.exceptions import TimeoutException, UnexpectedAlertPresentException, WebDriverException from types import SimpleNamespace try: @@ -243,12 +245,27 @@ def crawl(url, browser, thread_id=0): # If the SizeError is raised it will be caught in the except block in the run section, # and the following code will not be run. r = requests.get(url, headers=HEADER) + + print(f"attempting url: {url}") if (browser is None): page = r # Get page else: - browser.get(url) - page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers) + try: + browser.get(url) + page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers) + except TimeoutException: + KNOWN_ERROR_COUNT += 1 + return [] + except UnexpectedAlertPresentException: + browser.get(url) + alert = Alert(browser) + alert.accept() + page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers) + KNOWN_ERROR_COUNT += 1 + except WebDriverException: + KNOWN_ERROR_COUNT += 1 + return [] word_list = [] doctype = get_mime_type(page) @@ -303,6 +320,9 @@ def crawl_worker(thread_id, robots_index): browser = webdriver.Firefox(options=browser_options) browser.request_interceptor = interceptor + browser.implicitly_wait(10) + browser.set_page_load_timeout(10) + webdriver.DesiredCapabilities.FIREFOX["unexpectedAlertBehaviour"] = "accept" while THREAD_RUNNING: # Check if there are more urls to crawl From cb4f856a0510175244a876f376415cb05d171c38 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 17:37:50 -0400 Subject: [PATCH 20/25] Fix requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a7ffbbd..de5777a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,6 @@ requests lxml flake8 reppy +selenium selenium-wire blinker==1.7.0 \ No newline at end of file From e417d34621b5277742da0de588d0dc6183c0c83f Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 18:05:53 -0400 Subject: [PATCH 21/25] Update docs and fix comments --- README.md | 4 +- spidy/crawler.py | 1 - spidy/docs/DOCS.md | 123 +++++++++++++++++++++++---------------------- 3 files changed, 65 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 3d9ab37..42eb5b6 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ Pretty simple! ![All Platforms!](https://img.shields.io/badge/Windows,%20OS/X,%20Linux-%20%20-brightgreen.svg) ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
-![Lines of Code: 1553](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg) -![Lines of Docs: 605](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg) +![Lines of Code: 1810](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg) +![Lines of Docs: 614](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg) [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card) [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy) [![PyPI Wheel](https://img.shields.io/pypi/wheel/spidy-web-crawler.svg)](https://pypi.org/project/spidy-web-crawler/) diff --git a/spidy/crawler.py b/spidy/crawler.py index 07edd14..d642596 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -231,7 +231,6 @@ def _remember(self, url): write_log('INIT', 'Creating functions...') -# TODO: Integrate selenium for fully rendered pages def crawl(url, browser, thread_id=0): global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS if not OVERRIDE_SIZE: diff --git a/spidy/docs/DOCS.md b/spidy/docs/DOCS.md index 4e0c570..6fd46a6 100644 --- a/spidy/docs/DOCS.md +++ b/spidy/docs/DOCS.md @@ -99,17 +99,17 @@ Everything that follows is intended to be detailed information on each piece in This section lists the custom classes in `crawler.py`.
Most are Errors or Exceptions that may be raised throughout the code. -## `HeaderError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L120)) +## `HeaderError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L126)) Raised when there is a problem deciphering HTTP headers returned from a website. -## `SizeError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L127)) +## `SizeError` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L133)) Raised when a file is too large to download in an acceptable time. # Functions This section lists the functions in `crawler.py` that are used throughout the code. -## `check_link` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L399)) +## `check_link` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L464)) Determines whether links should be crawled.
Types of links that will be pruned: @@ -118,34 +118,34 @@ Types of links that will be pruned: - Links that have already been crawled. - Links in [`KILL_LIST`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#kill_list--source). -## `check_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L433)) +## `check_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L510)) Checks whether a file path will cause errors when saving.
Paths longer than 256 characters cannot be saved (Windows). -## `check_word` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L421)) +## `check_word` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L498)) Checks whether a word is valid.
The word-saving feature was originally added to be used for password cracking with hashcat, which is why `check_word` checks for length of less than 16 characters.
The average password length is around 8 characters. -## `crawl` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L190)) +## `crawl` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L253)) Does all of the crawling, scraping, scraping of a single document. -## `err_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L601)) +## `err_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L678)) Saves the triggering error to the log file. -## `get_mime_type` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L500)) +## `get_mime_type` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L577)) Extracts the Content-Type header from the headers returned by page. -## `get_time` - ([Source](https://github.com/rivermont/spidy/blobl/master/spidy/crawler.py#L29)) +## `get_time` - ([Source](https://github.com/rivermont/spidy/blobl/master/spidy/crawler.py#L36)) Returns the current time in the format `HH:MM::SS`. -## `get_full_time` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L33)) +## `get_full_time` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L40)) Returns the current time in the format `HH:MM:SS, Day, Mon, YYYY`. -## `handle_keyboard_interrupt` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L1137)) +## `handle_keyboard_interrupt` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L1314)) Shuts down the crawler when a `KeyboardInterrupt` is performed. -## `info_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L561)) +## `info_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L638)) Logs important information to the console and log file.
Example log: @@ -164,17 +164,17 @@ Example log: [23:17:06] [spidy] [LOG]: Saved done list to crawler_done.txt [23:17:06] [spidy] [LOG]: Saved 90 bad links to crawler_bad.txt -## `log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L578)) +## `log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L655)) Logs a single message to the error log file. Prints message verbatim, so message must be formatted correctly in the function call. -## `make_file_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L487)) +## `make_file_path` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L564)) Makes a valid Windows file path for a given url. -## `make_words` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L166)) +## `make_words` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L522)) Returns a list of all the valid words (determined using [`check_word`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#check_word--source)) on a given page. -## `mime_lookup` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L511)) +## `mime_lookup` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L588)) This finds the correct file extension for a MIME type using the [`MIME_TYPES`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#mime_types--source) dictionary.
If the MIME type is blank it defaults to `.html`, and if the MIME type is not in the dictionary a [`HeaderError`](https://github.com/rivermont/spidy/blob/master/spidy/docs/DOCS.md#headererror--source) is raised.
Usage: @@ -183,59 +183,62 @@ Usage: Where `value` is the MIME type. -## `save_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L459)) +## `save_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L536)) Saves the TODO, DONE, word, and bad lists to their respective files.
The word and bad link lists use the same function to save space. -## `save_page` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L527)) +## `save_page` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L604)) Download content of url and save to the `save` folder. -## `update_file` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L546)) +## `update_file` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L623)) TODO -## `write_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L78) +## `write_log` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L85) Writes message to both the console and the log file.
NOTE: Automatically adds timestamp and `[spidy]` to message, and formats message for log appropriately. -## `zip_saved_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L610)) +## `zip_saved_files` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L687)) Zips the contents of `saved/` to a `.zip` file.
Each archive is unique, with names generated from the current time. +## `interceptor` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L11320)) +Intercepts a request from selenium and updates the headers to match the one selected by the user. + # Global Variables This section lists the variables in [`crawler.py`](https://github.om/rivermont/spidy/blob/master/spidy/crawler.py) that are used throughout the code. -## `COUNTER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L774)) +## `COUNTER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L852)) Incremented each time a link is crawled.
-## `CRAWLER_DIR` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L30)) +## `WORKING_DIR` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L48)) The directory that `crawler.py` is located in. -## `DOMAIN` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L794)) +## `DOMAIN` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L873)) The domain that crawling is restricted to if [`RESTRICT`](#restrict--source) is `True`. -## `DONE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L798)) +## `DONE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L877)) TODO -## `DONE_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L797)) +## `DONE_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L876)) TODO -## `ERR_LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L56)) +## `ERR_LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L63)) TODO -## `ERR_LOG_FILE_NAME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L57)) +## `ERR_LOG_FILE_NAME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L64)) TODO -## `HEADER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L791)) +## `HEADER` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L869)) TODO -## `HEADERS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L727)) +## `HEADERS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L805)) TODO -## `HTTP_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L777)) +## `HTTP_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L895)) TODO -## `KILL_LIST` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L762)) +## `KILL_LIST` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L894)) A list of pages that are known to cause problems with the crawler. - `bhphotovideo.com/c/search` @@ -243,33 +246,33 @@ A list of pages that are known to cause problems with the crawler. - `w3.org`: I have never been able to access W3, although it never says it's down. If someone knows of this problem, please let me know. - `web.archive.org/web/`: While there is some good content, there are sometimes thousands of copies of the same exact page. Not good for web crawling. -## `KNOWN_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L776)) +## `KNOWN_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L895)) TODO -## `LOG_END` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L504)) +## `LOG_END` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L)) Line to print at the end of each `logFile` log -## `LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L51)) +## `LOG_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L893)) The file that the command line logs are written to.
Kept open until the crawler stops for whatever reason so that it can be written to. -## `LOG_FILE_NAME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L53)) +## `LOG_FILE_NAME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L893)) The actual file name of [`LOG_FILE`](#log_file--source).
Used in [`info_log`](#info_log--source). -## `MAX_HTTP_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792)) +## `MAX_HTTP_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896)) TODO -## `MAX_KNOWN_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792)) +## `MAX_KNOWN_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896)) TODO -## `MAX_NEW_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792)) +## `MAX_NEW_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896)) TODO -## `MAX_NEW_MIMES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L793)) +## `MAX_NEW_MIMES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896)) TODO -## `MIME_TYPES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L628)) +## `MIME_TYPES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L705)) A dictionary of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) encountered by the crawler.
While there are [thousands of other types](https://www.iana.org/assignments/media-types/media-types.xhtml) that are not listed, to list them all would be impractical: - The size of the list would be huge, using memory, space, etc. @@ -288,63 +291,63 @@ Where `value` is the MIME type.
This will return the extension associated with the MIME type if it exists, however this will throw an [`IndexError`](https://docs.python.org/2/library/exceptions.html#exceptions.IndexError) if the MIME type is not in the dictionary.
Because of this, it is recommended to use the [`mime_lookup`](#mime_lookup--source) function. -## `NEW_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L775)) +## `NEW_ERROR_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L853)) TODO -## `NEW_MIME_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L778)) +## `NEW_MIME_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L856)) TODO -## `OVERRIDE_SIZE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795)) +## `OVERRIDE_SIZE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874)) TODO -## `OVERWRITE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795)) +## `OVERWRITE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874)) TODO -## `RAISE_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795)) +## `RAISE_ERRORS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874)) TODO -## `RESTRICT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L794)) +## `RESTRICT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L873)) Whether to restrict crawling to [`DOMAIN`](#domain--source) or not. -## `SAVE_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L792)) +## `SAVE_COUNT` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L871)) TODO -## `SAVE_PAGES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L796)) +## `SAVE_PAGES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L875)) TODO -## `SAVE_WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L796)) +## `SAVE_WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L875)) TODO -## `START` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L771)) +## `START` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L849)) Links to start crawling if the TODO list is empty -## `START_TIME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L37)) +## `START_TIME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L44)) The time that `crawler.py` was started, in seconds from the epoch.
More information can be found on the page for the Python [time](https://docs.python.org/3/library/time.html) library. -## `START_TIME_LONG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L38)) +## `START_TIME_LONG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L45)) The time that `crawler.py` was started, in the format `HH:MM:SS, Date Month Year`.
Used in `info_log`. -## `TODO` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L798)) +## `TODO` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L877)) The list containing all links that are yet to be crawled. -## `TODO_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L797)) +## `TODO_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L876)) TODO -## `USE_CONFIG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795)) +## `USE_CONFIG` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874)) TODO -## `VERSION` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L24)) +## `VERSION` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L31)) The current version of the crawler. -## `WORD_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L797)) +## `WORD_FILE` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L876)) TODO -## `WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L782)) +## `WORDS` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L860)) TODO -## `ZIP_FILES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L795)) +## `ZIP_FILES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L874)) TODO *** From 14566941b68da6be02130888bedbe0335e5f83cb Mon Sep 17 00:00:00 2001 From: lkotlus Date: Fri, 2 Aug 2024 18:13:51 -0400 Subject: [PATCH 22/25] Contributors --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 42eb5b6..4f396d2 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Pretty simple! ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
![Lines of Code: 1810](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg) -![Lines of Docs: 614](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg) +![Lines of Docs: 616](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg) [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card) [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy) [![PyPI Wheel](https://img.shields.io/pypi/wheel/spidy-web-crawler.svg)](https://pypi.org/project/spidy-web-crawler/) @@ -101,6 +101,7 @@ Here are some features we figure are worth noting. - Cross-Platform compatibility: spidy will work on all three major operating systems, Windows, Mac OS/X, and Linux! - Frequent Timestamp Logging: Spidy logs almost every action it takes to both the console and one of two log files. - Browser Spoofing: Make requests using User Agents from 4 popular web browsers, use a custom spidy bot one, or create your own! + - Headless Browser Support: Render full webpages to get dynamic content. - Portability: Move spidy's folder and its contents somewhere else and it will run right where it left off. *Note*: This only works if you run it from source code. - User-Friendly Logs: Both the console and log file messages are simple and easy to interpret, but packed with information. - Webpage saving: Spidy downloads each page that it runs into, regardless of file type. The crawler uses the HTTP `Content-Type` header returned with most files to determine the file type. @@ -225,6 +226,7 @@ See the [`CONTRIBUTING.md`](https://github.com/rivermont/spidy/blob/master/spidy * [quatroka](https://github.com/quatroka) - Fixed testing bugs. * [stevelle](https://github.com/stevelle) - Respect robots.txt. * [thatguywiththatname](https://github.com/thatguywiththatname) - README link corrections. +* [lkotlus](https://github.com/lkotlus) - Optimizations, out of scope items, and headless browser support. # License We used the [Gnu General Public License](https://www.gnu.org/licenses/gpl-3.0.en.html) (see [`LICENSE`](https://github.com/rivermont/spidy/blob/master/LICENSE)) as it was the license that best suited our needs.
From 62b4668c50dfdd449d90f35aeadb7cb64e22bc30 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Mon, 5 Aug 2024 11:55:44 -0400 Subject: [PATCH 23/25] Remove unnecesary print --- README.md | 2 +- spidy/crawler.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 4f396d2..9060ffd 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Pretty simple! ![All Platforms!](https://img.shields.io/badge/Windows,%20OS/X,%20Linux-%20%20-brightgreen.svg) ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
-![Lines of Code: 1810](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg) +![Lines of Code: 1808](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg) ![Lines of Docs: 616](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg) [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card) [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy) diff --git a/spidy/crawler.py b/spidy/crawler.py index d642596..421fa4c 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -244,8 +244,6 @@ def crawl(url, browser, thread_id=0): # If the SizeError is raised it will be caught in the except block in the run section, # and the following code will not be run. r = requests.get(url, headers=HEADER) - - print(f"attempting url: {url}") if (browser is None): page = r # Get page From 1547563d0c65e2bc6d67d65d595f338aea803c63 Mon Sep 17 00:00:00 2001 From: lkotlus Date: Mon, 5 Aug 2024 11:58:31 -0400 Subject: [PATCH 24/25] KNOWN_ERROR_COUNT referenced before assignment fixed. --- spidy/crawler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spidy/crawler.py b/spidy/crawler.py index 421fa4c..532605f 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -232,7 +232,7 @@ def _remember(self, url): write_log('INIT', 'Creating functions...') def crawl(url, browser, thread_id=0): - global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS + global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS, KNOWN_ERROR_COUNT if not OVERRIDE_SIZE: try: # Attempt to get the size in bytes of the document @@ -252,16 +252,16 @@ def crawl(url, browser, thread_id=0): browser.get(url) page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers) except TimeoutException: - KNOWN_ERROR_COUNT += 1 + KNOWN_ERROR_COUNT.increment() return [] except UnexpectedAlertPresentException: browser.get(url) alert = Alert(browser) alert.accept() page = SimpleNamespace(text=browser.page_source, content=browser.page_source.encode('utf-8'), headers=r.headers) - KNOWN_ERROR_COUNT += 1 + KNOWN_ERROR_COUNT.increment() except WebDriverException: - KNOWN_ERROR_COUNT += 1 + KNOWN_ERROR_COUNT.increment() return [] word_list = [] From b37fd416eab0eadc58b95a48fc3f8a2980ce8d6c Mon Sep 17 00:00:00 2001 From: lkotlus Date: Thu, 8 Aug 2024 10:25:54 -0400 Subject: [PATCH 25/25] Add maximum time --- README.md | 4 ++-- spidy/config/blank.cfg | 3 +++ spidy/config/default.cfg | 1 + spidy/config/docker.cfg | 1 + spidy/config/heavy.cfg | 1 + spidy/config/infinite.cfg | 1 + spidy/config/light.cfg | 1 + spidy/config/multithreaded.cfg | 1 + spidy/config/wsj.cfg | 1 + spidy/crawler.py | 8 ++++++-- spidy/docs/DOCS.md | 3 +++ 11 files changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9060ffd..f48aa96 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ Pretty simple! ![All Platforms!](https://img.shields.io/badge/Windows,%20OS/X,%20Linux-%20%20-brightgreen.svg) ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
-![Lines of Code: 1808](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg) -![Lines of Docs: 616](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg) +![Lines of Code: 1811](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg) +![Lines of Docs: 619](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg) [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card) [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy) [![PyPI Wheel](https://img.shields.io/pypi/wheel/spidy-web-crawler.svg)](https://pypi.org/project/spidy-web-crawler/) diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg index 1e8c5eb..ed55fa9 100644 --- a/spidy/config/blank.cfg +++ b/spidy/config/blank.cfg @@ -60,5 +60,8 @@ MAX_KNOWN_ERRORS = MAX_HTTP_ERRORS = MAX_NEW_MIMES = +# Amount of time (in seconds) the crawl is allowed to run for (set to float('inf') if you want it to run forever) +MAX_TIME = + # Pages to start crawling on in case TODO is empty at start. START = ['', ''] diff --git a/spidy/config/default.cfg b/spidy/config/default.cfg index 6e89984..fa2afc0 100644 --- a/spidy/config/default.cfg +++ b/spidy/config/default.cfg @@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 MAX_NEW_MIMES = 10 +MAX_TIME = float('inf') START = ['https://en.wikipedia.org/wiki/Main_Page'] \ No newline at end of file diff --git a/spidy/config/docker.cfg b/spidy/config/docker.cfg index 9a4b5d7..3a546ca 100644 --- a/spidy/config/docker.cfg +++ b/spidy/config/docker.cfg @@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 MAX_NEW_MIMES = 10 +MAX_TIME = float('inf') START = ['https://en.wikipedia.org/wiki/Main_Page'] diff --git a/spidy/config/heavy.cfg b/spidy/config/heavy.cfg index 8f3be0d..4e2f0ea 100644 --- a/spidy/config/heavy.cfg +++ b/spidy/config/heavy.cfg @@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 MAX_NEW_MIMES = 10 +MAX_TIME = float('inf') START = ['https://en.wikipedia.org/wiki/Main_Page'] \ No newline at end of file diff --git a/spidy/config/infinite.cfg b/spidy/config/infinite.cfg index 71d616a..1c41881 100644 --- a/spidy/config/infinite.cfg +++ b/spidy/config/infinite.cfg @@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 1000000 MAX_KNOWN_ERRORS = 1000000 MAX_HTTP_ERRORS = 1000000 MAX_NEW_MIMES = 1000000 +MAX_TIME = float('inf') START = ['https://en.wikipedia.org/wiki/Main_Page'] \ No newline at end of file diff --git a/spidy/config/light.cfg b/spidy/config/light.cfg index 991dfdf..7a11da4 100644 --- a/spidy/config/light.cfg +++ b/spidy/config/light.cfg @@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 MAX_NEW_MIMES = 10 +MAX_TIME = 600 START = ['https://en.wikipedia.org/wiki/Main_Page'] \ No newline at end of file diff --git a/spidy/config/multithreaded.cfg b/spidy/config/multithreaded.cfg index eec2eff..17daafa 100644 --- a/spidy/config/multithreaded.cfg +++ b/spidy/config/multithreaded.cfg @@ -19,4 +19,5 @@ MAX_NEW_ERRORS = 5 MAX_KNOWN_ERRORS = 10 MAX_HTTP_ERRORS = 20 MAX_NEW_MIMES = 10 +MAX_TIME = float('inf') START = ['https://en.wikipedia.org/wiki/Main_Page'] \ No newline at end of file diff --git a/spidy/config/wsj.cfg b/spidy/config/wsj.cfg index 3997015..d03ad06 100644 --- a/spidy/config/wsj.cfg +++ b/spidy/config/wsj.cfg @@ -26,4 +26,5 @@ MAX_NEW_ERRORS = 100 MAX_KNOWN_ERRORS = 100 MAX_HTTP_ERRORS = 100 MAX_NEW_MIMES = 5 +MAX_TIME = float('inf') START = ['https://www.wsj.com/'] diff --git a/spidy/crawler.py b/spidy/crawler.py index 532605f..9075848 100755 --- a/spidy/crawler.py +++ b/spidy/crawler.py @@ -298,7 +298,7 @@ def crawl_worker(thread_id, robots_index): """ # Declare global variables - global VERSION, START_TIME, START_TIME_LONG + global VERSION, START_TIME, START_TIME_LONG, MAX_TIME global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME global HEADER, USE_BROWSER, WORKING_DIR, KILL_LIST global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT @@ -348,6 +348,10 @@ def crawl_worker(thread_id, robots_index): write_log('CRAWL', 'Too many errors have accumulated; stopping crawler.') done_crawling() break + elif time.time() - START_TIME >= MAX_TIME: # If too much time has passed + write_log('CRAWL', 'Maximum time has been exceeded.') + done_crawling() + break elif COUNTER.val >= SAVE_COUNT: # If it's time for an autosave # Make sure only one thread saves files with save_mutex: @@ -865,7 +869,7 @@ def zip_saved_files(out_file_name, directory): # Initialize variables as empty that will be needed in the global scope HEADER = {} USE_BROWSER = False -SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS = 0, 0, 0, 0 +SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_TIME = 0, 0, 0, 0, float('inf') MAX_NEW_MIMES = 0 RESPECT_ROBOTS, RESTRICT, DOMAIN, OUT_OF_SCOPE = False, False, '', [] USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE = False, False, False, False, False diff --git a/spidy/docs/DOCS.md b/spidy/docs/DOCS.md index 6fd46a6..43c9939 100644 --- a/spidy/docs/DOCS.md +++ b/spidy/docs/DOCS.md @@ -272,6 +272,9 @@ TODO ## `MAX_NEW_MIMES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896)) TODO +## `MAX_TIME` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L896)) +Maximum amount of time (in seconds) that a crawl will go for. Defaults to float('inf'), allowing it to run forever. + ## `MIME_TYPES` - ([Source](https://github.com/rivermont/spidy/blob/master/spidy/crawler.py#L705)) A dictionary of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) encountered by the crawler.
While there are [thousands of other types](https://www.iana.org/assignments/media-types/media-types.xhtml) that are not listed, to list them all would be impractical: