From c53be05d4c08d11cf988df698d40741278ecc745 Mon Sep 17 00:00:00 2001 From: Eneias Silva Date: Tue, 15 Nov 2022 09:20:41 -0300 Subject: [PATCH] improvements: * multi thread processing * parameter for script * documentation in -h param --- Reddit_image_scraper.py | 156 ++++++++++++++++++++++++++++++++-------- subreddit.txt | 0 2 files changed, 127 insertions(+), 29 deletions(-) create mode 100644 subreddit.txt diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index e04b51b..aea3113 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -1,11 +1,22 @@ +import os +import sys import praw import configparser import urllib.request +import argparse +from argparse import RawTextHelpFormatter from prawcore.exceptions import Redirect from prawcore.exceptions import ResponseException from urllib.error import HTTPError +from threading import Thread + + +# Globals +dw_results = {} +debug = False + class ClientInfo: id = '' @@ -22,21 +33,23 @@ def get_client_info(): return id, secret -def save_list(img_url_list): +def save_list(img_url_list, subreddit): + link_file = 'result/{}.txt'.format(subreddit) for img_url in img_url_list: - file = open('img_links.txt', 'a') + file = open(link_file, 'a') file.write('{} \n'.format(img_url)) file.close() + return link_file + -def delete_img_list(): - f = open('img_links.txt', 'r+') - f.truncate() +def delete_img_list(link_file_name): + os.remove(link_file_name) def is_img_link(img_link): - ext = img_link[-4:] - if ext == '.jpg' or ext == '.png': + ext = img_link[-4:].lower() + if ext == '.jpg' or ext == '.png' or ext == '.gif': return True else: return False @@ -62,64 +75,149 @@ def get_img_urls(sub, li): return 0 -def download_img(img_url, img_title, filename): +def download_img(subreddit, img_url, img_title, filename): opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] urllib.request.install_opener(opener) try: - print('Downloading ' + img_title + '....') + if debug: + print('['+subreddit+'] Downloading ' + filename + '....') urllib.request.urlretrieve(img_url, filename) + dw_results[subreddit].append(1) return 1 except HTTPError: print("Too many Requests. Try again later!") return 0 + except Exception as e: + print( str(e) ) + return 0 -def read_img_links(): - with open('img_links.txt') as f: +def read_img_links(subreddit, link_file): + with open(link_file) as f: links = f.readlines() links = [x.strip() for x in links] download_count = 0 + dw_threads = [] for link in links: if not is_img_link(link): continue file_name = link.split('/')[-1] - file_loc = 'result/{}'.format(file_name) if not file_name: continue - download_status = download_img(link, file_name, file_loc) - download_count += 1 + file_path = 'result/{}'.format(subreddit) + if not os.path.exists(file_path): + os.makedirs(file_path) + file_loc = '{}/{}'.format(file_path, file_name) - if download_status == 0: - return download_count, 0 + t = Thread(target=download_img, args=(subreddit, link, file_name, file_loc)) + dw_threads.append(t) + t.start() + download_count += 1 + + # wait for the threads to complete + for thread in dw_threads: + thread.join() return download_count, 1 - -if __name__ == '__main__': - - ClientInfo.id, ClientInfo.secret = get_client_info() - - subreddit = input('Enter Subreddit: ') - num = int(input('Enter Limit: ')) - print() +def process_subreddit(subreddit, num=10): + print("Processing: {}".format(subreddit)) url_list = get_img_urls(subreddit, num) file_no = 1 if url_list: - save_list(url_list) - count, status = read_img_links() + link_file = save_list(url_list, subreddit) + count, status = read_img_links(subreddit, link_file) + + count = len(dw_results[subreddit]) if status == 1: - print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count)) + print('\n[{}] Download Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(subreddit, count, num - count)) elif status == 0: - print('\nDownload Incomplete\n{} - Images Downloaded'.format(count)) + print('\n[{}] Download Incomplete\n{} - Images Downloaded'.format(subreddit, count)) + + delete_img_list(link_file) - delete_img_list() + +def read_subreddit_from_file_list(limit): + subreddit_file = "subreddit.txt" + if os.path.exists(subreddit_file): + with open(subreddit_file, 'r', encoding="utf8") as reader: + multthread_processing(reader.readlines()) + +def multthread_processing(r_list): + threads = [] + for subreddit in r_list: + subreddit = subreddit.strip() + if not subreddit: + continue + + dw_results[subreddit] = [] + t = Thread(target=process_subreddit, args=(subreddit, limit)) + threads.append(t) + t.start() + + # wait for the threads to complete + for thread in threads: + thread.join() + + +if __name__ == '__main__': + + ClientInfo.id, ClientInfo.secret = get_client_info() + + about = ''' +Download images from Reddit community. +There are 3 ways to use this script: + + 1. You can create a file, in root dir, called 'subreddit.txt' and save your community names, one per line, to download its images. + 2. You can pass the communities names with -r param. + Like -r XXX YYY - XXX and YYY are communties names + 3. You can use the iterative mode with -i param. + We'll ask the subreddit and the hot limit + ''' + + parser = argparse.ArgumentParser(prog=sys.argv[0], description=about, formatter_class=RawTextHelpFormatter) + parser.add_argument('-r', '--subreddit', action='store', nargs='*', dest='subreddit', default=[], help='subreddit community name without "r/"') + parser.add_argument('-l', '--limit', action='store', nargs='*', dest='limit', default=[10], help='limit value for use in hot() method - number of images to download') + parser.add_argument('-d', '--debug', action='store_true', default=False, help='Debug messages') + parser.add_argument('-i', '--iterative', action='store_true', default=False, help='Iterative mode. Will ask for subreddit and limit') + + params = parser.parse_args() + limit = int(params.limit[0]) + debug = params.debug + + print("#######################################################") + print("# Reddit_Image_Scraper #") + print("#-----------------------------------------------------#") + print("# improoved by: eneias.com #") + print("#######################################################\n") + print(" >>> subreddit: " + ", ".join(params.subreddit) ) + print(" >>> limit: " + str(params.limit[0]) ) + print(" >>> iterative: " + str(params.iterative) ) + print(" >>> debug: " + str(debug) ) + print() + + + if params.iterative: + subreddit = input('Enter Subreddit: ') + num = int(input('Enter Limit: ')) + print() + dw_results[subreddit] = [] + process_subreddit(subreddit, num) + exit() + + if len(params.subreddit) == 0: + read_subreddit_from_file_list(limit) + else: + multthread_processing(params.subreddit) + + print(" >>> done. ") \ No newline at end of file diff --git a/subreddit.txt b/subreddit.txt new file mode 100644 index 0000000..e69de29