From c53be05d4c08d11cf988df698d40741278ecc745 Mon Sep 17 00:00:00 2001
From: Eneias Silva <eneias@eneias.com>
Date: Tue, 15 Nov 2022 09:20:41 -0300
Subject: [PATCH] improvements: * multi thread processing * parameter for
 script * documentation in -h param

---
 Reddit_image_scraper.py | 156 ++++++++++++++++++++++++++++++++--------
 subreddit.txt           |   0
 2 files changed, 127 insertions(+), 29 deletions(-)
 create mode 100644 subreddit.txt

diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py
index e04b51b..aea3113 100644
--- a/Reddit_image_scraper.py
+++ b/Reddit_image_scraper.py
@@ -1,11 +1,22 @@
+import os
+import sys
 import praw
 import configparser
 import urllib.request
+import argparse
+from argparse import RawTextHelpFormatter
 
 from prawcore.exceptions import Redirect
 from prawcore.exceptions import ResponseException
 from urllib.error import HTTPError
 
+from threading import Thread
+
+
+# Globals
+dw_results = {}
+debug = False
+
 
 class ClientInfo:
     id = ''
@@ -22,21 +33,23 @@ def get_client_info():
     return id, secret
 
 
-def save_list(img_url_list):
+def save_list(img_url_list, subreddit):
+    link_file = 'result/{}.txt'.format(subreddit)
     for img_url in img_url_list:
-        file = open('img_links.txt', 'a')
+        file = open(link_file, 'a')
         file.write('{} \n'.format(img_url))
         file.close()
 
+    return link_file
+
 
-def delete_img_list():
-    f = open('img_links.txt', 'r+')
-    f.truncate()
+def delete_img_list(link_file_name):
+    os.remove(link_file_name)
 
 
 def is_img_link(img_link):
-    ext = img_link[-4:]
-    if ext == '.jpg' or ext == '.png':
+    ext = img_link[-4:].lower()
+    if ext == '.jpg' or ext == '.png' or ext == '.gif':
         return True
     else:
         return False
@@ -62,64 +75,149 @@ def get_img_urls(sub, li):
         return 0
 
 
-def download_img(img_url, img_title, filename):
+def download_img(subreddit, img_url, img_title, filename):
     opener = urllib.request.build_opener()
     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
     urllib.request.install_opener(opener)
     try:
-        print('Downloading ' + img_title + '....')
+        if debug:
+            print('['+subreddit+'] Downloading ' + filename + '....')
         urllib.request.urlretrieve(img_url, filename)
+        dw_results[subreddit].append(1)
         return 1
 
     except HTTPError:
         print("Too many Requests. Try again later!")
         return 0
+    except Exception as e:
+        print( str(e) )
+        return 0
 
 
-def read_img_links():
-    with open('img_links.txt') as f:
+def read_img_links(subreddit, link_file):
+    with open(link_file) as f:
         links = f.readlines()
 
     links = [x.strip() for x in links]
     download_count = 0
 
+    dw_threads = []
     for link in links:
         if not is_img_link(link):
             continue
 
         file_name = link.split('/')[-1]
-        file_loc = 'result/{}'.format(file_name)
 
         if not file_name:
             continue
 
-        download_status = download_img(link, file_name, file_loc)
-        download_count += 1
+        file_path = 'result/{}'.format(subreddit)
+        if not os.path.exists(file_path):
+            os.makedirs(file_path)
+        file_loc = '{}/{}'.format(file_path, file_name)
 
-        if download_status == 0:
-            return download_count, 0
+        t = Thread(target=download_img, args=(subreddit, link, file_name, file_loc))
+        dw_threads.append(t)
+        t.start()
+        download_count += 1
+    
+    # wait for the threads to complete
+    for thread in dw_threads:
+        thread.join()
 
     return download_count, 1
 
-
-if __name__ == '__main__':
-
-    ClientInfo.id, ClientInfo.secret = get_client_info()
-
-    subreddit = input('Enter Subreddit: ')
-    num = int(input('Enter Limit: '))
-    print()
+def process_subreddit(subreddit, num=10):
+    print("Processing: {}".format(subreddit))
     url_list = get_img_urls(subreddit, num)
     file_no = 1
 
     if url_list:
 
-        save_list(url_list)
-        count, status = read_img_links()
+        link_file = save_list(url_list, subreddit)
+        count, status = read_img_links(subreddit, link_file)
+
+        count = len(dw_results[subreddit])
 
         if status == 1:
-            print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count))
+            print('\n[{}] Download Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(subreddit, count, num - count))
         elif status == 0:
-            print('\nDownload Incomplete\n{} - Images Downloaded'.format(count))
+            print('\n[{}] Download Incomplete\n{} - Images Downloaded'.format(subreddit, count))
+
+    delete_img_list(link_file)
 
-    delete_img_list()
+
+def read_subreddit_from_file_list(limit):
+    subreddit_file = "subreddit.txt"
+    if os.path.exists(subreddit_file):
+        with open(subreddit_file, 'r', encoding="utf8") as reader:
+            multthread_processing(reader.readlines())
+
+def multthread_processing(r_list):
+    threads = []
+    for subreddit in r_list:
+        subreddit = subreddit.strip()
+        if not subreddit:
+            continue
+
+        dw_results[subreddit] = []
+        t = Thread(target=process_subreddit, args=(subreddit, limit))
+        threads.append(t)
+        t.start()
+    
+    # wait for the threads to complete
+    for thread in threads:
+        thread.join()
+
+
+if __name__ == '__main__':
+
+    ClientInfo.id, ClientInfo.secret = get_client_info()
+
+    about = '''
+Download images from Reddit community.
+There are 3 ways to use this script:
+
+   1. You can create a file, in root dir, called 'subreddit.txt' and save your community names, one per line, to download its images.
+   2. You can pass the communities names with -r param. 
+         Like -r XXX YYY - XXX and YYY are communties names
+   3. You can use the iterative mode with -i param. 
+         We'll ask the subreddit and the hot limit
+    '''
+
+    parser = argparse.ArgumentParser(prog=sys.argv[0], description=about, formatter_class=RawTextHelpFormatter)
+    parser.add_argument('-r', '--subreddit', action='store', nargs='*', dest='subreddit', default=[], help='subreddit community name without "r/"')
+    parser.add_argument('-l', '--limit', action='store', nargs='*', dest='limit', default=[10], help='limit value for use in hot() method - number of images to download')
+    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Debug messages')
+    parser.add_argument('-i', '--iterative', action='store_true', default=False, help='Iterative mode. Will ask for subreddit and limit')
+    
+    params = parser.parse_args()
+    limit = int(params.limit[0])
+    debug = params.debug
+
+    print("#######################################################")
+    print("#                Reddit_Image_Scraper                 #")
+    print("#-----------------------------------------------------#")
+    print("# improoved by: eneias.com                            #")
+    print("#######################################################\n")
+    print(" >>> subreddit: " + ", ".join(params.subreddit) )
+    print(" >>> limit: "     + str(params.limit[0]) )
+    print(" >>> iterative: " + str(params.iterative) )
+    print(" >>> debug: "     + str(debug) )
+    print()
+
+
+    if params.iterative:
+        subreddit = input('Enter Subreddit: ')
+        num = int(input('Enter Limit: '))    
+        print()
+        dw_results[subreddit] = []
+        process_subreddit(subreddit, num)
+        exit()
+
+    if len(params.subreddit) == 0:
+        read_subreddit_from_file_list(limit)
+    else:
+        multthread_processing(params.subreddit)
+    
+    print(" >>> done. ")
\ No newline at end of file
diff --git a/subreddit.txt b/subreddit.txt
new file mode 100644
index 0000000..e69de29