From ee279413e7af043fa36881dfaedff92cece33809 Mon Sep 17 00:00:00 2001
From: Gaurav Bhandarkar <62330601+DracoCoder@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:05:26 +0530
Subject: [PATCH 1/2] Fixed Limit Parameter, Improved Extention Checking Logic,
 Added Gif Support

---
 Reddit_image_scraper.py | 43 ++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py
index e04b51b..16577e5 100644
--- a/Reddit_image_scraper.py
+++ b/Reddit_image_scraper.py
@@ -1,3 +1,4 @@
+import os
 import praw
 import configparser
 import urllib.request
@@ -22,8 +23,13 @@ def get_client_info():
     return id, secret
 
 
+def is_img_link(img_link):
+    return img_link.lower().endswith("jpg") or img_link.lower().endswith("png") or img_link.lower().endswith("gif")
+
 def save_list(img_url_list):
     for img_url in img_url_list:
+        if not is_img_link(img_url):
+            continue
         file = open('img_links.txt', 'a')
         file.write('{} \n'.format(img_url))
         file.close()
@@ -34,18 +40,10 @@ def delete_img_list():
     f.truncate()
 
 
-def is_img_link(img_link):
-    ext = img_link[-4:]
-    if ext == '.jpg' or ext == '.png':
-        return True
-    else:
-        return False
-
-
 def get_img_urls(sub, li):
     try:
         r = praw.Reddit(client_id=ClientInfo.id, client_secret=ClientInfo.secret, user_agent=ClientInfo.user_agent)
-        submissions = r.subreddit(sub).hot(limit=li)
+        submissions = r.subreddit(sub).hot(limit=li*5)
 
         return [submission.url for submission in submissions]
 
@@ -74,9 +72,13 @@ def download_img(img_url, img_title, filename):
     except HTTPError:
         print("Too many Requests. Try again later!")
         return 0
+        
+    except OSError:
+        print(OSError)
+        return 0 
 
-
-def read_img_links():
+def read_img_links(sub, limit, tolerance=3):
+    failed = 0
     with open('img_links.txt') as f:
         links = f.readlines()
 
@@ -88,7 +90,11 @@ def read_img_links():
             continue
 
         file_name = link.split('/')[-1]
-        file_loc = 'result/{}'.format(file_name)
+        file_loc = 'result/{}/{}'.format(sub, file_name)
+
+        directory = os.path.dirname('result/{}/'.format(sub))
+        if not os.path.exists(directory):
+            os.makedirs(directory)
 
         if not file_name:
             continue
@@ -96,8 +102,15 @@ def read_img_links():
         download_status = download_img(link, file_name, file_loc)
         download_count += 1
 
+        if(download_count == limit):
+            return download_count, 1
+
         if download_status == 0:
-            return download_count, 0
+            failed+=1
+            if(failed==tolerance):
+                return download_count, 0
+                
+            continue
 
     return download_count, 1
 
@@ -108,14 +121,12 @@ def read_img_links():
 
     subreddit = input('Enter Subreddit: ')
     num = int(input('Enter Limit: '))
-    print()
     url_list = get_img_urls(subreddit, num)
-    file_no = 1
 
     if url_list:
 
         save_list(url_list)
-        count, status = read_img_links()
+        count, status = read_img_links(subreddit, num)
 
         if status == 1:
             print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count))

From a6c65ef806cf8f9614e6e1501363535099a05fd8 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-7-104.ap-south-1.compute.internal>
Date: Tue, 16 Apr 2024 07:01:25 +0000
Subject: [PATCH 2/2] Updates..

---
 .gitignore              |  3 ++-
 Reddit_image_scraper.py | 51 ++++++++++++++++++++++++++---------------
 config.ini              |  4 ++--
 result/.gitignore       |  2 --
 4 files changed, 37 insertions(+), 23 deletions(-)
 delete mode 100644 result/.gitignore

diff --git a/.gitignore b/.gitignore
index a09c56d..170e352 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-/.idea
+config.ini
+result
\ No newline at end of file
diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py
index 16577e5..4ab8e52 100644
--- a/Reddit_image_scraper.py
+++ b/Reddit_image_scraper.py
@@ -2,6 +2,8 @@
 import praw
 import configparser
 import urllib.request
+from tqdm import tqdm
+from time import sleep
 
 from prawcore.exceptions import Redirect
 from prawcore.exceptions import ResponseException
@@ -43,7 +45,10 @@ def delete_img_list():
 def get_img_urls(sub, li):
     try:
         r = praw.Reddit(client_id=ClientInfo.id, client_secret=ClientInfo.secret, user_agent=ClientInfo.user_agent)
-        submissions = r.subreddit(sub).hot(limit=li*5)
+        if hot:
+            submissions = r.subreddit(sub).hot(limit=li*5)
+        else:
+            submissions = r.subreddit(sub).top(time_filter="all", limit=li*5)
 
         return [submission.url for submission in submissions]
 
@@ -59,23 +64,31 @@ def get_img_urls(sub, li):
         print("Client info is wrong. Check again.")
         return 0
 
+    except Exception as e:
+        print("Unexpected Error:", e)
+        return 0
+
 
 def download_img(img_url, img_title, filename):
     opener = urllib.request.build_opener()
     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
     urllib.request.install_opener(opener)
     try:
-        print('Downloading ' + img_title + '....')
+        # print('Downloading ' + img_title + '....')
         urllib.request.urlretrieve(img_url, filename)
         return 1
 
-    except HTTPError:
-        print("Too many Requests. Try again later!")
+    except HTTPError as e:
+        print("Too many Requests. Try again later!, ", e)
+        return 0
+
+    except OSError as e:
+        print("OSError:", e)
+        return 0
+
+    except Exception as e:
+        print("Unexpected Error:", e)
         return 0
-        
-    except OSError:
-        print(OSError)
-        return 0 
 
 def read_img_links(sub, limit, tolerance=3):
     failed = 0
@@ -85,10 +98,13 @@ def read_img_links(sub, limit, tolerance=3):
     links = [x.strip() for x in links]
     download_count = 0
 
-    for link in links:
+    for link in tqdm(links, total=limit):
         if not is_img_link(link):
             continue
 
+        if(download_count == limit):
+            return download_count, 1
+
         file_name = link.split('/')[-1]
         file_loc = 'result/{}/{}'.format(sub, file_name)
 
@@ -100,17 +116,15 @@ def read_img_links(sub, limit, tolerance=3):
             continue
 
         download_status = download_img(link, file_name, file_loc)
-        download_count += 1
-
-        if(download_count == limit):
-            return download_count, 1
+        sleep(3)
 
         if download_status == 0:
             failed+=1
             if(failed==tolerance):
                 return download_count, 0
-                
             continue
+        else:
+            download_count += 1
 
     return download_count, 1
 
@@ -120,16 +134,17 @@ def read_img_links(sub, limit, tolerance=3):
     ClientInfo.id, ClientInfo.secret = get_client_info()
 
     subreddit = input('Enter Subreddit: ')
-    num = int(input('Enter Limit: '))
-    url_list = get_img_urls(subreddit, num)
+    hot = bool(input('0 For Top, 1 For Hot: '))
+    limit = int(input('Enter Limit: '))
+    url_list = get_img_urls(subreddit, limit)
 
     if url_list:
 
         save_list(url_list)
-        count, status = read_img_links(subreddit, num)
+        count, status = read_img_links(subreddit, limit)
 
         if status == 1:
-            print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count))
+            print(f'\nDownload Complete\n{count} - Images Downloaded.')
         elif status == 0:
             print('\nDownload Incomplete\n{} - Images Downloaded'.format(count))
 
diff --git a/config.ini b/config.ini
index fb0566e..7a5ca79 100644
--- a/config.ini
+++ b/config.ini
@@ -1,3 +1,3 @@
 [ALPHA]
-client_id=YOUR CLIENT ID HERE
-client_secret=YOUR CLIENT SECRET HERE
\ No newline at end of file
+client_id=BySGe2h8CHp2o9pfQu344A
+client_secret=1b9Bu5VMgTXTbtchBvl7mmwEvUGWMA
\ No newline at end of file
diff --git a/result/.gitignore b/result/.gitignore
deleted file mode 100644
index c96a04f..0000000
--- a/result/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
\ No newline at end of file