From ee279413e7af043fa36881dfaedff92cece33809 Mon Sep 17 00:00:00 2001 From: Gaurav Bhandarkar <62330601+DracoCoder@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:05:26 +0530 Subject: [PATCH 1/2] Fixed Limit Parameter, Improved Extention Checking Logic, Added Gif Support --- Reddit_image_scraper.py | 43 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index e04b51b..16577e5 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -1,3 +1,4 @@ +import os import praw import configparser import urllib.request @@ -22,8 +23,13 @@ def get_client_info(): return id, secret +def is_img_link(img_link): + return img_link.lower().endswith("jpg") or img_link.lower().endswith("png") or img_link.lower().endswith("gif") + def save_list(img_url_list): for img_url in img_url_list: + if not is_img_link(img_url): + continue file = open('img_links.txt', 'a') file.write('{} \n'.format(img_url)) file.close() @@ -34,18 +40,10 @@ def delete_img_list(): f.truncate() -def is_img_link(img_link): - ext = img_link[-4:] - if ext == '.jpg' or ext == '.png': - return True - else: - return False - - def get_img_urls(sub, li): try: r = praw.Reddit(client_id=ClientInfo.id, client_secret=ClientInfo.secret, user_agent=ClientInfo.user_agent) - submissions = r.subreddit(sub).hot(limit=li) + submissions = r.subreddit(sub).hot(limit=li*5) return [submission.url for submission in submissions] @@ -74,9 +72,13 @@ def download_img(img_url, img_title, filename): except HTTPError: print("Too many Requests. Try again later!") return 0 + + except OSError: + print(OSError) + return 0 - -def read_img_links(): +def read_img_links(sub, limit, tolerance=3): + failed = 0 with open('img_links.txt') as f: links = f.readlines() @@ -88,7 +90,11 @@ def read_img_links(): continue file_name = link.split('/')[-1] - file_loc = 'result/{}'.format(file_name) + file_loc = 'result/{}/{}'.format(sub, file_name) + + directory = os.path.dirname('result/{}/'.format(sub)) + if not os.path.exists(directory): + os.makedirs(directory) if not file_name: continue @@ -96,8 +102,15 @@ def read_img_links(): download_status = download_img(link, file_name, file_loc) download_count += 1 + if(download_count == limit): + return download_count, 1 + if download_status == 0: - return download_count, 0 + failed+=1 + if(failed==tolerance): + return download_count, 0 + + continue return download_count, 1 @@ -108,14 +121,12 @@ def read_img_links(): subreddit = input('Enter Subreddit: ') num = int(input('Enter Limit: ')) - print() url_list = get_img_urls(subreddit, num) - file_no = 1 if url_list: save_list(url_list) - count, status = read_img_links() + count, status = read_img_links(subreddit, num) if status == 1: print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count)) From a6c65ef806cf8f9614e6e1501363535099a05fd8 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 16 Apr 2024 07:01:25 +0000 Subject: [PATCH 2/2] Updates.. --- .gitignore | 3 ++- Reddit_image_scraper.py | 51 ++++++++++++++++++++++++++--------------- config.ini | 4 ++-- result/.gitignore | 2 -- 4 files changed, 37 insertions(+), 23 deletions(-) delete mode 100644 result/.gitignore diff --git a/.gitignore b/.gitignore index a09c56d..170e352 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -/.idea +config.ini +result \ No newline at end of file diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index 16577e5..4ab8e52 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -2,6 +2,8 @@ import praw import configparser import urllib.request +from tqdm import tqdm +from time import sleep from prawcore.exceptions import Redirect from prawcore.exceptions import ResponseException @@ -43,7 +45,10 @@ def delete_img_list(): def get_img_urls(sub, li): try: r = praw.Reddit(client_id=ClientInfo.id, client_secret=ClientInfo.secret, user_agent=ClientInfo.user_agent) - submissions = r.subreddit(sub).hot(limit=li*5) + if hot: + submissions = r.subreddit(sub).hot(limit=li*5) + else: + submissions = r.subreddit(sub).top(time_filter="all", limit=li*5) return [submission.url for submission in submissions] @@ -59,23 +64,31 @@ def get_img_urls(sub, li): print("Client info is wrong. Check again.") return 0 + except Exception as e: + print("Unexpected Error:", e) + return 0 + def download_img(img_url, img_title, filename): opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] urllib.request.install_opener(opener) try: - print('Downloading ' + img_title + '....') + # print('Downloading ' + img_title + '....') urllib.request.urlretrieve(img_url, filename) return 1 - except HTTPError: - print("Too many Requests. Try again later!") + except HTTPError as e: + print("Too many Requests. Try again later!, ", e) + return 0 + + except OSError as e: + print("OSError:", e) + return 0 + + except Exception as e: + print("Unexpected Error:", e) return 0 - - except OSError: - print(OSError) - return 0 def read_img_links(sub, limit, tolerance=3): failed = 0 @@ -85,10 +98,13 @@ def read_img_links(sub, limit, tolerance=3): links = [x.strip() for x in links] download_count = 0 - for link in links: + for link in tqdm(links, total=limit): if not is_img_link(link): continue + if(download_count == limit): + return download_count, 1 + file_name = link.split('/')[-1] file_loc = 'result/{}/{}'.format(sub, file_name) @@ -100,17 +116,15 @@ def read_img_links(sub, limit, tolerance=3): continue download_status = download_img(link, file_name, file_loc) - download_count += 1 - - if(download_count == limit): - return download_count, 1 + sleep(3) if download_status == 0: failed+=1 if(failed==tolerance): return download_count, 0 - continue + else: + download_count += 1 return download_count, 1 @@ -120,16 +134,17 @@ def read_img_links(sub, limit, tolerance=3): ClientInfo.id, ClientInfo.secret = get_client_info() subreddit = input('Enter Subreddit: ') - num = int(input('Enter Limit: ')) - url_list = get_img_urls(subreddit, num) + hot = bool(input('0 For Top, 1 For Hot: ')) + limit = int(input('Enter Limit: ')) + url_list = get_img_urls(subreddit, limit) if url_list: save_list(url_list) - count, status = read_img_links(subreddit, num) + count, status = read_img_links(subreddit, limit) if status == 1: - print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count)) + print(f'\nDownload Complete\n{count} - Images Downloaded.') elif status == 0: print('\nDownload Incomplete\n{} - Images Downloaded'.format(count)) diff --git a/config.ini b/config.ini index fb0566e..7a5ca79 100644 --- a/config.ini +++ b/config.ini @@ -1,3 +1,3 @@ [ALPHA] -client_id=YOUR CLIENT ID HERE -client_secret=YOUR CLIENT SECRET HERE \ No newline at end of file +client_id=BySGe2h8CHp2o9pfQu344A +client_secret=1b9Bu5VMgTXTbtchBvl7mmwEvUGWMA \ No newline at end of file diff --git a/result/.gitignore b/result/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/result/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file