diff --git a/.gitignore b/.gitignore index a09c56d..170e352 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -/.idea +config.ini +result \ No newline at end of file diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index e04b51b..4ab8e52 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -1,6 +1,9 @@ +import os import praw import configparser import urllib.request +from tqdm import tqdm +from time import sleep from prawcore.exceptions import Redirect from prawcore.exceptions import ResponseException @@ -22,8 +25,13 @@ def get_client_info(): return id, secret +def is_img_link(img_link): + return img_link.lower().endswith("jpg") or img_link.lower().endswith("png") or img_link.lower().endswith("gif") + def save_list(img_url_list): for img_url in img_url_list: + if not is_img_link(img_url): + continue file = open('img_links.txt', 'a') file.write('{} \n'.format(img_url)) file.close() @@ -34,18 +42,13 @@ def delete_img_list(): f.truncate() -def is_img_link(img_link): - ext = img_link[-4:] - if ext == '.jpg' or ext == '.png': - return True - else: - return False - - def get_img_urls(sub, li): try: r = praw.Reddit(client_id=ClientInfo.id, client_secret=ClientInfo.secret, user_agent=ClientInfo.user_agent) - submissions = r.subreddit(sub).hot(limit=li) + if hot: + submissions = r.subreddit(sub).hot(limit=li*5) + else: + submissions = r.subreddit(sub).top(time_filter="all", limit=li*5) return [submission.url for submission in submissions] @@ -61,43 +64,67 @@ def get_img_urls(sub, li): print("Client info is wrong. Check again.") return 0 + except Exception as e: + print("Unexpected Error:", e) + return 0 + def download_img(img_url, img_title, filename): opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] urllib.request.install_opener(opener) try: - print('Downloading ' + img_title + '....') + # print('Downloading ' + img_title + '....') urllib.request.urlretrieve(img_url, filename) return 1 - except HTTPError: - print("Too many Requests. Try again later!") + except HTTPError as e: + print("Too many Requests. Try again later!, ", e) + return 0 + + except OSError as e: + print("OSError:", e) return 0 + except Exception as e: + print("Unexpected Error:", e) + return 0 -def read_img_links(): +def read_img_links(sub, limit, tolerance=3): + failed = 0 with open('img_links.txt') as f: links = f.readlines() links = [x.strip() for x in links] download_count = 0 - for link in links: + for link in tqdm(links, total=limit): if not is_img_link(link): continue + if(download_count == limit): + return download_count, 1 + file_name = link.split('/')[-1] - file_loc = 'result/{}'.format(file_name) + file_loc = 'result/{}/{}'.format(sub, file_name) + + directory = os.path.dirname('result/{}/'.format(sub)) + if not os.path.exists(directory): + os.makedirs(directory) if not file_name: continue download_status = download_img(link, file_name, file_loc) - download_count += 1 + sleep(3) if download_status == 0: - return download_count, 0 + failed+=1 + if(failed==tolerance): + return download_count, 0 + continue + else: + download_count += 1 return download_count, 1 @@ -107,18 +134,17 @@ def read_img_links(): ClientInfo.id, ClientInfo.secret = get_client_info() subreddit = input('Enter Subreddit: ') - num = int(input('Enter Limit: ')) - print() - url_list = get_img_urls(subreddit, num) - file_no = 1 + hot = bool(input('0 For Top, 1 For Hot: ')) + limit = int(input('Enter Limit: ')) + url_list = get_img_urls(subreddit, limit) if url_list: save_list(url_list) - count, status = read_img_links() + count, status = read_img_links(subreddit, limit) if status == 1: - print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count)) + print(f'\nDownload Complete\n{count} - Images Downloaded.') elif status == 0: print('\nDownload Incomplete\n{} - Images Downloaded'.format(count)) diff --git a/config.ini b/config.ini index fb0566e..7a5ca79 100644 --- a/config.ini +++ b/config.ini @@ -1,3 +1,3 @@ [ALPHA] -client_id=YOUR CLIENT ID HERE -client_secret=YOUR CLIENT SECRET HERE \ No newline at end of file +client_id=BySGe2h8CHp2o9pfQu344A +client_secret=1b9Bu5VMgTXTbtchBvl7mmwEvUGWMA \ No newline at end of file diff --git a/result/.gitignore b/result/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/result/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file