Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
/.idea
config.ini
result
72 changes: 49 additions & 23 deletions Reddit_image_scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import praw
import configparser
import urllib.request
from tqdm import tqdm
from time import sleep

from prawcore.exceptions import Redirect
from prawcore.exceptions import ResponseException
Expand All @@ -22,8 +25,13 @@ def get_client_info():
return id, secret


def is_img_link(img_link):
return img_link.lower().endswith("jpg") or img_link.lower().endswith("png") or img_link.lower().endswith("gif")

def save_list(img_url_list):
for img_url in img_url_list:
if not is_img_link(img_url):
continue
file = open('img_links.txt', 'a')
file.write('{} \n'.format(img_url))
file.close()
Expand All @@ -34,18 +42,13 @@ def delete_img_list():
f.truncate()


def is_img_link(img_link):
ext = img_link[-4:]
if ext == '.jpg' or ext == '.png':
return True
else:
return False


def get_img_urls(sub, li):
try:
r = praw.Reddit(client_id=ClientInfo.id, client_secret=ClientInfo.secret, user_agent=ClientInfo.user_agent)
submissions = r.subreddit(sub).hot(limit=li)
if hot:
submissions = r.subreddit(sub).hot(limit=li*5)
else:
submissions = r.subreddit(sub).top(time_filter="all", limit=li*5)

return [submission.url for submission in submissions]

Expand All @@ -61,43 +64,67 @@ def get_img_urls(sub, li):
print("Client info is wrong. Check again.")
return 0

except Exception as e:
print("Unexpected Error:", e)
return 0


def download_img(img_url, img_title, filename):
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
try:
print('Downloading ' + img_title + '....')
# print('Downloading ' + img_title + '....')
urllib.request.urlretrieve(img_url, filename)
return 1

except HTTPError:
print("Too many Requests. Try again later!")
except HTTPError as e:
print("Too many Requests. Try again later!, ", e)
return 0

except OSError as e:
print("OSError:", e)
return 0

except Exception as e:
print("Unexpected Error:", e)
return 0

def read_img_links():
def read_img_links(sub, limit, tolerance=3):
failed = 0
with open('img_links.txt') as f:
links = f.readlines()

links = [x.strip() for x in links]
download_count = 0

for link in links:
for link in tqdm(links, total=limit):
if not is_img_link(link):
continue

if(download_count == limit):
return download_count, 1

file_name = link.split('/')[-1]
file_loc = 'result/{}'.format(file_name)
file_loc = 'result/{}/{}'.format(sub, file_name)

directory = os.path.dirname('result/{}/'.format(sub))
if not os.path.exists(directory):
os.makedirs(directory)

if not file_name:
continue

download_status = download_img(link, file_name, file_loc)
download_count += 1
sleep(3)

if download_status == 0:
return download_count, 0
failed+=1
if(failed==tolerance):
return download_count, 0
continue
else:
download_count += 1

return download_count, 1

Expand All @@ -107,18 +134,17 @@ def read_img_links():
ClientInfo.id, ClientInfo.secret = get_client_info()

subreddit = input('Enter Subreddit: ')
num = int(input('Enter Limit: '))
print()
url_list = get_img_urls(subreddit, num)
file_no = 1
hot = bool(input('0 For Top, 1 For Hot: '))
limit = int(input('Enter Limit: '))
url_list = get_img_urls(subreddit, limit)

if url_list:

save_list(url_list)
count, status = read_img_links()
count, status = read_img_links(subreddit, limit)

if status == 1:
print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count))
print(f'\nDownload Complete\n{count} - Images Downloaded.')
elif status == 0:
print('\nDownload Incomplete\n{} - Images Downloaded'.format(count))

Expand Down
4 changes: 2 additions & 2 deletions config.ini
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[ALPHA]
client_id=YOUR CLIENT ID HERE
client_secret=YOUR CLIENT SECRET HERE
client_id=BySGe2h8CHp2o9pfQu344A
client_secret=1b9Bu5VMgTXTbtchBvl7mmwEvUGWMA
2 changes: 0 additions & 2 deletions result/.gitignore

This file was deleted.