Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 127 additions & 29 deletions Reddit_image_scraper.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import os
import sys
import praw
import configparser
import urllib.request
import argparse
from argparse import RawTextHelpFormatter

from prawcore.exceptions import Redirect
from prawcore.exceptions import ResponseException
from urllib.error import HTTPError

from threading import Thread


# Globals
dw_results = {}
debug = False


class ClientInfo:
id = ''
Expand All @@ -22,21 +33,23 @@ def get_client_info():
return id, secret


def save_list(img_url_list):
def save_list(img_url_list, subreddit):
link_file = 'result/{}.txt'.format(subreddit)
for img_url in img_url_list:
file = open('img_links.txt', 'a')
file = open(link_file, 'a')
file.write('{} \n'.format(img_url))
file.close()

return link_file


def delete_img_list():
f = open('img_links.txt', 'r+')
f.truncate()
def delete_img_list(link_file_name):
os.remove(link_file_name)


def is_img_link(img_link):
ext = img_link[-4:]
if ext == '.jpg' or ext == '.png':
ext = img_link[-4:].lower()
if ext == '.jpg' or ext == '.png' or ext == '.gif':
return True
else:
return False
Expand All @@ -62,64 +75,149 @@ def get_img_urls(sub, li):
return 0


def download_img(img_url, img_title, filename):
def download_img(subreddit, img_url, img_title, filename):
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
try:
print('Downloading ' + img_title + '....')
if debug:
print('['+subreddit+'] Downloading ' + filename + '....')
urllib.request.urlretrieve(img_url, filename)
dw_results[subreddit].append(1)
return 1

except HTTPError:
print("Too many Requests. Try again later!")
return 0
except Exception as e:
print( str(e) )
return 0


def read_img_links():
with open('img_links.txt') as f:
def read_img_links(subreddit, link_file):
with open(link_file) as f:
links = f.readlines()

links = [x.strip() for x in links]
download_count = 0

dw_threads = []
for link in links:
if not is_img_link(link):
continue

file_name = link.split('/')[-1]
file_loc = 'result/{}'.format(file_name)

if not file_name:
continue

download_status = download_img(link, file_name, file_loc)
download_count += 1
file_path = 'result/{}'.format(subreddit)
if not os.path.exists(file_path):
os.makedirs(file_path)
file_loc = '{}/{}'.format(file_path, file_name)

if download_status == 0:
return download_count, 0
t = Thread(target=download_img, args=(subreddit, link, file_name, file_loc))
dw_threads.append(t)
t.start()
download_count += 1

# wait for the threads to complete
for thread in dw_threads:
thread.join()

return download_count, 1


if __name__ == '__main__':

ClientInfo.id, ClientInfo.secret = get_client_info()

subreddit = input('Enter Subreddit: ')
num = int(input('Enter Limit: '))
print()
def process_subreddit(subreddit, num=10):
print("Processing: {}".format(subreddit))
url_list = get_img_urls(subreddit, num)
file_no = 1

if url_list:

save_list(url_list)
count, status = read_img_links()
link_file = save_list(url_list, subreddit)
count, status = read_img_links(subreddit, link_file)

count = len(dw_results[subreddit])

if status == 1:
print('\nDownload Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(count, num - count))
print('\n[{}] Download Complete\n{} - Images Downloaded\n{} - Posts Ignored'.format(subreddit, count, num - count))
elif status == 0:
print('\nDownload Incomplete\n{} - Images Downloaded'.format(count))
print('\n[{}] Download Incomplete\n{} - Images Downloaded'.format(subreddit, count))

delete_img_list(link_file)

delete_img_list()

def read_subreddit_from_file_list(limit):
subreddit_file = "subreddit.txt"
if os.path.exists(subreddit_file):
with open(subreddit_file, 'r', encoding="utf8") as reader:
multthread_processing(reader.readlines())

def multthread_processing(r_list):
threads = []
for subreddit in r_list:
subreddit = subreddit.strip()
if not subreddit:
continue

dw_results[subreddit] = []
t = Thread(target=process_subreddit, args=(subreddit, limit))
threads.append(t)
t.start()

# wait for the threads to complete
for thread in threads:
thread.join()


if __name__ == '__main__':

ClientInfo.id, ClientInfo.secret = get_client_info()

about = '''
Download images from Reddit community.
There are 3 ways to use this script:

1. You can create a file, in root dir, called 'subreddit.txt' and save your community names, one per line, to download its images.
2. You can pass the communities names with -r param.
Like -r XXX YYY - XXX and YYY are communties names
3. You can use the iterative mode with -i param.
We'll ask the subreddit and the hot limit
'''

parser = argparse.ArgumentParser(prog=sys.argv[0], description=about, formatter_class=RawTextHelpFormatter)
parser.add_argument('-r', '--subreddit', action='store', nargs='*', dest='subreddit', default=[], help='subreddit community name without "r/"')
parser.add_argument('-l', '--limit', action='store', nargs='*', dest='limit', default=[10], help='limit value for use in hot() method - number of images to download')
parser.add_argument('-d', '--debug', action='store_true', default=False, help='Debug messages')
parser.add_argument('-i', '--iterative', action='store_true', default=False, help='Iterative mode. Will ask for subreddit and limit')

params = parser.parse_args()
limit = int(params.limit[0])
debug = params.debug

print("#######################################################")
print("# Reddit_Image_Scraper #")
print("#-----------------------------------------------------#")
print("# improoved by: eneias.com #")
print("#######################################################\n")
print(" >>> subreddit: " + ", ".join(params.subreddit) )
print(" >>> limit: " + str(params.limit[0]) )
print(" >>> iterative: " + str(params.iterative) )
print(" >>> debug: " + str(debug) )
print()


if params.iterative:
subreddit = input('Enter Subreddit: ')
num = int(input('Enter Limit: '))
print()
dw_results[subreddit] = []
process_subreddit(subreddit, num)
exit()

if len(params.subreddit) == 0:
read_subreddit_from_file_list(limit)
else:
multthread_processing(params.subreddit)

print(" >>> done. ")
Empty file added subreddit.txt
Empty file.