Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions scrape.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env python

import re
import os
import passwords
import sys
import getpass
from subprocess import Popen
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
Expand Down Expand Up @@ -29,12 +30,12 @@ def download(work):

# Import from a module outside of version control your SUNET id and password
br.select_form(name="login")
br["username"] = passwords.my_username
br["password"] = passwords.my_password
br["username"] = raw_input('Username: ')
br["password"] = getpass.getpass();

# Open the course page for the title you're looking for
# Open the course page for the title you're looking for
response = br.submit()
response = br.follow_link(text=sys.argv[1])
response = br.follow_link(text=raw_input('Class Description: '))

# Build up a list of lectures
links = []
Expand All @@ -44,13 +45,16 @@ def download(work):
videos = []
# These are done serially purely just to not look suspicious, we could probably parallelize this as well
for link in links:
print "Reading contents of %s" % link
response = br.open(link)
soup = BeautifulSoup(response.read())
video = soup.find('object', id='WMPlayer')['data']
video = re.sub("http","mms",video)
output_name = re.search(r"[a-z]+[0-9]+[a-z]?/[0-9]+",video).group(0).replace("/","_") + ".wmv"
videos.append((video, output_name))

print "Downloading %d videos..." % len(videos)

# Make a thread pool and download 5 files at a time
p = Pool(processes=5)
p.map(download, videos)