Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
7c92c40
Collect 30k URLs using web API
mtanghu Aug 14, 2023
79dcfdb
Save urls to txt file
mtanghu Aug 14, 2023
1a90fd0
Crawl 1M links using recommendations
mtanghu Aug 23, 2023
5a6b581
Use asynchronous workers to do crawling
mtanghu Aug 27, 2023
6678d1a
Clarify comment about workers restarting
mtanghu Aug 27, 2023
487719e
Remove search prototyping code
mtanghu Aug 27, 2023
9c7f474
Clean up original crawling pilot for posterity
mtanghu Aug 27, 2023
033af03
Encode playlists using utf-8
mtanghu Aug 28, 2023
3e3a608
Load more videos when restarting crawler
mtanghu Aug 28, 2023
674c2a7
Use normal data directory instead of new one
mtanghu Sep 5, 2023
981eca2
Prototype a channel video collector
mtanghu Sep 12, 2023
dad16aa
Unindent a double indent
mtanghu Sep 12, 2023
3c1b34a
Build out async functionality and error handling
mtanghu Sep 12, 2023
7b310df
Ignore 404s and start more workers
mtanghu Sep 12, 2023
33a49dd
Correctly ignore 404 and allow no subscribers
mtanghu Sep 12, 2023
00e4029
Improve none handling
mtanghu Sep 12, 2023
04d0284
Use less locking and increase workers
mtanghu Sep 12, 2023
555d767
Allow all video data to be None
mtanghu Sep 12, 2023
0dd30fc
Ensure views can also be None
mtanghu Sep 12, 2023
5950a4b
Ignore title-less channels
mtanghu Sep 12, 2023
968e27e
Use 200 workers on shuffled channels list
mtanghu Sep 12, 2023
1339899
Fix utf-8 file reading
mtanghu Sep 12, 2023
3ea1d44
Use a singular session and adjust parallelism
mtanghu Sep 13, 2023
f077e56
Correctly count collected videos (no repeats)
mtanghu Sep 13, 2023
98f3c3e
Add channel link to video information
mtanghu Sep 13, 2023
2c22d9e
Seperate lines with agnostic seperator
mtanghu Sep 13, 2023
c39e178
Improve commenting and write to error file
mtanghu Sep 14, 2023
22ae56f
Print out a message alongside logging error
mtanghu Sep 14, 2023
cd0d3bd
Flush the error file immediately
mtanghu Sep 14, 2023
42ed45e
Allow for empty continuations
mtanghu Sep 14, 2023
624874c
Assert when videos aren't found with message
mtanghu Sep 14, 2023
6184b49
Improve commenting
mtanghu Sep 14, 2023
4167754
Collect more channel data with better formatting
mtanghu Sep 14, 2023
ec65b78
Try keeping newlines in text
mtanghu Sep 14, 2023
adaa126
Use a strip just to ensure tsv format
mtanghu Sep 14, 2023
7a86b65
Try description snippets with \n replaced
mtanghu Sep 14, 2023
7ef3bb9
Remove all whitespace with split
mtanghu Sep 15, 2023
5ab07d1
Use basic csvs and csv writers
mtanghu Sep 15, 2023
520105f
Only space functions with 2 lines
mtanghu Sep 15, 2023
807a373
Comment the initial loading
mtanghu Sep 15, 2023
e9f899b
Catch more channels with no videos
mtanghu Sep 15, 2023
e1ea1e4
gitignore
tomohiro-sawada Sep 16, 2023
397b59d
Updated sample data
tomohiro-sawada Sep 16, 2023
c80296c
Implemented linear model and utils
tomohiro-sawada Sep 19, 2023
314b811
.
tomohiro-sawada Sep 20, 2023
9ba2cf5
.
tomohiro-sawada Sep 20, 2023
aeacae9
.:
tomohiro-sawada Sep 20, 2023
3ff343f
.
tomohiro-sawada Sep 20, 2023
8089a97
.
tomohiro-sawada Sep 20, 2023
49a3395
.
tomohiro-sawada Sep 20, 2023
53da3b5
.
tomohiro-sawada Sep 20, 2023
5f9ef34
.
tomohiro-sawada Sep 20, 2023
9bea308
.
tomohiro-sawada Sep 20, 2023
2a85786
Parse more channel information with /browse
mtanghu Sep 21, 2023
12394e1
Update payload for newer firefox update
mtanghu Sep 21, 2023
84c254a
Collect thumbnails and use faster date parser
mtanghu Sep 22, 2023
0b6a7e0
Fully parse all 6 tabs with extractor refactor
mtanghu Sep 23, 2023
73f9d5f
Fix function spacing
mtanghu Sep 23, 2023
e3490a1
Randomize channels again from about file
mtanghu Sep 23, 2023
0c71093
Actively name files for easier understanding
mtanghu Sep 23, 2023
cf681e5
Parse featured channels correctly
mtanghu Sep 23, 2023
4e01e95
Parse subscriber count for featured channels
mtanghu Sep 23, 2023
ec7e994
Reprocess last n many channels before shuffle
mtanghu Sep 23, 2023
90c22b4
Improve channel id finding and error out otherwise
mtanghu Sep 24, 2023
d94a9bf
Improve 404 handling and restarting
mtanghu Sep 24, 2023
a9aed07
Fix restarting given no reprocessing
mtanghu Sep 24, 2023
aead859
Disallow redirects and keep connection alive
mtanghu Sep 24, 2023
af5b563
Clean up before lightsail deployment
mtanghu Sep 24, 2023
da6d755
Fix parquet compression code
mtanghu Sep 24, 2023
b3b374c
Organize new parquets into folders
mtanghu Sep 24, 2023
89b7553
Close files and flush in right order
mtanghu Sep 24, 2023
9244d12
Adjust parameters for full runs
mtanghu Sep 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample/**
**.csv
16 changes: 0 additions & 16 deletions code/conda_env.yml

This file was deleted.

144 changes: 144 additions & 0 deletions code/data/big_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import asyncio
import socket
import copy
import json
import os

from aiohttp import ClientSession, TCPConnector, DummyCookieJar

from youtube_helpers import BASE, ENDPOINT, PAYLOAD, USER_AGENT, parse_response



# windows specific fix
if os.name == 'nt':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

# locks for concurrency
processing_lock = asyncio.Lock()
print_lock = asyncio.Lock()

# data files
video_file = open('videos.txt', 'a')
channel_file = open('channels.txt', 'a')
playlist_file = open('playlists.jsonl', 'a', encoding = 'utf-8')

# channels seen so far (continue from previous runs)
channels = set()
with open('channels.txt', 'r') as f:
for line in f:
channels.add(line.strip())


async def get_recommendations(
video_id, session, unexplored_videos,
channel_set = channels, lock = processing_lock,
video_file = video_file, channel_file = channel_file,
playlist_file = playlist_file
):
data = copy.deepcopy(PAYLOAD)
data['videoId'] = video_id
async with session.post(ENDPOINT, headers = {'User-Agent': USER_AGENT},
json = data, timeout = 5) as response:
if response.ok is False:
return response.ok

recommendations = parse_response(await response.json())
for recommendation in recommendations:
async with lock:
video_file.write(recommendation['id'] + '\n')

if recommendation['isPlaylist']:
playlist_file.write(json.dumps(recommendation) + '\n')

if (recommendation['channel']['link'] is not None and
recommendation['channel']['link'] not in channel_set):

channel_set.add(recommendation['channel']['link'])
channel_file.write(recommendation['channel']['link'] + '\n')

if ('shorts' not in recommendation['link'] and
recommendation['isPlaylist'] is not True):
unexplored_videos.append(recommendation['id'])


async def worker(unexplored_videos, num_reqs, channels_set = channels):
async with print_lock:
print('worker started')

while True:
# use ipv6 (helps with blocks) and leave concurrency to parallel connections
conn = TCPConnector(limit = 1, family = socket.AF_INET6, force_close = True)
async with ClientSession(
base_url = BASE, connector = conn, cookie_jar = DummyCookieJar()
) as session:
for _ in range(num_reqs):
if len(unexplored_videos) == 0:
async with print_lock:
print('no more videos to explore, stopping worker')
return

video_id = unexplored_videos.pop()
try:
ok_response = await get_recommendations(video_id, session, unexplored_videos)
if ok_response is False:
async with print_lock:
print("bad response, stopping worker (try restarting)")
return
except Exception as e:
async with print_lock:
print(e, video_id)
async with print_lock:
print(
'finished connection, number of channels:',
len(channels_set), end = "\t\t\t\r"
)


async def main(num_workers, num_reqs):
# read last num_workers * 1000 in to start the crawler back up
initial_videos = os.popen(
'tail -n ' + str(num_workers * 1000) + ' videos.txt'
).read().split('\n')[:-1]

# if videos.txt doesn't have enough videos (cold start), fill it with some recommendations
if len(initial_videos) == 0:
assert len(channels) == 0, \
'channels.txt should be empty for cold start, delete channels.txt and try again'

# start with an old and popular video
initial_videos = ['dQw4w9WgXcQ']
async with ClientSession(base_url = BASE) as session:
# collect num_workers * num_reqs videos (just a heuristic)
while len(initial_videos) < num_workers * num_reqs:
video_id = initial_videos.pop()
await get_recommendations(video_id, session, initial_videos)
print(
f'collecting initial videos: {len(initial_videos)}/{num_workers * num_reqs}',
end = '\t\t\t\r'
)
print('\nfinished collecting initial videos, starting asyncronous workers')
else:
print('loaded previous videos.txt and channels.txt, starting asyncronous workers')

# split unexplored videos equally among workers
await asyncio.gather(*[
worker(copy.deepcopy(initial_videos[i::num_workers]), num_reqs)
for i in range(num_workers)
])


try:
# launch 20 concurrent workers that each make 20 requests before restarting connection
# this would be the high end of normal individual youtube traffic
asyncio.run(main(num_workers = 20, num_reqs = 20))
except (KeyboardInterrupt, Exception) as e:
print("\nfinal exception:", e)

# make sure to exit cleanly
video_file.flush()
channel_file.flush()
playlist_file.flush()
video_file.close()
channel_file.close()
playlist_file.close()
Loading