Skip to content

Attempted refactor to download class #149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 106 additions & 42 deletions tests/TESTING.md
Original file line number Diff line number Diff line change
@@ -1,45 +1,109 @@
## Testing channels
- stacksmashing
- https://www.youtube.com/@stacksmashing/videos
- PwnFunction
- https://www.youtube.com/@PwnFunction/videos


## search key words
- stacksmashing
- "firmware decrypt"
- "unknown bitcoin array"
- "input shift register"
- "extensible and linkable format"
- PwnFunction
- "templating engine"
- "parameter pollution"
- "session objects"
- "same origin policy"

## download
- test channel not found error
- test language not found error
- testmulti threading error
- test channel id not found error

## search
- test search too long error
- test channel name conflict error
- test --all flag
- test search channel by id
- test search channel by name

## export
- test search string too long error
- test channel name conflict error
- test --all flag
- test export by channel by id
- test export by channel by name

## delete
- test name conflict error
- test delete channel by id
- test delee channel by name
- JCS - Criminal Psychology
- https://www.youtube.com/@JCS
- https://www.youtube.com/channel/UCYwVxWpjeKFWwu8TML-Te9A

## Testing Playlists
- How to start a startup
- https://www.youtube.com/playlist?list=PL5q_lef6zVkaTY_cT1k7qFNF2TidHCe-1

## Test Download Commands
### Channel Download
Commands:
```sh
# custom channel name
yt-fts download "https://www.youtube.com/@JCS"
# legacy channel name
yt-fts download "https://www.youtube.com/channel/UCYwVxWpjeKFWwu8TML-Te9A"
# mutli threading
yt-fts download -j 5 "https://www.youtube.com/@JCS"
```
Expected sql output:
```sql
select * from channels;
-- UCYwVxWpjeKFWwu8TML-Te9A|JCS - Criminal Psychology|https://www.youtube.com/channel/UCYwVxWpjeKFWwu8TML-Te9A/videos

select count(*) from Videos where channel_id = 'UCYwVxWpjeKFWwu8TML-Te9A';
-- 17

select count(*) from Subtitles ;
-- 21153
```

### Playlist Download

```shell
# default
yt-fts download --playlist "https://www.youtube.com/playlist?list=PL5q_lef6zVkaTY_cT1k7qFNF2TidHCe-1"
# multi threaded
yt-fts downlaod --playlist -j 5 "https://www.youtube.com/playlist?list=PL5q_lef6zVkaTY_cT1k7qFNF2TidHCe-1"
```

Expected sql output:
```sql
select * from Channels where channel_id = 'UCxIJaCMEptJjxmmQgGFsnCg';
-- UCxIJaCMEptJjxmmQgGFsnCg|Y Combinator: The Vault|https://www.youtube.com/channel/UCxIJaCMEptJjxmmQgGFsnCg/videos
select count(*) from videos where channel_id = 'UCxIJaCMEptJjxmmQgGFsnCg';
-- 16

SELECT COUNT(*) as subtitle_count
FROM Subtitles s
JOIN Videos v ON s.video_id = v.video_id
JOIN Channels c ON v.channel_id = c.channel_id
WHERE c.channel_id = 'UCxIJaCMEptJjxmmQgGFsnCg';
-- 20970
```


## Test `search` commands
Assuming you have both the playlists saved to local db
### Global Search
Command:
```sh
yt-fts search "growth hacking"
```

Expected output:
```text
Found 3 matches in 2 videos from 1 channel
Query 'growth hacking'
Scope: all
```

Command:
```sh
yt-fts search "knife attack"
```

Expected output:
```text
Found 1 matches in 1 videos from 1 channel
Query 'knife attack'
Scope: all
```


### Search JCS by name
Command:
```sh
yt-fts search --channel "JCS - Criminal Psychology" "criminal
```
Expected output:
```txt
Found 11 matches in 7 videos from 1 channel
Query 'criminal'
Scope: channel
```

### Search JCS by channel id
Command:
```sh
yt-fts search -c 4 "criminal"
```

Expected output:
```txt
Found 11 matches in 7 videos from 1 channel
Query 'criminal'
Scope: channel
```
24 changes: 23 additions & 1 deletion yt_fts/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ def search_all(text, limit=None):

db = Database(get_db_path())

# TODO: handle quotes
# this thing breaks if the user puts quotes in their
# search query
return list(db["Subtitles"].search(text, limit=limit))


Expand Down Expand Up @@ -315,4 +318,23 @@ def get_subs_by_video_id(video_id):

return db.execute(f"SELECT start_time, stop_time, text FROM Subtitles WHERE video_id = ?",
[video_id]).fetchall()



def get_channel_id_from_input(channel_input): # yt_fts, export, search, vector_search ... broken
"""
Checks if the input is a rowid or a channel name and returns channel id
"""

name_res = get_channel_id_from_name(channel_input)
id_res = get_channel_id_from_rowid(channel_input)



if id_res != None:
return id_res
elif name_res != None:
return name_res
else:
show_message("channel_not_found")
exit()

70 changes: 12 additions & 58 deletions yt_fts/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,11 @@
from rich.console import Console
console = Console()

def handle_reject_consent_cookie(channel_url, s):
"""
Auto rejects the consent cookie if request is redirected to the consent page
"""
r = s.get(channel_url)
if "https://consent.youtube.com" in r.url:
m = re.search(r"<input type=\"hidden\" name=\"bl\" value=\"([^\"]*)\"", r.text)
if m:
data = {
"gl":"DE",
"pc":"yt",
"continue":channel_url,
"x":"6",
"bl":m.group(1),
"hl":"de",
"set_eom":"true"
}
s.post("https://consent.youtube.com/save", data=data)


def get_channel_id(url, s):
def get_channel_id(url, s): # yt_fts
"""
Scrapes channel id from the channel page
"""
# TODO: wrap in try except
res = s.get(url)
if res.status_code == 200:
html = res.text
Expand All @@ -52,7 +33,7 @@ def get_channel_id(url, s):
return None


def get_channel_name(channel_id, s):
def get_channel_name(channel_id, s): # yt_fts, update
"""
Scrapes channel name from the channel page
"""
Expand Down Expand Up @@ -81,7 +62,7 @@ def get_channel_name(channel_id, s):
return None


def get_videos_list(channel_url):
def get_videos_list(channel_url): # download, update
"""
Scrapes list of all video urls from the channel
"""
Expand Down Expand Up @@ -110,7 +91,7 @@ def get_videos_list(channel_url):
return list_of_videos_urls


def get_playlist_data(playlist_url):
def get_playlist_data(playlist_url): # download
"""
Returns a list of channel ids and video ids from a playlist
"""
Expand Down Expand Up @@ -138,7 +119,7 @@ def get_playlist_data(playlist_url):
return playlist_data


def download_vtts(number_of_jobs, video_ids, language, tmp_dir):
def download_vtts(number_of_jobs, video_ids, language, tmp_dir): # download, update
"""
Multi-threaded download of vtt files
"""
Expand All @@ -154,13 +135,13 @@ def download_vtts(number_of_jobs, video_ids, language, tmp_dir):
futures[i].result()


def quiet_progress_hook(d):
def quiet_progress_hook(d): # download
if d['status'] == 'finished':
filename = Path(d['filename']).name
print(f" -> {filename}")


def get_vtt(tmp_dir, video_url, language):
def get_vtt(tmp_dir, video_url, language): # download
ydl_opts = {
'outtmpl': f'{tmp_dir}/%(id)s',
'writeinfojson': True,
Expand All @@ -176,7 +157,7 @@ def get_vtt(tmp_dir, video_url, language):
ydl.download([video_url])


def vtt_to_db(dir_path):
def vtt_to_db(dir_path): # download, update
"""
Iterates through all vtt files in the temp_dir, passes them to
the vtt parsing function, then inserts the data into the database.
Expand Down Expand Up @@ -218,7 +199,7 @@ def vtt_to_db(dir_path):
con.close()


def validate_channel_url(channel_url):
def validate_channel_url(channel_url): # yt_fts
"""
valid patterns
https://www.youtube.com/channel/channelID
Expand Down Expand Up @@ -264,7 +245,7 @@ def validate_channel_url(channel_url):
exit()


def download_channel(channel_id, channel_name, language, number_of_jobs, s):
def download_channel(channel_id, channel_name, language, number_of_jobs, s): # yt_fts
"""
Downloads all the videos from a channel to a tmp directory
"""
Expand All @@ -287,7 +268,7 @@ def download_channel(channel_id, channel_name, language, number_of_jobs, s):
return True


def download_playlist(playlist_url, s, language=None, number_of_jobs=None):
def download_playlist(playlist_url, s, language=None, number_of_jobs=None): # yt-fts
"""
Downloads all subtitles from playlist, making new channels where needed
"""
Expand All @@ -314,30 +295,3 @@ def download_playlist(playlist_url, s, language=None, number_of_jobs=None):
console.print(f"[green][bold]Downloading [red]{len(playlist_data)}[/red] vtt files[/bold][/green]\n")
download_vtts(number_of_jobs, video_ids, language, tmp_dir)
vtt_to_db(tmp_dir)


def get_channel_id_from_input(channel_input):
"""
Checks if the input is a rowid or a channel name and returns channel id
"""

from yt_fts.db_utils import (
get_channel_id_from_rowid,
get_channel_id_from_name
)

from yt_fts.utils import show_message

name_res = get_channel_id_from_name(channel_input)
id_res = get_channel_id_from_rowid(channel_input)



if id_res != None:
return id_res
elif name_res != None:
return name_res
else:
show_message("channel_not_found")
exit()

4 changes: 1 addition & 3 deletions yt_fts/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .db_utils import (
search_channel, search_video, search_all,
get_channel_name_from_video_id, get_metadata_from_db,
get_channel_id_from_input
)

from .utils import time_to_secs, show_message
Expand All @@ -23,12 +24,10 @@ def export_fts(text, scope, channel_id=None, video_id=None):
file_name = f"video_{video_id}_{timestamp}.csv"
res = search_video(video_id, text)
if scope == "channel":
from .download import get_channel_id_from_input
channel_id = get_channel_id_from_input(channel_id)
file_name = f"channel_{channel_id}_{timestamp}.csv"
res = search_channel(channel_id, text)


if len(res) == 0:
show_message("no_matches_found")
return None
Expand Down Expand Up @@ -100,7 +99,6 @@ def export_transcripts(channel_id):

console = Console()

from .download import get_channel_id_from_input
channel_id = get_channel_id_from_input(channel_id)

from .db_utils import get_vid_ids_by_channel_id, get_transcript_by_video_id
Expand Down
3 changes: 1 addition & 2 deletions yt_fts/search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from pprint import pprint
from .download import get_channel_id_from_input
from .db_utils import *
from .db_utils import *
from .utils import *
from rich.console import Console
from rich.text import Text
Expand Down
Loading
Loading