From 3d07a3ebc3247d88f1efccc8f32e1b47070c1dcb Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 22 Apr 2022 20:06:49 +0200 Subject: [PATCH 01/23] There's something i guess Co-authored-by: Artrix --- ScraperEngine/resources/animepisode.py | 79 ++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 ScraperEngine/resources/animepisode.py diff --git a/ScraperEngine/resources/animepisode.py b/ScraperEngine/resources/animepisode.py new file mode 100644 index 0000000..509a291 --- /dev/null +++ b/ScraperEngine/resources/animepisode.py @@ -0,0 +1,79 @@ +import falcon +import aiohttp +from utils.session import execute_proxied_request +import re +from falcon import uri +from typing import List +from bs4 import BeautifulSoup +from interfaces.resource import ScraperResource +from models.episode import Episode +from models.matching import Matching + +class AnimepisodeResource(ScraperResource): + + def __init__(self, app: falcon.App) -> None: + # On this line, use the name you used inside MongoDB's websites collection + super().__init__(app, "animepisode") + + def fix_title(self, title: str) -> str: + # set the variable "is_dubbed" to true if the title contains "Dubbed" in it, otherwise set it to false, do this using regexp + is_dubbed = title in "Dubbed" + title = title.replace("Dubbed", "").replace("Subbed", "").replace("English", "").strip() + title = title.replace("Episode","").strip() + # Use Regexp to find the last number found in the title, pop it from the title and return it to the variable "episode_number" + episode_number = re.findall(r'\d+', title)[-1] + title = title.replace(episode_number, "").strip() + + + + return { + "title": title, + "is_dubbed": is_dubbed, + "episode_number": episode_number + } + + + + + + + + + + + + async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: + matchings = [] + url = f"{self.base_url}/search?q={title}" + try: + page = await execute_proxied_request(self, url) + articles = page.select_one("#main").find_all("article") + for article in articles: + content = article.find({ "class": "blog-entry-inner" }).find({"class": "blog-entry-content"}) + link = content.find("header").find().find("a") + title = link.text + url = link.get("href") + matchings.append(Matching(title, url)) + + except Exception as e: + print(str(e)) + raise + + return matchings + + async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: + episodes = [] + url = f"{self.base_url}{path}" + try: + # Btw im going on a trip, so ill be in car with shitty internet in like 20mins, so you should copy your code + # Yeah, i can commit + # could you commit your code to your github? and add me as a collaborator? + page = await execute_proxied_request(self, url) + article = page.select_one("#main").find("article") + + + except Exception as e: + print(str(e)) + raise + + return episodes \ No newline at end of file From 42b3cfbae3992dee14a82c377db30bf64dcf2d62 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 22 Apr 2022 20:07:07 +0200 Subject: [PATCH 02/23] commit --- ScraperEngine/resources/desuonline.py | 97 +++++++++++++++++++ .../Models/Websites/DesuonlineWebsite.cs | 28 ++++++ SyncService/Services/WebsiteScraperService.cs | 3 + 3 files changed, 128 insertions(+) create mode 100644 ScraperEngine/resources/desuonline.py create mode 100644 SyncService/Models/Websites/DesuonlineWebsite.cs diff --git a/ScraperEngine/resources/desuonline.py b/ScraperEngine/resources/desuonline.py new file mode 100644 index 0000000..accd3b6 --- /dev/null +++ b/ScraperEngine/resources/desuonline.py @@ -0,0 +1,97 @@ +import base64 +import falcon +import aiohttp +import re + +from falcon import uri +from typing import List +from bs4 import BeautifulSoup +from interfaces.resource import ScraperResource +from models.episode import Episode +from models.matching import Matching +from utils.session import execute_proxied_request, get_proxied_response_json_get + + +class DesuonlineResource(ScraperResource): + + def __init__(self, app: falcon.App) -> None: + print("DesuOnline initialized!") + super().__init__(app, "desuonline") + + async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: + matchings = [] + + url = f"{self.base_url}/?s={uri.encode(title)}" + print(url) + + try: + has_ended = False + page_number = 4 + + while not has_ended: + page = await execute_proxied_request(self, url) + + try: + show_elements = page.find(class_="bixbox").find("div", class_="listupd").find_all("article") + + if len(show_elements) == 0: + raise Exception + + for show_element in show_elements: + element = show_element.find(class_="bsx").find("a") + path = str(element["href"]).replace(self.base_url, "") + + matchings.append(Matching(element["oldtitle"], path)) + + url = f"{self.base_url}/page/{page_number}/?s={uri.encode(title)}" + page_number = page_number + 1 + except: + has_ended = True + + except Exception as e: + print(str(e)) + raise + + return matchings + + async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: + episodes = [] + + url = f"{self.base_url}{path}" + print(url) + + try: + page = await execute_proxied_request(self, url) + epList = page.find("div", class_="eplister").find("ul").find_all("li").reverse() + + episodeLink = str(epList[number].find("a")["href"]) + + episodePage = await execute_proxied_request(self, episodeLink) + + sourcesList = episodePage.find("select", class_="mirror").find_all("option") + + cdaEmbedLink = '' + + for option in sourcesList: + decodedString = base64.b64decode(str(option["value"])) + if "https://ebd.cda.pl/" in decodedString: + for x in range(13, decodedString.__len__()): + if decodedString[x] == '"': + break + else: + cdaEmbedLink += decodedString[x] + + if cdaEmbedLink == '': + raise Exception("Failed to get CDA Link!") + + embedPage = await execute_proxied_request(self, cdaEmbedLink) + cdaVidLink = str(embedPage.find("h1", class_="title").find("a")["href"]) + + # TODO: GET VIDEO LINK DATA FROM CDA LINK + + + except Exception as e: + print(str(e)) + raise + + return episodes diff --git a/SyncService/Models/Websites/DesuonlineWebsite.cs b/SyncService/Models/Websites/DesuonlineWebsite.cs new file mode 100644 index 0000000..cc02df1 --- /dev/null +++ b/SyncService/Models/Websites/DesuonlineWebsite.cs @@ -0,0 +1,28 @@ +using Commons; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace SyncService.Models.Websites +{ + public class DesuonlineWebsite : IWebsite + { + public DesuonlineWebsite(Website website) : base(website) + { + } + + public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) + { + Console.WriteLine(anime.Titles["en"] + ": " + matching.Title); + + return base.AnalyzeMatching(anime, matching, sourceTitle); + } + + public override Dictionary GetVideoProxyHeaders(AnimeMatching matching, Dictionary values = null) + { + return null; + } + } +} diff --git a/SyncService/Services/WebsiteScraperService.cs b/SyncService/Services/WebsiteScraperService.cs index 4331390..2add091 100644 --- a/SyncService/Services/WebsiteScraperService.cs +++ b/SyncService/Services/WebsiteScraperService.cs @@ -84,6 +84,9 @@ public override async Task Start(CancellationToken cancellationToken) case "gogoanime": iWeb = new GogoanimeWebsite(website); break; + case "desuonline": + iWeb = new DesuonlineWebsite(website); + break; default: throw new Exception($"Website {website.Name} not handled!"); From 54a58dffb4284f858867e9a00308bd5a3ab8c3f5 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 22 Apr 2022 20:24:24 +0200 Subject: [PATCH 03/23] Preparing for adding animepisode and desu-online --- ScraperEngine/resources/animepisode.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ScraperEngine/resources/animepisode.py b/ScraperEngine/resources/animepisode.py index 509a291..bbf393d 100644 --- a/ScraperEngine/resources/animepisode.py +++ b/ScraperEngine/resources/animepisode.py @@ -65,11 +65,10 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis episodes = [] url = f"{self.base_url}{path}" try: - # Btw im going on a trip, so ill be in car with shitty internet in like 20mins, so you should copy your code - # Yeah, i can commit - # could you commit your code to your github? and add me as a collaborator? page = await execute_proxied_request(self, url) - article = page.select_one("#main").find("article") + video_container = page.select_one("#anime-embed-container") + + except Exception as e: From bef79358bd61a42d9566ef057ec8096392120775 Mon Sep 17 00:00:00 2001 From: Artrix <39530102+Artrix9095@users.noreply.github.com> Date: Fri, 22 Apr 2022 11:44:34 -0700 Subject: [PATCH 04/23] Add requirements.txt --- ScraperEngine/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 ScraperEngine/requirements.txt diff --git a/ScraperEngine/requirements.txt b/ScraperEngine/requirements.txt new file mode 100644 index 0000000..b8a9fdd --- /dev/null +++ b/ScraperEngine/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4 +falcon +aiohttp \ No newline at end of file From 1b4c583bd4f533787ab784137e33ff4719f26f42 Mon Sep 17 00:00:00 2001 From: Artrix <39530102+Artrix9095@users.noreply.github.com> Date: Fri, 22 Apr 2022 12:23:28 -0700 Subject: [PATCH 05/23] Update requirements.txt Update requirements and add all deps --- ScraperEngine/requirements.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ScraperEngine/requirements.txt b/ScraperEngine/requirements.txt index b8a9fdd..322702b 100644 --- a/ScraperEngine/requirements.txt +++ b/ScraperEngine/requirements.txt @@ -1,3 +1,6 @@ beautifulsoup4 falcon -aiohttp \ No newline at end of file +aiohttp +brotli +ujson +pymongo \ No newline at end of file From 188ec1b28a52e13daa82669382863d34c88773d6 Mon Sep 17 00:00:00 2001 From: Artrix <39530102+Artrix9095@users.noreply.github.com> Date: Fri, 22 Apr 2022 12:23:48 -0700 Subject: [PATCH 06/23] Add Animepisode to main --- ScraperEngine/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ScraperEngine/main.py b/ScraperEngine/main.py index 157db17..699b430 100644 --- a/ScraperEngine/main.py +++ b/ScraperEngine/main.py @@ -4,6 +4,7 @@ from resources.dreamsub import DreamsubResource from resources.gogoanime import GogoanimeResource +from resources.animepisode import AnimepisodeResource from resources.aniplaylist import AniplaylistResource @@ -11,6 +12,7 @@ DreamsubResource(app) AnimeworldResource(app) +AnimepisodeResource(app) GogoanimeResource(app) From 2b3231e7f505bd0f7cd27e337527632c729569d7 Mon Sep 17 00:00:00 2001 From: Artrix <39530102+Artrix9095@users.noreply.github.com> Date: Fri, 22 Apr 2022 12:24:13 -0700 Subject: [PATCH 07/23] Add Animepisode scraper --- ScraperEngine/resources/animepisode.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ScraperEngine/resources/animepisode.py b/ScraperEngine/resources/animepisode.py index bbf393d..ffba866 100644 --- a/ScraperEngine/resources/animepisode.py +++ b/ScraperEngine/resources/animepisode.py @@ -15,7 +15,7 @@ def __init__(self, app: falcon.App) -> None: # On this line, use the name you used inside MongoDB's websites collection super().__init__(app, "animepisode") - def fix_title(self, title: str) -> str: + def fix_title(self, title: str): # set the variable "is_dubbed" to true if the title contains "Dubbed" in it, otherwise set it to false, do this using regexp is_dubbed = title in "Dubbed" title = title.replace("Dubbed", "").replace("Subbed", "").replace("English", "").strip() @@ -44,12 +44,12 @@ def fix_title(self, title: str) -> str: async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: matchings = [] - url = f"{self.base_url}/search?q={title}" + url = f"{self.base_url}/?s={uri.encode(title)}" try: page = await execute_proxied_request(self, url) articles = page.select_one("#main").find_all("article") for article in articles: - content = article.find({ "class": "blog-entry-inner" }).find({"class": "blog-entry-content"}) + content = article.find(class_="blog-entry-inner").find(class_="blog-entry-content") link = content.find("header").find().find("a") title = link.text url = link.get("href") @@ -62,14 +62,18 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List return matchings async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: - episodes = [] + episodes: List[Episode] = [] url = f"{self.base_url}{path}" try: page = await execute_proxied_request(self, url) - video_container = page.select_one("#anime-embed-container") - + embed_url = str(page.find("iframe").get("src")) + page = await execute_proxied_request(self, embed_url, { + "Referer": "https://animepisode.com/" + }) + video = page.select_one("video") + video_url = video.select_one('source').get('src') + episodes.append(Episode(f"Episode {number}", video_url, video_url, format="mp4", quality=None)) - except Exception as e: print(str(e)) From 9b3f028a3e1fafd6eef7bbf4be9fe415c7339616 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 22 Apr 2022 22:28:25 +0200 Subject: [PATCH 08/23] Added desu-online.pl support for ScrapperEngine --- ScraperEngine/main.py | 6 ++- ScraperEngine/resources/desuonline.py | 68 ++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/ScraperEngine/main.py b/ScraperEngine/main.py index 699b430..9856e65 100644 --- a/ScraperEngine/main.py +++ b/ScraperEngine/main.py @@ -8,6 +8,8 @@ from resources.aniplaylist import AniplaylistResource +from resources.desuonline import DesuonlineResource + app = falcon.asgi.App() DreamsubResource(app) @@ -16,4 +18,6 @@ GogoanimeResource(app) -AniplaylistResource(app) \ No newline at end of file +AniplaylistResource(app) + +DesuonlineResource(app) \ No newline at end of file diff --git a/ScraperEngine/resources/desuonline.py b/ScraperEngine/resources/desuonline.py index accd3b6..0cba453 100644 --- a/ScraperEngine/resources/desuonline.py +++ b/ScraperEngine/resources/desuonline.py @@ -1,4 +1,7 @@ import base64 +from enum import Enum +from os import link +from tokenize import String import falcon import aiohttp import re @@ -11,12 +14,74 @@ from models.matching import Matching from utils.session import execute_proxied_request, get_proxied_response_json_get +remove_keys = { "_XDDD", "_CDA", "_ADC", "_CXD", "_QWE", "_Q5", "_IKSDE" } + +regex_link = re.compile("https:\/\/www.cda.pl\/video\/([^\/\s]+)") +regex_file = re.compile("""file"":""(.*?)(?:"")""") + +class VideoQuality(Enum): + auto = 0, + p360 = 360, + p480 = 480, + p720 = 720, + p1080 = 1080 class DesuonlineResource(ScraperResource): def __init__(self, app: falcon.App) -> None: print("DesuOnline initialized!") super().__init__(app, "desuonline") + + async def get_mp4_link(cda_link, quality: VideoQuality.auto, https: False) -> String: + if cda_link.endswith("/vfilm"): + cda_link = cda_link[:len(cda_link)-5] + + if cda_link.endswith("/"): + cda_link = cda_link[:len(cda_link)-1] + + if cda_link.startsWith("http://"): + cutLink = "" + + for x in range(len(cda_link)): + if x >= 7: + continue + else: + cutLink += cda_link[x] + + cda_link = "https://" + cutLink + + if not re.match(regex_link, cda_link): + return None + + cdaPage = await execute_proxied_request(cda_link, { + "Referer": "https://www.cda.pl", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0", + "Accept-Encoding": "identity" + }) + + match = regex_file.match(cdaPage) + + if match and match.groups().count() >= 2: + key = match.groups()[0] + decryptedString = "" + + for vkey in remove_keys: + key = key.replace(vkey, "") + + for c in key: + if (c >= 33 and c <= 126): + decryptedString += (33 + ((c + 14) % 94)) + else: + decryptedString += c + + decryptedString = decryptedString.replace(".cda.mp4", "") + decryptedString = decryptedString.replace(".2cda.pl", ".cda.pl") + decryptedString = decryptedString.replace(".3cda.pl", ".cda.pl") + + if https: + return "https://" + decryptedString + ".mp4" + else: + return "http://" + decryptedString + ".mp4" async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: matchings = [] @@ -87,8 +152,9 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis embedPage = await execute_proxied_request(self, cdaEmbedLink) cdaVidLink = str(embedPage.find("h1", class_="title").find("a")["href"]) - # TODO: GET VIDEO LINK DATA FROM CDA LINK + dlLink = await get_mp4_link(cdaVidLink, VideoQuality.p1080) + episodes.append(Episode(f"Odcinek {number}", url, dlLink, 1080, "mp4")) except Exception as e: print(str(e)) From 97573aca84559bac043641addb6783f78e3f4c20 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 22 Apr 2022 23:00:34 +0200 Subject: [PATCH 09/23] Add new websites to WebsiteScraperService.cs --- .../Models/Websites/AnimepisodeWebsite.cs | 26 +++++++++++++++++++ .../Models/Websites/DesuonlineWebsite.cs | 2 -- SyncService/Services/WebsiteScraperService.cs | 3 +++ 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 SyncService/Models/Websites/AnimepisodeWebsite.cs diff --git a/SyncService/Models/Websites/AnimepisodeWebsite.cs b/SyncService/Models/Websites/AnimepisodeWebsite.cs new file mode 100644 index 0000000..99ae917 --- /dev/null +++ b/SyncService/Models/Websites/AnimepisodeWebsite.cs @@ -0,0 +1,26 @@ +using Commons; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace SyncService.Models.Websites +{ + public class AnimepisodeWebsite : IWebsite + { + public AnimepisodeWebsite(Website website) : base(website) + { + } + + public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) + { + return base.AnalyzeMatching(anime, matching, sourceTitle); + } + + public override Dictionary GetVideoProxyHeaders(AnimeMatching matching, Dictionary values = null) + { + return null; + } + } +} diff --git a/SyncService/Models/Websites/DesuonlineWebsite.cs b/SyncService/Models/Websites/DesuonlineWebsite.cs index cc02df1..e0d4ecf 100644 --- a/SyncService/Models/Websites/DesuonlineWebsite.cs +++ b/SyncService/Models/Websites/DesuonlineWebsite.cs @@ -15,8 +15,6 @@ public DesuonlineWebsite(Website website) : base(website) public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) { - Console.WriteLine(anime.Titles["en"] + ": " + matching.Title); - return base.AnalyzeMatching(anime, matching, sourceTitle); } diff --git a/SyncService/Services/WebsiteScraperService.cs b/SyncService/Services/WebsiteScraperService.cs index 2add091..202d6c1 100644 --- a/SyncService/Services/WebsiteScraperService.cs +++ b/SyncService/Services/WebsiteScraperService.cs @@ -84,6 +84,9 @@ public override async Task Start(CancellationToken cancellationToken) case "gogoanime": iWeb = new GogoanimeWebsite(website); break; + case "animepisode": + iWeb = new AnimepisodeWebsite(website); + break; case "desuonline": iWeb = new DesuonlineWebsite(website); break; From f398706e6859591f0bb3490ca1bc80240fa9c4f6 Mon Sep 17 00:00:00 2001 From: Artrix <39530102+Artrix9095@users.noreply.github.com> Date: Fri, 22 Apr 2022 15:54:17 -0700 Subject: [PATCH 10/23] Fix anime matching on Animepisode since they use peculiar names for their episodes --- .../Models/Websites/AnimepisodeWebsite.cs | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/SyncService/Models/Websites/AnimepisodeWebsite.cs b/SyncService/Models/Websites/AnimepisodeWebsite.cs index 99ae917..877ce58 100644 --- a/SyncService/Models/Websites/AnimepisodeWebsite.cs +++ b/SyncService/Models/Websites/AnimepisodeWebsite.cs @@ -4,18 +4,54 @@ using System.Linq; using System.Text; using System.Threading.Tasks; +using System.Text.RegularExpressions; + + namespace SyncService.Models.Websites { + class IAnimeInfo { + public string title { get; set; } + public int episodeNumber { get; set; } + public bool IsDub { get; set; } + } public class AnimepisodeWebsite : IWebsite { + private IAnimeInfo FixTitle(string title) + { + // Remove all numbers from the title using regexp + Regex rx = new Regex(@"(\d+)(?!.*\d)"); + + // Use the regex "rx" to remove the last number in title and store it in "episodeNumber" + int episodeNumber = int.Parse(rx.Match(title).Value); + title = rx.Replace(title, ""); + + // Check if the title contains "Dub" or "Dubbed" + bool IsDub = title.Contains("Dubbed"); + + // Remove "Dubbed" and "Subbed" from the title + title = title.Replace("Dubbed", ""); + title = title.Replace("Subbed", ""); + // Remove "Episode" and "English" from the title + title = title.Replace("Episode", ""); + title = title.Replace("English", ""); + + // Return the title, episode number and if it is a dub + return new IAnimeInfo() { title = title, episodeNumber = episodeNumber, IsDub = IsDub }; + + + } public AnimepisodeWebsite(Website website) : base(website) { } public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) { - return base.AnalyzeMatching(anime, matching, sourceTitle); + + + IAnimeInfo animeInfo = FixTitle(sourceTitle); + matching.IsDub = animeInfo.IsDub; + return base.AnalyzeMatching(anime, matching, animeInfo.title); } public override Dictionary GetVideoProxyHeaders(AnimeMatching matching, Dictionary values = null) From 72dae7a42217740ead81ed68ab6ae4b971d7992d Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Sat, 23 Apr 2022 09:32:11 +0200 Subject: [PATCH 11/23] Added loop for all video qualities --- ScraperEngine/resources/desuonline.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ScraperEngine/resources/desuonline.py b/ScraperEngine/resources/desuonline.py index 0cba453..6ccc554 100644 --- a/ScraperEngine/resources/desuonline.py +++ b/ScraperEngine/resources/desuonline.py @@ -152,9 +152,12 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis embedPage = await execute_proxied_request(self, cdaEmbedLink) cdaVidLink = str(embedPage.find("h1", class_="title").find("a")["href"]) - dlLink = await get_mp4_link(cdaVidLink, VideoQuality.p1080) - - episodes.append(Episode(f"Odcinek {number}", url, dlLink, 1080, "mp4")) + for quality in VideoQuality: + if quality == VideoQuality.auto: + continue + dlLink = await get_mp4_link(cdaVidLink, quality) + if dlLink != None: + episodes.append(Episode(f"Odcinek {number}", url, dlLink, quality.value, "mp4")) except Exception as e: print(str(e)) From a7a7384e3328a6cbfd355fd74248db771f092b8c Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Sat, 23 Apr 2022 15:03:47 +0200 Subject: [PATCH 12/23] Added search for desuonline --- ScraperEngine/resources/desuonline.py | 97 +++++++++++++++++---------- 1 file changed, 61 insertions(+), 36 deletions(-) diff --git a/ScraperEngine/resources/desuonline.py b/ScraperEngine/resources/desuonline.py index 6ccc554..e3c41db 100644 --- a/ScraperEngine/resources/desuonline.py +++ b/ScraperEngine/resources/desuonline.py @@ -1,10 +1,14 @@ import base64 from enum import Enum +import http from os import link +from tkinter import N from tokenize import String +from urllib import request import falcon import aiohttp import re +import requests from falcon import uri from typing import List @@ -16,14 +20,13 @@ remove_keys = { "_XDDD", "_CDA", "_ADC", "_CXD", "_QWE", "_Q5", "_IKSDE" } -regex_link = re.compile("https:\/\/www.cda.pl\/video\/([^\/\s]+)") -regex_file = re.compile("""file"":""(.*?)(?:"")""") +regex_file = re.compile('"""file"":""(.*?)(?:"")""') class VideoQuality(Enum): - auto = 0, - p360 = 360, - p480 = 480, - p720 = 720, + auto = 0 + p360 = 360 + p480 = 480 + p720 = 720 p1080 = 1080 class DesuonlineResource(ScraperResource): @@ -32,14 +35,14 @@ def __init__(self, app: falcon.App) -> None: print("DesuOnline initialized!") super().__init__(app, "desuonline") - async def get_mp4_link(cda_link, quality: VideoQuality.auto, https: False) -> String: + async def get_mp4_link(self, cda_link, quality: VideoQuality.auto, https) -> String: if cda_link.endswith("/vfilm"): cda_link = cda_link[:len(cda_link)-5] if cda_link.endswith("/"): cda_link = cda_link[:len(cda_link)-1] - if cda_link.startsWith("http://"): + if cda_link.startswith("http://"): cutLink = "" for x in range(len(cda_link)): @@ -48,16 +51,23 @@ async def get_mp4_link(cda_link, quality: VideoQuality.auto, https: False) -> St else: cutLink += cda_link[x] - cda_link = "https://" + cutLink + cda_link = "https://www." + cutLink - if not re.match(regex_link, cda_link): - return None + if quality != VideoQuality.auto: + cda_link = cda_link + f"?wersja={quality.value}p" + + print("Getting page: " + cda_link) + + headers = { + 'Referer': 'https://www.cda.pl', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0', + 'Accept-Encoding': 'identity', + } - cdaPage = await execute_proxied_request(cda_link, { - "Referer": "https://www.cda.pl", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0", - "Accept-Encoding": "identity" - }) + cdaPage = requests.get(cda_link, headers=headers).text + + with open("Output.txt", "w") as text_file: + text_file.write(cdaPage) match = regex_file.match(cdaPage) @@ -82,11 +92,13 @@ async def get_mp4_link(cda_link, quality: VideoQuality.auto, https: False) -> St return "https://" + decryptedString + ".mp4" else: return "http://" + decryptedString + ".mp4" + else: + raise Exception("No regex matches") async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: matchings = [] - url = f"{self.base_url}/?s={uri.encode(title)}" + url = f"{self.base_url}?s={uri.encode(title)}" print(url) try: @@ -97,26 +109,31 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List page = await execute_proxied_request(self, url) try: - show_elements = page.find(class_="bixbox").find("div", class_="listupd").find_all("article") + show_elements = page.find("div", class_="listupd").find_all("article", class_="bs") + print(len(show_elements)) if len(show_elements) == 0: raise Exception for show_element in show_elements: - element = show_element.find(class_="bsx").find("a") - path = str(element["href"]).replace(self.base_url, "") - - matchings.append(Matching(element["oldtitle"], path)) - - url = f"{self.base_url}/page/{page_number}/?s={uri.encode(title)}" - page_number = page_number + 1 + path = str(show_element.find_next("a")["href"]).replace(self.base_url, "") + + print("Adding to matchings") + match = show_element.find("div", class_="tt").find_next("h2").string + matchings.append(Matching(match, path)) + + if len(show_elements == 10): + url = f"{self.base_url}page/{page_number}/?s={uri.encode(title)}" + page_number = page_number + 1 + else: + has_ended = True except: has_ended = True except Exception as e: print(str(e)) raise - + return matchings async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: @@ -127,35 +144,43 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis try: page = await execute_proxied_request(self, url) - epList = page.find("div", class_="eplister").find("ul").find_all("li").reverse() + epList = page.find("div", class_="eplister").find_next("ul").find_all("li") episodeLink = str(epList[number].find("a")["href"]) + print(episodeLink) episodePage = await execute_proxied_request(self, episodeLink) sourcesList = episodePage.find("select", class_="mirror").find_all("option") - cdaEmbedLink = '' + cdaVidLink = '' for option in sourcesList: - decodedString = base64.b64decode(str(option["value"])) + decodedString = base64.b64decode(str(option["value"])).decode('ascii') + cdaEmbedLink = '' if "https://ebd.cda.pl/" in decodedString: - for x in range(13, decodedString.__len__()): + for x in range(13, len(decodedString)): if decodedString[x] == '"': break else: cdaEmbedLink += decodedString[x] - - if cdaEmbedLink == '': + + if cdaEmbedLink == '': + raise Exception("Failed to get CDA Embed Link!") + + videoID = cdaEmbedLink.split('/')[-1] + cdaVidLink = f"https://cda.pl/video/{videoID}" + + if cdaVidLink == '': raise Exception("Failed to get CDA Link!") - - embedPage = await execute_proxied_request(self, cdaEmbedLink) - cdaVidLink = str(embedPage.find("h1", class_="title").find("a")["href"]) + + print(cdaVidLink) for quality in VideoQuality: if quality == VideoQuality.auto: continue - dlLink = await get_mp4_link(cdaVidLink, quality) + dlLink = await self.get_mp4_link(cdaVidLink, quality, False) + print(dlLink) if dlLink != None: episodes.append(Episode(f"Odcinek {number}", url, dlLink, quality.value, "mp4")) From 563b61be6eaabacc902c3c79fc92abfc54c4f7ee Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Sat, 23 Apr 2022 16:06:54 +0200 Subject: [PATCH 13/23] Added Full Support for desu-online.pl :D --- ScraperEngine/requirements.txt | 3 +- ScraperEngine/resources/animepisode.py | 12 ------ ScraperEngine/resources/desuonline.py | 57 ++++---------------------- 3 files changed, 9 insertions(+), 63 deletions(-) diff --git a/ScraperEngine/requirements.txt b/ScraperEngine/requirements.txt index 322702b..d76c1cd 100644 --- a/ScraperEngine/requirements.txt +++ b/ScraperEngine/requirements.txt @@ -3,4 +3,5 @@ falcon aiohttp brotli ujson -pymongo \ No newline at end of file +pymongo +cda_downloader \ No newline at end of file diff --git a/ScraperEngine/resources/animepisode.py b/ScraperEngine/resources/animepisode.py index ffba866..043d173 100644 --- a/ScraperEngine/resources/animepisode.py +++ b/ScraperEngine/resources/animepisode.py @@ -24,24 +24,12 @@ def fix_title(self, title: str): episode_number = re.findall(r'\d+', title)[-1] title = title.replace(episode_number, "").strip() - - return { "title": title, "is_dubbed": is_dubbed, "episode_number": episode_number } - - - - - - - - - - async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: matchings = [] url = f"{self.base_url}/?s={uri.encode(title)}" diff --git a/ScraperEngine/resources/desuonline.py b/ScraperEngine/resources/desuonline.py index e3c41db..07514b3 100644 --- a/ScraperEngine/resources/desuonline.py +++ b/ScraperEngine/resources/desuonline.py @@ -18,9 +18,7 @@ from models.matching import Matching from utils.session import execute_proxied_request, get_proxied_response_json_get -remove_keys = { "_XDDD", "_CDA", "_ADC", "_CXD", "_QWE", "_Q5", "_IKSDE" } - -regex_file = re.compile('"""file"":""(.*?)(?:"")""') +from cda_downloader import CDA class VideoQuality(Enum): auto = 0 @@ -55,45 +53,8 @@ async def get_mp4_link(self, cda_link, quality: VideoQuality.auto, https) -> Str if quality != VideoQuality.auto: cda_link = cda_link + f"?wersja={quality.value}p" - - print("Getting page: " + cda_link) - headers = { - 'Referer': 'https://www.cda.pl', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0', - 'Accept-Encoding': 'identity', - } - - cdaPage = requests.get(cda_link, headers=headers).text - - with open("Output.txt", "w") as text_file: - text_file.write(cdaPage) - - match = regex_file.match(cdaPage) - - if match and match.groups().count() >= 2: - key = match.groups()[0] - decryptedString = "" - - for vkey in remove_keys: - key = key.replace(vkey, "") - - for c in key: - if (c >= 33 and c <= 126): - decryptedString += (33 + ((c + 14) % 94)) - else: - decryptedString += c - - decryptedString = decryptedString.replace(".cda.mp4", "") - decryptedString = decryptedString.replace(".2cda.pl", ".cda.pl") - decryptedString = decryptedString.replace(".3cda.pl", ".cda.pl") - - if https: - return "https://" + decryptedString + ".mp4" - else: - return "http://" + decryptedString + ".mp4" - else: - raise Exception("No regex matches") + return CDA().get_video_urls(urls=cda_link, only_urls=True, quality=quality.value) async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: matchings = [] @@ -110,15 +71,13 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List try: show_elements = page.find("div", class_="listupd").find_all("article", class_="bs") - print(len(show_elements)) if len(show_elements) == 0: raise Exception for show_element in show_elements: path = str(show_element.find_next("a")["href"]).replace(self.base_url, "") - - print("Adding to matchings") + match = show_element.find("div", class_="tt").find_next("h2").string matchings.append(Matching(match, path)) @@ -140,17 +99,17 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis episodes = [] url = f"{self.base_url}{path}" - print(url) try: page = await execute_proxied_request(self, url) epList = page.find("div", class_="eplister").find_next("ul").find_all("li") + episodeList = [] - episodeLink = str(epList[number].find("a")["href"]) - print(episodeLink) + for i in reversed(range(len(epList))): + episodeList.append(epList[i]) + episodeLink = str(episodeList[number - 1].find("a")["href"]) episodePage = await execute_proxied_request(self, episodeLink) - sourcesList = episodePage.find("select", class_="mirror").find_all("option") cdaVidLink = '' @@ -174,8 +133,6 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis if cdaVidLink == '': raise Exception("Failed to get CDA Link!") - print(cdaVidLink) - for quality in VideoQuality: if quality == VideoQuality.auto: continue From f373ffc814bc8884311b3e08b6500c1aecd12038 Mon Sep 17 00:00:00 2001 From: Artrix <39530102+Artrix9095@users.noreply.github.com> Date: Sat, 23 Apr 2022 22:58:17 -0700 Subject: [PATCH 14/23] Add Animegg resource --- ScraperEngine/main.py | 2 ++ ScraperEngine/resources/animegg.py | 52 ++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 ScraperEngine/resources/animegg.py diff --git a/ScraperEngine/main.py b/ScraperEngine/main.py index 9856e65..a1d2603 100644 --- a/ScraperEngine/main.py +++ b/ScraperEngine/main.py @@ -1,5 +1,6 @@ import falcon import falcon.asgi +from resources.animegg import AnimeggResource from resources.animeworld import AnimeworldResource from resources.dreamsub import DreamsubResource @@ -15,6 +16,7 @@ DreamsubResource(app) AnimeworldResource(app) AnimepisodeResource(app) +AnimeggResource(app) GogoanimeResource(app) diff --git a/ScraperEngine/resources/animegg.py b/ScraperEngine/resources/animegg.py new file mode 100644 index 0000000..2baca3e --- /dev/null +++ b/ScraperEngine/resources/animegg.py @@ -0,0 +1,52 @@ +import falcon +import aiohttp +from utils.session import execute_proxied_request + +from falcon import uri +from typing import List +from bs4 import BeautifulSoup +from interfaces.resource import ScraperResource +from models.episode import Episode +from models.matching import Matching + +class AnimeggResource(ScraperResource): + + def __init__(self, app: falcon.App) -> None: + # On this line, use the name you used inside MongoDB's websites collection + super().__init__(app, "animegg") + + async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: + matchings = [] + url = f"{self.base_url}/search/?q={uri.encode(title)}" + try: + page = await execute_proxied_request(self, url) + # Search results class is "mse" + results = page.find_all(class_="mse") + for result in results: + url = result.get("href") + title = result.select_one(".searchre > .media-body > .first > h2").text + matchings.append(Matching(title, url)) + except Exception as e: + print(str(e)) + raise + + return matchings + + async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: + episodes = [] + series_name = path.split("/")[-1] + url = f"{self.base_url}/{series_name}-episode-{number}" + try: + page = await execute_proxied_request(self, url) + title = page.select_one(".e4tit").text + iframe = page.find("iframe", class_="video") + page = await execute_proxied_request(self, f"{self.base_url}{iframe.get('src')}") + video = page.select_one("video") + url = video.get("src") + episodes.append(Episode(title, url, url, format="mp4")) + + except Exception as e: + print(str(e)) + raise + + return episodes \ No newline at end of file From d08b16ce939e707245dac180a0f78e2e2e3221f6 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Mon, 25 Apr 2022 16:44:34 +0200 Subject: [PATCH 15/23] Optimalization for desuonline, and some bug fixes --- ScraperEngine/resources/animegg.py | 20 ++++++-- ScraperEngine/resources/animepisode.py | 6 +-- ScraperEngine/resources/desuonline.py | 63 +++++++++++++------------- 3 files changed, 50 insertions(+), 39 deletions(-) diff --git a/ScraperEngine/resources/animegg.py b/ScraperEngine/resources/animegg.py index 2baca3e..4463d46 100644 --- a/ScraperEngine/resources/animegg.py +++ b/ScraperEngine/resources/animegg.py @@ -35,15 +35,27 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: episodes = [] series_name = path.split("/")[-1] + + # Thats just some weird shit anime.gg does + if number == 1: + number = 0 + url = f"{self.base_url}/{series_name}-episode-{number}" + print(url) try: page = await execute_proxied_request(self, url) title = page.select_one(".e4tit").text + print(title) iframe = page.find("iframe", class_="video") - page = await execute_proxied_request(self, f"{self.base_url}{iframe.get('src')}") - video = page.select_one("video") - url = video.get("src") - episodes.append(Episode(title, url, url, format="mp4")) + print(iframe) + print(f"{self.base_url}{iframe.get('src')}") + embedPage = await execute_proxied_request(self, f"{self.base_url}{iframe.get('src')}") + print(embedPage) + video = embedPage.find_next("video") + print(video) + dlUrl = video["src"] + print(dlUrl) + episodes.append(Episode(title, url, dlUrl, format="mp4")) except Exception as e: print(str(e)) diff --git a/ScraperEngine/resources/animepisode.py b/ScraperEngine/resources/animepisode.py index 043d173..d9b0f4a 100644 --- a/ScraperEngine/resources/animepisode.py +++ b/ScraperEngine/resources/animepisode.py @@ -40,7 +40,7 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List content = article.find(class_="blog-entry-inner").find(class_="blog-entry-content") link = content.find("header").find().find("a") title = link.text - url = link.get("href") + url = link["href"] matchings.append(Matching(title, url)) except Exception as e: @@ -54,12 +54,12 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis url = f"{self.base_url}{path}" try: page = await execute_proxied_request(self, url) - embed_url = str(page.find("iframe").get("src")) + embed_url = str(page.find("iframe")["src"]) page = await execute_proxied_request(self, embed_url, { "Referer": "https://animepisode.com/" }) video = page.select_one("video") - video_url = video.select_one('source').get('src') + video_url = video.select_one('source').get['src'] episodes.append(Episode(f"Episode {number}", video_url, video_url, format="mp4", quality=None)) diff --git a/ScraperEngine/resources/desuonline.py b/ScraperEngine/resources/desuonline.py index 07514b3..51af102 100644 --- a/ScraperEngine/resources/desuonline.py +++ b/ScraperEngine/resources/desuonline.py @@ -21,8 +21,6 @@ from cda_downloader import CDA class VideoQuality(Enum): - auto = 0 - p360 = 360 p480 = 480 p720 = 720 p1080 = 1080 @@ -30,10 +28,9 @@ class VideoQuality(Enum): class DesuonlineResource(ScraperResource): def __init__(self, app: falcon.App) -> None: - print("DesuOnline initialized!") super().__init__(app, "desuonline") - async def get_mp4_link(self, cda_link, quality: VideoQuality.auto, https) -> String: + async def get_mp4_link(self, cda_link, quality, https) -> String: if cda_link.endswith("/vfilm"): cda_link = cda_link[:len(cda_link)-5] @@ -51,10 +48,9 @@ async def get_mp4_link(self, cda_link, quality: VideoQuality.auto, https) -> Str cda_link = "https://www." + cutLink - if quality != VideoQuality.auto: - cda_link = cda_link + f"?wersja={quality.value}p" + cda_link = cda_link + f"?wersja={quality.value}p" - return CDA().get_video_urls(urls=cda_link, only_urls=True, quality=quality.value) + return CDA(use_api=False).get_video_urls(urls=cda_link, only_urls=True, quality=quality.value)[0] async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: matchings = [] @@ -98,48 +94,51 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: episodes = [] - url = f"{self.base_url}{path}" + url = f"{self.base_url}{path}-odcinek-{number}" try: - page = await execute_proxied_request(self, url) - epList = page.find("div", class_="eplister").find_next("ul").find_all("li") - episodeList = [] + # This here works, but theres a faster method + # But in case there are any bugs with current approach u you can use this - for i in reversed(range(len(epList))): - episodeList.append(epList[i]) + # url = f"{self.base_url}{path}" + # page = await execute_proxied_request(self, url) + # epList = page.find("div", class_="eplister").find_next("ul").find_all("li") + # episodeList = [] + + # for i in reversed(range(len(epList))): + # episodeList.append(epList[i]) + + # episodeLink = str(episodeList[number - 1].find("a")["href"]) + # episodePage = await execute_proxied_request(self, episodeLink) + + episodePage = await execute_proxied_request(self, url) - episodeLink = str(episodeList[number - 1].find("a")["href"]) - episodePage = await execute_proxied_request(self, episodeLink) sourcesList = episodePage.find("select", class_="mirror").find_all("option") cdaVidLink = '' for option in sourcesList: decodedString = base64.b64decode(str(option["value"])).decode('ascii') - cdaEmbedLink = '' + embedLink = '' + if "https://ebd.cda.pl/" in decodedString: - for x in range(13, len(decodedString)): - if decodedString[x] == '"': - break - else: - cdaEmbedLink += decodedString[x] + bs = BeautifulSoup(decodedString, 'html.parser') + embedLink = bs.find('iframe')["src"] - if cdaEmbedLink == '': + if embedLink == '': raise Exception("Failed to get CDA Embed Link!") - videoID = cdaEmbedLink.split('/')[-1] + videoID = embedLink.split('/')[-1] cdaVidLink = f"https://cda.pl/video/{videoID}" - if cdaVidLink == '': - raise Exception("Failed to get CDA Link!") + if cdaVidLink == '': + raise Exception("Failed to get CDA Link!") - for quality in VideoQuality: - if quality == VideoQuality.auto: - continue - dlLink = await self.get_mp4_link(cdaVidLink, quality, False) - print(dlLink) - if dlLink != None: - episodes.append(Episode(f"Odcinek {number}", url, dlLink, quality.value, "mp4")) + for quality in VideoQuality: + dlLink = await self.get_mp4_link(cdaVidLink, quality, False) + print(dlLink) + if dlLink != None: + episodes.append(Episode(f"Odcinek {number}", url, dlLink, quality.value, "mp4")) except Exception as e: print(str(e)) From 25aaca5434639b3cb2a9d1583572126dfb1e651f Mon Sep 17 00:00:00 2001 From: Artrix <39530102+Artrix9095@users.noreply.github.com> Date: Thu, 28 Apr 2022 12:35:24 -0700 Subject: [PATCH 16/23] Add animegg support --- ScraperEngine/main.py | 3 +- ScraperEngine/resources/animegg.py | 57 +++++++++++++++++++ SyncService/Models/Websites/AnimeggWebsite.cs | 30 ++++++++++ SyncService/Services/WebsiteScraperService.cs | 3 + 4 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 ScraperEngine/resources/animegg.py create mode 100644 SyncService/Models/Websites/AnimeggWebsite.cs diff --git a/ScraperEngine/main.py b/ScraperEngine/main.py index 157db17..2425177 100644 --- a/ScraperEngine/main.py +++ b/ScraperEngine/main.py @@ -1,5 +1,6 @@ import falcon import falcon.asgi +from resources.animegg import AnimeggResource from resources.animeworld import AnimeworldResource from resources.dreamsub import DreamsubResource @@ -11,7 +12,7 @@ DreamsubResource(app) AnimeworldResource(app) - +AnimeggResource(app) GogoanimeResource(app) AniplaylistResource(app) \ No newline at end of file diff --git a/ScraperEngine/resources/animegg.py b/ScraperEngine/resources/animegg.py new file mode 100644 index 0000000..2c2b179 --- /dev/null +++ b/ScraperEngine/resources/animegg.py @@ -0,0 +1,57 @@ +import falcon +import aiohttp +from utils.session import execute_proxied_request + +from falcon import uri +from typing import List +from bs4 import BeautifulSoup +from interfaces.resource import ScraperResource +from models.episode import Episode +from models.matching import Matching + + +class AnimeggResource(ScraperResource): + + def __init__(self, app: falcon.App) -> None: + # On this line, use the name you used inside MongoDB's websites collection + super().__init__(app, "yourwebsitenamehere") + + async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: + matchings = [] + url = f"{self.base_url}/search/?q={uri.encode(title)}" + try: + page = await execute_proxied_request(self, url) + # Search results class is "mse" + results = page.find_all(class_="mse") + for result in results: + url = result.get("href") + title = result.select_one(".searchre > .media-body > .first > h2").text + matchings.append(Matching(title, url)) + except Exception as e: + print(str(e)) + raise + return matchings + + async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: + episodes = [] + series_name = uri.encode(path.split("/")[-1].replace(" ", "-").replace(",", "")) + video_url = f"{self.base_url}/{series_name}-episode-{number}" + try: + page = await execute_proxied_request(self, video_url) + # Search results class is "mse" + iframe = page.find("iframe", class_="video") + for video in page.select("ul#videos > li.active > a"): + embed_id = video["data-id"] + is_dub = video["data-version"] == "dubbed" + quality_text = video.select_one("span.btn-hd").text + quality = 1080 if quality_text == "HD" else 480 + embed_url = f"{self.base_url}/embed/{embed_id}" + page = await execute_proxied_request(self, embed_url) + video_url = page.select_one("meta[property='og:video']")["content"] + episodes.append(Episode(f"Episode {number}", embed_url, video_url, quality, "mp4")) + + except Exception as e: + print(str(e)) + raise + + return episodes \ No newline at end of file diff --git a/SyncService/Models/Websites/AnimeggWebsite.cs b/SyncService/Models/Websites/AnimeggWebsite.cs new file mode 100644 index 0000000..38e7bdd --- /dev/null +++ b/SyncService/Models/Websites/AnimeggWebsite.cs @@ -0,0 +1,30 @@ +using Commons; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace SyncService.Models.Websites +{ + public class AnimeggWebsite : IWebsite + { + public AnimeggWebsite(Website website) : base(website) + { + } + + public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) + { + + return base.AnalyzeMatching(anime, matching, sourceTitle); + } + + public override Dictionary GetVideoProxyHeaders(AnimeMatching matching, Dictionary values = null) + { + return new Dictionary + { + { "referer", matching.EpisodePath } + }; + } + } +} diff --git a/SyncService/Services/WebsiteScraperService.cs b/SyncService/Services/WebsiteScraperService.cs index 4331390..50584be 100644 --- a/SyncService/Services/WebsiteScraperService.cs +++ b/SyncService/Services/WebsiteScraperService.cs @@ -84,6 +84,9 @@ public override async Task Start(CancellationToken cancellationToken) case "gogoanime": iWeb = new GogoanimeWebsite(website); break; + case "animegg": + iWeb = new AnimeggWebsite(website); + break; default: throw new Exception($"Website {website.Name} not handled!"); From d61c5de7a177f335797e2ee617349910a2b61270 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 29 Apr 2022 17:34:06 +0200 Subject: [PATCH 17/23] AnimeGG fully scrapped Co-authored-by: Artrix --- ScraperEngine/main.py | 1 - ScraperEngine/resources/animegg.py | 10 ++-- ScraperEngine/resources/animepisode.py | 70 -------------------------- 3 files changed, 5 insertions(+), 76 deletions(-) delete mode 100644 ScraperEngine/resources/animepisode.py diff --git a/ScraperEngine/main.py b/ScraperEngine/main.py index 424782d..7543cac 100644 --- a/ScraperEngine/main.py +++ b/ScraperEngine/main.py @@ -5,7 +5,6 @@ from resources.dreamsub import DreamsubResource from resources.gogoanime import GogoanimeResource -from resources.animepisode import AnimepisodeResource from resources.aniplaylist import AniplaylistResource diff --git a/ScraperEngine/resources/animegg.py b/ScraperEngine/resources/animegg.py index 2c2b179..cacc1de 100644 --- a/ScraperEngine/resources/animegg.py +++ b/ScraperEngine/resources/animegg.py @@ -14,7 +14,7 @@ class AnimeggResource(ScraperResource): def __init__(self, app: falcon.App) -> None: # On this line, use the name you used inside MongoDB's websites collection - super().__init__(app, "yourwebsitenamehere") + super().__init__(app, "animegg") async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: matchings = [] @@ -38,16 +38,16 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis video_url = f"{self.base_url}/{series_name}-episode-{number}" try: page = await execute_proxied_request(self, video_url) - # Search results class is "mse" - iframe = page.find("iframe", class_="video") - for video in page.select("ul#videos > li.active > a"): + links = page.find("ul", id="videos").find_next("li").find_all("a") + for video in links: embed_id = video["data-id"] is_dub = video["data-version"] == "dubbed" quality_text = video.select_one("span.btn-hd").text quality = 1080 if quality_text == "HD" else 480 embed_url = f"{self.base_url}/embed/{embed_id}" page = await execute_proxied_request(self, embed_url) - video_url = page.select_one("meta[property='og:video']")["content"] + video_path = page.select_one("meta[property='og:video']")["content"] + video_url = f"{self.base_url}{video_path}" episodes.append(Episode(f"Episode {number}", embed_url, video_url, quality, "mp4")) except Exception as e: diff --git a/ScraperEngine/resources/animepisode.py b/ScraperEngine/resources/animepisode.py deleted file mode 100644 index d9b0f4a..0000000 --- a/ScraperEngine/resources/animepisode.py +++ /dev/null @@ -1,70 +0,0 @@ -import falcon -import aiohttp -from utils.session import execute_proxied_request -import re -from falcon import uri -from typing import List -from bs4 import BeautifulSoup -from interfaces.resource import ScraperResource -from models.episode import Episode -from models.matching import Matching - -class AnimepisodeResource(ScraperResource): - - def __init__(self, app: falcon.App) -> None: - # On this line, use the name you used inside MongoDB's websites collection - super().__init__(app, "animepisode") - - def fix_title(self, title: str): - # set the variable "is_dubbed" to true if the title contains "Dubbed" in it, otherwise set it to false, do this using regexp - is_dubbed = title in "Dubbed" - title = title.replace("Dubbed", "").replace("Subbed", "").replace("English", "").strip() - title = title.replace("Episode","").strip() - # Use Regexp to find the last number found in the title, pop it from the title and return it to the variable "episode_number" - episode_number = re.findall(r'\d+', title)[-1] - title = title.replace(episode_number, "").strip() - - return { - "title": title, - "is_dubbed": is_dubbed, - "episode_number": episode_number - } - - async def get_possible_matchings(self, res: falcon.Response, title: str) -> List[Matching]: - matchings = [] - url = f"{self.base_url}/?s={uri.encode(title)}" - try: - page = await execute_proxied_request(self, url) - articles = page.select_one("#main").find_all("article") - for article in articles: - content = article.find(class_="blog-entry-inner").find(class_="blog-entry-content") - link = content.find("header").find().find("a") - title = link.text - url = link["href"] - matchings.append(Matching(title, url)) - - except Exception as e: - print(str(e)) - raise - - return matchings - - async def get_episode(self, res: falcon.Response, path: str, number: int) -> List[Episode]: - episodes: List[Episode] = [] - url = f"{self.base_url}{path}" - try: - page = await execute_proxied_request(self, url) - embed_url = str(page.find("iframe")["src"]) - page = await execute_proxied_request(self, embed_url, { - "Referer": "https://animepisode.com/" - }) - video = page.select_one("video") - video_url = video.select_one('source').get['src'] - episodes.append(Episode(f"Episode {number}", video_url, video_url, format="mp4", quality=None)) - - - except Exception as e: - print(str(e)) - raise - - return episodes \ No newline at end of file From 4c143eef14f9142612afe7a5aa60f01a5ce9df1d Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 29 Apr 2022 17:47:04 +0200 Subject: [PATCH 18/23] Added full support for AnimeGG Co-authored-by: Artrix --- ScraperEngine/resources/animegg.py | 8 ++- SyncService/Models/Websites/AnimeggWebsite.cs | 6 +- .../Models/Websites/AnimepisodeWebsite.cs | 62 ------------------- .../Models/Websites/DesuonlineWebsite.cs | 10 ++- SyncService/Services/WebsiteScraperService.cs | 3 - 5 files changed, 20 insertions(+), 69 deletions(-) delete mode 100644 SyncService/Models/Websites/AnimepisodeWebsite.cs diff --git a/ScraperEngine/resources/animegg.py b/ScraperEngine/resources/animegg.py index cacc1de..d8197e3 100644 --- a/ScraperEngine/resources/animegg.py +++ b/ScraperEngine/resources/animegg.py @@ -1,3 +1,4 @@ +from operator import is_ import falcon import aiohttp from utils.session import execute_proxied_request @@ -41,14 +42,19 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis links = page.find("ul", id="videos").find_next("li").find_all("a") for video in links: embed_id = video["data-id"] + is_dub = video["data-version"] == "dubbed" + vid_type = 'subbed' + if is_dub: + vid_type = "dubbed" + quality_text = video.select_one("span.btn-hd").text quality = 1080 if quality_text == "HD" else 480 embed_url = f"{self.base_url}/embed/{embed_id}" page = await execute_proxied_request(self, embed_url) video_path = page.select_one("meta[property='og:video']")["content"] video_url = f"{self.base_url}{video_path}" - episodes.append(Episode(f"Episode {number}", embed_url, video_url, quality, "mp4")) + episodes.append(Episode(f"Episode {number}", f"{embed_url}#{vid_type}", video_url, quality, "mp4")) except Exception as e: print(str(e)) diff --git a/SyncService/Models/Websites/AnimeggWebsite.cs b/SyncService/Models/Websites/AnimeggWebsite.cs index 38e7bdd..0521a0f 100644 --- a/SyncService/Models/Websites/AnimeggWebsite.cs +++ b/SyncService/Models/Websites/AnimeggWebsite.cs @@ -15,7 +15,9 @@ public AnimeggWebsite(Website website) : base(website) public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) { - + if(matching.EpisodePath.Contains("#dubbed")) { + matching.isDub = true; + } return base.AnalyzeMatching(anime, matching, sourceTitle); } @@ -23,7 +25,7 @@ public override Dictionary GetVideoProxyHeaders(AnimeMatching ma { return new Dictionary { - { "referer", matching.EpisodePath } + { "referer", matching.EpisodePath.Replace("#dubbed", "").Replace("#subbed", "") } }; } } diff --git a/SyncService/Models/Websites/AnimepisodeWebsite.cs b/SyncService/Models/Websites/AnimepisodeWebsite.cs deleted file mode 100644 index 877ce58..0000000 --- a/SyncService/Models/Websites/AnimepisodeWebsite.cs +++ /dev/null @@ -1,62 +0,0 @@ -using Commons; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using System.Text.RegularExpressions; - - - -namespace SyncService.Models.Websites -{ - class IAnimeInfo { - public string title { get; set; } - public int episodeNumber { get; set; } - public bool IsDub { get; set; } - } - public class AnimepisodeWebsite : IWebsite - { - private IAnimeInfo FixTitle(string title) - { - // Remove all numbers from the title using regexp - Regex rx = new Regex(@"(\d+)(?!.*\d)"); - - // Use the regex "rx" to remove the last number in title and store it in "episodeNumber" - int episodeNumber = int.Parse(rx.Match(title).Value); - title = rx.Replace(title, ""); - - // Check if the title contains "Dub" or "Dubbed" - bool IsDub = title.Contains("Dubbed"); - - // Remove "Dubbed" and "Subbed" from the title - title = title.Replace("Dubbed", ""); - title = title.Replace("Subbed", ""); - // Remove "Episode" and "English" from the title - title = title.Replace("Episode", ""); - title = title.Replace("English", ""); - - // Return the title, episode number and if it is a dub - return new IAnimeInfo() { title = title, episodeNumber = episodeNumber, IsDub = IsDub }; - - - } - public AnimepisodeWebsite(Website website) : base(website) - { - } - - public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) - { - - - IAnimeInfo animeInfo = FixTitle(sourceTitle); - matching.IsDub = animeInfo.IsDub; - return base.AnalyzeMatching(anime, matching, animeInfo.title); - } - - public override Dictionary GetVideoProxyHeaders(AnimeMatching matching, Dictionary values = null) - { - return null; - } - } -} diff --git a/SyncService/Models/Websites/DesuonlineWebsite.cs b/SyncService/Models/Websites/DesuonlineWebsite.cs index e0d4ecf..e069af0 100644 --- a/SyncService/Models/Websites/DesuonlineWebsite.cs +++ b/SyncService/Models/Websites/DesuonlineWebsite.cs @@ -15,12 +15,20 @@ public DesuonlineWebsite(Website website) : base(website) public override bool AnalyzeMatching(Anime anime, AnimeMatching matching, string sourceTitle) { + if (matching.EpisodePath.Contains("#dubbed")) + { + matching.IsDub = true; + } + return base.AnalyzeMatching(anime, matching, sourceTitle); } public override Dictionary GetVideoProxyHeaders(AnimeMatching matching, Dictionary values = null) { - return null; + return new Dictionary + { + { "referer", matching.EpisodePath.Replace("#dubbed", "").Replace("subbed", "") } + }; } } } diff --git a/SyncService/Services/WebsiteScraperService.cs b/SyncService/Services/WebsiteScraperService.cs index 31b9074..63f43bf 100644 --- a/SyncService/Services/WebsiteScraperService.cs +++ b/SyncService/Services/WebsiteScraperService.cs @@ -87,9 +87,6 @@ public override async Task Start(CancellationToken cancellationToken) case "animegg": iWeb = new AnimeggWebsite(website); break; - case "animepisode": - iWeb = new AnimepisodeWebsite(website); - break; case "desuonline": iWeb = new DesuonlineWebsite(website); break; From 2e56535bd13f2b236473e243441c159808b23d12 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 29 Apr 2022 17:52:13 +0200 Subject: [PATCH 19/23] Fix some merge conflicts --- WebAPI/Resources/localizations.1.1.5.json | 16 ++++++++++++++++ WebAPI/Resources/localizations.1_0.json | 22 +++++++++++----------- WebAPI/WebAPI.csproj | 2 ++ 3 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 WebAPI/Resources/localizations.1.1.5.json diff --git a/WebAPI/Resources/localizations.1.1.5.json b/WebAPI/Resources/localizations.1.1.5.json new file mode 100644 index 0000000..b4ae5ef --- /dev/null +++ b/WebAPI/Resources/localizations.1.1.5.json @@ -0,0 +1,16 @@ +{ + "localizations": [ + { + "i18n": "en", + "label": "English" + }, + { + "i18n": "it", + "label": "Italian" + }, + { + "i18n": "pl", + "label": "Polish" + } + ] + } \ No newline at end of file diff --git a/WebAPI/Resources/localizations.1_0.json b/WebAPI/Resources/localizations.1_0.json index 0e891cc..36ba13c 100644 --- a/WebAPI/Resources/localizations.1_0.json +++ b/WebAPI/Resources/localizations.1_0.json @@ -1,12 +1,12 @@ { - "localizations": [ - { - "i18n": "en", - "label": "English" - }, - { - "i18n": "it", - "label": "Italian" - } - ] -} \ No newline at end of file + "localizations": [ + { + "i18n": "en", + "label": "English" + }, + { + "i18n": "it", + "label": "Italian" + } + ] +} \ No newline at end of file diff --git a/WebAPI/WebAPI.csproj b/WebAPI/WebAPI.csproj index 045db2c..7d4dd05 100644 --- a/WebAPI/WebAPI.csproj +++ b/WebAPI/WebAPI.csproj @@ -16,6 +16,7 @@ + @@ -25,6 +26,7 @@ + From 87da999332d1c04defa0ba0ac6330e4e415a0b3e Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 29 Apr 2022 17:58:46 +0200 Subject: [PATCH 20/23] Merge --- ScraperEngine/main.py | 8 +------- ScraperEngine/resources/desuonline.py | 27 +-------------------------- 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/ScraperEngine/main.py b/ScraperEngine/main.py index 2826c06..43ce228 100644 --- a/ScraperEngine/main.py +++ b/ScraperEngine/main.py @@ -14,14 +14,8 @@ DreamsubResource(app) AnimeworldResource(app) -<<<<<<< HEAD -AnimeggResource(app) -GogoanimeResource(app) -AniplaylistResource(app) -DesuonlineResource(app) -======= GogoanimeResource(app) DesuonlineResource(app) +AnimeggResource(app) AniplaylistResource(app) ->>>>>>> d7199668263eb2b75d9a3687805f2f1593ab84ff diff --git a/ScraperEngine/resources/desuonline.py b/ScraperEngine/resources/desuonline.py index e43f92f..d9ad354 100644 --- a/ScraperEngine/resources/desuonline.py +++ b/ScraperEngine/resources/desuonline.py @@ -56,10 +56,6 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List matchings = [] url = f"{self.base_url}?s={uri.encode(title)}" -<<<<<<< HEAD - print(url) -======= ->>>>>>> d7199668263eb2b75d9a3687805f2f1593ab84ff try: has_ended = False @@ -75,11 +71,7 @@ async def get_possible_matchings(self, res: falcon.Response, title: str) -> List raise Exception for show_element in show_elements: -<<<<<<< HEAD - path = str(show_element.find_next("a")["href"]).replace(self.base_url, "") -======= path = str(show_element.find_next("a")["href"]).replace(f"{self.base_url}/anime", "")[:-1] ->>>>>>> d7199668263eb2b75d9a3687805f2f1593ab84ff match = show_element.find("div", class_="tt").find_next("h2").string matchings.append(Matching(match, path)) @@ -102,11 +94,7 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis episodes = [] url = f"{self.base_url}{path}-odcinek-{number}" -<<<<<<< HEAD -======= - print(url) ->>>>>>> d7199668263eb2b75d9a3687805f2f1593ab84ff - + try: # This here works, but theres a faster method # But in case there are any bugs with current approach u you can use this @@ -126,14 +114,6 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis sourcesList = episodePage.find("select", class_="mirror").find_all("option") -<<<<<<< HEAD - cdaVidLink = '' - - for option in sourcesList: - decodedString = base64.b64decode(str(option["value"])).decode('ascii') - embedLink = '' - -======= for option in sourcesList: decodedString = base64.b64decode(str(option["value"])).decode('ascii') @@ -146,7 +126,6 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis dlLink = f"https://drive.google.com/u/0/uc?id={videoID}&export=download&confirm=t" episodes.append(Episode(f"Odcinek {number}", url, dlLink, None, "mp4")) ->>>>>>> d7199668263eb2b75d9a3687805f2f1593ab84ff if "https://ebd.cda.pl/" in decodedString: bs = BeautifulSoup(decodedString, 'html.parser') embedLink = bs.find('iframe')["src"] @@ -162,10 +141,6 @@ async def get_episode(self, res: falcon.Response, path: str, number: int) -> Lis for quality in VideoQuality: dlLink = await self.get_mp4_link(cdaVidLink, quality, False) -<<<<<<< HEAD - print(dlLink) -======= ->>>>>>> d7199668263eb2b75d9a3687805f2f1593ab84ff if dlLink != None: episodes.append(Episode(f"Odcinek {number}", url, dlLink, quality.value, "mp4")) From 41a04b7ef17a3de7770a63fd082ddeb42c4b8dc3 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 29 Apr 2022 18:03:15 +0200 Subject: [PATCH 21/23] Delete localizations.1.1.5.json --- WebAPI/Resources/localizations.1.1.5.json | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 WebAPI/Resources/localizations.1.1.5.json diff --git a/WebAPI/Resources/localizations.1.1.5.json b/WebAPI/Resources/localizations.1.1.5.json deleted file mode 100644 index b4ae5ef..0000000 --- a/WebAPI/Resources/localizations.1.1.5.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "localizations": [ - { - "i18n": "en", - "label": "English" - }, - { - "i18n": "it", - "label": "Italian" - }, - { - "i18n": "pl", - "label": "Polish" - } - ] - } \ No newline at end of file From 2e489a0d25c8e140883617ab3e4366f120530599 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 29 Apr 2022 18:04:16 +0200 Subject: [PATCH 22/23] Some merge conflicts --- .../{localizations.1.1.5.json => localizations.1_1_5.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename WebAPI/Resources/{localizations.1.1.5.json => localizations.1_1_5.json} (100%) diff --git a/WebAPI/Resources/localizations.1.1.5.json b/WebAPI/Resources/localizations.1_1_5.json similarity index 100% rename from WebAPI/Resources/localizations.1.1.5.json rename to WebAPI/Resources/localizations.1_1_5.json From 87922e41dca4871edb655f35f6e2807fbeaee428 Mon Sep 17 00:00:00 2001 From: Kapilarny <50770669+Kapilarny@users.noreply.github.com> Date: Fri, 29 Apr 2022 18:07:00 +0200 Subject: [PATCH 23/23] a lil fix --- WebAPI/Resources/localizations.1_1_5.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebAPI/Resources/localizations.1_1_5.json b/WebAPI/Resources/localizations.1_1_5.json index b4ae5ef..7bbfd71 100644 --- a/WebAPI/Resources/localizations.1_1_5.json +++ b/WebAPI/Resources/localizations.1_1_5.json @@ -13,4 +13,4 @@ "label": "Polish" } ] - } \ No newline at end of file +} \ No newline at end of file