diff --git a/Cargo.lock b/Cargo.lock index 28c77b4..da51b77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2022,14 +2022,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.7" +version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" dependencies = [ "form_urlencoded", "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 3a43854..4f65ea8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,8 +5,9 @@ edition = "2024" [features] default = ["telegram"] -telegram = ["dep:teloxide", "twitter"] +telegram = ["dep:teloxide", "twitter", "tiktok"] twitter = [] +tiktok = [] [dependencies] async-trait = "0.1.89" diff --git a/README.md b/README.md index 2df1629..b163e77 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A small project to explore the world of Telegram, Discord, Instagram, X (formerl ## ☀️ Overview -Bot-RS is a cross-platform bot framework that scrapes real-time data from Instagram, X (formerly Twitter), and TikTok, delivering it to users through stylish and intuitive bots — whether you’re chatting on Telegram or hanging out on Discord. +Bot-RS is a cross-platform bot framework that scrapes real-time data from Instagram, X, and TikTok, delivering it to users through stylish and intuitive bots — whether you’re chatting on Telegram or hanging out on Discord. ## 🚀 Quick Start @@ -24,7 +24,8 @@ cargo run Once the bot is running, you can interact with it using the following commands: * `/help` (aliases: `/h`, `/?`) - Display available commands -* `/twitter ` (alias: `/t`) - Download media from a X (Twitter) post +* `/twitter ` (alias: `/t`) - Download media from a X post +* `/tiktok ` (alias: `/tk`) - Download media from a TikTok post ## 🌍 Why this project? diff --git a/src/core/error.rs b/src/core/error.rs index 2ea6f2f..1309472 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -1,26 +1,77 @@ #![allow(unused_macros, unused_imports)] use std::fmt; +// --- Structs and Enums --- + +#[allow(dead_code)] +#[derive(Debug)] +pub enum BotError { + CommandNotFound, + NoMediaFound, + InvalidLink, + InvalidUrl, + MediaSendFailed, + InvalidScraperResponse, + FileTypeNotSupported, + InvalidMedia, + Unknown, + Custom(String), +} + +// --- Type Aliases --- + +pub type BotResult = Result; + +// --- Trait Impl --- + +impl fmt::Display for BotError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let msg = match self { + BotError::CommandNotFound => "Command not found.", + BotError::NoMediaFound => { + "No media items found for this link. The post might be private or not contain any media." + } + BotError::InvalidLink => "Invalid link.", + BotError::InvalidUrl => "Invalid URL.", + BotError::MediaSendFailed => { + "Failed to send media. The media might be unavailable or the format unsupported." + } + BotError::InvalidScraperResponse => { + "The scraper returned an unexpected response. The link might be invalid or the content might be unavailable." + } + BotError::FileTypeNotSupported => "The media format is not currently supported.", + BotError::InvalidMedia => "The media might be corrupted or in an unrecognized format.", + BotError::Unknown => "An unexpected error occurred, please retry later...", + BotError::Custom(msg) => msg, + }; + + write!(f, "{msg}") + } +} + +impl std::error::Error for BotError {} + // --- Macros --- macro_rules! error { - ($err:expr, $fmt:expr, $($arg:expr),* $(,)?) => {{ + ($err:expr, $fmt:literal, $($arg:expr),* $(,)?) => {{ let err = $err; let enum_variant = format!("{err:?}"); let cause = format!($fmt, $($arg,)*); - ::tracing::error!("{enum_variant}: {cause}"); + ::tracing::warn!("{enum_variant}: {cause}"); err }}; + ($err:expr, $cause:expr $(,)?) => {{ let err = $err; let enum_variant = format!("{err:?}"); - let cause = format!($cause); - ::tracing::error!("{enum_variant}: {cause}"); + let cause = $cause.to_string(); + ::tracing::warn!("{enum_variant}: {cause}"); err }}; ($err:expr $(,)?) => {{ let err = $err; - ::tracing::error!("{err:?}"); + ::tracing::warn!("{err:?}"); err }}; } @@ -86,53 +137,3 @@ pub(crate) use invalid_url; pub(crate) use media_send_failed; pub(crate) use no_media_found; pub(crate) use unknown; - -// --- Structs and Enums --- - -#[allow(dead_code)] -#[derive(Debug)] -pub enum BotError { - CommandNotFound, - NoMediaFound, - InvalidLink, - InvalidUrl, - MediaSendFailed, - InvalidScraperResponse, - FileTypeNotSupported, - InvalidMedia, - Unknown, - Custom(String), -} - -// --- Type Aliases --- - -pub type BotResult = Result; - -// --- Trait Impl --- - -impl fmt::Display for BotError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let msg = match self { - BotError::CommandNotFound => "Command not found.", - BotError::NoMediaFound => { - "No media items found for this link. The post might be private or not contain any media." - } - BotError::InvalidLink => "Invalid link.", - BotError::InvalidUrl => "Invalid URL.", - BotError::MediaSendFailed => { - "Failed to send media. The media might be unavailable or the format unsupported." - } - BotError::InvalidScraperResponse => { - "The scraper returned an unexpected response. The link might be invalid or the content might be unavailable." - } - BotError::FileTypeNotSupported => "The media format is not currently supported.", - BotError::InvalidMedia => "The media might be corrupted or in an unrecognized format.", - BotError::Unknown => "An unexpected error occured, please retry later...", - BotError::Custom(msg) => msg, - }; - - write!(f, "{msg}") - } -} - -impl std::error::Error for BotError {} diff --git a/src/core/mod.rs b/src/core/mod.rs index 5b8f87c..e105dcc 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,3 +1,5 @@ +#![allow(unused_imports)] + pub mod error; pub mod traits; pub mod types; diff --git a/src/core/traits.rs b/src/core/traits.rs index edfb976..7b69763 100644 --- a/src/core/traits.rs +++ b/src/core/traits.rs @@ -6,7 +6,7 @@ use crate::core::{BotResult, MediaMetadata}; pub trait MediaScraper { type Input; - async fn scrape(input: Self::Input) -> BotResult>>; + async fn get_medias(input: Self::Input) -> BotResult>>; } #[async_trait] diff --git a/src/core/types.rs b/src/core/types.rs index c7528c6..3b0e976 100644 --- a/src/core/types.rs +++ b/src/core/types.rs @@ -9,14 +9,18 @@ pub enum MediaKind { Video, } -#[allow(dead_code)] #[derive(Debug)] pub struct MediaMetadata { - pub id: String, pub kind: MediaKind, pub url: Url, } +impl MediaMetadata { + pub fn new(kind: MediaKind, url: Url) -> Self { + Self { kind, url } + } +} + impl std::fmt::Display for MediaMetadata { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}({})", self.kind, self.url.as_str()) diff --git a/src/lib.rs b/src/lib.rs index 5aff1a7..74dab4f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,9 @@ pub mod telegram; #[cfg(feature = "twitter")] pub mod twitter; +#[cfg(feature = "tiktok")] +pub mod tiktok; + pub mod prelude { pub use crate::core::error::{BotError, BotResult}; pub use crate::core::traits::{MediaScraper, MediaSender}; diff --git a/src/main.rs b/src/main.rs index 8e042b5..708460c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,26 +2,38 @@ use tokio::task::JoinSet; #[cfg(feature = "telegram")] use media_bot::telegram::TelegramBot; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; #[tokio::main] async fn main() { + // load env files dotenvy::dotenv().ok(); - tracing_subscriber::fmt() - // .with_env_filter(EnvFilter::from_default_env()) - .with_env_filter(tracing_subscriber::EnvFilter::new("media_bot=trace")) - .pretty() - .with_line_number(true) - .with_target(true) // Include module target in logs - .init(); + // enable tracing logs + tracing_subscriber().init(); + #[allow(unused_mut)] let mut jobs: JoinSet<()> = JoinSet::new(); #[cfg(feature = "telegram")] { - let telegram_bot = TelegramBot::new(); - jobs.spawn(async move { telegram_bot.run().await }); + jobs.spawn(TelegramBot::run()); } jobs.join_all().await; } + +fn tracing_subscriber() -> impl SubscriberInitExt { + let filter_layer = tracing_subscriber::filter::Targets::new() + // .with_filter(EnvFilter::from_default_env()); + .with_target("media_bot", tracing::Level::DEBUG); + + let fmt_layer = tracing_subscriber::fmt::layer() + .pretty() + .with_line_number(true) + .with_target(true); // Include module target in logs + + tracing_subscriber::registry() + .with(filter_layer) + .with(fmt_layer) +} diff --git a/src/telegram/bot.rs b/src/telegram/bot.rs index 2be770a..a3b5272 100644 --- a/src/telegram/bot.rs +++ b/src/telegram/bot.rs @@ -1,6 +1,8 @@ use teloxide::{prelude::*, utils::command::BotCommands}; -use tracing::{debug, error, info, instrument}; +use tracing::{Span, debug, info, instrument, warn}; +#[cfg(feature = "tiktok")] +use crate::tiktok::TikTokScraper; use crate::{core::*, telegram::*, twitter::TwitterScraper}; #[derive(BotCommands, Clone)] @@ -13,39 +15,47 @@ pub(crate) enum Command { #[command(aliases = ["h", "?"], hide_aliases)] Help, - /// Download medias attached to the post + /// Download media attached to the post #[cfg(feature = "twitter")] #[command(aliases = ["t"], hide_aliases)] Twitter(String), + /// Handle a TikTok link + #[cfg(feature = "tiktok")] + #[command(aliases = ["tk"], hide_aliases)] + Tiktok(String), // /// Handle a insta link // #[command(parse_with = "split", alias = "insta")] // Instagram, - - // /// Handle a tiktok link - // #[command(aliases = ["tk", "tiktok"])] - // Tiktok(String), } #[derive(Debug, Clone)] -pub struct TelegramBot { - bot: teloxide::Bot, -} +pub enum TelegramBot {} impl TelegramBot { - pub fn new() -> Self { - Self { - bot: teloxide::Bot::from_env(), - } + pub async fn run() { + Self::run_with(teloxide::Bot::from_env()).await; } - pub fn from_bot(bot: teloxide::Bot) -> Self { - Self { bot } - } + pub async fn run_with(bot: teloxide::Bot) { + let dptree_entry = { + let command_handler = Update::filter_message() + .filter_command::() + .endpoint(command_handler); + + let default_handler = Update::filter_message().endpoint(default_handler); + + dptree::entry() + .branch(command_handler) + .branch(default_handler) + }; + + let mut dispatcher = Dispatcher::builder(bot, dptree_entry) + .enable_ctrlc_handler() + .build(); - pub async fn run(self) { info!("Bot is running..."); - Command::repl(self.bot, answer).await; + dispatcher.dispatch().await; info!("Bot shutting down..."); } } @@ -57,38 +67,35 @@ impl std::fmt::Display for Command { #[cfg(feature = "twitter")] Self::Twitter(arg) => write!(f, "/twitter {arg}"), - } - } -} -impl Default for TelegramBot { - fn default() -> Self { - Self::new() + #[cfg(feature = "tiktok")] + Self::Tiktok(arg) => write!(f, "/tiktok {arg}"), + } } } #[instrument( skip_all, fields( - user_id = msg.from.as_ref().map(|u| u.id.0).unwrap_or(0), - username = msg.from.as_ref().map(|u| u.username.as_ref()).unwrap_or(Some(&"".to_string())).unwrap_or(&"".to_string()), command = %cmd, chat_id = msg.chat.id.0 ) )] -async fn answer(bot: teloxide::Bot, msg: Message, cmd: Command) -> ResponseResult<()> { +async fn command_handler(bot: teloxide::Bot, msg: Message, cmd: Command) -> ResponseResult<()> { macro_rules! send_msg { ($msg:expr) => {{ if let Err(err) = bot.send_message(msg.chat.id, $msg.to_string()).await { - error!("Failed to send response: {err}"); + warn!("Failed to send response: {err}"); } else { - info!("Response succesfully send") + info!("Response successfully sent"); } ResponseResult::Ok(()) }}; } + record_user_infos_into_span(&msg); + info!("Received command {cmd}"); if matches!(cmd, Command::Help) { @@ -97,7 +104,10 @@ async fn answer(bot: teloxide::Bot, msg: Message, cmd: Command) -> ResponseResul let scraping_results: BotResult>> = match cmd { #[cfg(feature = "twitter")] - Command::Twitter(url) => TwitterScraper::scrape(url).await, + Command::Twitter(arg) => TwitterScraper::get_medias(arg).await, + + #[cfg(feature = "tiktok")] + Command::Tiktok(arg) => TikTokScraper::get_medias(arg).await, _ => return send_msg!(command_not_found!("{cmd}")), }; @@ -120,3 +130,36 @@ async fn answer(bot: teloxide::Bot, msg: Message, cmd: Command) -> ResponseResul Ok(()) } + +#[instrument( + skip_all, + fields( + user_id = msg.from.as_ref().map(|u| u.id.0).unwrap_or(0), + username = msg.from.as_ref().map(|u| u.username.as_ref()).unwrap_or(Some(&"".to_string())).unwrap_or(&"".to_string()), + chat_id = msg.chat.id.0 + ) +)] +async fn default_handler(bot: teloxide::Bot, msg: Message) -> ResponseResult<()> { + warn!("Unknown command received"); + if let Err(err) = bot.send_message(msg.chat.id, "Unknown command").await { + warn!("Failed to send response: {err}"); + } + debug!("Command completed"); + Ok(()) +} + +fn record_user_infos_into_span(msg: &Message) { + let user = match msg.from.as_ref() { + Some(user) => user, + None => return, + }; + + let span = Span::current(); + + let user_id = user.id.0; + span.record("user_id", &user_id); + + if let Some(username) = user.username.as_ref() { + span.record("username", username); + } +} diff --git a/src/telegram/sender.rs b/src/telegram/sender.rs index d0a9b64..a7574cd 100644 --- a/src/telegram/sender.rs +++ b/src/telegram/sender.rs @@ -5,7 +5,7 @@ use teloxide::{ types::{ChatId, InputFile, Message}, }; use tokio::task::JoinSet; -use tracing::{debug, error, info, instrument}; +use tracing::{Instrument, debug, info, instrument, warn}; use crate::core::*; @@ -27,13 +27,26 @@ impl TelegramSender { let bot = Arc::clone(&bot); match result { Ok(metadata) => { - debug!("Processing media item"); - jobs.spawn(Self::download_and_send(bot, chat_id, metadata, item_index)); + jobs.spawn( + Self::download_and_send(bot, chat_id, metadata, item_index) + .in_current_span(), + ); } Err(err) => { - debug!("Processing error for media item: {err}"); - jobs.spawn(async move { bot.send_message(chat_id, err.to_string()).await }); + // Request + jobs.spawn(async move { + let result = bot + .send_message(chat_id, err.to_string()) + .into_future() + .in_current_span() + .await; + if let Err(err) = &result { + warn!("Failed to send error message to chat: {err}"); + } + + result + }); } } } @@ -48,7 +61,7 @@ impl TelegramSender { let result = result.unwrap(); if let Err(err) = result.as_ref() { - error!("Failed to send message: {err}"); + warn!("Failed to send message: {err}"); } results.push(result); @@ -59,7 +72,11 @@ impl TelegramSender { let total = results.len(); let successes = results.iter().filter(|r| r.is_ok()).count(); - info!("Media sending summary: {successes}/{total} items successfully delivered"); + if total == successes { + info!("Media sending summary: {successes}/{total} items successfully delivered"); + } else { + warn!("Media sending summary: {successes}/{total} items successfully delivered"); + } } results @@ -72,8 +89,6 @@ impl TelegramSender { metadata: MediaMetadata, item_index: usize, ) -> ResponseResult { - debug!("Starting media download and send process"); - let input_file = InputFile::url(metadata.url.clone()); let result = match metadata.kind { @@ -93,7 +108,24 @@ impl TelegramSender { Ok(message) } Err(err) => { - error!("Failed to send media to chat"); + warn!("Failed to send media to chat: {err}"); + + use teloxide::{ApiError, RequestError}; + let err_msg = match err { + // scraper error (most likely invalid url given by user) + RequestError::Api(ApiError::Unknown(_)) => { + let mut unknown_err = BotError::Unknown.to_string(); + unknown_err.push_str("\nNote: ensure that the URL is valid"); + BotError::Custom(unknown_err) + }, + + _ => BotError::Unknown, + }; + + if let Err(err) = bot.send_message(chat_id, err_msg.to_string()).await { + warn!("Failed to send error message to chat: {err}"); + } + Err(err) } } diff --git a/src/tiktok/mod.rs b/src/tiktok/mod.rs new file mode 100644 index 0000000..c1a6e0b --- /dev/null +++ b/src/tiktok/mod.rs @@ -0,0 +1,11 @@ +mod scraper; +pub use scraper::TikTokScraper; + +mod config { + pub const TIKTOK_SCRAPER_LINK: &str = "https://www.tikwm.com/video/media/hdplay/"; + pub const TIKTOK_SCRAPER_LINK_END: &str = ".mp4"; + #[expect(unused)] + pub const BROWSER_UA: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; + pub const MINIMAL_USER_AGENT: &str = "curl/8.7.1"; // Minimal user agent string used for TikTok requests; update if TikTok starts requiring a different value + pub const MINIMAL_ACCEPT: &str = "*/*"; +} diff --git a/src/tiktok/scraper.rs b/src/tiktok/scraper.rs new file mode 100644 index 0000000..c882bf6 --- /dev/null +++ b/src/tiktok/scraper.rs @@ -0,0 +1,199 @@ +use async_trait::async_trait; +use reqwest::{Client, Url, header, redirect}; +use tracing::{info, instrument, warn}; + +use crate::{core::*, tiktok}; + +pub enum TikTokScraper {} + +impl TikTokScraper { + #[instrument(skip_all, fields(arg = %arg))] + async fn get_medias_metadata(arg: String) -> BotResult { + let tiktok_url = Self::get_tiktok_url(&arg).await?; + + let (_, video_id) = tiktok_url + .path() + .rsplit_once("/") + .ok_or_else(|| invalid_url!())?; + + if video_id.is_empty() || !video_id.chars().all(|c| c.is_numeric()) { + return Err(invalid_url!("invalid video id: '{video_id}'")); + } + + let media_url = Self::media_url(video_id)?; + + Ok(MediaMetadata::new(MediaKind::Video, media_url)) + } + + async fn get_tiktok_url(arg: &str) -> BotResult { + let url = Url::parse(arg).map_err(|err| invalid_url!(err))?; + + let domain_name = url + .domain() + .ok_or_else(|| invalid_url!("invalid domain name"))?; + + let tiktok_url = match domain_name { + "www.tiktok.com" => url, + + // getting the redirection location return by url + "vm.tiktok.com" | "vt.tiktok.com" => { + warn!("TikTok url is {domain_name}, getting redirection url with reqwest"); + Self::get_tiktok_url_from_redirection(&url).await? + } + + _ => { + return Err(invalid_url!( + "url domain should be www.tiktok.com, vm.tiktok.com or vt.tiktok.com" + )); + } + }; + + Self::validate_tiktok_url(&tiktok_url)?; + Ok(tiktok_url) + } + + #[instrument(name = "from_redirection", skip_all, fields(url = %url))] + async fn get_tiktok_url_from_redirection(url: &Url) -> BotResult { + // TODO: create a BotError variant to replace Unknown + + Self::validate_redirection_url(url)?; + + let reqwest_client = Client::builder() + .redirect(redirect::Policy::none()) + .build() + .map_err(|err| unknown!(err))?; + + let response = reqwest_client + .get(url.clone()) + .header(header::USER_AGENT, tiktok::config::MINIMAL_USER_AGENT) + .header(header::ACCEPT, tiktok::config::MINIMAL_ACCEPT) + .send() + .await + .map_err(|err| invalid_url!(err))?; + + if !response.status().is_redirection() { + return Err(unknown!("response is not a redirection")); + } + + let redirection = response + .headers() + .get(header::LOCATION) + .ok_or_else(|| unknown!("response does not contains a location header"))? + .to_str() + .map_err(|err| unknown!(err))?; + + let redirection = Url::parse(redirection).map_err(|err| unknown!(err))?; + + if redirection.domain() != Some("www.tiktok.com") { + return Err(unknown!( + "location header returned is {:?}, expected www.tiktok.com", + redirection.domain() + )); + } + + Ok(redirection) + } + + #[instrument(name = "validate_url", skip_all, fields(url = %url))] + fn validate_tiktok_url(url: &Url) -> BotResult<()> { + macro_rules! bail_invalid_url { + () => { + return Err(invalid_url!( + "url should look like 'https://www.tiktok.com/[@username/]video/123456789'" + )); + }; + } + + if url.domain() != Some("www.tiktok.com") { + bail_invalid_url!(); + } + + let path = url.path().trim_matches('/'); + + let mut path_segments = path.split('/').collect::>(); + + if path_segments.len() == 3 { + if path_segments[0].len() < 2 { + bail_invalid_url!(); + } + + if !path_segments[0].starts_with('@') { + bail_invalid_url!(); + } + + path_segments.remove(0); + } + + if path_segments.len() != 2 { + bail_invalid_url!(); + } + + if path_segments[0] != "video" { + bail_invalid_url!(); + } + + if path_segments[1].len() < 18 || !path_segments[1].chars().all(|c| c.is_numeric()) { + bail_invalid_url!(); + } + + Ok(()) + } + + #[instrument(name = "validate_url", skip_all, fields(url = %url))] + fn validate_redirection_url(url: &Url) -> BotResult<()> { + let domain = url + .domain() + .ok_or_else(|| invalid_url!("url does not have a domain"))?; + + if !matches!(domain, "vm.tiktok.com" | "vt.tiktok.com") { + return Err(invalid_url!( + "url domain should be vm.tiktok.com or vt.tiktok.com" + )); + } + + let validation_error = + || invalid_url!("url path should look like 'https://{domain}/ABC123'"); + + let segments = match url.path_segments() { + Some(segments) => segments.collect::>(), + None => return Err(validation_error()), + }; + + if segments.len() != 1 { + return Err(validation_error()); + } + + let at_least_6 = segments[0].len() >= 6; + let is_alphanumeric = segments[0].chars().all(|c| c.is_alphanumeric()); + if !at_least_6 || !is_alphanumeric { + return Err(validation_error()); + } + + Ok(()) + } + + fn media_url(video_id: &str) -> Result { + let scraper_link = tiktok::config::TIKTOK_SCRAPER_LINK; + let scraper_link_end = tiktok::config::TIKTOK_SCRAPER_LINK_END; + let media_url = format!("{scraper_link}{video_id}{scraper_link_end}"); + Url::parse(&media_url).map_err(|err| invalid_link!("{media_url}: {err}")) + } +} + +#[async_trait] +impl MediaScraper for TikTokScraper { + type Input = String; + + async fn get_medias(arg: Self::Input) -> BotResult>> { + info!("Starting TikTok media metadata retrieving"); + + let media = TikTokScraper::get_medias_metadata(arg).await; + if media.is_ok() { + info!("TikTok media metadata retrieving results: 1 total, 1 successful, 0 failed"); + } else { + warn!("TikTok media metadata retrieving results: 1 total, 0 successful, 1 failed"); + } + + Ok(vec![media]) + } +} diff --git a/src/twitter/mod.rs b/src/twitter/mod.rs index f125599..7b26bc5 100644 --- a/src/twitter/mod.rs +++ b/src/twitter/mod.rs @@ -1,2 +1,6 @@ mod scraper; pub use scraper::TwitterScraper; + +mod config { + pub const TWITTER_SCRAPER_LINK: &str = "https://www.twitter-viewer.com/api/x/tweet?tweetId="; +} diff --git a/src/twitter/scraper.rs b/src/twitter/scraper.rs index eb8ba87..49324f2 100644 --- a/src/twitter/scraper.rs +++ b/src/twitter/scraper.rs @@ -1,22 +1,16 @@ -use std::str::FromStr; - use async_trait::async_trait; use reqwest::Url; use serde_json::Value; -use tracing::{info, instrument}; +use tracing::{info, instrument, warn}; use crate::core::*; -const X_LINK: &str = "https://www.twitter-viewer.com/api/x/tweet?tweetId="; - -pub struct TwitterScraper; +pub enum TwitterScraper {} impl TwitterScraper { - #[instrument(skip_all, fields(input = %input))] - async fn scrape(input: String) -> BotResult>> { - let url = Url::parse(&input).map_err(|err| invalid_url!("{err}"))?; - - info!("Starting media scraping"); + #[instrument(skip_all, fields(arg = %arg))] + async fn get_medias_metadata(arg: String) -> BotResult>> { + let url = Url::parse(&arg).map_err(|err| invalid_url!("{err}"))?; let scraping_results = { let post_id = url @@ -25,32 +19,19 @@ impl TwitterScraper { .next_back() .ok_or_else(|| invalid_url!("{url}"))?; - let scraper_url = Self::scraper_link(post_id)?; + let scraper_url = Self::media_link(post_id)?; Self::scrape_medias_inner(&scraper_url).await? }; - info!("media scraping finished"); - if scraping_results.is_empty() { return Err(no_media_found!()); } - // Logging - { - let total_count = scraping_results.len(); - let success_count = scraping_results.iter().filter(|r| r.is_ok()).count(); - let error_count = total_count - success_count; - - info!( - "Twitter scraping completed: {} total, {} successful, {} failed", - total_count, success_count, error_count - ); - } - Ok(scraping_results) } + #[instrument(name = "media_metadata_parsing", skip_all)] fn parse_metadata(item: &Value) -> BotResult { let get_index_as_str = |index: &str| -> BotResult<&str> { item.get(index) @@ -71,18 +52,12 @@ impl TwitterScraper { let url = reqwest::Url::parse(url) .map_err(|err| invalid_scraper_response!("invalid url: {err}"))?; - let id = url - .as_str() - .rsplit('/') - .next() - .and_then(|filename: &str| filename.split_once('.').map(|(name, _)| name)) - .ok_or_else(|| invalid_scraper_response!("invalid url: {}", url))? - .to_string(); - - Ok(MediaMetadata { id, url, kind }) + Ok(MediaMetadata::new(kind, url)) } async fn scrape_medias_inner(scraper_url: &Url) -> BotResult>> { + info!("Starting medias scraping"); + let response = reqwest::get(scraper_url.as_str()) .await .map_err(|err| unknown!("{err}"))?; @@ -96,12 +71,10 @@ impl TwitterScraper { .get("success") .and_then(|v: &Value| v.as_bool()) { - let error_msg = response_json - .get("error") - .and_then(|v| v.as_str()) - .unwrap_or("unknown error"); - - return Err(custom!("{error_msg}")); + return match response_json.get("error").and_then(|v| v.as_str()) { + Some(msg) => Err(custom!("{msg}")), + None => Err(unknown!()), + } } let data = response_json @@ -114,15 +87,16 @@ impl TwitterScraper { .as_array() .ok_or_else(|| invalid_scraper_response!("invalid field media"))?; - let medias = json_medias.iter().map(Self::parse_metadata).collect(); + let medias: Vec> = + json_medias.iter().map(Self::parse_metadata).collect(); Ok(medias) } - #[instrument] - fn scraper_link(post_id: &str) -> Result { - let link = format!("{X_LINK}{post_id}"); - Url::from_str(&link).map_err(|err| invalid_link!("{link}: {err}")) + fn media_link(post_id: &str) -> Result { + let scraper_link = crate::twitter::config::TWITTER_SCRAPER_LINK; + let media_link = format!("{scraper_link}{post_id}"); + Url::parse(&media_link).map_err(|err| invalid_link!("{media_link}: {err}")) } } @@ -130,8 +104,31 @@ impl TwitterScraper { impl MediaScraper for TwitterScraper { type Input = String; - #[doc(hidden)] - async fn scrape(input: Self::Input) -> BotResult>> { - TwitterScraper::scrape(input).await + async fn get_medias(arg: Self::Input) -> BotResult>> { + info!("Starting Twitter media metadata retrieving"); + + let result = TwitterScraper::get_medias_metadata(arg).await; + + if let Ok(medias) = &result { + let total_count = medias.len(); + let success_count = medias.iter().filter(|r| r.is_ok()).count(); + let error_count = total_count - success_count; + + if error_count == 0 { + info!( + "Medias scraping results: {} total, {} successful, {} failed", + total_count, success_count, error_count + ); + } else { + warn!( + "Medias scraping results: {} total, {} successful, {} failed", + total_count, success_count, error_count + ); + } + } else { + warn!("Media scraping results: failed to retrieve media"); + } + + result } }