diff --git a/CHANGELOG.md b/CHANGELOG.md index 35ade18..92f849b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,123 @@ All notable changes to this project will be documented in this file. +## Unreleased + +### RPKIviews Historical Data Support + +* **Added RPKIviews as a historical RPKI data source**: Users can now load historical RPKI data from RPKIviews collectors in addition to RIPE NCC archives + - New `RpkiViewsCollector` enum with four collectors: SoborostNet (default), MassarsNet, AttnJp, and KerfuffleNet + - Added `RpkiTrie::from_rpkiviews(collector, date)` method for loading from a specific collector + - Added `RpkiTrie::from_rpkiviews_file(url, date)` and `from_rpkiviews_files(urls, date)` for loading from specific archive URLs + - Added `list_rpkiviews_files(collector, date)` function to discover available archives for a given date + - New `HistoricalRpkiSource` enum to explicitly select between RIPE and RPKIviews sources + +* **Streaming optimization for .tgz archives**: RPKIviews archives are streamed efficiently without downloading the entire file + - `rpki-client.json` is located at position 3-4 in the archive, allowing early termination after ~80MB instead of downloading 300+ MB + - New `extract_file_from_tgz(url, target_path)` function for streaming extraction of specific files + - New `list_files_in_tgz(url, max_entries)` function for listing archive contents with early termination + - New `tgz_contains_file(url, target_path)` function for checking file existence + - Uses `reqwest` for HTTP streaming and external `gunzip` for decompression + - Test completion time reduced from several minutes to ~8 seconds + +* **Unified rpki-client JSON parsing**: Extracted shared parsing logic for rpki-client JSON format + - New internal `rpki_client.rs` module with `RpkiClientData` struct and robust deserializers + - Handles variations in ASN formats (numeric `12345` vs string `"AS12345"`) + - Handles variations in ASPA field names (`customer_asid` vs `customer`) + - Handles provider arrays as both numbers and strings + - Used by Cloudflare, RIPE historical, and RPKIviews sources + +* **Public ROA and ASPA structs**: Added stable public API types + - New `Roa` struct with fields: `prefix`, `asn`, `max_length`, `not_before`, `not_after` + - New `Aspa` struct with fields: `customer_asn`, `providers` + - Internal rpki-client format structs are now `pub(crate)` only + +* **Updated RIPE historical to use JSON format**: Changed from CSV to `output.json.xz` for consistency + - Requires `xz` feature in oneio (now enabled by default for rpki feature) + - Provides richer data including expiry timestamps + +* **New BgpkitCommons methods**: + - `load_rpki_historical(source, date)` - Load historical RPKI data from specified source + - `list_rpki_files(source, date)` - List available RPKI files for a date from specified source + - `load_rpki_from_files(urls, date)` - Load and merge RPKI data from multiple file URLs + +* **New example**: Added `examples/rpki_historical.rs` demonstrating historical RPKI data loading + +* **Updated example**: `examples/list_aspas.rs` now counts ASPA objects for first day of years 2020-2025 + +### Dependencies + +* Added `reqwest` (with blocking feature) for HTTP streaming +* Added `tar` crate for reading tar archives +* Enabled `xz` feature in `oneio` for RIPE historical JSON support + +### Crate Consolidation + +* **Migrated `as2org-rs` into bgpkit-commons**: The CAIDA AS-to-Organization mapping functionality previously provided by the external `as2org-rs` crate has been fully integrated into the `asinfo` module + - New `src/asinfo/as2org.rs` module provides `As2org` struct with `new()`, `get_as_info()`, `get_siblings()`, and `are_siblings()` methods + - Removed external `as2org-rs` dependency from Cargo.toml + - Single codebase simplifies maintenance and patch application + +* **Migrated `peeringdb-rs` into bgpkit-commons**: The PeeringDB API access functionality previously provided by the external `peeringdb-rs` crate has been fully integrated into the `asinfo` module + - Updated `src/asinfo/peeringdb.rs` with full PeeringDB API client implementation + - Includes `PeeringdbNet` struct and `load_peeringdb_net()` function for direct API access + - Removed external `peeringdb-rs` dependency from Cargo.toml + +* **Updated feature flags**: The `asinfo` feature now uses `regex` instead of external crate dependencies + - Before: `asinfo = ["as2org-rs", "peeringdb-rs", "oneio", "serde_json", "tracing", "chrono"]` + - After: `asinfo = ["oneio", "serde_json", "tracing", "chrono", "regex"]` + +### API Improvements + +* **AsInfoBuilder**: Added a new builder pattern for loading AS information with specific data sources + - New `AsInfoBuilder` struct with fluent API methods: `with_as2org()`, `with_population()`, `with_hegemony()`, `with_peeringdb()`, `with_all()` + - Added `asinfo_builder()` method to `BgpkitCommons` for creating builders + - Added `load_asinfo_with(builder)` method to `BgpkitCommons` for loading with builder configuration + - The existing `load_asinfo(bool, bool, bool, bool)` method is preserved for backward compatibility + +**Before (confusing boolean parameters):** +```rust +commons.load_asinfo(true, false, true, false)?; +``` + +**After (clear builder pattern):** +```rust +let builder = commons.asinfo_builder() + .with_as2org() + .with_hegemony(); +commons.load_asinfo_with(builder)?; +``` + +### Public API Enhancements + +* **asinfo module**: Added `PeeringdbData` to public exports for direct module access +* All modules now consistently support both: + - Central access via `BgpkitCommons` instance + - Direct module access (e.g., `bgpkit_commons::bogons::Bogons::new()`) + +### Testing Improvements + +* **Comprehensive as2org module tests**: Added extensive unit tests for the migrated CAIDA AS-to-Organization functionality + - JSON deserialization tests for `As2orgJsonOrg` and `As2orgJsonAs` structures + - Tests for optional fields and default values + - `As2orgAsInfo` struct creation and serialization round-trip tests + - `fix_latin1_misinterpretation` function tests for edge cases + - Integration tests (ignored by default) for `As2org::new()`, `get_as_info()`, `get_siblings()`, and `are_siblings()` methods + +* **Comprehensive peeringdb module tests**: Added extensive unit tests for the migrated PeeringDB functionality + - `PeeringdbData` struct creation, serialization, and deserialization tests + - `PeeringdbNet` struct tests with all optional fields + - `PeeringdbNetResponse` API response deserialization tests + - `Peeringdb` struct tests for `get_data()`, `contains()`, `len()`, `is_empty()`, and `get_all_asns()` methods + - Empty database edge case tests + - Integration tests (ignored by default) for live API access + +* **New Peeringdb helper methods**: Added utility methods to the `Peeringdb` struct for better usability + - `len()`: Get the number of networks in the database + - `is_empty()`: Check if the database is empty + - `contains(asn)`: Check if an ASN exists in PeeringDB + - `get_all_asns()`: Get all ASNs in the database + ## v0.9.6 - 2025-10-29 ### Maintenance diff --git a/Cargo.toml b/Cargo.toml index 8588152..c6c13ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,32 +18,32 @@ keywords = ["bgp", "bgpkit"] thiserror = "2.0" serde = { version = "1.0", features = ["derive"] } -as2org-rs = { version = "1.1.1", optional = true } chrono = { version = "0.4", features = ["serde"], optional = true } ipnet = { version = "2.9", features = ["serde"], optional = true } ipnet-trie = { version = "0.3.0", optional = true } -oneio = { version = "0.20.0", optional = true, features = ["json"] } -peeringdb-rs = { version = "0.1.3", optional = true } +oneio = { version = "0.20.0", optional = true, features = ["json", "xz"] } regex = { version = "1", optional = true } serde_json = { version = "1", optional = true } tracing = { version = "0.1", optional = true } +reqwest = { version = "0.12", optional = true, features = ["blocking"] } +tar = { version = "0.4", optional = true } [dev-dependencies] tracing-subscriber = "0.3" serde_json = "1" -oneio = { version = "0.20.0", features = ["json"] } +oneio = { version = "0.20.0", features = ["json", "xz"] } [features] default = ["all"] # Module features -asinfo = ["as2org-rs", "peeringdb-rs", "oneio", "serde_json", "tracing", "chrono"] +asinfo = ["oneio", "serde_json", "tracing", "chrono", "regex"] as2rel = ["oneio", "serde_json", "tracing"] bogons = ["oneio", "ipnet", "regex", "chrono"] countries = ["oneio"] mrt_collectors = ["oneio", "chrono"] -rpki = ["oneio", "ipnet", "ipnet-trie", "chrono", "tracing"] +rpki = ["oneio", "ipnet", "ipnet-trie", "chrono", "tracing", "reqwest", "tar", "serde_json"] # Convenience feature to enable all modules all = ["asinfo", "as2rel", "bogons", "countries", "mrt_collectors", "rpki"] @@ -61,6 +61,10 @@ required-features = ["mrt_collectors"] name = "list_aspas" required-features = ["rpki"] +[[example]] +name = "rpki_historical" +required-features = ["rpki"] + [lints.clippy] uninlined_format_args = "allow" collapsible_if = "allow" diff --git a/examples/list_aspas.rs b/examples/list_aspas.rs index 47b4f9c..f50f3b5 100644 --- a/examples/list_aspas.rs +++ b/examples/list_aspas.rs @@ -1,9 +1,39 @@ -use serde_json::json; +use bgpkit_commons::rpki::{RpkiTrie, RpkiViewsCollector}; +use chrono::NaiveDate; fn main() { - let cf_data = bgpkit_commons::rpki::CfData::new().unwrap(); - println!( - "{}", - serde_json::to_string_pretty(&json!(cf_data.aspas)).unwrap() - ); + println!("Counting ASPA objects on the first day of each year (2020-2025)"); + println!("{}", "=".repeat(60)); + + for year in 2020..=2025 { + let date = NaiveDate::from_ymd_opt(year, 1, 1).unwrap(); + + // Try RIPE historical first + match RpkiTrie::from_ripe_historical(date) { + Ok(trie) => { + println!("{}-01-01: {} ASPAs (from RIPE)", year, trie.aspas.len()); + continue; + } + Err(_) => { + // RIPE failed, try RPKIviews + } + } + + // Fallback to RPKIviews + match RpkiTrie::from_rpkiviews(RpkiViewsCollector::default(), date) { + Ok(trie) => { + println!( + "{}-01-01: {} ASPAs (from RPKIviews)", + year, + trie.aspas.len() + ); + } + Err(_) => { + println!("{}-01-01: No data available", year); + } + } + } + + println!("{}", "=".repeat(60)); + println!("Done!"); } diff --git a/examples/rpki_historical.rs b/examples/rpki_historical.rs new file mode 100644 index 0000000..e6d640e --- /dev/null +++ b/examples/rpki_historical.rs @@ -0,0 +1,64 @@ +//! Example demonstrating historical RPKI data loading from RIPE NCC and RPKIviews +//! +//! Run with: cargo run --example rpki_historical --features rpki + +use bgpkit_commons::BgpkitCommons; +use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; +use chrono::NaiveDate; + +fn main() { + // Initialize tracing for debug output + tracing_subscriber::fmt::init(); + + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let commons = BgpkitCommons::new(); + + println!("=== Listing available RPKI files for {} ===\n", date); + + // List files from RIPE NCC (one file per RIR) + println!("RIPE NCC files:"); + match commons.list_rpki_files(date, HistoricalRpkiSource::Ripe) { + Ok(files) => { + for file in &files { + println!( + " - {} (RIR: {})", + file.url, + file.rir + .map(|r| r.to_string()) + .unwrap_or_else(|| "N/A".to_string()) + ); + } + } + Err(e) => println!(" Error listing RIPE files: {}", e), + } + println!(); + + // List files from RPKIviews (multiple snapshots per day) + println!("RPKIviews files (KerfuffleNet collector):"); + let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::KerfuffleNet); + match commons.list_rpki_files(date, source) { + Ok(files) => { + println!(" Found {} files for {}", files.len(), date); + // Show first 5 files + for file in files.iter().take(5) { + println!( + " - {} ({} bytes, timestamp: {})", + file.url, + file.size.unwrap_or(0), + file.timestamp + ); + } + if files.len() > 5 { + println!(" ... and {} more files", files.len() - 5); + } + } + Err(e) => println!(" Error listing RPKIviews files: {}", e), + } + println!(); + + // Show available collectors + println!("Available RPKIviews collectors:"); + for collector in RpkiViewsCollector::all() { + println!(" - {} ({})", collector, collector.base_url()); + } +} diff --git a/src/asinfo/as2org.rs b/src/asinfo/as2org.rs new file mode 100644 index 0000000..195933b --- /dev/null +++ b/src/asinfo/as2org.rs @@ -0,0 +1,520 @@ +//! AS-to-Organization mapping using CAIDA data +//! +//! This module provides access to CAIDA's AS-to-Organization dataset, allowing +//! lookups of AS information including organization details and sibling relationships. +//! +//! # Data source +//! - CAIDA AS Organizations Dataset: + +use crate::{BgpkitCommonsError, Result}; +use chrono::NaiveDate; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Organization JSON format from CAIDA dataset +#[derive(Debug, Clone, Serialize, Deserialize)] +struct As2orgJsonOrg { + #[serde(alias = "organizationId")] + org_id: String, + + changed: Option, + + #[serde(default)] + name: String, + + country: String, + + /// The RIR or NIR database that contained this entry + source: String, + + #[serde(alias = "type")] + data_type: String, +} + +/// AS JSON format from CAIDA dataset +#[derive(Debug, Clone, Serialize, Deserialize)] +struct As2orgJsonAs { + asn: String, + + changed: Option, + + #[serde(default)] + name: String, + + #[serde(alias = "opaqueId")] + opaque_id: Option, + + #[serde(alias = "organizationId")] + org_id: String, + + /// The RIR or NIR database that contained this entry + source: String, + + #[serde(rename = "type")] + data_type: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +enum As2orgJsonEntry { + Org(As2orgJsonOrg), + As(As2orgJsonAs), +} + +/// Public information for an Autonomous System (AS) enriched with its organization. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct As2orgAsInfo { + /// The AS number + pub asn: u32, + /// The name provided for the individual AS number + pub name: String, + /// The registration country code of the organization + pub country_code: String, + /// Organization identifier (as used in the dataset) + pub org_id: String, + /// Organization name + pub org_name: String, + /// The RIR database that contained this entry + pub source: String, +} + +/// In-memory accessor for CAIDA's AS-to-Organization dataset. +#[allow(dead_code)] +pub struct As2org { + as_map: HashMap, + org_map: HashMap, + as_to_org: HashMap, + org_to_as: HashMap>, +} + +const BASE_URL: &str = "https://publicdata.caida.org/datasets/as-organizations"; + +impl As2org { + /// Create a new `As2org` accessor. + /// + /// - When `data_file_path` is `None`, the constructor fetches the CAIDA + /// index page to discover the most recent `*.as-org2info.jsonl.gz` file + /// and reads it via HTTP(S). + /// - When `Some(path_or_url)` is provided, the path can be a local file or + /// a remote URL. Gzipped files are supported transparently. + pub fn new(data_file_path: Option) -> Result { + let entries = match data_file_path { + Some(path) => parse_as2org_file(path.as_str())?, + None => { + let url = get_most_recent_data()?; + parse_as2org_file(url.as_str())? + } + }; + + let mut as_map: HashMap = HashMap::new(); + let mut org_map: HashMap = HashMap::new(); + + for entry in entries { + match entry { + As2orgJsonEntry::As(as_entry) => { + if let Ok(asn) = as_entry.asn.parse::() { + as_map.insert(asn, as_entry); + } + } + As2orgJsonEntry::Org(org_entry) => { + org_map.insert(org_entry.org_id.clone(), org_entry); + } + } + } + + let mut as_to_org: HashMap = HashMap::new(); + let mut org_to_as: HashMap> = HashMap::new(); + + for (asn, as_entry) in as_map.iter() { + as_to_org.insert(*asn, as_entry.org_id.clone()); + let org_asn = org_to_as.entry(as_entry.org_id.clone()).or_default(); + org_asn.push(*asn); + } + + Ok(Self { + as_map, + org_map, + as_to_org, + org_to_as, + }) + } + + /// List all available dataset files published by CAIDA with their dates. + #[allow(dead_code)] + pub fn get_all_files_with_dates() -> Result> { + get_all_files_with_dates() + } + + /// Returns the URL for the latest AS-to-Organization dataset file. + #[allow(dead_code)] + pub fn get_latest_file_url() -> String { + format!("{BASE_URL}/latest.as-org2info.jsonl.gz") + } + + /// Get enriched information for a specific ASN, if present. + pub fn get_as_info(&self, asn: u32) -> Option { + let as_entry = self.as_map.get(&asn)?; + let org_id = as_entry.org_id.as_str(); + let org_entry = self.org_map.get(org_id)?; + Some(As2orgAsInfo { + asn, + name: as_entry.name.clone(), + country_code: org_entry.country.clone(), + org_id: org_id.to_string(), + org_name: org_entry.name.clone(), + source: org_entry.source.clone(), + }) + } + + /// Return all ASNs that belong to the same organization as the given ASN. + #[allow(dead_code)] + pub fn get_siblings(&self, asn: u32) -> Option> { + let org_id = self.as_to_org.get(&asn)?; + let org_asns = self.org_to_as.get(org_id)?.to_vec(); + Some( + org_asns + .iter() + .filter_map(|asn| self.get_as_info(*asn)) + .collect(), + ) + } + + /// Return `true` if both ASNs belong to the same organization. + #[allow(dead_code)] + pub fn are_siblings(&self, asn1: u32, asn2: u32) -> bool { + let org1 = match self.as_to_org.get(&asn1) { + None => return false, + Some(o) => o, + }; + let org2 = match self.as_to_org.get(&asn2) { + None => return false, + Some(o) => o, + }; + org1 == org2 + } +} + +/// Fixes misinterpretation of strings encoded in Latin-1 that were mistakenly decoded as UTF-8. +fn fix_latin1_misinterpretation(input: &str) -> String { + let mut result = String::new(); + let mut chars = input.chars().peekable(); + + while let Some(c) = chars.next() { + // Check for the pattern of misinterpreted Latin-1 chars + if c == 'Ã' && chars.peek().is_some() { + let next_char = chars.next().unwrap(); + + // Calculate the original Latin-1 character + let byte_value = match next_char { + '\u{0080}'..='\u{00BF}' => 0xC0 + (next_char as u32 - 0x0080), + // Handle other ranges as needed + _ => { + // If it doesn't match the pattern, treat as normal chars + result.push(c); + result.push(next_char); + continue; + } + }; + + // Convert to the correct character + if let Some(correct_char) = char::from_u32(byte_value) { + result.push(correct_char); + } else { + // Fallback for invalid characters + result.push(c); + result.push(next_char); + } + } else { + result.push(c); + } + } + + result +} + +/// Parse remote AS2Org file into Vec of DataEntry +fn parse_as2org_file(path: &str) -> Result> { + let mut res: Vec = vec![]; + + for line in oneio::read_lines(path)? { + let line = fix_latin1_misinterpretation(&line?); + if line.contains(r#""type":"ASN""#) { + let data = serde_json::from_str::(line.as_str()); + match data { + Ok(data) => { + res.push(As2orgJsonEntry::As(data)); + } + Err(e) => { + return Err(BgpkitCommonsError::data_source_error( + crate::errors::data_sources::CAIDA, + format!("error parsing AS line: {}", e), + )); + } + } + } else { + let data = serde_json::from_str::(line.as_str()); + match data { + Ok(data) => { + res.push(As2orgJsonEntry::Org(data)); + } + Err(e) => { + return Err(BgpkitCommonsError::data_source_error( + crate::errors::data_sources::CAIDA, + format!("error parsing Org line: {}", e), + )); + } + } + } + } + Ok(res) +} + +/// Returns a vector of tuples containing the full URLs of AS2Org data files and their corresponding dates. +fn get_all_files_with_dates() -> Result> { + let data_link: Regex = Regex::new(r".*(\d{8}\.as-org2info\.jsonl\.gz).*") + .map_err(|e| BgpkitCommonsError::Internal(format!("failed to compile regex: {}", e)))?; + let content = oneio::read_to_string(BASE_URL)?; + let mut res: Vec<(String, NaiveDate)> = data_link + .captures_iter(content.as_str()) + .filter_map(|cap| { + let file = cap[1].to_owned(); + let date = NaiveDate::parse_from_str(&file[..8], "%Y%m%d").ok()?; + Some((format!("{BASE_URL}/{file}"), date)) + }) + .collect(); + res.sort_by_key(|(_, date)| *date); + Ok(res) +} + +fn get_most_recent_data() -> Result { + let files = get_all_files_with_dates()?; + let last_file = files.last().ok_or_else(|| { + BgpkitCommonsError::data_source_error( + crate::errors::data_sources::CAIDA, + "No dataset files found", + ) + })?; + Ok(last_file.0.clone()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fix_latin1_misinterpretation() { + // Test that the function handles normal strings correctly + let normal = "Hello World"; + assert_eq!(fix_latin1_misinterpretation(normal), normal); + + // Test empty string + assert_eq!(fix_latin1_misinterpretation(""), ""); + + // Test string without misinterpreted characters + let ascii_only = "ACME Corporation Inc."; + assert_eq!(fix_latin1_misinterpretation(ascii_only), ascii_only); + + // Test string with special characters that shouldn't be modified + let special = "Test @#$%^&*() 123"; + assert_eq!(fix_latin1_misinterpretation(special), special); + } + + #[test] + fn test_as2org_json_org_deserialization() { + let json = r#"{"organizationId":"ORG-TEST","changed":"20240101","name":"Test Org","country":"US","source":"ARIN","type":"Organization"}"#; + let org: As2orgJsonOrg = serde_json::from_str(json).unwrap(); + assert_eq!(org.org_id, "ORG-TEST"); + assert_eq!(org.name, "Test Org"); + assert_eq!(org.country, "US"); + assert_eq!(org.source, "ARIN"); + assert_eq!(org.data_type, "Organization"); + } + + #[test] + fn test_as2org_json_org_with_missing_optional_fields() { + let json = r#"{"organizationId":"ORG-TEST2","name":"Another Org","country":"DE","source":"RIPE","type":"Organization"}"#; + let org: As2orgJsonOrg = serde_json::from_str(json).unwrap(); + assert_eq!(org.org_id, "ORG-TEST2"); + assert!(org.changed.is_none()); + } + + #[test] + fn test_as2org_json_as_deserialization() { + let json = r#"{"asn":"12345","changed":"20240101","name":"Test AS","opaqueId":"opaque123","organizationId":"ORG-TEST","source":"ARIN","type":"ASN"}"#; + let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap(); + assert_eq!(as_entry.asn, "12345"); + assert_eq!(as_entry.name, "Test AS"); + assert_eq!(as_entry.org_id, "ORG-TEST"); + assert_eq!(as_entry.opaque_id, Some("opaque123".to_string())); + assert_eq!(as_entry.source, "ARIN"); + assert_eq!(as_entry.data_type, "ASN"); + } + + #[test] + fn test_as2org_json_as_with_missing_optional_fields() { + let json = r#"{"asn":"67890","name":"Minimal AS","organizationId":"ORG-MIN","source":"APNIC","type":"ASN"}"#; + let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap(); + assert_eq!(as_entry.asn, "67890"); + assert!(as_entry.changed.is_none()); + assert!(as_entry.opaque_id.is_none()); + } + + #[test] + fn test_as2org_json_as_with_empty_name() { + // Test the #[serde(default)] attribute for name field + let json = r#"{"asn":"11111","organizationId":"ORG-EMPTY","source":"RIPE","type":"ASN"}"#; + let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap(); + assert_eq!(as_entry.name, ""); // default empty string + } + + #[test] + fn test_as2org_as_info_struct() { + let info = As2orgAsInfo { + asn: 12345, + name: "Test AS".to_string(), + country_code: "US".to_string(), + org_id: "ORG-TEST".to_string(), + org_name: "Test Organization".to_string(), + source: "ARIN".to_string(), + }; + assert_eq!(info.asn, 12345); + assert_eq!(info.name, "Test AS"); + assert_eq!(info.country_code, "US"); + assert_eq!(info.org_id, "ORG-TEST"); + assert_eq!(info.org_name, "Test Organization"); + assert_eq!(info.source, "ARIN"); + } + + #[test] + fn test_as2org_as_info_serialization() { + let info = As2orgAsInfo { + asn: 12345, + name: "Test AS".to_string(), + country_code: "US".to_string(), + org_id: "ORG-TEST".to_string(), + org_name: "Test Organization".to_string(), + source: "ARIN".to_string(), + }; + let json = serde_json::to_string(&info).unwrap(); + assert!(json.contains("\"asn\":12345")); + assert!(json.contains("\"name\":\"Test AS\"")); + + // Test round-trip + let deserialized: As2orgAsInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.asn, info.asn); + assert_eq!(deserialized.name, info.name); + } + + #[test] + fn test_get_latest_file_url() { + let url = As2org::get_latest_file_url(); + assert!(url.starts_with("https://publicdata.caida.org/datasets/as-organizations/")); + assert!(url.ends_with(".as-org2info.jsonl.gz")); + } + + // Integration tests that require network access - marked as ignored by default + #[test] + #[ignore] + fn test_as2org_new_from_latest() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Verify the database was loaded by checking if we have some data + assert!(!as2org.as_map.is_empty()); + assert!(!as2org.org_map.is_empty()); + } + + #[test] + #[ignore] + fn test_as2org_get_as_info_existing() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Test with a well-known ASN (Google) + let info = as2org.get_as_info(15169); + assert!(info.is_some()); + let info = info.unwrap(); + assert_eq!(info.asn, 15169); + assert!(!info.org_id.is_empty()); + assert!(!info.org_name.is_empty()); + assert!(!info.country_code.is_empty()); + assert!(!info.source.is_empty()); + } + + #[test] + #[ignore] + fn test_as2org_get_as_info_nonexistent() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Test with a likely non-existent ASN + let info = as2org.get_as_info(999999999); + assert!(info.is_none()); + } + + #[test] + #[ignore] + fn test_as2org_get_siblings() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Test with Google's AS15169 + let siblings = as2org.get_siblings(15169); + assert!(siblings.is_some()); + let siblings = siblings.unwrap(); + // Google should have at least a few sibling ASNs + assert!(!siblings.is_empty()); + // The queried ASN should be included in siblings + assert!(siblings.iter().any(|s| s.asn == 15169)); + } + + #[test] + #[ignore] + fn test_as2org_get_siblings_nonexistent() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + let siblings = as2org.get_siblings(999999999); + assert!(siblings.is_none()); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_true() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Google ASNs 15169 and 36040 are known siblings + assert!(as2org.are_siblings(15169, 36040)); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_false() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Google (15169) and Cloudflare (13335) are not siblings + assert!(!as2org.are_siblings(15169, 13335)); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_nonexistent() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Non-existent ASN should return false + assert!(!as2org.are_siblings(15169, 999999999)); + assert!(!as2org.are_siblings(999999999, 15169)); + assert!(!as2org.are_siblings(999999999, 999999998)); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_same_asn() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Same ASN should be its own sibling + assert!(as2org.are_siblings(15169, 15169)); + } + + #[test] + #[ignore] + fn test_as2org_get_all_files_with_dates() { + let files = As2org::get_all_files_with_dates().expect("Failed to get file list"); + assert!(!files.is_empty()); + // Files should be sorted by date (ascending) + for i in 1..files.len() { + assert!(files[i].1 >= files[i - 1].1); + } + // Each URL should point to CAIDA + for (url, _) in &files { + assert!(url.contains("publicdata.caida.org")); + assert!(url.ends_with(".as-org2info.jsonl.gz")); + } + } +} diff --git a/src/asinfo/mod.rs b/src/asinfo/mod.rs index 0ded583..e46a199 100644 --- a/src/asinfo/mod.rs +++ b/src/asinfo/mod.rs @@ -111,6 +111,7 @@ //! let are_siblings = bgpkit.asinfo_are_siblings(3333, 3334).unwrap(); //! ``` +mod as2org; mod hegemony; mod peeringdb; mod population; @@ -118,13 +119,13 @@ mod sibling_orgs; use crate::errors::{data_sources, load_methods, modules}; use crate::{BgpkitCommons, BgpkitCommonsError, LazyLoadable, Result}; -use peeringdb::PeeringdbData; use serde::{Deserialize, Serialize}; use sibling_orgs::SiblingOrgsUtils; use std::collections::HashMap; use tracing::info; pub use hegemony::HegemonyData; +pub use peeringdb::PeeringdbData; pub use population::AsnPopulationData; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -173,6 +174,77 @@ const RIPE_RIS_ASN_TXT_URL: &str = "https://ftp.ripe.net/ripe/asnames/asn.txt"; const BGPKIT_ASN_TXT_MIRROR_URL: &str = "https://data.bgpkit.com/commons/asn.txt"; const BGPKIT_ASNINFO_URL: &str = "https://data.bgpkit.com/commons/asinfo.jsonl"; +/// Builder for configuring which data sources to load for AS information. +/// +/// # Example +/// +/// ```rust,no_run +/// use bgpkit_commons::asinfo::AsInfoBuilder; +/// +/// let asinfo = AsInfoBuilder::new() +/// .with_as2org() +/// .with_peeringdb() +/// .build() +/// .unwrap(); +/// ``` +#[derive(Default)] +pub struct AsInfoBuilder { + load_as2org: bool, + load_population: bool, + load_hegemony: bool, + load_peeringdb: bool, +} + +impl AsInfoBuilder { + /// Create a new builder with all data sources disabled by default. + pub fn new() -> Self { + Self::default() + } + + /// Enable loading CAIDA AS-to-Organization mapping data. + pub fn with_as2org(mut self) -> Self { + self.load_as2org = true; + self + } + + /// Enable loading APNIC AS population data. + pub fn with_population(mut self) -> Self { + self.load_population = true; + self + } + + /// Enable loading IIJ IHR hegemony score data. + pub fn with_hegemony(mut self) -> Self { + self.load_hegemony = true; + self + } + + /// Enable loading PeeringDB data. + pub fn with_peeringdb(mut self) -> Self { + self.load_peeringdb = true; + self + } + + /// Enable all optional data sources. + pub fn with_all(mut self) -> Self { + self.load_as2org = true; + self.load_population = true; + self.load_hegemony = true; + self.load_peeringdb = true; + self + } + + /// Build the AsInfoUtils with the configured data sources. + pub fn build(self) -> Result { + AsInfoUtils::new( + self.load_as2org, + self.load_population, + self.load_hegemony, + self.load_peeringdb, + ) + } +} + pub struct AsInfoUtils { pub asinfo_map: HashMap, pub sibling_orgs: Option, @@ -291,9 +363,7 @@ pub fn get_asinfo_map( let as2org_utils = if load_as2org { info!("loading as2org data from CAIDA..."); - Some(as2org_rs::As2org::new(None).map_err(|e| { - BgpkitCommonsError::data_source_error(data_sources::CAIDA, e.to_string()) - })?) + Some(as2org::As2org::new(None)?) } else { None }; diff --git a/src/asinfo/peeringdb.rs b/src/asinfo/peeringdb.rs index 7736844..ee2c5af 100644 --- a/src/asinfo/peeringdb.rs +++ b/src/asinfo/peeringdb.rs @@ -1,8 +1,25 @@ +//! PeeringDB data module +//! +//! This module provides access to PeeringDB data via their public API. +//! +//! # Data source +//! - PeeringDB API: +//! +//! # PeeringDB API key required +//! +//! It is strongly recommended to obtain a [PeeringDB API key](https://docs.peeringdb.com/blog/api_keys/) +//! and set the `PEERINGDB_API_KEY` environment variable. +//! Without it, the API call will likely fail due to rate limiting. + use crate::{BgpkitCommonsError, Result}; -use peeringdb_rs::{PeeringdbNet, load_peeringdb_net}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::io::Read; +use tracing::warn; + +const PEERINGDB_NET_API_URL: &str = "https://www.peeringdb.com/api/net"; +/// PeeringDB network data #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PeeringdbData { pub asn: u32, @@ -13,20 +30,114 @@ pub struct PeeringdbData { pub website: Option, } +/// Full PeeringDB network response from API +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeeringdbNet { + pub id: u32, + pub name: Option, + pub name_long: Option, + pub aka: Option, + pub asn: Option, + pub org_id: Option, + pub irr_as_set: Option, + pub website: Option, + pub notes: Option, + pub fac_count: Option, + pub ix_count: Option, + + pub policy_contracts: Option, + pub policy_general: Option, + pub policy_locations: Option, + pub policy_ratio: Option, + pub policy_url: Option, + + pub info_ipv6: Option, + pub info_multicast: Option, + pub info_never_via_route_servers: Option, + pub info_prefixes4: Option, + pub info_prefixes6: Option, + pub info_ratio: Option, + pub info_scope: Option, + pub info_traffic: Option, + pub info_type: Option, + pub info_types: Option>, + pub info_unicast: Option, + + pub rir_status: Option, + pub status: Option, + pub status_dashboard: Option, + pub created: Option, + pub updated: Option, + pub route_server: Option, + pub looking_glass: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +struct PeeringdbNetResponse { + data: Vec, +} + +/// Get a reader for PeeringDB API with proper authentication headers +fn get_peeringdb_reader(url: &str) -> Result> { + // Try to load API key from environment + let api_key = std::env::var("PEERINGDB_API_KEY").unwrap_or_else(|_| { + warn!("missing PEERINGDB_API_KEY env var, call may fail due to rate limiting"); + "".to_string() + }); + + let client = oneio::remote::create_client_with_headers([ + ("Authorization".to_string(), format!("Api-Key {}", api_key)), + ( + "User-Agent".to_string(), + format!("bgpkit-commons/{}", env!("CARGO_PKG_VERSION")), + ), + ])?; + + let res = client + .execute(client.get(url).build().map_err(|e| { + BgpkitCommonsError::data_source_error( + crate::errors::data_sources::PEERINGDB, + format!("failed to build request: {}", e), + ) + })?) + .map_err(|e| { + BgpkitCommonsError::data_source_error( + crate::errors::data_sources::PEERINGDB, + format!("request failed: {}", e), + ) + })? + .error_for_status() + .map_err(|e| { + BgpkitCommonsError::data_source_error( + crate::errors::data_sources::PEERINGDB, + format!("API returned error status: {}", e), + ) + })?; + + Ok(Box::new(res)) +} + +/// Load PeeringDB network data from the API +pub fn load_peeringdb_net() -> Result> { + let mut reader = get_peeringdb_reader(PEERINGDB_NET_API_URL)?; + let mut buf = String::new(); + reader.read_to_string(&mut buf)?; + let res: PeeringdbNetResponse = serde_json::from_str(&buf)?; + Ok(res.data) +} + +/// PeeringDB data accessor with cached network information #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Peeringdb { peeringdb_map: HashMap, } impl Peeringdb { + /// Create a new Peeringdb accessor by loading data from the API pub fn new() -> Result { let mut peeringdb_map = HashMap::new(); - let net_vec: Vec = load_peeringdb_net().map_err(|e| { - BgpkitCommonsError::data_source_error( - crate::errors::data_sources::PEERINGDB, - e.to_string(), - ) - })?; + let net_vec = load_peeringdb_net()?; + for net in net_vec { if let Some(asn) = net.asn { peeringdb_map.entry(asn).or_insert(PeeringdbData { @@ -37,13 +148,324 @@ impl Peeringdb { irr_as_set: net.irr_as_set, website: net.website, }); - }; + } } Ok(Self { peeringdb_map }) } + /// Get PeeringDB data for a specific ASN pub fn get_data(&self, asn: u32) -> Option<&PeeringdbData> { self.peeringdb_map.get(&asn) } + + /// Get all ASNs in the PeeringDB data + #[allow(dead_code)] + pub fn get_all_asns(&self) -> Vec { + self.peeringdb_map.keys().copied().collect() + } + + /// Check if an ASN exists in PeeringDB + #[allow(dead_code)] + pub fn contains(&self, asn: u32) -> bool { + self.peeringdb_map.contains_key(&asn) + } + + /// Get the number of networks in the database + #[allow(dead_code)] + pub fn len(&self) -> usize { + self.peeringdb_map.len() + } + + /// Check if the database is empty + #[allow(dead_code)] + pub fn is_empty(&self) -> bool { + self.peeringdb_map.is_empty() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_peeringdb_data_struct() { + let data = PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: Some("Cloudflare, Inc.".to_string()), + aka: Some("CF".to_string()), + irr_as_set: Some("AS-CLOUDFLARE".to_string()), + website: Some("https://cloudflare.com".to_string()), + }; + assert_eq!(data.asn, 13335); + assert_eq!(data.name, Some("Cloudflare".to_string())); + assert_eq!(data.name_long, Some("Cloudflare, Inc.".to_string())); + assert_eq!(data.aka, Some("CF".to_string())); + assert_eq!(data.irr_as_set, Some("AS-CLOUDFLARE".to_string())); + assert_eq!(data.website, Some("https://cloudflare.com".to_string())); + } + + #[test] + fn test_peeringdb_data_with_none_fields() { + let data = PeeringdbData { + asn: 12345, + name: None, + name_long: None, + aka: None, + irr_as_set: None, + website: None, + }; + assert_eq!(data.asn, 12345); + assert!(data.name.is_none()); + assert!(data.name_long.is_none()); + assert!(data.aka.is_none()); + assert!(data.irr_as_set.is_none()); + assert!(data.website.is_none()); + } + + #[test] + fn test_peeringdb_data_serialization() { + let data = PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: Some("Cloudflare, Inc.".to_string()), + aka: None, + irr_as_set: Some("AS-CLOUDFLARE".to_string()), + website: Some("https://cloudflare.com".to_string()), + }; + let json = serde_json::to_string(&data).unwrap(); + assert!(json.contains("\"asn\":13335")); + assert!(json.contains("\"name\":\"Cloudflare\"")); + + // Test round-trip + let deserialized: PeeringdbData = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.asn, data.asn); + assert_eq!(deserialized.name, data.name); + } + + #[test] + fn test_peeringdb_data_deserialization() { + let json = r#"{"asn":13335,"name":"Cloudflare","name_long":"Cloudflare, Inc.","aka":null,"irr_as_set":"AS-CLOUDFLARE","website":"https://cloudflare.com"}"#; + let data: PeeringdbData = serde_json::from_str(json).unwrap(); + assert_eq!(data.asn, 13335); + assert_eq!(data.name, Some("Cloudflare".to_string())); + assert_eq!(data.name_long, Some("Cloudflare, Inc.".to_string())); + assert!(data.aka.is_none()); + assert_eq!(data.irr_as_set, Some("AS-CLOUDFLARE".to_string())); + } + + #[test] + fn test_peeringdb_net_struct() { + let net = PeeringdbNet { + id: 1, + name: Some("Test Network".to_string()), + name_long: Some("Test Network Inc.".to_string()), + aka: None, + asn: Some(12345), + org_id: Some(100), + irr_as_set: Some("AS-TEST".to_string()), + website: Some("https://test.com".to_string()), + notes: None, + fac_count: Some(5), + ix_count: Some(3), + policy_contracts: None, + policy_general: Some("Open".to_string()), + policy_locations: None, + policy_ratio: None, + policy_url: None, + info_ipv6: Some(true), + info_multicast: Some(false), + info_never_via_route_servers: Some(false), + info_prefixes4: Some(100), + info_prefixes6: Some(50), + info_ratio: None, + info_scope: Some("Global".to_string()), + info_traffic: None, + info_type: Some("NSP".to_string()), + info_types: None, + info_unicast: Some(true), + rir_status: None, + status: Some("ok".to_string()), + status_dashboard: None, + created: Some("2020-01-01".to_string()), + updated: Some("2024-01-01".to_string()), + route_server: None, + looking_glass: None, + }; + assert_eq!(net.id, 1); + assert_eq!(net.asn, Some(12345)); + assert_eq!(net.name, Some("Test Network".to_string())); + assert_eq!(net.info_prefixes4, Some(100)); + } + + #[test] + fn test_peeringdb_net_deserialization() { + let json = r#"{"id":1,"name":"Test","name_long":null,"aka":null,"asn":12345,"org_id":100,"irr_as_set":null,"website":null,"notes":null,"fac_count":null,"ix_count":null,"policy_contracts":null,"policy_general":null,"policy_locations":null,"policy_ratio":null,"policy_url":null,"info_ipv6":null,"info_multicast":null,"info_never_via_route_servers":null,"info_prefixes4":null,"info_prefixes6":null,"info_ratio":null,"info_scope":null,"info_traffic":null,"info_type":null,"info_types":null,"info_unicast":null,"rir_status":null,"status":"ok","status_dashboard":null,"created":null,"updated":null,"route_server":null,"looking_glass":null}"#; + let net: PeeringdbNet = serde_json::from_str(json).unwrap(); + assert_eq!(net.id, 1); + assert_eq!(net.asn, Some(12345)); + assert_eq!(net.name, Some("Test".to_string())); + } + + #[test] + fn test_peeringdb_net_response_deserialization() { + let json = r#"{"data":[{"id":1,"name":"Test1","name_long":null,"aka":null,"asn":11111,"org_id":100,"irr_as_set":null,"website":null,"notes":null,"fac_count":null,"ix_count":null,"policy_contracts":null,"policy_general":null,"policy_locations":null,"policy_ratio":null,"policy_url":null,"info_ipv6":null,"info_multicast":null,"info_never_via_route_servers":null,"info_prefixes4":null,"info_prefixes6":null,"info_ratio":null,"info_scope":null,"info_traffic":null,"info_type":null,"info_types":null,"info_unicast":null,"rir_status":null,"status":"ok","status_dashboard":null,"created":null,"updated":null,"route_server":null,"looking_glass":null},{"id":2,"name":"Test2","name_long":null,"aka":null,"asn":22222,"org_id":200,"irr_as_set":null,"website":null,"notes":null,"fac_count":null,"ix_count":null,"policy_contracts":null,"policy_general":null,"policy_locations":null,"policy_ratio":null,"policy_url":null,"info_ipv6":null,"info_multicast":null,"info_never_via_route_servers":null,"info_prefixes4":null,"info_prefixes6":null,"info_ratio":null,"info_scope":null,"info_traffic":null,"info_type":null,"info_types":null,"info_unicast":null,"rir_status":null,"status":"ok","status_dashboard":null,"created":null,"updated":null,"route_server":null,"looking_glass":null}]}"#; + let response: PeeringdbNetResponse = serde_json::from_str(json).unwrap(); + assert_eq!(response.data.len(), 2); + assert_eq!(response.data[0].asn, Some(11111)); + assert_eq!(response.data[1].asn, Some(22222)); + } + + #[test] + fn test_peeringdb_struct_from_hashmap() { + let mut peeringdb_map = HashMap::new(); + peeringdb_map.insert( + 13335, + PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: Some("Cloudflare, Inc.".to_string()), + aka: None, + irr_as_set: Some("AS-CLOUDFLARE".to_string()), + website: Some("https://cloudflare.com".to_string()), + }, + ); + peeringdb_map.insert( + 15169, + PeeringdbData { + asn: 15169, + name: Some("Google".to_string()), + name_long: Some("Google LLC".to_string()), + aka: None, + irr_as_set: Some("AS-GOOGLE".to_string()), + website: Some("https://google.com".to_string()), + }, + ); + + let peeringdb = Peeringdb { peeringdb_map }; + + // Test get_data + let cf_data = peeringdb.get_data(13335); + assert!(cf_data.is_some()); + assert_eq!(cf_data.unwrap().name, Some("Cloudflare".to_string())); + + let google_data = peeringdb.get_data(15169); + assert!(google_data.is_some()); + assert_eq!(google_data.unwrap().name, Some("Google".to_string())); + + // Test non-existent ASN + let nonexistent = peeringdb.get_data(999999); + assert!(nonexistent.is_none()); + + // Test contains + assert!(peeringdb.contains(13335)); + assert!(peeringdb.contains(15169)); + assert!(!peeringdb.contains(999999)); + + // Test len + assert_eq!(peeringdb.len(), 2); + + // Test is_empty + assert!(!peeringdb.is_empty()); + + // Test get_all_asns + let asns = peeringdb.get_all_asns(); + assert_eq!(asns.len(), 2); + assert!(asns.contains(&13335)); + assert!(asns.contains(&15169)); + } + + #[test] + fn test_peeringdb_empty() { + let peeringdb = Peeringdb { + peeringdb_map: HashMap::new(), + }; + assert!(peeringdb.is_empty()); + assert_eq!(peeringdb.len(), 0); + assert!(peeringdb.get_all_asns().is_empty()); + assert!(peeringdb.get_data(12345).is_none()); + assert!(!peeringdb.contains(12345)); + } + + #[test] + fn test_peeringdb_serialization() { + let mut peeringdb_map = HashMap::new(); + peeringdb_map.insert( + 13335, + PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: None, + aka: None, + irr_as_set: None, + website: None, + }, + ); + + let peeringdb = Peeringdb { peeringdb_map }; + let json = serde_json::to_string(&peeringdb).unwrap(); + assert!(json.contains("13335")); + assert!(json.contains("Cloudflare")); + + // Test round-trip + let deserialized: Peeringdb = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.len(), 1); + assert!(deserialized.contains(13335)); + } + + // Integration tests that require network access - marked as ignored by default + #[test] + #[ignore] + fn test_load_peeringdb_net() { + // This test requires PEERINGDB_API_KEY to be set + let result = load_peeringdb_net(); + assert!(result.is_ok()); + let nets = result.unwrap(); + assert!(!nets.is_empty()); + // Check that we got some networks with ASNs + let with_asn: Vec<_> = nets.iter().filter(|n| n.asn.is_some()).collect(); + assert!(!with_asn.is_empty()); + } + + #[test] + #[ignore] + fn test_peeringdb_new() { + // This test requires PEERINGDB_API_KEY to be set + let result = Peeringdb::new(); + assert!(result.is_ok()); + let peeringdb = result.unwrap(); + assert!(!peeringdb.is_empty()); + + // Test that we can look up well-known networks + // Cloudflare (AS13335) should be in PeeringDB + let cf = peeringdb.get_data(13335); + assert!(cf.is_some()); + } + + #[test] + #[ignore] + fn test_peeringdb_get_data_existing() { + // This test requires PEERINGDB_API_KEY to be set + let peeringdb = Peeringdb::new().expect("Failed to load PeeringDB"); + + // Test with Cloudflare + let data = peeringdb.get_data(13335); + assert!(data.is_some()); + let data = data.unwrap(); + assert_eq!(data.asn, 13335); + // Cloudflare should have a name + assert!(data.name.is_some()); + } + + #[test] + #[ignore] + fn test_peeringdb_get_data_nonexistent() { + // This test requires PEERINGDB_API_KEY to be set + let peeringdb = Peeringdb::new().expect("Failed to load PeeringDB"); + + // Test with a non-existent ASN + let data = peeringdb.get_data(999999999); + assert!(data.is_none()); + } } diff --git a/src/lib.rs b/src/lib.rs index 0bd44d6..c9b2960 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,39 +6,39 @@ //! ## Available Modules //! //! ### [`asinfo`] - Autonomous System Information (requires `asinfo` feature) -//! **Load Method**: `load_asinfo(as2org, population, hegemony, peeringdb)` or `load_asinfo_cached()` -//! **Access Methods**: `asinfo_get(asn)`, `asinfo_all()` -//! **Data Sources**: RIPE NCC, CAIDA as2org, APNIC population, IIJ IHR hegemony, PeeringDB +//! **Load Method**: `load_asinfo(as2org, population, hegemony, peeringdb)` or `load_asinfo_cached()` +//! **Access Methods**: `asinfo_get(asn)`, `asinfo_all()` +//! **Data Sources**: RIPE NCC, CAIDA as2org, APNIC population, IIJ IHR hegemony, PeeringDB //! **Functionality**: AS name resolution, country mapping, organization data, population statistics, hegemony scores //! //! ### [`as2rel`] - AS Relationship Data (requires `as2rel` feature) -//! **Load Method**: `load_as2rel()` -//! **Access Methods**: `as2rel_lookup(asn1, asn2)` -//! **Data Sources**: BGPKIT AS relationship inference +//! **Load Method**: `load_as2rel()` +//! **Access Methods**: `as2rel_lookup(asn1, asn2)` +//! **Data Sources**: BGPKIT AS relationship inference //! **Functionality**: Provider-customer, peer-to-peer, and sibling relationships between ASes //! //! ### [`bogons`] - Bogon Detection (requires `bogons` feature) -//! **Load Method**: `load_bogons()` -//! **Access Methods**: `bogons_match(input)`, `bogons_match_prefix(prefix)`, `bogons_match_asn(asn)`, `get_bogon_prefixes()`, `get_bogon_asns()` -//! **Data Sources**: IANA special registries (IPv4, IPv6, ASN) +//! **Load Method**: `load_bogons()` +//! **Access Methods**: `bogons_match(input)`, `bogons_match_prefix(prefix)`, `bogons_match_asn(asn)`, `get_bogon_prefixes()`, `get_bogon_asns()` +//! **Data Sources**: IANA special registries (IPv4, IPv6, ASN) //! **Functionality**: Detect invalid/reserved IP prefixes and ASNs that shouldn't appear in routing //! //! ### [`countries`] - Country Information (requires `countries` feature) -//! **Load Method**: `load_countries()` -//! **Access Methods**: `country_by_code(code)`, country lookup by name -//! **Data Sources**: GeoNames geographical database +//! **Load Method**: `load_countries()` +//! **Access Methods**: `country_by_code(code)`, country lookup by name +//! **Data Sources**: GeoNames geographical database //! **Functionality**: ISO country code to name mapping and geographical information //! //! ### [`mrt_collectors`] - MRT Collector Metadata (requires `mrt_collectors` feature) -//! **Load Methods**: `load_mrt_collectors()`, `load_mrt_collector_peers()` -//! **Access Methods**: `mrt_collectors_all()`, `mrt_collector_peers()`, `mrt_collector_peers_full_feed()` -//! **Data Sources**: RouteViews and RIPE RIS official APIs +//! **Load Methods**: `load_mrt_collectors()`, `load_mrt_collector_peers()` +//! **Access Methods**: `mrt_collectors_all()`, `mrt_collector_peers()`, `mrt_collector_peers_full_feed()` +//! **Data Sources**: RouteViews and RIPE RIS official APIs //! **Functionality**: BGP collector information, peer details, full-feed vs partial-feed classification //! //! ### [`rpki`] - RPKI Validation (requires `rpki` feature) -//! **Load Method**: `load_rpki(optional_date)` -//! **Access Methods**: `rpki_validate(prefix, asn)` -//! **Data Sources**: RIPE NCC historical data, Cloudflare real-time data +//! **Load Method**: `load_rpki(optional_date)` +//! **Access Methods**: `rpki_validate(prefix, asn)` +//! **Data Sources**: RIPE NCC historical data, Cloudflare real-time data //! **Functionality**: Route Origin Authorization (ROA) validation, supports multiple ROAs per prefix //! //! ## Quick Start @@ -57,6 +57,8 @@ //! 3. Access the data using the corresponding `xxx_yyy()` methods //! //! ```rust +//! # #[cfg(feature = "bogons")] +//! # fn main() { //! use bgpkit_commons::BgpkitCommons; //! //! let mut commons = BgpkitCommons::new(); @@ -68,11 +70,16 @@ //! if let Ok(is_bogon) = commons.bogons_match("23456") { //! println!("ASN 23456 is a bogon: {}", is_bogon); //! } +//! # } +//! # #[cfg(not(feature = "bogons"))] +//! # fn main() {} //! ``` //! //! ### Working with Multiple Modules //! //! ```rust +//! # #[cfg(all(feature = "asinfo", feature = "countries"))] +//! # fn main() { //! use bgpkit_commons::BgpkitCommons; //! //! let mut commons = BgpkitCommons::new(); @@ -85,6 +92,9 @@ //! if let Ok(Some(asinfo)) = commons.asinfo_get(13335) { //! println!("AS13335: {} ({})", asinfo.name, asinfo.country); //! } +//! # } +//! # #[cfg(not(all(feature = "asinfo", feature = "countries")))] +//! # fn main() {} //! ``` //! //! ## Feature Flags @@ -92,12 +102,12 @@ //! ### Module Features //! - `asinfo` - AS information with organization and population data //! - `as2rel` - AS relationship data -//! - `bogons` - Bogon prefix and ASN detection +//! - `bogons` - Bogon prefix and ASN detection //! - `countries` - Country information lookup //! - `mrt_collectors` - MRT collector metadata //! - `rpki` - RPKI validation functionality //! -//! ### Convenience Features +//! ### Convenience Features //! - `all` (default) - Enables all modules for backwards compatibility //! //! ### Minimal Build Example @@ -118,12 +128,17 @@ //! all currently loaded data sources: //! //! ```rust +//! # #[cfg(feature = "bogons")] +//! # fn main() { //! # use bgpkit_commons::BgpkitCommons; //! let mut commons = BgpkitCommons::new(); //! commons.load_bogons().unwrap(); //! //! // Later, reload all loaded data //! commons.reload().unwrap(); +//! # } +//! # #[cfg(not(feature = "bogons"))] +//! # fn main() {} //! ``` #![doc( @@ -283,7 +298,12 @@ impl BgpkitCommons { Ok(()) } - /// Load RPKI data + /// Load RPKI data from Cloudflare (real-time) or historical archives + /// + /// - If `date_opt` is `None`, loads real-time data from Cloudflare + /// - If `date_opt` is `Some(date)`, loads historical data from RIPE NCC by default + /// + /// For more control over the data source, use `load_rpki_historical()` instead. #[cfg(feature = "rpki")] pub fn load_rpki(&mut self, date_opt: Option) -> Result<()> { if let Some(date) = date_opt { @@ -294,6 +314,120 @@ impl BgpkitCommons { Ok(()) } + /// Load RPKI data from a specific historical data source + /// + /// This allows you to choose between RIPE NCC and RPKIviews for historical data. + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; + /// use chrono::NaiveDate; + /// + /// let mut commons = BgpkitCommons::new(); + /// let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + /// + /// // Load from RIPE NCC + /// commons.load_rpki_historical(date, HistoricalRpkiSource::Ripe).unwrap(); + /// + /// // Or load from RPKIviews + /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::KerfuffleNet); + /// commons.load_rpki_historical(date, source).unwrap(); + /// ``` + #[cfg(feature = "rpki")] + pub fn load_rpki_historical( + &mut self, + date: chrono::NaiveDate, + source: rpki::HistoricalRpkiSource, + ) -> Result<()> { + match source { + rpki::HistoricalRpkiSource::Ripe => { + self.rpki_trie = Some(rpki::RpkiTrie::from_ripe_historical(date)?); + } + rpki::HistoricalRpkiSource::RpkiViews(collector) => { + self.rpki_trie = Some(rpki::RpkiTrie::from_rpkiviews(collector, date)?); + } + } + Ok(()) + } + + /// Load RPKI data from specific file URLs + /// + /// This allows loading from specific archive files, which is useful when you want + /// to process multiple files or use specific timestamps. + /// + /// # Arguments + /// + /// * `urls` - A slice of URLs pointing to RPKI data files + /// * `source` - The type of data source (RIPE or RPKIviews) - determines how files are parsed + /// * `date` - Optional date to associate with the loaded data + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// use bgpkit_commons::rpki::HistoricalRpkiSource; + /// + /// let mut commons = BgpkitCommons::new(); + /// let urls = vec![ + /// "https://example.com/rpki-20240104T144128Z.tgz".to_string(), + /// ]; + /// commons.load_rpki_from_files(&urls, HistoricalRpkiSource::RpkiViews( + /// bgpkit_commons::rpki::RpkiViewsCollector::KerfuffleNet + /// ), None).unwrap(); + /// ``` + #[cfg(feature = "rpki")] + pub fn load_rpki_from_files( + &mut self, + urls: &[String], + source: rpki::HistoricalRpkiSource, + date: Option, + ) -> Result<()> { + match source { + rpki::HistoricalRpkiSource::Ripe => { + self.rpki_trie = Some(rpki::RpkiTrie::from_ripe_files(urls, date)?); + } + rpki::HistoricalRpkiSource::RpkiViews(_) => { + self.rpki_trie = Some(rpki::RpkiTrie::from_rpkiviews_files(urls, date)?); + } + } + Ok(()) + } + + /// List available RPKI files for a given date from a specific source + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; + /// use chrono::NaiveDate; + /// + /// let commons = BgpkitCommons::new(); + /// let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + /// + /// // List files from RIPE NCC + /// let ripe_files = commons.list_rpki_files(date, HistoricalRpkiSource::Ripe).unwrap(); + /// + /// // List files from RPKIviews + /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::KerfuffleNet); + /// let rpkiviews_files = commons.list_rpki_files(date, source).unwrap(); + /// ``` + #[cfg(feature = "rpki")] + pub fn list_rpki_files( + &self, + date: chrono::NaiveDate, + source: rpki::HistoricalRpkiSource, + ) -> Result> { + match source { + rpki::HistoricalRpkiSource::Ripe => rpki::list_ripe_files(date), + rpki::HistoricalRpkiSource::RpkiViews(collector) => { + rpki::list_rpkiviews_files(collector, date) + } + } + } + /// Load MRT mrt_collectors data #[cfg(feature = "mrt_collectors")] pub fn load_mrt_collectors(&mut self) -> Result<()> { @@ -339,6 +473,46 @@ impl BgpkitCommons { Ok(()) } + /// Returns a builder for loading AS information with specific data sources. + /// + /// This provides a more ergonomic way to configure which data sources to load + /// compared to the `load_asinfo()` method with boolean parameters. + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// + /// let mut commons = BgpkitCommons::new(); + /// let builder = commons.asinfo_builder() + /// .with_as2org() + /// .with_peeringdb(); + /// commons.load_asinfo_with(builder).unwrap(); + /// ``` + #[cfg(feature = "asinfo")] + pub fn asinfo_builder(&self) -> crate::asinfo::AsInfoBuilder { + crate::asinfo::AsInfoBuilder::new() + } + + /// Load AS information using a pre-configured builder. + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// + /// let mut commons = BgpkitCommons::new(); + /// let builder = commons.asinfo_builder() + /// .with_as2org() + /// .with_hegemony(); + /// commons.load_asinfo_with(builder).unwrap(); + /// ``` + #[cfg(feature = "asinfo")] + pub fn load_asinfo_with(&mut self, builder: crate::asinfo::AsInfoBuilder) -> Result<()> { + self.asinfo = Some(builder.build()?); + Ok(()) + } + /// Load AS-level relationship data #[cfg(feature = "as2rel")] pub fn load_as2rel(&mut self) -> Result<()> { diff --git a/src/rpki/cloudflare.rs b/src/rpki/cloudflare.rs index c8dd6a1..3b94cd2 100644 --- a/src/rpki/cloudflare.rs +++ b/src/rpki/cloudflare.rs @@ -1,221 +1,36 @@ //! Load current RPKI information from Cloudflare RPKI portal. use crate::Result; -use chrono::DateTime; -use ipnet::IpNet; -use serde::{Deserialize, Serialize}; -use std::str::FromStr; -use super::{Rir, RoaEntry, RpkiTrie}; +use super::RpkiTrie; +use super::rpki_client::RpkiClientData; -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CfData { - pub metadata: CfMetaData, - pub roas: Vec, - pub aspas: Vec, - pub bgpsec_keys: Vec, -} - -impl CfData { - pub fn new() -> Result { - let data: CfData = - oneio::read_json_struct::("https://rpki.cloudflare.com/rpki.json")?; - Ok(data) - } -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfMetaData { - pub buildmachine: Option, - pub buildtime: Option, - pub elapsedtime: Option, - pub usertime: Option, - pub systemtime: Option, - pub roas: Option, - pub failedroas: Option, - pub invalidroas: Option, - pub spls: Option, - pub failedspls: Option, - pub invalidspls: Option, - pub aspas: Option, - pub failedaspas: Option, - pub invalidaspas: Option, - pub bgpsec_pubkeys: Option, - pub certificates: Option, - pub invalidcertificates: Option, - pub taks: Option, - pub tals: Option, - pub invalidtals: Option, - pub talfiles: Option>, - pub manifests: Option, - pub failedmanifests: Option, - pub crls: Option, - pub gbrs: Option, - pub repositories: Option, - pub vrps: Option, - pub uniquevrps: Option, - pub vsps: Option, - pub uniquevsps: Option, - pub vaps: Option, - pub uniquevaps: Option, - pub cachedir_new_files: Option, - pub cachedir_del_files: Option, - pub cachedir_del_dirs: Option, - pub cachedir_superfluous_files: Option, - pub cachedir_del_superfluous_files: Option, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfAspaEntry { - pub customer_asid: u32, - pub expires: i64, - pub providers: Vec, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfBgpsecKeysEntry { - pub asn: u32, - pub ski: String, - pub pubkey: String, - pub ta: String, - pub expires: i64, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfRoaEntry { - pub prefix: String, - #[serde(rename = "maxLength")] - pub max_length: u8, - pub asn: u32, - pub ta: String, - pub expires: u64, -} +const CLOUDFLARE_RPKI_URL: &str = "https://rpki.cloudflare.com/rpki.json"; impl RpkiTrie { + /// Load current RPKI data from Cloudflare RPKI portal. + /// + /// This loads real-time RPKI data from Cloudflare's public RPKI JSON endpoint. + /// The data includes ROAs, ASPAs, and BGPsec keys. pub fn from_cloudflare() -> Result { - let data: CfData = - oneio::read_json_struct::("https://rpki.cloudflare.com/rpki.json")?; - - let mut trie = RpkiTrie { - aspas: data.aspas, - ..Default::default() - }; - - for roa in data.roas { - let prefix = roa.prefix.parse::()?; - let max_length = roa.max_length; - let rir = Rir::from_str(roa.ta.as_str()).ok(); - - // Convert expires timestamp to NaiveDateTime - let not_after = - DateTime::from_timestamp(roa.expires as i64, 0).map(|dt| dt.naive_utc()); - - let roa_entry = RoaEntry { - prefix, - asn: roa.asn, - max_length, - rir, - not_before: None, - not_after, - }; - - trie.insert_roa(roa_entry); - } - Ok(trie) + let data = RpkiClientData::from_url(CLOUDFLARE_RPKI_URL)?; + Self::from_rpki_client_data(data, None) } } #[cfg(test)] mod tests { use super::*; - use chrono::{DateTime, Utc}; #[test] - #[ignore] // This test requires network access and is for manual testing - // Run with: cargo test --release --features rpki test_cloudflare_rpki_expiry_loading -- --ignored --nocapture - fn test_cloudflare_rpki_expiry_loading() { - println!("Loading RPKI data from Cloudflare..."); - - // Load the RPKI data + #[ignore] // Requires network access + fn test_from_cloudflare() { let trie = RpkiTrie::from_cloudflare().expect("Failed to load Cloudflare RPKI data"); - // Count total ROAs let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); - println!("Total ROAs loaded: {}", total_roas); - - // Count ROAs with expiry dates - let mut roas_with_expiry = 0; - let mut expired_roas = 0; - let mut future_roas = 0; - let current_time = Utc::now().naive_utc(); - - for (prefix, roas) in trie.trie.iter() { - for roa in roas { - if roa.not_after.is_some() { - roas_with_expiry += 1; - - if let Some(not_after) = roa.not_after { - if not_after < current_time { - expired_roas += 1; - println!( - "Expired ROA found: prefix={}, asn={}, expired={}", - prefix, roa.asn, not_after - ); - } - } - } - - if let Some(not_before) = roa.not_before { - if not_before > current_time { - future_roas += 1; - println!( - "Future ROA found: prefix={}, asn={}, valid_from={}", - prefix, roa.asn, not_before - ); - } - } - } - } - - println!("\nSummary:"); - println!("- ROAs with expiry dates: {}", roas_with_expiry); - println!("- Expired ROAs: {}", expired_roas); - println!("- Future ROAs: {}", future_roas); - - // Test expiry validation with a sample ROA if any have expiry dates - if roas_with_expiry > 0 { - // Find a ROA with expiry date to test - for (prefix, roas) in trie.trie.iter() { - for roa in roas { - if roa.not_after.is_some() { - println!( - "\nTesting validation with expiry check for prefix={}, asn={}", - prefix, roa.asn - ); - - // Test with current time - let validation = trie.validate_check_expiry(&prefix, roa.asn, None); - println!("Validation result (current time): {:?}", validation); - - // Test with a far future time - let future_time = DateTime::from_timestamp(3000000000, 0) - .map(|dt| dt.naive_utc()) - .unwrap(); - let future_validation = - trie.validate_check_expiry(&prefix, roa.asn, Some(future_time)); - println!("Validation result (future time): {:?}", future_validation); - - break; - } - } - if roas_with_expiry > 0 { - break; - } - } - } + println!("Loaded {} ROAs from Cloudflare", total_roas); + println!("Loaded {} ASPAs", trie.aspas.len()); - // Basic sanity check - we should have loaded some ROAs - assert!(total_roas > 0, "No ROAs were loaded from Cloudflare"); - println!("\nTest completed successfully!"); + assert!(total_roas > 0, "Should have loaded some ROAs"); } } diff --git a/src/rpki/mod.rs b/src/rpki/mod.rs index e7ae975..2659dc3 100644 --- a/src/rpki/mod.rs +++ b/src/rpki/mod.rs @@ -1,7 +1,7 @@ //! RPKI (Resource Public Key Infrastructure) validation and data structures. //! //! This module provides functionality for loading and validating RPKI data from multiple sources, -//! including real-time data from Cloudflare and historical data from RIPE NCC. +//! including real-time data from Cloudflare and historical data from RIPE NCC or RPKIviews. //! //! # Overview //! @@ -20,7 +20,7 @@ //! //! ## Historical Data (RIPE NCC) //! - **Source**: [RIPE NCC FTP archives](https://ftp.ripe.net/rpki/) -//! - **Format**: CSV files with historical RPKI states +//! - **Format**: JSON files (output.json.xz) with ROAs, ASPAs //! - **Use Case**: Historical analysis and research //! - **Date Range**: Configurable historical date //! - **TAL Sources**: @@ -30,15 +30,26 @@ //! - LACNIC: //! - RIPE NCC: //! +//! ## Historical Data (RPKIviews) +//! - **Source**: [RPKIviews](https://rpkiviews.org/) +//! - **Format**: Compressed tarballs (.tgz) containing rpki-client.json +//! - **Use Case**: Historical analysis from multiple vantage points +//! - **Default Collector**: Kerfuffle (rpkiviews.kerfuffle.net) +//! - **Collectors**: +//! - Josephine: A2B Internet (AS51088), Amsterdam, Netherlands +//! - Amber: Massar (AS57777), Lugano, Switzerland +//! - Dango: Internet Initiative Japan (AS2497), Tokyo, Japan +//! - Kerfuffle: Kerfuffle, LLC (AS35008), Fremont, California, United States +//! //! # Core Data Structures //! //! ## RpkiTrie //! The main data structure that stores RPKI data in a trie for efficient prefix lookups: -//! - **Trie**: `IpnetTrie>` - Maps IP prefixes to lists of ROA entries -//! - **ASPAs**: `Vec` - AS Provider Authorization records +//! - **Trie**: `IpnetTrie>` - Maps IP prefixes to lists of ROA entries +//! - **ASPAs**: `Vec` - AS Provider Authorization records //! - **Date**: `Option` - Optional date for historical data //! -//! ## RoaEntry +//! ## Roa //! Represents a Route Origin Authorization with the following fields: //! - `prefix: IpNet` - The IP prefix (e.g., 192.0.2.0/24) //! - `asn: u32` - The authorized ASN (e.g., 64496) @@ -47,38 +58,18 @@ //! - `not_before: Option` - ROA validity start time //! - `not_after: Option` - ROA validity end time (from expires field) //! +//! ## Aspa +//! Represents an AS Provider Authorization with the following fields: +//! - `customer_asn: u32` - The customer AS number +//! - `providers: Vec` - List of provider AS numbers +//! - `expires: Option` - When this ASPA expires +//! //! ## Validation Results //! RPKI validation returns one of three states: //! - **Valid**: The prefix-ASN pair is explicitly authorized by a valid ROA //! - **Invalid**: The prefix has ROAs but none authorize the given ASN //! - **Unknown**: No ROAs exist for the prefix, or all ROAs are outside their validity period //! -//! # Validation Process -//! -//! ## Standard Validation (`validate`) -//! 1. Look up all ROAs that cover the given prefix -//! 2. Check if any ROA authorizes the given ASN with appropriate max_length -//! 3. Return Valid/Invalid/Unknown based on matches -//! -//! ## Expiry-Aware Validation (`validate_check_expiry`) -//! 1. Look up all ROAs that cover the given prefix -//! 2. Filter ROAs to only include those within their validity time window: -//! - Check `not_before` ≤ check_time (if present) -//! - Check `not_after` ≥ check_time (if present) -//! 3. Among time-valid ROAs, check for ASN authorization -//! 4. Return validation result: -//! - **Valid**: Time-valid ROA found for the ASN -//! - **Invalid**: Time-valid ROAs exist but none authorize the ASN -//! - **Unknown**: No ROAs found, or all ROAs are outside validity period -//! -//! # Key Features -//! -//! - **Multiple ROAs per prefix**: A single prefix can have multiple valid ROAs with different ASNs -//! - **Duplicate prevention**: ROAs with identical (prefix, asn, max_length) are automatically deduplicated -//! - **Efficient lookup**: Fast prefix matching using a trie data structure -//! - **Temporal validation**: Support for time-aware validation with expiry checking -//! - **Comprehensive validation**: Full RPKI validation against stored ROAs -//! //! # Usage Examples //! //! ## Loading Real-time Data (Cloudflare) @@ -98,131 +89,141 @@ //! bgpkit_commons::rpki::RpkiValidation::Invalid => println!("Route is RPKI invalid"), //! bgpkit_commons::rpki::RpkiValidation::Unknown => println!("No RPKI data for this prefix"), //! } -//! -//! // Validate with expiry checking (current time) -//! let result = commons.rpki_validate_check_expiry(64496, "192.0.2.0/24", None)?; -//! -//! // Validate with expiry checking (specific time) -//! use chrono::{DateTime, Utc}; -//! let check_time = DateTime::from_timestamp(1700000000, 0).unwrap().naive_utc(); -//! let result = commons.rpki_validate_check_expiry(64496, "192.0.2.0/24", Some(check_time))?; //! # Ok(()) //! # } //! ``` //! -//! ## Loading Historical Data (RIPE) +//! ## Loading Historical Data with Source Selection //! ```rust,no_run //! use bgpkit_commons::BgpkitCommons; +//! use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; //! use chrono::NaiveDate; //! //! # fn main() -> Result<(), Box> { //! let mut commons = BgpkitCommons::new(); +//! let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); //! -//! // Load RPKI data for a specific historical date -//! let date = NaiveDate::from_ymd_opt(2023, 6, 15).unwrap(); -//! commons.load_rpki(Some(date))?; +//! // Load from RIPE NCC +//! commons.load_rpki_historical(date, HistoricalRpkiSource::Ripe)?; //! -//! // Validate using historical data -//! let result = commons.rpki_validate(64496, "192.0.2.0/24")?; +//! // Or load from RPKIviews (uses Kerfuffle collector by default) +//! let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::default()); +//! commons.load_rpki_historical(date, source)?; //! # Ok(()) //! # } //! ``` //! -//! ## Direct Trie Usage -//! ```rust,no_run -//! use bgpkit_commons::rpki::{RpkiTrie, RpkiValidation}; -//! use ipnet::IpNet; -//! -//! # fn main() -> Result<(), Box> { -//! // Load from Cloudflare directly -//! let trie = RpkiTrie::from_cloudflare()?; -//! -//! // Lookup all ROAs for a prefix -//! let prefix: IpNet = "192.0.2.0/24".parse()?; -//! let roas = trie.lookup_by_prefix(&prefix); -//! println!("Found {} ROAs for prefix", roas.len()); -//! -//! // Validate with expiry checking -//! let result = trie.validate_check_expiry(&prefix, 64496, None); -//! # Ok(()) -//! # } -//! ``` -//! -//! ## Handling Multiple ROAs -//! A single prefix can have multiple ROAs with different ASNs and validity periods: +//! ## Listing Available Files //! ```rust,no_run //! use bgpkit_commons::BgpkitCommons; +//! use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; +//! use chrono::NaiveDate; //! //! # fn main() -> Result<(), Box> { -//! let mut commons = BgpkitCommons::new(); -//! commons.load_rpki(None)?; -//! -//! // Look up all ROAs for a prefix -//! let roas = commons.rpki_lookup_by_prefix("192.0.2.0/24")?; -//! for roa in roas { -//! println!("ASN: {}, Max Length: {}, Expires: {:?}", -//! roa.asn, roa.max_length, roa.not_after); +//! let commons = BgpkitCommons::new(); +//! let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); +//! +//! // List available files from RPKIviews (multiple snapshots per day) +//! let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::default()); +//! let rpkiviews_files = commons.list_rpki_files(date, source)?; +//! for file in &rpkiviews_files { +//! println!("RPKIviews file: {} (timestamp: {})", file.url, file.timestamp); //! } //! # Ok(()) //! # } //! ``` -//! -//! # Performance Considerations -//! -//! - **Trie Structure**: Uses `ipnet-trie` for O(log n) prefix lookups -//! - **Memory Usage**: Stores all ROAs in memory for fast access -//! - **Loading Time**: Initial load from Cloudflare takes a few seconds -//! - **Caching**: No automatic caching - reload when fresh data is needed -//! -//! # Error Handling -//! -//! All validation methods return `Result` and can fail due to: -//! - Network errors when loading data -//! - Invalid prefix format in input -//! - RPKI data not loaded (call `load_rpki_*` methods first) mod cloudflare; mod ripe_historical; -// mod rpkiviews; +pub(crate) mod rpki_client; +mod rpkiviews; -use chrono::{NaiveDate, NaiveDateTime, Utc}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; use ipnet::IpNet; use ipnet_trie::IpnetTrie; use crate::errors::{load_methods, modules}; use crate::{BgpkitCommons, BgpkitCommonsError, LazyLoadable, Result}; -pub use cloudflare::*; +pub use ripe_historical::list_ripe_files; +use rpki_client::RpkiClientData; +pub use rpkiviews::{RpkiViewsCollector, list_rpkiviews_files}; use serde::{Deserialize, Serialize}; use std::fmt::Display; use std::str::FromStr; -#[derive(Clone)] -pub struct RpkiTrie { - pub trie: IpnetTrie>, - pub aspas: Vec, - date: Option, -} - -impl Default for RpkiTrie { - fn default() -> Self { - Self { - trie: IpnetTrie::new(), - aspas: vec![], - date: None, - } - } -} +// ============================================================================ +// Public Data Structures +// ============================================================================ +/// A validated Route Origin Authorization (ROA). +/// +/// ROAs authorize specific Autonomous Systems to originate specific IP prefixes. #[derive(Clone, Debug, Serialize, Deserialize)] -pub struct RoaEntry { +pub struct Roa { + /// The IP prefix (e.g., 192.0.2.0/24 or 2001:db8::/32) pub prefix: IpNet, + /// The AS number authorized to originate this prefix pub asn: u32, + /// Maximum prefix length allowed for announcements pub max_length: u8, + /// Regional Internet Registry that issued this ROA pub rir: Option, + /// ROA validity start time (if available) pub not_before: Option, + /// ROA validity end time (from expires field) pub not_after: Option, } +/// A validated AS Provider Authorization (ASPA). +/// +/// ASPAs specify which ASes are authorized providers for a customer AS. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Aspa { + /// The customer AS number + pub customer_asn: u32, + /// List of provider AS numbers + pub providers: Vec, + /// When this ASPA expires + pub expires: Option, +} + +/// Information about an available RPKI data file. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RpkiFile { + /// Full URL to download the file + pub url: String, + /// Timestamp when the file was created + pub timestamp: DateTime, + /// Size of the file in bytes (if available) + pub size: Option, + /// RIR that this file is for (for RIPE files) + pub rir: Option, + /// Collector that provides this file (for RPKIviews files) + pub collector: Option, +} + +/// Historical RPKI data source. +/// +/// Used to specify which data source to use when loading historical RPKI data. +#[derive(Debug, Clone, Default)] +pub enum HistoricalRpkiSource { + /// RIPE NCC historical archives (data from all 5 RIRs) + #[default] + Ripe, + /// RPKIviews collector + RpkiViews(RpkiViewsCollector), +} + +impl std::fmt::Display for HistoricalRpkiSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + HistoricalRpkiSource::Ripe => write!(f, "RIPE NCC"), + HistoricalRpkiSource::RpkiViews(collector) => write!(f, "RPKIviews ({})", collector), + } + } +} + +/// Regional Internet Registry (RIR). #[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum Rir { AFRINIC, @@ -271,10 +272,14 @@ impl Rir { } } +/// RPKI validation result. #[derive(Clone, Debug, PartialEq, Eq)] pub enum RpkiValidation { + /// The prefix-ASN pair is explicitly authorized by a valid ROA Valid, + /// The prefix has ROAs but none authorize the given ASN Invalid, + /// No ROAs exist for the prefix, or all ROAs are outside their validity period Unknown, } @@ -288,7 +293,42 @@ impl Display for RpkiValidation { } } +// ============================================================================ +// Backwards Compatibility Type Aliases +// ============================================================================ + +/// Type alias for backwards compatibility. Use [`Roa`] instead. +/// Deprecated since 0.10.0. This alias will be removed in version 0.12.0. +#[deprecated(since = "0.10.0", note = "Use Roa instead")] +pub type RoaEntry = Roa; + +// ============================================================================ +// RpkiTrie Implementation +// ============================================================================ + +/// The main RPKI data structure storing ROAs and ASPAs. +#[derive(Clone)] +pub struct RpkiTrie { + /// Trie mapping IP prefixes to ROA entries + pub trie: IpnetTrie>, + /// AS Provider Authorizations + pub aspas: Vec, + /// Date for historical data (None for real-time) + date: Option, +} + +impl Default for RpkiTrie { + fn default() -> Self { + Self { + trie: IpnetTrie::new(), + aspas: vec![], + date: None, + } + } +} + impl RpkiTrie { + /// Create a new empty RpkiTrie. pub fn new(date: Option) -> Self { Self { trie: IpnetTrie::new(), @@ -297,9 +337,9 @@ impl RpkiTrie { } } - /// insert an [RoaEntry]. Returns true if this is a new prefix, false if added to existing prefix. + /// Insert a ROA. Returns true if this is a new prefix, false if added to existing prefix. /// Duplicates are avoided - ROAs with same (prefix, asn, max_length) are considered identical. - pub fn insert_roa(&mut self, roa: RoaEntry) -> bool { + pub fn insert_roa(&mut self, roa: Roa) -> bool { match self.trie.exact_match_mut(roa.prefix) { Some(existing_roas) => { // Check if this ROA already exists (same prefix, asn, max_length) @@ -317,15 +357,71 @@ impl RpkiTrie { } } - /// insert multiple [RoaEntry]s - pub fn insert_roas(&mut self, roas: Vec) { + /// Insert multiple ROAs. + pub fn insert_roas(&mut self, roas: Vec) { for roa in roas { self.insert_roa(roa); } } - /// Lookup all ROAs that match a given prefix, including invalid ones - pub fn lookup_by_prefix(&self, prefix: &IpNet) -> Vec { + /// Convert rpki-client data into an RpkiTrie. + /// + /// This is a shared conversion function used by all data sources + /// (Cloudflare, RIPE, RPKIviews) since they all produce the same + /// rpki-client JSON format. + pub(crate) fn from_rpki_client_data( + data: RpkiClientData, + date: Option, + ) -> Result { + let mut trie = RpkiTrie::new(date); + trie.merge_rpki_client_data(data); + Ok(trie) + } + + /// Merge rpki-client data into this trie. + /// + /// This converts ROAs and ASPAs from rpki-client format and inserts them, + /// avoiding duplicates for ASPAs based on customer_asn. + pub(crate) fn merge_rpki_client_data(&mut self, data: RpkiClientData) { + // Convert and insert ROAs + for roa in data.roas { + let prefix = match roa.prefix.parse::() { + Ok(p) => p, + Err(_) => continue, + }; + let rir = Rir::from_str(&roa.ta).ok(); + let not_after = + DateTime::from_timestamp(roa.expires as i64, 0).map(|dt| dt.naive_utc()); + + self.insert_roa(Roa { + prefix, + asn: roa.asn, + max_length: roa.max_length, + rir, + not_before: None, + not_after, + }); + } + + // Convert and merge ASPAs (avoiding duplicates based on customer_asn) + for aspa in data.aspas { + if !self + .aspas + .iter() + .any(|a| a.customer_asn == aspa.customer_asid) + { + let expires = DateTime::from_timestamp(aspa.expires, 0).map(|dt| dt.naive_utc()); + self.aspas.push(Aspa { + customer_asn: aspa.customer_asid, + providers: aspa.providers, + expires, + }); + } + } + } + + /// Lookup all ROAs that match a given prefix, including invalid ones. + pub fn lookup_by_prefix(&self, prefix: &IpNet) -> Vec { let mut all_matches = vec![]; for (p, roas) in self.trie.matches(prefix) { if p.contains(prefix) { @@ -339,7 +435,7 @@ impl RpkiTrie { all_matches } - /// Validate a prefix with an ASN + /// Validate a prefix with an ASN. /// /// Return values: /// - `RpkiValidation::Valid` if the prefix-asn pair is valid @@ -360,7 +456,7 @@ impl RpkiTrie { RpkiValidation::Invalid } - /// Validate a prefix with an ASN, checking expiry dates + /// Validate a prefix with an ASN, checking expiry dates. /// /// Return values: /// - `RpkiValidation::Valid` if the prefix-asn pair is valid and not expired @@ -423,15 +519,18 @@ impl RpkiTrie { RpkiValidation::Invalid } + /// Reload the RPKI data from its original source. pub fn reload(&mut self) -> Result<()> { match self.date { Some(date) => { let trie = RpkiTrie::from_ripe_historical(date)?; self.trie = trie.trie; + self.aspas = trie.aspas; } None => { let trie = RpkiTrie::from_cloudflare()?; self.trie = trie.trie; + self.aspas = trie.aspas; } } @@ -457,8 +556,12 @@ impl LazyLoadable for RpkiTrie { } } +// ============================================================================ +// BgpkitCommons Integration +// ============================================================================ + impl BgpkitCommons { - pub fn rpki_lookup_by_prefix(&self, prefix: &str) -> Result> { + pub fn rpki_lookup_by_prefix(&self, prefix: &str) -> Result> { if self.rpki_trie.is_none() { return Err(BgpkitCommonsError::module_not_loaded( modules::RPKI, @@ -503,6 +606,10 @@ impl BgpkitCommons { } } +// ============================================================================ +// Tests +// ============================================================================ + #[cfg(test)] mod tests { use super::*; @@ -512,179 +619,250 @@ mod tests { fn test_multiple_roas_same_prefix() { let mut trie = RpkiTrie::new(None); - // Create a test prefix - let prefix: IpNet = "10.0.0.0/8".parse().unwrap(); - - // Create multiple ROAs for the same prefix with different ASNs - let roa1 = RoaEntry { - prefix, + // Insert first ROA + let roa1 = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64496, - max_length: 16, - rir: Some(Rir::ARIN), + max_length: 24, + rir: Some(Rir::APNIC), not_before: None, not_after: None, }; + assert!(trie.insert_roa(roa1.clone())); - let roa2 = RoaEntry { - prefix, + // Insert second ROA with different ASN for same prefix + let roa2 = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64497, max_length: 24, - rir: Some(Rir::ARIN), + rir: Some(Rir::APNIC), not_before: None, not_after: None, }; + assert!(!trie.insert_roa(roa2.clone())); - // Create a duplicate ROA (same prefix, asn, max_length as roa1) - let roa1_duplicate = RoaEntry { - prefix, + // Insert duplicate ROA (same prefix, asn, max_length) - should be ignored + let roa_dup = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64496, - max_length: 16, - rir: Some(Rir::APNIC), // Different RIR but same (prefix, asn, max_length) + max_length: 24, + rir: Some(Rir::ARIN), // Different RIR shouldn't matter not_before: None, not_after: None, }; + assert!(!trie.insert_roa(roa_dup)); - // Insert ROAs - assert!(trie.insert_roa(roa1)); // Should return true for new prefix - assert!(!trie.insert_roa(roa2)); // Should return false for existing prefix - assert!(!trie.insert_roa(roa1_duplicate)); // Should return false and not add duplicate - - // Lookup should return only 2 ROAs (duplicate should be ignored) - let matches = trie.lookup_by_prefix(&prefix); - assert_eq!(matches.len(), 2); + // Insert ROA with different max_length - should be added + let roa3 = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), + asn: 64496, + max_length: 28, + rir: Some(Rir::APNIC), + not_before: None, + not_after: None, + }; + assert!(!trie.insert_roa(roa3.clone())); - // Check that both ASNs are present - let asns: std::collections::HashSet = matches.iter().map(|r| r.asn).collect(); - assert!(asns.contains(&64496)); - assert!(asns.contains(&64497)); + // Lookup should return 3 ROAs (roa1, roa2, roa3) + let prefix: IpNet = "192.0.2.0/24".parse().unwrap(); + let roas = trie.lookup_by_prefix(&prefix); + assert_eq!(roas.len(), 3); - // Test validation - should be valid for both ASNs + // Validate AS 64496 - should be valid assert_eq!(trie.validate(&prefix, 64496), RpkiValidation::Valid); + + // Validate AS 64497 - should be valid assert_eq!(trie.validate(&prefix, 64497), RpkiValidation::Valid); + + // Validate AS 64498 - should be invalid (prefix has ROAs but not for this ASN) assert_eq!(trie.validate(&prefix, 64498), RpkiValidation::Invalid); + + // Validate unknown prefix - should be unknown + let unknown_prefix: IpNet = "10.0.0.0/8".parse().unwrap(); + assert_eq!( + trie.validate(&unknown_prefix, 64496), + RpkiValidation::Unknown + ); } #[test] - fn test_validate_check_expiry() { + fn test_validate_check_expiry_with_time_constraints() { let mut trie = RpkiTrie::new(None); - // Create a test prefix - let prefix: IpNet = "10.0.0.0/8".parse().unwrap(); - - // Create test dates - let past = DateTime::from_timestamp(1000000000, 0).unwrap().naive_utc(); // 2001-09-09 - let present = DateTime::from_timestamp(1700000000, 0).unwrap().naive_utc(); // 2023-11-14 - let future = DateTime::from_timestamp(2000000000, 0).unwrap().naive_utc(); // 2033-05-18 - - // Test 1: ROA with no time constraints - let roa_no_time = RoaEntry { - prefix, + // Time references + let past_time = DateTime::from_timestamp(1600000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); + let current_time = DateTime::from_timestamp(1700000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); + let future_time = DateTime::from_timestamp(1800000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); + + // Insert ROA that's currently valid (not_before in past, not_after in future) + let roa_valid = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64496, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: None, - not_after: None, + max_length: 24, + rir: Some(Rir::APNIC), + not_before: Some(past_time), + not_after: Some(future_time), }; - trie.insert_roa(roa_no_time.clone()); - - // Should be valid at any time - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, Some(past)), - RpkiValidation::Valid - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, Some(present)), - RpkiValidation::Valid - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, Some(future)), - RpkiValidation::Valid - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, None), - RpkiValidation::Valid - ); + trie.insert_roa(roa_valid); - // Test 2: ROA that's expired - let expired_roa = RoaEntry { - prefix, + // Insert ROA that's expired + let roa_expired = Roa { + prefix: "198.51.100.0/24".parse().unwrap(), asn: 64497, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: None, - not_after: Some(past), + max_length: 24, + rir: Some(Rir::APNIC), + not_before: Some(past_time), + not_after: Some(past_time), // Expired in the past }; - trie.insert_roa(expired_roa); - - // Should be unknown after expiry (ROA exists but is outside valid time range) - assert_eq!( - trie.validate_check_expiry(&prefix, 64497, Some(present)), - RpkiValidation::Unknown - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64497, None), - RpkiValidation::Unknown - ); + trie.insert_roa(roa_expired); - // Test 3: ROA that's not yet valid - let future_roa = RoaEntry { - prefix, + // Insert ROA that's not yet valid + let roa_future = Roa { + prefix: "203.0.113.0/24".parse().unwrap(), asn: 64498, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: Some(future), + max_length: 24, + rir: Some(Rir::APNIC), + not_before: Some(future_time), // Not valid yet not_after: None, }; - trie.insert_roa(future_roa); + trie.insert_roa(roa_future); - // Should be unknown before validity period (ROA exists but is outside valid time range) + // Test valid ROA at current time + let prefix_valid: IpNet = "192.0.2.0/24".parse().unwrap(); assert_eq!( - trie.validate_check_expiry(&prefix, 64498, Some(present)), - RpkiValidation::Unknown + trie.validate_check_expiry(&prefix_valid, 64496, Some(current_time)), + RpkiValidation::Valid ); + + // Test expired ROA at current time - should return Unknown (was valid but expired) + let prefix_expired: IpNet = "198.51.100.0/24".parse().unwrap(); assert_eq!( - trie.validate_check_expiry(&prefix, 64498, None), + trie.validate_check_expiry(&prefix_expired, 64497, Some(current_time)), RpkiValidation::Unknown ); - // Test 4: ROA with valid time window - let windowed_roa = RoaEntry { - prefix, - asn: 64499, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: Some(past), - not_after: Some(future), - }; - trie.insert_roa(windowed_roa); - - // Should be valid within window + // Test not-yet-valid ROA at current time - should return Unknown + let prefix_future: IpNet = "203.0.113.0/24".parse().unwrap(); assert_eq!( - trie.validate_check_expiry(&prefix, 64499, Some(present)), - RpkiValidation::Valid - ); - // Should be unknown outside window (ROA exists but is outside valid time range) - assert_eq!( - trie.validate_check_expiry( - &prefix, - 64499, - Some(DateTime::from_timestamp(900000000, 0).unwrap().naive_utc()) - ), + trie.validate_check_expiry(&prefix_future, 64498, Some(current_time)), RpkiValidation::Unknown ); + + // Test not-yet-valid ROA at future time - should return Valid + let far_future = DateTime::from_timestamp(1900000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); assert_eq!( - trie.validate_check_expiry( - &prefix, - 64499, - Some(DateTime::from_timestamp(2100000000, 0).unwrap().naive_utc()) - ), - RpkiValidation::Unknown + trie.validate_check_expiry(&prefix_future, 64498, Some(far_future)), + RpkiValidation::Valid ); - // Test 5: Ensure Invalid is still returned for wrong ASN + // Test wrong ASN - should return Invalid assert_eq!( - trie.validate_check_expiry(&prefix, 99999, Some(present)), + trie.validate_check_expiry(&prefix_valid, 64499, Some(current_time)), RpkiValidation::Invalid ); } + + #[test] + #[ignore] // Requires network access + fn test_load_from_ripe_historical() { + // Use a recent date that should have data available + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let trie = RpkiTrie::from_ripe_historical(date).expect("Failed to load RIPE data"); + + let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); + println!( + "Loaded {} ROAs from RIPE historical for {}", + total_roas, date + ); + println!("Loaded {} ASPAs", trie.aspas.len()); + + assert!(total_roas > 0, "Should have loaded some ROAs"); + } + + #[test] + #[ignore] // Requires network access + fn test_load_from_rpkiviews() { + // Note: This test streams from a remote tgz file but stops early + // once rpki-client.json is found (typically at position 3-4 in the archive). + // Due to streaming optimization, this typically completes in ~8 seconds. + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::default(), date) + .expect("Failed to load RPKIviews data"); + + let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); + println!("Loaded {} ROAs from RPKIviews for {}", total_roas, date); + println!("Loaded {} ASPAs", trie.aspas.len()); + + assert!(total_roas > 0, "Should have loaded some ROAs"); + } + + #[test] + #[ignore] // Requires network access + fn test_rpkiviews_file_position() { + // Verify that rpki-client.json appears early in the archive + // This confirms our early-termination optimization works + use crate::rpki::rpkiviews::list_files_in_tgz; + + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::default(), date) + .expect("Failed to list files"); + + assert!(!files.is_empty(), "Should have found some files"); + + let tgz_url = &files[0].url; + println!("Checking file positions in: {}", tgz_url); + + // List first 50 entries to see where rpki-client.json appears + let entries = list_files_in_tgz(tgz_url, Some(50)).expect("Failed to list tgz entries"); + + let json_position = entries + .iter() + .position(|e| e.path.ends_with("rpki-client.json")); + + println!("First {} entries:", entries.len()); + for (i, entry) in entries.iter().enumerate() { + println!(" [{}] {} ({} bytes)", i, entry.path, entry.size); + } + + if let Some(pos) = json_position { + println!( + "\nrpki-client.json found at position {} (early in archive)", + pos + ); + assert!( + pos < 50, + "rpki-client.json should appear early in the archive" + ); + } else { + println!("\nrpki-client.json not in first 50 entries - may need to stream more"); + } + } + + #[test] + #[ignore] // Requires network access + fn test_list_rpkiviews_files() { + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::default(), date) + .expect("Failed to list files"); + + println!("Found {} files for {} from Kerfuffle", files.len(), date); + for file in files.iter().take(3) { + println!( + " {} ({} bytes, {})", + file.url, + file.size.unwrap_or(0), + file.timestamp + ); + } + + assert!(!files.is_empty(), "Should have found some files"); + } } diff --git a/src/rpki/ripe_historical.rs b/src/rpki/ripe_historical.rs index 9ff2cdd..bfa9f13 100644 --- a/src/rpki/ripe_historical.rs +++ b/src/rpki/ripe_historical.rs @@ -1,15 +1,64 @@ -//! load RIPE NCC historical RPKI VRP dump +//! Load RIPE NCC historical RPKI VRP dump using JSON format. +//! +//! RIPE NCC provides historical RPKI data archives for all 5 RIRs: +//! - AFRINIC: +//! - APNIC: +//! - ARIN: +//! - LACNIC: +//! - RIPE NCC: use crate::Result; -use crate::rpki::{Rir, RoaEntry, RpkiTrie}; -use chrono::{Datelike, NaiveDate, NaiveDateTime}; -use ipnet::IpNet; -use std::str::FromStr; +use crate::rpki::rpki_client::RpkiClientData; +use crate::rpki::{Rir, RpkiFile, RpkiTrie}; +use chrono::{Datelike, NaiveDate, Utc}; use tracing::info; +/// List available RIPE output.json.xz files for a given date from all RIRs. +pub fn list_ripe_files(date: NaiveDate) -> Result> { + let mut files = vec![]; + + for rir in [ + Rir::AFRINIC, + Rir::APNIC, + Rir::ARIN, + Rir::LACNIC, + Rir::RIPENCC, + ] { + let base_url = rir.to_ripe_ftp_root_url(); + let url = format!( + "{}/{:04}/{:02}/{:02}/output.json.xz", + base_url, + date.year(), + date.month(), + date.day() + ); + + files.push(RpkiFile { + url, + // We don't have exact timestamp for RIPE files, use midnight of the date + timestamp: date + .and_hms_opt(0, 0, 0) + .unwrap() + .and_utc() + .fixed_offset() + .with_timezone(&Utc), + size: None, + rir: Some(rir), + collector: None, + }); + } + + Ok(files) +} + impl RpkiTrie { + /// Load RPKI data from RIPE NCC historical archives for a specific date. + /// + /// This loads data from all 5 RIRs (AFRINIC, APNIC, ARIN, LACNIC, RIPENCC) + /// using the output.json.xz format which contains ROAs and ASPAs. pub fn from_ripe_historical(date: NaiveDate) -> Result { let mut trie = RpkiTrie::new(Some(date)); + for rir in [ Rir::AFRINIC, Rir::APNIC, @@ -17,55 +66,32 @@ impl RpkiTrie { Rir::LACNIC, Rir::RIPENCC, ] { - let roas = Self::load_vrp_from_ripe(rir, date)?; - for roa in roas { - trie.insert_roa(roa); - } + let url = format!( + "{}/{:04}/{:02}/{:02}/output.json.xz", + rir.to_ripe_ftp_root_url(), + date.year(), + date.month(), + date.day() + ); + info!("loading {} ROAs from {}", rir, url); + + let data = RpkiClientData::from_url(&url)?; + trie.merge_rpki_client_data(data); } + Ok(trie) } - fn load_vrp_from_ripe(rir: Rir, date: NaiveDate) -> Result> { - let mut roas = vec![]; - let base_url = rir.to_ripe_ftp_root_url(); - let url = format!( - "{}/{:04}/{:02}/{:02}/roas.csv.xz", - base_url, - date.year(), - date.month(), - date.day() - ); - info!("loading {} ROAs from {}", rir, url); - for line in oneio::read_lines(&url)?.skip(1) { - let line = line?; - let mut fields = line.split(','); - let _uri = fields.next().unwrap(); - let asn = fields - .next() - .unwrap() - .to_lowercase() - .strip_prefix("as") - .unwrap() - .parse::()?; - let prefix = IpNet::from_str(fields.next().unwrap())?; - let max_length = match fields.next().unwrap().parse::() { - Ok(l) => l, - Err(_) => continue, - }; - let not_before = - NaiveDateTime::parse_from_str(fields.next().unwrap(), "%Y-%m-%d %H:%M:%S").ok(); - let not_after = - NaiveDateTime::parse_from_str(fields.next().unwrap(), "%Y-%m-%d %H:%M:%S").ok(); - let roa_entry = RoaEntry { - prefix, - asn, - max_length, - rir: Some(rir), - not_before, - not_after, - }; - roas.push(roa_entry); + /// Load RPKI data from specific RIPE file URLs. + pub fn from_ripe_files(urls: &[String], date: Option) -> Result { + let mut trie = RpkiTrie::new(date); + + for url in urls { + info!("loading ROAs from {}", url); + let data = RpkiClientData::from_url(url)?; + trie.merge_rpki_client_data(data); } - Ok(roas) + + Ok(trie) } } diff --git a/src/rpki/rpki_client.rs b/src/rpki/rpki_client.rs new file mode 100644 index 0000000..753c733 --- /dev/null +++ b/src/rpki/rpki_client.rs @@ -0,0 +1,371 @@ +//! Internal data structures for parsing rpki-client JSON output format. +//! +//! The `rpki-client` software produces JSON output that is used by multiple RPKI data sources: +//! - Cloudflare RPKI Portal () +//! - RIPE NCC historical archives (output.json.xz files) +//! - RPKIviews collectors (rpki-client.json inside .tgz files) +//! +//! This module defines the internal data structures for parsing this JSON format. +//! For public access, use the `Roa` and `Aspa` structs from the parent module. + +use serde::{Deserialize, Deserializer, Serialize}; + +/// Custom deserializer for ASN that handles both numeric and string formats. +/// RIPE uses "AS12345" format, while Cloudflare uses numeric 12345. +fn deserialize_asn<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + use serde::de::{self, Visitor}; + + struct AsnVisitor; + + impl<'de> Visitor<'de> for AsnVisitor { + type Value = u32; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("an ASN as a number or string like 'AS12345'") + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + u32::try_from(value).map_err(|_| E::custom(format!("ASN {} out of range", value))) + } + + fn visit_i64(self, value: i64) -> Result + where + E: de::Error, + { + u32::try_from(value).map_err(|_| E::custom(format!("ASN {} out of range", value))) + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + // Handle "AS12345" or "as12345" format + let num_str = value + .strip_prefix("AS") + .or_else(|| value.strip_prefix("as")) + .unwrap_or(value); + + num_str + .parse::() + .map_err(|_| E::custom(format!("invalid ASN string: {}", value))) + } + } + + deserializer.deserialize_any(AsnVisitor) +} + +/// Custom deserializer for expires that handles both i64 and u64. +fn deserialize_expires<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + use serde::de::{self, Visitor}; + + struct ExpiresVisitor; + + impl<'de> Visitor<'de> for ExpiresVisitor { + type Value = u64; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a timestamp as a number") + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + Ok(value) + } + + fn visit_i64(self, value: i64) -> Result + where + E: de::Error, + { + if value >= 0 { + Ok(value as u64) + } else { + Err(E::custom(format!("negative timestamp: {}", value))) + } + } + } + + deserializer.deserialize_any(ExpiresVisitor) +} + +/// Custom deserializer for provider list that handles both string array and number array. +/// RIPE uses ["AS123", "AS456"] format, Cloudflare uses [123, 456]. +fn deserialize_providers<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + use serde::de::{self, SeqAccess, Visitor}; + + struct ProvidersVisitor; + + impl<'de> Visitor<'de> for ProvidersVisitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a list of ASNs as numbers or strings") + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + let mut providers = Vec::new(); + + while let Some(elem) = seq.next_element::()? { + let asn = match elem { + serde_json::Value::Number(n) => n + .as_u64() + .and_then(|v| u32::try_from(v).ok()) + .ok_or_else(|| de::Error::custom("invalid ASN number"))?, + serde_json::Value::String(s) => { + let num_str = s + .strip_prefix("AS") + .or_else(|| s.strip_prefix("as")) + .unwrap_or(&s); + num_str + .parse::() + .map_err(|_| de::Error::custom(format!("invalid ASN string: {}", s)))? + } + _ => return Err(de::Error::custom("expected number or string")), + }; + providers.push(asn); + } + + Ok(providers) + } + } + + deserializer.deserialize_seq(ProvidersVisitor) +} + +/// The main rpki-client JSON output structure. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub(crate) struct RpkiClientData { + #[serde(default)] + pub metadata: RpkiClientMetadata, + #[serde(default)] + pub roas: Vec, + #[serde(default)] + pub aspas: Vec, + #[serde(default)] + pub bgpsec_keys: Vec, +} + +/// Metadata about the rpki-client validation run. +#[derive(Clone, Debug, Serialize, Deserialize, Default)] +pub(crate) struct RpkiClientMetadata { + pub buildmachine: Option, + pub buildtime: Option, + #[serde(default)] + pub generated: Option, + #[serde(rename = "generatedTime", default)] + pub generated_time: Option, + pub elapsedtime: Option, + pub usertime: Option, + pub systemtime: Option, + pub roas: Option, + pub failedroas: Option, + pub invalidroas: Option, + pub spls: Option, + pub failedspls: Option, + pub invalidspls: Option, + pub aspas: Option, + pub failedaspas: Option, + pub invalidaspas: Option, + pub bgpsec_pubkeys: Option, + pub certificates: Option, + pub invalidcertificates: Option, + pub taks: Option, + pub tals: Option, + pub invalidtals: Option, + pub talfiles: Option>, + pub manifests: Option, + pub failedmanifests: Option, + pub crls: Option, + pub gbrs: Option, + pub repositories: Option, + pub vrps: Option, + pub uniquevrps: Option, + pub vsps: Option, + pub uniquevsps: Option, + pub vaps: Option, + pub uniquevaps: Option, + pub cachedir_new_files: Option, + pub cachedir_del_files: Option, + pub cachedir_del_dirs: Option, + pub cachedir_superfluous_files: Option, + pub cachedir_del_superfluous_files: Option, +} + +/// A validated Route Origin Authorization (ROA) entry from rpki-client. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub(crate) struct RpkiClientRoaEntry { + pub prefix: String, + #[serde(rename = "maxLength")] + pub max_length: u8, + #[serde(deserialize_with = "deserialize_asn")] + pub asn: u32, + pub ta: String, + #[serde(default, deserialize_with = "deserialize_expires")] + pub expires: u64, +} + +/// A validated AS Provider Authorization (ASPA) entry from rpki-client. +/// +/// Handles both Cloudflare format (customer_asid as number, providers as numbers) +/// and RIPE format (customer as string "AS123", providers as strings ["AS456"]). +#[derive(Clone, Debug, Serialize, Deserialize)] +pub(crate) struct RpkiClientAspaEntry { + /// Customer ASN - Cloudflare uses "customer_asid", RIPE uses "customer" + #[serde(alias = "customer", deserialize_with = "deserialize_asn")] + pub customer_asid: u32, + /// Expiry timestamp - may be missing in RIPE format + #[serde(default)] + pub expires: i64, + /// Provider ASNs - can be numbers or strings like "AS123" + #[serde(deserialize_with = "deserialize_providers")] + pub providers: Vec, +} + +/// A validated BGPsec router key entry from rpki-client. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub(crate) struct RpkiClientBgpsecKeyEntry { + pub asn: u32, + pub ski: String, + pub pubkey: String, + pub ta: String, + pub expires: i64, +} + +impl RpkiClientData { + /// Load rpki-client data from a URL. + /// + /// This uses oneio to handle remote URLs and compression (xz, gz, etc.), + /// then parses the JSON with our custom deserializers. + pub fn from_url(url: &str) -> crate::Result { + let reader = oneio::get_reader(url)?; + let data: RpkiClientData = serde_json::from_reader(reader)?; + Ok(data) + } + + /// Load rpki-client data from a JSON string. + pub fn from_json(json: &str) -> crate::Result { + let data: RpkiClientData = serde_json::from_str(json)?; + Ok(data) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_deserialize_empty() { + let json = r#"{}"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert!(data.roas.is_empty()); + assert!(data.aspas.is_empty()); + assert!(data.bgpsec_keys.is_empty()); + } + + #[test] + fn test_deserialize_roa_numeric_asn() { + let json = r#"{ + "roas": [ + { + "prefix": "192.0.2.0/24", + "maxLength": 24, + "asn": 64496, + "ta": "apnic", + "expires": 1704067200 + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.roas.len(), 1); + assert_eq!(data.roas[0].prefix, "192.0.2.0/24"); + assert_eq!(data.roas[0].max_length, 24); + assert_eq!(data.roas[0].asn, 64496); + assert_eq!(data.roas[0].ta, "apnic"); + } + + #[test] + fn test_deserialize_roa_string_asn() { + // RIPE format uses "AS12345" string format + let json = r#"{ + "roas": [ + { + "prefix": "1.178.112.0/20", + "maxLength": 24, + "asn": "AS12975", + "ta": "ripencc" + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.roas.len(), 1); + assert_eq!(data.roas[0].prefix, "1.178.112.0/20"); + assert_eq!(data.roas[0].max_length, 24); + assert_eq!(data.roas[0].asn, 12975); + assert_eq!(data.roas[0].ta, "ripencc"); + } + + #[test] + fn test_deserialize_roa_lowercase_asn() { + let json = r#"{ + "roas": [ + { + "prefix": "10.0.0.0/8", + "maxLength": 8, + "asn": "as64496", + "ta": "arin" + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.roas[0].asn, 64496); + } + + #[test] + fn test_deserialize_aspa() { + let json = r#"{ + "aspas": [ + { + "customer_asid": 64496, + "expires": 1704067200, + "providers": [64497, 64498] + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.aspas.len(), 1); + assert_eq!(data.aspas[0].customer_asid, 64496); + assert_eq!(data.aspas[0].providers, vec![64497, 64498]); + } + + #[test] + fn test_deserialize_ripe_metadata() { + let json = r#"{ + "metadata": { + "generated": 1717215759, + "generatedTime": "2024-06-01T04:22:39Z" + } + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.metadata.generated, Some(1717215759)); + assert_eq!( + data.metadata.generated_time, + Some("2024-06-01T04:22:39Z".to_string()) + ); + } +} diff --git a/src/rpki/rpkiviews.rs b/src/rpki/rpkiviews.rs new file mode 100644 index 0000000..7cf1fb1 --- /dev/null +++ b/src/rpki/rpkiviews.rs @@ -0,0 +1,685 @@ +//! Load RPKI data from RPKIviews collectors. +//! +//! RPKIviews provides historical RPKI data from multiple collectors around the world. +//! See: +//! +//! ## System Requirements +//! +//! This module requires the `gunzip` command to be available in the system PATH +//! for decompressing `.tgz` archives. On most Unix-like systems, this is provided +//! by the `gzip` package. On macOS, it is available by default. On Windows, +//! you may need to install it via WSL, Cygwin, or similar tools. + +use crate::Result; +use crate::rpki::rpki_client::RpkiClientData; +use crate::rpki::{RpkiFile, RpkiTrie}; +use chrono::{DateTime, Datelike, NaiveDate}; +use serde::{Deserialize, Serialize}; +use std::io::{Read, Write}; +use std::process::{Command, Stdio}; +use std::str::FromStr; +use tracing::info; + +/// Available RPKIviews collectors. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] +pub enum RpkiViewsCollector { + /// josephine.sobornost.net - A2B Internet (AS51088), Amsterdam, Netherlands + #[default] + SoborostNet, + /// amber.massars.net - Massar (AS57777), Lugano, Switzerland + MassarsNet, + /// dango.attn.jp - Internet Initiative Japan (AS2497), Tokyo, Japan + AttnJp, + /// rpkiviews.kerfuffle.net - Kerfuffle, LLC (AS35008), Fremont, California, United States + KerfuffleNet, +} + +impl RpkiViewsCollector { + /// Get the HTTPS base URL for this collector + pub fn base_url(&self) -> &'static str { + match self { + RpkiViewsCollector::SoborostNet => "https://josephine.sobornost.net/rpkidata", + RpkiViewsCollector::MassarsNet => "https://amber.massars.net/rpkidata", + RpkiViewsCollector::AttnJp => "https://dango.attn.jp/rpkidata", + RpkiViewsCollector::KerfuffleNet => "https://rpkiviews.kerfuffle.net/rpkidata", + } + } + + /// Get the index.txt URL for this collector + pub fn index_url(&self) -> String { + format!("{}/index.txt", self.base_url()) + } + + /// Get all available collectors + pub fn all() -> Vec { + vec![ + RpkiViewsCollector::SoborostNet, + RpkiViewsCollector::MassarsNet, + RpkiViewsCollector::AttnJp, + RpkiViewsCollector::KerfuffleNet, + ] + } +} + +impl std::fmt::Display for RpkiViewsCollector { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RpkiViewsCollector::SoborostNet => write!(f, "sobornost.net"), + RpkiViewsCollector::MassarsNet => write!(f, "massars.net"), + RpkiViewsCollector::AttnJp => write!(f, "attn.jp"), + RpkiViewsCollector::KerfuffleNet => write!(f, "kerfuffle.net"), + } + } +} + +impl FromStr for RpkiViewsCollector { + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "sobornost.net" | "josephine.sobornost.net" => Ok(RpkiViewsCollector::SoborostNet), + "massars.net" | "amber.massars.net" => Ok(RpkiViewsCollector::MassarsNet), + "attn.jp" | "dango.attn.jp" => Ok(RpkiViewsCollector::AttnJp), + "kerfuffle.net" | "rpkiviews.kerfuffle.net" => Ok(RpkiViewsCollector::KerfuffleNet), + _ => Err(format!("unknown RPKIviews collector: {}", s)), + } + } +} + +/// List available RPKIviews files for a given date from a specific collector. +/// +/// This function reads the index.txt file from the collector to find available +/// archives for the specified date. This is a fast operation as it only downloads +/// a small index file. +pub fn list_rpkiviews_files( + collector: RpkiViewsCollector, + date: NaiveDate, +) -> Result> { + let index_url = collector.index_url(); + let base_url = collector.base_url(); + + // Format the date path prefix we're looking for (e.g., "2024/01/04/") + let date_prefix = format!("{:04}/{:02}/{:02}/", date.year(), date.month(), date.day()); + + let mut files = vec![]; + + for line in oneio::read_lines(&index_url)? { + let line = line?; + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() < 3 { + continue; + } + + let path = parts[0]; + let timestamp_secs: i64 = match parts[1].parse() { + Ok(t) => t, + Err(_) => continue, + }; + let size: u64 = match parts[2].parse() { + Ok(s) => s, + Err(_) => continue, + }; + + // Check if this file is for the requested date and is a .tgz file + if path.starts_with(&date_prefix) && path.ends_with(".tgz") && path.contains("/rpki-") { + let url = format!("{}/{}", base_url, path); + let timestamp = DateTime::from_timestamp(timestamp_secs, 0) + .unwrap_or_else(|| DateTime::from_timestamp(0, 0).unwrap()); + + files.push(RpkiFile { + url, + timestamp, + size: Some(size), + rir: None, + collector: Some(collector), + }); + } + } + + // Sort by timestamp (oldest first) + files.sort_by_key(|f| f.timestamp); + + Ok(files) +} + +/// Information about a file entry found within a .tgz archive. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct TgzFileEntry { + /// Path of the file within the archive + pub path: String, + /// Size of the file in bytes + pub size: u64, +} + +/// List files within a remote .tgz archive by streaming only the tar headers. +#[allow(dead_code)] +/// +/// This function streams the .tgz archive and reads tar headers to enumerate +/// the files contained within. It skips reading the actual file content, which +/// makes it much faster than downloading the entire archive. +/// +/// **Important**: Due to the nature of gzip compression, we still need to decompress +/// the data sequentially, but we skip over file content rather than buffering it. +/// If `max_entries` is provided, we stop early after finding that many entries. +/// +/// # Arguments +/// * `url` - URL of the .tgz file +/// * `max_entries` - Optional maximum number of entries to return (for early termination) +pub fn list_files_in_tgz(url: &str, max_entries: Option) -> Result> { + info!("listing files in tgz archive: {}", url); + + // Spawn gunzip process + let mut gunzip = Command::new("gunzip") + .arg("-c") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to spawn gunzip: {}", e), + ) + })?; + + let mut gunzip_stdin = gunzip.stdin.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdin") + })?; + + let gunzip_stdout = gunzip.stdout.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdout") + })?; + + // Flag to signal early termination to writer thread + let should_stop = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let should_stop_writer = should_stop.clone(); + + // Stream the .tgz file using reqwest in a separate thread + let url_owned = url.to_string(); + let writer_thread = std::thread::spawn(move || -> Result<()> { + let response = reqwest::blocking::get(&url_owned).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to fetch {}: {}", url_owned, e), + ) + })?; + + if !response.status().is_success() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("HTTP error {} for {}", response.status(), url_owned), + )); + } + + let mut reader = response; + let mut buffer = [0u8; 65536]; + loop { + // Check if we should stop early + if should_stop_writer.load(std::sync::atomic::Ordering::SeqCst) { + break; + } + + let n = match reader.read(&mut buffer) { + Ok(0) => break, + Ok(n) => n, + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => { + tracing::debug!("Network read error during tgz streaming: {}", e); + break; + } + }; + + if gunzip_stdin.write_all(&buffer[..n]).is_err() { + // Pipe closed (reader side done), stop gracefully + break; + } + } + drop(gunzip_stdin); + Ok(()) + }); + + // Read tar entries from gunzip stdout - only reading headers, skipping content + let mut archive = tar::Archive::new(gunzip_stdout); + let mut entries_list = Vec::new(); + + let entries_iter = archive.entries().map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to read tar entries: {}", e), + ) + })?; + + for entry_result in entries_iter { + let entry = match entry_result { + Ok(e) => e, + Err(_) => continue, + }; + + let path = match entry.path() { + Ok(p) => p.to_string_lossy().to_string(), + Err(_) => continue, + }; + + let size = entry.size(); + + // Skip directories (they have size 0 and path ends with /) + if !path.ends_with('/') { + entries_list.push(TgzFileEntry { path, size }); + } + + // Check if we've reached max_entries + if let Some(max) = max_entries { + if entries_list.len() >= max { + // Signal writer to stop and break out + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + break; + } + } + } + + // Signal writer to stop (in case we finished iterating) + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + + // Wait for writer thread to finish to ensure clean shutdown + let _ = writer_thread.join(); + + // Kill gunzip process to clean up + let _ = gunzip.kill(); + let _ = gunzip.wait(); + + Ok(entries_list) +} + +/// Check if a .tgz archive contains a specific file path. +#[allow(dead_code)] +/// +/// This is an optimized function that stops as soon as it finds the target file, +/// avoiding the need to decompress the entire archive. +pub fn tgz_contains_file(url: &str, target_path: &str) -> Result { + info!( + "checking if tgz archive contains file: {} in {}", + target_path, url + ); + + let mut gunzip = Command::new("gunzip") + .arg("-c") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to spawn gunzip: {}", e), + ) + })?; + + let mut gunzip_stdin = gunzip.stdin.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdin") + })?; + + let gunzip_stdout = gunzip.stdout.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdout") + })?; + + let should_stop = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let should_stop_writer = should_stop.clone(); + + let url_owned = url.to_string(); + let writer_thread = std::thread::spawn(move || -> Result<()> { + let response = reqwest::blocking::get(&url_owned).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to fetch {}: {}", url_owned, e), + ) + })?; + + if !response.status().is_success() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("HTTP error {} for {}", response.status(), url_owned), + )); + } + + let mut reader = response; + let mut buffer = [0u8; 65536]; + loop { + if should_stop_writer.load(std::sync::atomic::Ordering::SeqCst) { + break; + } + + let n = match reader.read(&mut buffer) { + Ok(0) => break, + Ok(n) => n, + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => { + tracing::debug!("Network read error during tgz streaming: {}", e); + break; + } + }; + + if gunzip_stdin.write_all(&buffer[..n]).is_err() { + break; + } + } + drop(gunzip_stdin); + Ok(()) + }); + + let mut archive = tar::Archive::new(gunzip_stdout); + let mut found = false; + + if let Ok(entries_iter) = archive.entries() { + for entry_result in entries_iter { + let entry = match entry_result { + Ok(e) => e, + Err(_) => continue, + }; + + let path = match entry.path() { + Ok(p) => p.to_string_lossy().to_string(), + Err(_) => continue, + }; + + if path.ends_with(target_path) || path == target_path { + found = true; + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + break; + } + } + } + + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + let _ = writer_thread.join(); + let _ = gunzip.kill(); + let _ = gunzip.wait(); + + Ok(found) +} + +/// Extract a specific file from a remote .tgz archive. +/// +/// This function streams the .tgz archive and extracts only the target file, +/// stopping as soon as the file is found and read. This is much faster than +/// downloading and extracting the entire archive. +/// +/// # Arguments +/// * `url` - URL of the .tgz file +/// * `target_path` - Path of the file to extract within the archive (e.g., "output/rpki-client.json") +/// +/// # Returns +/// The content of the target file as a String, or an error if the file is not found. +pub fn extract_file_from_tgz(url: &str, target_path: &str) -> Result { + info!("extracting {} from tgz archive: {}", target_path, url); + + let mut gunzip = Command::new("gunzip") + .arg("-c") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to spawn gunzip: {}", e), + ) + })?; + + let mut gunzip_stdin = gunzip.stdin.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdin") + })?; + + let gunzip_stdout = gunzip.stdout.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdout") + })?; + + let should_stop = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let should_stop_writer = should_stop.clone(); + + let url_owned = url.to_string(); + let writer_thread = std::thread::spawn(move || -> Result<()> { + let response = reqwest::blocking::get(&url_owned).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to fetch {}: {}", url_owned, e), + ) + })?; + + if !response.status().is_success() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("HTTP error {} for {}", response.status(), url_owned), + )); + } + + let mut reader = response; + let mut buffer = [0u8; 65536]; + loop { + if should_stop_writer.load(std::sync::atomic::Ordering::SeqCst) { + break; + } + + let n = match reader.read(&mut buffer) { + Ok(0) => break, + Ok(n) => n, + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => { + tracing::debug!("Network read error during tgz streaming: {}", e); + break; + } + }; + + if gunzip_stdin.write_all(&buffer[..n]).is_err() { + break; + } + } + drop(gunzip_stdin); + Ok(()) + }); + + let mut archive = tar::Archive::new(gunzip_stdout); + let mut content: Option = None; + + let entries_iter = archive.entries().map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to read tar entries: {}", e), + ) + })?; + + for entry_result in entries_iter { + let mut entry = match entry_result { + Ok(e) => e, + Err(_) => continue, + }; + + let path = match entry.path() { + Ok(p) => p.to_string_lossy().to_string(), + Err(_) => continue, + }; + + // Check if this is the target file + if path.ends_with(target_path) || path == target_path { + let mut file_content = String::new(); + entry.read_to_string(&mut file_content).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to read {}: {}", target_path, e), + ) + })?; + content = Some(file_content); + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + break; + } + // Note: if this is not our target file, the tar iterator automatically + // skips past the file content when we move to the next entry, + // so we don't buffer unnecessary data + } + + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + let _ = writer_thread.join(); + let _ = gunzip.kill(); + let _ = gunzip.wait(); + + content.ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("{} not found in archive: {}", target_path, url), + ) + }) +} + +/// Stream and extract rpki-client.json from a .tgz URL. +/// +/// This is a convenience function that extracts the rpki-client.json file +/// from an RPKIviews archive. It stops streaming as soon as the file is found. +fn stream_tgz_and_extract_json(url: &str) -> Result { + let json_str = extract_file_from_tgz(url, "output/rpki-client.json")?; + RpkiClientData::from_json(&json_str) +} + +impl RpkiTrie { + /// Load RPKI data from RPKIviews for a specific date. + /// + /// This will use the first (earliest) available file for the given date from the specified collector. + /// The caller must specify which collector to use. See [`RpkiViewsCollector`] for available options. + pub fn from_rpkiviews(collector: RpkiViewsCollector, date: NaiveDate) -> Result { + let files = list_rpkiviews_files(collector, date)?; + + if files.is_empty() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!( + "No RPKIviews files found for date {} from collector {}", + date, collector + ), + )); + } + + // Use the first (earliest) file for the date + let first_file = files.first().unwrap(); + info!( + "Using RPKIviews file from {} (timestamp: {})", + collector, first_file.timestamp + ); + + Self::from_rpkiviews_file(&first_file.url, Some(date)) + } + + /// Load RPKI data from a specific RPKIviews .tgz file URL. + pub fn from_rpkiviews_file(url: &str, date: Option) -> Result { + let data = stream_tgz_and_extract_json(url)?; + Self::from_rpki_client_data(data, date) + } + + /// Load RPKI data from multiple RPKIviews file URLs. + /// + /// This allows loading and merging data from multiple files into a single trie. + pub fn from_rpkiviews_files(urls: &[String], date: Option) -> Result { + let mut trie = RpkiTrie::new(date); + + for url in urls { + let data = stream_tgz_and_extract_json(url)?; + trie.merge_rpki_client_data(data); + } + + Ok(trie) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collector_urls() { + assert_eq!( + RpkiViewsCollector::SoborostNet.base_url(), + "https://josephine.sobornost.net/rpkidata" + ); + assert_eq!( + RpkiViewsCollector::KerfuffleNet.index_url(), + "https://rpkiviews.kerfuffle.net/rpkidata/index.txt" + ); + } + + #[test] + fn test_collector_from_str() { + assert_eq!( + RpkiViewsCollector::from_str("sobornost.net").unwrap(), + RpkiViewsCollector::SoborostNet + ); + assert_eq!( + RpkiViewsCollector::from_str("amber.massars.net").unwrap(), + RpkiViewsCollector::MassarsNet + ); + } + + #[test] + fn test_default_collector() { + assert_eq!( + RpkiViewsCollector::default(), + RpkiViewsCollector::SoborostNet + ); + } + + #[test] + #[ignore] // Requires network access + fn test_list_rpkiviews_files() { + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::KerfuffleNet, date).unwrap(); + println!("Found {} files for {}", files.len(), date); + for file in &files { + println!(" {} ({} bytes)", file.url, file.size.unwrap_or(0)); + } + assert!(!files.is_empty()); + } + + #[test] + #[ignore] // Requires network access - streams partial archive + fn test_list_files_in_tgz() { + // List first 10 files in a remote tgz to verify streaming works + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::KerfuffleNet, date).unwrap(); + assert!(!files.is_empty()); + + let tgz_url = &files[0].url; + println!("Listing files in: {}", tgz_url); + + // Only get first 10 entries to test early termination + let entries = list_files_in_tgz(tgz_url, Some(10)).unwrap(); + println!("Found {} entries (limited to 10):", entries.len()); + for entry in &entries { + println!(" {} ({} bytes)", entry.path, entry.size); + } + assert!(!entries.is_empty()); + assert!(entries.len() <= 10); + } + + #[test] + #[ignore] // Requires network access - streams partial archive + fn test_tgz_contains_file() { + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::KerfuffleNet, date).unwrap(); + assert!(!files.is_empty()); + + let tgz_url = &files[0].url; + println!("Checking for rpki-client.json in: {}", tgz_url); + + let contains = tgz_contains_file(tgz_url, "output/rpki-client.json").unwrap(); + assert!(contains, "Archive should contain output/rpki-client.json"); + println!("Found rpki-client.json!"); + } + + #[test] + #[ignore] // Requires network access and takes time to download + fn test_from_rpkiviews() { + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::default(), date).unwrap(); + + let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); + println!("Loaded {} ROAs from RPKIviews", total_roas); + assert!(total_roas > 0); + } +} diff --git a/tests/asinfo_integration.rs b/tests/asinfo_integration.rs index e09f30d..b17d3d1 100644 --- a/tests/asinfo_integration.rs +++ b/tests/asinfo_integration.rs @@ -1,3 +1,5 @@ +#![cfg(feature = "asinfo")] + /// Integration test for basic AS information retrieval. #[test] fn test_basic_info() { diff --git a/tests/mrt_collectors.rs b/tests/mrt_collectors.rs index 6ecb8e0..fbd0b96 100644 --- a/tests/mrt_collectors.rs +++ b/tests/mrt_collectors.rs @@ -1,3 +1,5 @@ +#![cfg(feature = "mrt_collectors")] + #[test] fn test_get_collectors() { let mut commons = bgpkit_commons::BgpkitCommons::new();