From b9028e5aa6e6895197be36f2c6ba2594a0f2f34d Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 4 Dec 2025 11:30:28 -0800 Subject: [PATCH 1/7] chore: introduce AsInfoBuilder for AS info loading - Added a new `AsInfoBuilder` with fluent API methods to load AS information using specific data sources. - Introduced `asinfo_builder()` and `load_asinfo_with(builder)` methods for improved API ergonomics in `BgpkitCommons`. - Retained `load_asinfo(bool, ...)` for backward compatibility. - Exposed `PeeringdbData` for public access. --- CHANGELOG.md | 30 +++++++++++++++++++ src/asinfo/mod.rs | 73 ++++++++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 40 ++++++++++++++++++++++++++ 3 files changed, 142 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35ade18..b6710db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,36 @@ All notable changes to this project will be documented in this file. +## Unreleased + +### API Improvements + +* **AsInfoBuilder**: Added a new builder pattern for loading AS information with specific data sources + - New `AsInfoBuilder` struct with fluent API methods: `with_as2org()`, `with_population()`, `with_hegemony()`, `with_peeringdb()`, `with_all()` + - Added `asinfo_builder()` method to `BgpkitCommons` for creating builders + - Added `load_asinfo_with(builder)` method to `BgpkitCommons` for loading with builder configuration + - The existing `load_asinfo(bool, bool, bool, bool)` method is preserved for backward compatibility + +**Before (confusing boolean parameters):** +```rust +commons.load_asinfo(true, false, true, false)?; +``` + +**After (clear builder pattern):** +```rust +let builder = commons.asinfo_builder() + .with_as2org() + .with_hegemony(); +commons.load_asinfo_with(builder)?; +``` + +### Public API Enhancements + +* **asinfo module**: Added `PeeringdbData` to public exports for direct module access +* All modules now consistently support both: + - Central access via `BgpkitCommons` instance + - Direct module access (e.g., `bgpkit_commons::bogons::Bogons::new()`) + ## v0.9.6 - 2025-10-29 ### Maintenance diff --git a/src/asinfo/mod.rs b/src/asinfo/mod.rs index 0ded583..afe3af8 100644 --- a/src/asinfo/mod.rs +++ b/src/asinfo/mod.rs @@ -118,13 +118,13 @@ mod sibling_orgs; use crate::errors::{data_sources, load_methods, modules}; use crate::{BgpkitCommons, BgpkitCommonsError, LazyLoadable, Result}; -use peeringdb::PeeringdbData; use serde::{Deserialize, Serialize}; use sibling_orgs::SiblingOrgsUtils; use std::collections::HashMap; use tracing::info; pub use hegemony::HegemonyData; +pub use peeringdb::PeeringdbData; pub use population::AsnPopulationData; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -173,6 +173,77 @@ const RIPE_RIS_ASN_TXT_URL: &str = "https://ftp.ripe.net/ripe/asnames/asn.txt"; const BGPKIT_ASN_TXT_MIRROR_URL: &str = "https://data.bgpkit.com/commons/asn.txt"; const BGPKIT_ASNINFO_URL: &str = "https://data.bgpkit.com/commons/asinfo.jsonl"; +/// Builder for configuring which data sources to load for AS information. +/// +/// # Example +/// +/// ```rust,no_run +/// use bgpkit_commons::asinfo::AsInfoBuilder; +/// +/// let asinfo = AsInfoBuilder::new() +/// .with_as2org() +/// .with_peeringdb() +/// .build() +/// .unwrap(); +/// ``` +#[derive(Default)] +pub struct AsInfoBuilder { + load_as2org: bool, + load_population: bool, + load_hegemony: bool, + load_peeringdb: bool, +} + +impl AsInfoBuilder { + /// Create a new builder with all data sources disabled by default. + pub fn new() -> Self { + Self::default() + } + + /// Enable loading CAIDA AS-to-Organization mapping data. + pub fn with_as2org(mut self) -> Self { + self.load_as2org = true; + self + } + + /// Enable loading APNIC AS population data. + pub fn with_population(mut self) -> Self { + self.load_population = true; + self + } + + /// Enable loading IIJ IHR hegemony score data. + pub fn with_hegemony(mut self) -> Self { + self.load_hegemony = true; + self + } + + /// Enable loading PeeringDB data. + pub fn with_peeringdb(mut self) -> Self { + self.load_peeringdb = true; + self + } + + /// Enable all optional data sources. + pub fn with_all(mut self) -> Self { + self.load_as2org = true; + self.load_population = true; + self.load_hegemony = true; + self.load_peeringdb = true; + self + } + + /// Build the AsInfoUtils with the configured data sources. + pub fn build(self) -> Result { + AsInfoUtils::new( + self.load_as2org, + self.load_population, + self.load_hegemony, + self.load_peeringdb, + ) + } +} + pub struct AsInfoUtils { pub asinfo_map: HashMap, pub sibling_orgs: Option, diff --git a/src/lib.rs b/src/lib.rs index 0bd44d6..79b540d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -339,6 +339,46 @@ impl BgpkitCommons { Ok(()) } + /// Returns a builder for loading AS information with specific data sources. + /// + /// This provides a more ergonomic way to configure which data sources to load + /// compared to the `load_asinfo()` method with boolean parameters. + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// + /// let mut commons = BgpkitCommons::new(); + /// let builder = commons.asinfo_builder() + /// .with_as2org() + /// .with_peeringdb(); + /// commons.load_asinfo_with(builder).unwrap(); + /// ``` + #[cfg(feature = "asinfo")] + pub fn asinfo_builder(&self) -> crate::asinfo::AsInfoBuilder { + crate::asinfo::AsInfoBuilder::new() + } + + /// Load AS information using a pre-configured builder. + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// + /// let mut commons = BgpkitCommons::new(); + /// let builder = commons.asinfo_builder() + /// .with_as2org() + /// .with_hegemony(); + /// commons.load_asinfo_with(builder).unwrap(); + /// ``` + #[cfg(feature = "asinfo")] + pub fn load_asinfo_with(&mut self, builder: crate::asinfo::AsInfoBuilder) -> Result<()> { + self.asinfo = Some(builder.build()?); + Ok(()) + } + /// Load AS-level relationship data #[cfg(feature = "as2rel")] pub fn load_as2rel(&mut self) -> Result<()> { From fdac67175d78bac1122b79174e331a41351e2906 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 4 Dec 2025 11:49:17 -0800 Subject: [PATCH 2/7] chore: consolidate crates into bgpkit-commons - Migrated `as2org-rs` crate into `bgpkit-commons` as `as2org` module: - Implemented AS-to-Organization mapping with `As2org` struct and core methods. - Removed external `as2org-rs` dependency from `Cargo.toml`. - Migrated `peeringdb-rs` crate into `bgpkit-commons` as `peeringdb` module: - Integrated PeeringDB API client with `PeeringdbNet` struct and utility functions. - Removed external `peeringdb-rs` dependency from `Cargo.toml`. - Updated `asinfo` feature flags to replace external dependencies with `regex`. - Added comprehensive tests for `as2org` and `peeringdb` modules for reliability. - Simplified maintenance by unifying all functionality in the shared repository. --- CHANGELOG.md | 39 +++ Cargo.toml | 4 +- src/asinfo/as2org.rs | 516 ++++++++++++++++++++++++++++++++++++++++ src/asinfo/mod.rs | 5 +- src/asinfo/peeringdb.rs | 434 ++++++++++++++++++++++++++++++++- 5 files changed, 984 insertions(+), 14 deletions(-) create mode 100644 src/asinfo/as2org.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index b6710db..b0602ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file. ## Unreleased +### Crate Consolidation + +* **Migrated `as2org-rs` into bgpkit-commons**: The CAIDA AS-to-Organization mapping functionality previously provided by the external `as2org-rs` crate has been fully integrated into the `asinfo` module + - New `src/asinfo/as2org.rs` module provides `As2org` struct with `new()`, `get_as_info()`, `get_siblings()`, and `are_siblings()` methods + - Removed external `as2org-rs` dependency from Cargo.toml + - Single codebase simplifies maintenance and patch application + +* **Migrated `peeringdb-rs` into bgpkit-commons**: The PeeringDB API access functionality previously provided by the external `peeringdb-rs` crate has been fully integrated into the `asinfo` module + - Updated `src/asinfo/peeringdb.rs` with full PeeringDB API client implementation + - Includes `PeeringdbNet` struct and `load_peeringdb_net()` function for direct API access + - Removed external `peeringdb-rs` dependency from Cargo.toml + +* **Updated feature flags**: The `asinfo` feature now uses `regex` instead of external crate dependencies + - Before: `asinfo = ["as2org-rs", "peeringdb-rs", "oneio", "serde_json", "tracing", "chrono"]` + - After: `asinfo = ["oneio", "serde_json", "tracing", "chrono", "regex"]` + ### API Improvements * **AsInfoBuilder**: Added a new builder pattern for loading AS information with specific data sources @@ -32,6 +48,29 @@ commons.load_asinfo_with(builder)?; - Central access via `BgpkitCommons` instance - Direct module access (e.g., `bgpkit_commons::bogons::Bogons::new()`) +### Testing Improvements + +* **Comprehensive as2org module tests**: Added extensive unit tests for the migrated CAIDA AS-to-Organization functionality + - JSON deserialization tests for `As2orgJsonOrg` and `As2orgJsonAs` structures + - Tests for optional fields and default values + - `As2orgAsInfo` struct creation and serialization round-trip tests + - `fix_latin1_misinterpretation` function tests for edge cases + - Integration tests (ignored by default) for `As2org::new()`, `get_as_info()`, `get_siblings()`, and `are_siblings()` methods + +* **Comprehensive peeringdb module tests**: Added extensive unit tests for the migrated PeeringDB functionality + - `PeeringdbData` struct creation, serialization, and deserialization tests + - `PeeringdbNet` struct tests with all optional fields + - `PeeringdbNetResponse` API response deserialization tests + - `Peeringdb` struct tests for `get_data()`, `contains()`, `len()`, `is_empty()`, and `get_all_asns()` methods + - Empty database edge case tests + - Integration tests (ignored by default) for live API access + +* **New Peeringdb helper methods**: Added utility methods to the `Peeringdb` struct for better usability + - `len()`: Get the number of networks in the database + - `is_empty()`: Check if the database is empty + - `contains(asn)`: Check if an ASN exists in PeeringDB + - `get_all_asns()`: Get all ASNs in the database + ## v0.9.6 - 2025-10-29 ### Maintenance diff --git a/Cargo.toml b/Cargo.toml index 8588152..d40727f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,12 +18,10 @@ keywords = ["bgp", "bgpkit"] thiserror = "2.0" serde = { version = "1.0", features = ["derive"] } -as2org-rs = { version = "1.1.1", optional = true } chrono = { version = "0.4", features = ["serde"], optional = true } ipnet = { version = "2.9", features = ["serde"], optional = true } ipnet-trie = { version = "0.3.0", optional = true } oneio = { version = "0.20.0", optional = true, features = ["json"] } -peeringdb-rs = { version = "0.1.3", optional = true } regex = { version = "1", optional = true } serde_json = { version = "1", optional = true } tracing = { version = "0.1", optional = true } @@ -38,7 +36,7 @@ oneio = { version = "0.20.0", features = ["json"] } default = ["all"] # Module features -asinfo = ["as2org-rs", "peeringdb-rs", "oneio", "serde_json", "tracing", "chrono"] +asinfo = ["oneio", "serde_json", "tracing", "chrono", "regex"] as2rel = ["oneio", "serde_json", "tracing"] bogons = ["oneio", "ipnet", "regex", "chrono"] countries = ["oneio"] diff --git a/src/asinfo/as2org.rs b/src/asinfo/as2org.rs new file mode 100644 index 0000000..ed6c0e3 --- /dev/null +++ b/src/asinfo/as2org.rs @@ -0,0 +1,516 @@ +//! AS-to-Organization mapping using CAIDA data +//! +//! This module provides access to CAIDA's AS-to-Organization dataset, allowing +//! lookups of AS information including organization details and sibling relationships. +//! +//! # Data source +//! - CAIDA AS Organizations Dataset: + +use crate::{BgpkitCommonsError, Result}; +use chrono::NaiveDate; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Organization JSON format from CAIDA dataset +#[derive(Debug, Clone, Serialize, Deserialize)] +struct As2orgJsonOrg { + #[serde(alias = "organizationId")] + org_id: String, + + changed: Option, + + #[serde(default)] + name: String, + + country: String, + + /// The RIR or NIR database that contained this entry + source: String, + + #[serde(alias = "type")] + data_type: String, +} + +/// AS JSON format from CAIDA dataset +#[derive(Debug, Clone, Serialize, Deserialize)] +struct As2orgJsonAs { + asn: String, + + changed: Option, + + #[serde(default)] + name: String, + + #[serde(alias = "opaqueId")] + opaque_id: Option, + + #[serde(alias = "organizationId")] + org_id: String, + + /// The RIR or NIR database that contained this entry + source: String, + + #[serde(rename = "type")] + data_type: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +enum As2orgJsonEntry { + Org(As2orgJsonOrg), + As(As2orgJsonAs), +} + +/// Public information for an Autonomous System (AS) enriched with its organization. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct As2orgAsInfo { + /// The AS number + pub asn: u32, + /// The name provided for the individual AS number + pub name: String, + /// The registration country code of the organization + pub country_code: String, + /// Organization identifier (as used in the dataset) + pub org_id: String, + /// Organization name + pub org_name: String, + /// The RIR database that contained this entry + pub source: String, +} + +/// In-memory accessor for CAIDA's AS-to-Organization dataset. +pub struct As2org { + as_map: HashMap, + org_map: HashMap, + as_to_org: HashMap, + org_to_as: HashMap>, +} + +const BASE_URL: &str = "https://publicdata.caida.org/datasets/as-organizations"; + +impl As2org { + /// Create a new `As2org` accessor. + /// + /// - When `data_file_path` is `None`, the constructor fetches the CAIDA + /// index page to discover the most recent `*.as-org2info.jsonl.gz` file + /// and reads it via HTTP(S). + /// - When `Some(path_or_url)` is provided, the path can be a local file or + /// a remote URL. Gzipped files are supported transparently. + pub fn new(data_file_path: Option) -> Result { + let entries = match data_file_path { + Some(path) => parse_as2org_file(path.as_str())?, + None => { + let url = get_most_recent_data()?; + parse_as2org_file(url.as_str())? + } + }; + + let mut as_map: HashMap = HashMap::new(); + let mut org_map: HashMap = HashMap::new(); + + for entry in entries { + match entry { + As2orgJsonEntry::As(as_entry) => { + if let Ok(asn) = as_entry.asn.parse::() { + as_map.insert(asn, as_entry); + } + } + As2orgJsonEntry::Org(org_entry) => { + org_map.insert(org_entry.org_id.clone(), org_entry); + } + } + } + + let mut as_to_org: HashMap = HashMap::new(); + let mut org_to_as: HashMap> = HashMap::new(); + + for (asn, as_entry) in as_map.iter() { + as_to_org.insert(*asn, as_entry.org_id.clone()); + let org_asn = org_to_as.entry(as_entry.org_id.clone()).or_default(); + org_asn.push(*asn); + } + + Ok(Self { + as_map, + org_map, + as_to_org, + org_to_as, + }) + } + + /// List all available dataset files published by CAIDA with their dates. + pub fn get_all_files_with_dates() -> Result> { + get_all_files_with_dates() + } + + /// Returns the URL for the latest AS-to-Organization dataset file. + pub fn get_latest_file_url() -> String { + format!("{BASE_URL}/latest.as-org2info.jsonl.gz") + } + + /// Get enriched information for a specific ASN, if present. + pub fn get_as_info(&self, asn: u32) -> Option { + let as_entry = self.as_map.get(&asn)?; + let org_id = as_entry.org_id.as_str(); + let org_entry = self.org_map.get(org_id)?; + Some(As2orgAsInfo { + asn, + name: as_entry.name.clone(), + country_code: org_entry.country.clone(), + org_id: org_id.to_string(), + org_name: org_entry.name.clone(), + source: org_entry.source.clone(), + }) + } + + /// Return all ASNs that belong to the same organization as the given ASN. + pub fn get_siblings(&self, asn: u32) -> Option> { + let org_id = self.as_to_org.get(&asn)?; + let org_asns = self.org_to_as.get(org_id)?.to_vec(); + Some( + org_asns + .iter() + .filter_map(|asn| self.get_as_info(*asn)) + .collect(), + ) + } + + /// Return `true` if both ASNs belong to the same organization. + pub fn are_siblings(&self, asn1: u32, asn2: u32) -> bool { + let org1 = match self.as_to_org.get(&asn1) { + None => return false, + Some(o) => o, + }; + let org2 = match self.as_to_org.get(&asn2) { + None => return false, + Some(o) => o, + }; + org1 == org2 + } +} + +/// Fixes misinterpretation of strings encoded in Latin-1 that were mistakenly decoded as UTF-8. +fn fix_latin1_misinterpretation(input: &str) -> String { + let mut result = String::new(); + let mut chars = input.chars().peekable(); + + while let Some(c) = chars.next() { + // Check for the pattern of misinterpreted Latin-1 chars + if c == 'Ã' && chars.peek().is_some() { + let next_char = chars.next().unwrap(); + + // Calculate the original Latin-1 character + let byte_value = match next_char { + '\u{0080}'..='\u{00BF}' => 0xC0 + (next_char as u32 - 0x0080), + // Handle other ranges as needed + _ => { + // If it doesn't match the pattern, treat as normal chars + result.push(c); + result.push(next_char); + continue; + } + }; + + // Convert to the correct character + if let Some(correct_char) = char::from_u32(byte_value) { + result.push(correct_char); + } else { + // Fallback for invalid characters + result.push(c); + result.push(next_char); + } + } else { + result.push(c); + } + } + + result +} + +/// Parse remote AS2Org file into Vec of DataEntry +fn parse_as2org_file(path: &str) -> Result> { + let mut res: Vec = vec![]; + + for line in oneio::read_lines(path)? { + let line = fix_latin1_misinterpretation(&line?); + if line.contains(r#""type":"ASN""#) { + let data = serde_json::from_str::(line.as_str()); + match data { + Ok(data) => { + res.push(As2orgJsonEntry::As(data)); + } + Err(e) => { + return Err(BgpkitCommonsError::data_source_error( + crate::errors::data_sources::CAIDA, + format!("error parsing AS line: {}", e), + )); + } + } + } else { + let data = serde_json::from_str::(line.as_str()); + match data { + Ok(data) => { + res.push(As2orgJsonEntry::Org(data)); + } + Err(e) => { + return Err(BgpkitCommonsError::data_source_error( + crate::errors::data_sources::CAIDA, + format!("error parsing Org line: {}", e), + )); + } + } + } + } + Ok(res) +} + +/// Returns a vector of tuples containing the full URLs of AS2Org data files and their corresponding dates. +fn get_all_files_with_dates() -> Result> { + let data_link: Regex = Regex::new(r".*(\d{8}\.as-org2info\.jsonl\.gz).*").map_err(|e| { + BgpkitCommonsError::Internal(format!("failed to compile regex: {}", e)) + })?; + let content = oneio::read_to_string(BASE_URL)?; + let mut res: Vec<(String, NaiveDate)> = data_link + .captures_iter(content.as_str()) + .filter_map(|cap| { + let file = cap[1].to_owned(); + let date = NaiveDate::parse_from_str(&file[..8], "%Y%m%d").ok()?; + Some((format!("{BASE_URL}/{file}"), date)) + }) + .collect(); + res.sort_by_key(|(_, date)| *date); + Ok(res) +} + +fn get_most_recent_data() -> Result { + let files = get_all_files_with_dates()?; + let last_file = files + .last() + .ok_or_else(|| BgpkitCommonsError::data_source_error( + crate::errors::data_sources::CAIDA, + "No dataset files found", + ))?; + Ok(last_file.0.clone()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fix_latin1_misinterpretation() { + // Test that the function handles normal strings correctly + let normal = "Hello World"; + assert_eq!(fix_latin1_misinterpretation(normal), normal); + + // Test empty string + assert_eq!(fix_latin1_misinterpretation(""), ""); + + // Test string without misinterpreted characters + let ascii_only = "ACME Corporation Inc."; + assert_eq!(fix_latin1_misinterpretation(ascii_only), ascii_only); + + // Test string with special characters that shouldn't be modified + let special = "Test @#$%^&*() 123"; + assert_eq!(fix_latin1_misinterpretation(special), special); + } + + #[test] + fn test_as2org_json_org_deserialization() { + let json = r#"{"organizationId":"ORG-TEST","changed":"20240101","name":"Test Org","country":"US","source":"ARIN","type":"Organization"}"#; + let org: As2orgJsonOrg = serde_json::from_str(json).unwrap(); + assert_eq!(org.org_id, "ORG-TEST"); + assert_eq!(org.name, "Test Org"); + assert_eq!(org.country, "US"); + assert_eq!(org.source, "ARIN"); + assert_eq!(org.data_type, "Organization"); + } + + #[test] + fn test_as2org_json_org_with_missing_optional_fields() { + let json = r#"{"organizationId":"ORG-TEST2","name":"Another Org","country":"DE","source":"RIPE","type":"Organization"}"#; + let org: As2orgJsonOrg = serde_json::from_str(json).unwrap(); + assert_eq!(org.org_id, "ORG-TEST2"); + assert!(org.changed.is_none()); + } + + #[test] + fn test_as2org_json_as_deserialization() { + let json = r#"{"asn":"12345","changed":"20240101","name":"Test AS","opaqueId":"opaque123","organizationId":"ORG-TEST","source":"ARIN","type":"ASN"}"#; + let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap(); + assert_eq!(as_entry.asn, "12345"); + assert_eq!(as_entry.name, "Test AS"); + assert_eq!(as_entry.org_id, "ORG-TEST"); + assert_eq!(as_entry.opaque_id, Some("opaque123".to_string())); + assert_eq!(as_entry.source, "ARIN"); + assert_eq!(as_entry.data_type, "ASN"); + } + + #[test] + fn test_as2org_json_as_with_missing_optional_fields() { + let json = r#"{"asn":"67890","name":"Minimal AS","organizationId":"ORG-MIN","source":"APNIC","type":"ASN"}"#; + let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap(); + assert_eq!(as_entry.asn, "67890"); + assert!(as_entry.changed.is_none()); + assert!(as_entry.opaque_id.is_none()); + } + + #[test] + fn test_as2org_json_as_with_empty_name() { + // Test the #[serde(default)] attribute for name field + let json = r#"{"asn":"11111","organizationId":"ORG-EMPTY","source":"RIPE","type":"ASN"}"#; + let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap(); + assert_eq!(as_entry.name, ""); // default empty string + } + + #[test] + fn test_as2org_as_info_struct() { + let info = As2orgAsInfo { + asn: 12345, + name: "Test AS".to_string(), + country_code: "US".to_string(), + org_id: "ORG-TEST".to_string(), + org_name: "Test Organization".to_string(), + source: "ARIN".to_string(), + }; + assert_eq!(info.asn, 12345); + assert_eq!(info.name, "Test AS"); + assert_eq!(info.country_code, "US"); + assert_eq!(info.org_id, "ORG-TEST"); + assert_eq!(info.org_name, "Test Organization"); + assert_eq!(info.source, "ARIN"); + } + + #[test] + fn test_as2org_as_info_serialization() { + let info = As2orgAsInfo { + asn: 12345, + name: "Test AS".to_string(), + country_code: "US".to_string(), + org_id: "ORG-TEST".to_string(), + org_name: "Test Organization".to_string(), + source: "ARIN".to_string(), + }; + let json = serde_json::to_string(&info).unwrap(); + assert!(json.contains("\"asn\":12345")); + assert!(json.contains("\"name\":\"Test AS\"")); + + // Test round-trip + let deserialized: As2orgAsInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.asn, info.asn); + assert_eq!(deserialized.name, info.name); + } + + #[test] + fn test_get_latest_file_url() { + let url = As2org::get_latest_file_url(); + assert!(url.starts_with("https://publicdata.caida.org/datasets/as-organizations/")); + assert!(url.ends_with(".as-org2info.jsonl.gz")); + } + + // Integration tests that require network access - marked as ignored by default + #[test] + #[ignore] + fn test_as2org_new_from_latest() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Verify the database was loaded by checking if we have some data + assert!(!as2org.as_map.is_empty()); + assert!(!as2org.org_map.is_empty()); + } + + #[test] + #[ignore] + fn test_as2org_get_as_info_existing() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Test with a well-known ASN (Google) + let info = as2org.get_as_info(15169); + assert!(info.is_some()); + let info = info.unwrap(); + assert_eq!(info.asn, 15169); + assert!(!info.org_id.is_empty()); + assert!(!info.org_name.is_empty()); + assert!(!info.country_code.is_empty()); + assert!(!info.source.is_empty()); + } + + #[test] + #[ignore] + fn test_as2org_get_as_info_nonexistent() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Test with a likely non-existent ASN + let info = as2org.get_as_info(999999999); + assert!(info.is_none()); + } + + #[test] + #[ignore] + fn test_as2org_get_siblings() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Test with Google's AS15169 + let siblings = as2org.get_siblings(15169); + assert!(siblings.is_some()); + let siblings = siblings.unwrap(); + // Google should have at least a few sibling ASNs + assert!(!siblings.is_empty()); + // The queried ASN should be included in siblings + assert!(siblings.iter().any(|s| s.asn == 15169)); + } + + #[test] + #[ignore] + fn test_as2org_get_siblings_nonexistent() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + let siblings = as2org.get_siblings(999999999); + assert!(siblings.is_none()); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_true() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Google ASNs 15169 and 36040 are known siblings + assert!(as2org.are_siblings(15169, 36040)); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_false() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Google (15169) and Cloudflare (13335) are not siblings + assert!(!as2org.are_siblings(15169, 13335)); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_nonexistent() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Non-existent ASN should return false + assert!(!as2org.are_siblings(15169, 999999999)); + assert!(!as2org.are_siblings(999999999, 15169)); + assert!(!as2org.are_siblings(999999999, 999999998)); + } + + #[test] + #[ignore] + fn test_as2org_are_siblings_same_asn() { + let as2org = As2org::new(None).expect("Failed to load AS2org database"); + // Same ASN should be its own sibling + assert!(as2org.are_siblings(15169, 15169)); + } + + #[test] + #[ignore] + fn test_as2org_get_all_files_with_dates() { + let files = As2org::get_all_files_with_dates().expect("Failed to get file list"); + assert!(!files.is_empty()); + // Files should be sorted by date (ascending) + for i in 1..files.len() { + assert!(files[i].1 >= files[i - 1].1); + } + // Each URL should point to CAIDA + for (url, _) in &files { + assert!(url.contains("publicdata.caida.org")); + assert!(url.ends_with(".as-org2info.jsonl.gz")); + } + } +} diff --git a/src/asinfo/mod.rs b/src/asinfo/mod.rs index afe3af8..e46a199 100644 --- a/src/asinfo/mod.rs +++ b/src/asinfo/mod.rs @@ -111,6 +111,7 @@ //! let are_siblings = bgpkit.asinfo_are_siblings(3333, 3334).unwrap(); //! ``` +mod as2org; mod hegemony; mod peeringdb; mod population; @@ -362,9 +363,7 @@ pub fn get_asinfo_map( let as2org_utils = if load_as2org { info!("loading as2org data from CAIDA..."); - Some(as2org_rs::As2org::new(None).map_err(|e| { - BgpkitCommonsError::data_source_error(data_sources::CAIDA, e.to_string()) - })?) + Some(as2org::As2org::new(None)?) } else { None }; diff --git a/src/asinfo/peeringdb.rs b/src/asinfo/peeringdb.rs index 7736844..75c77d5 100644 --- a/src/asinfo/peeringdb.rs +++ b/src/asinfo/peeringdb.rs @@ -1,8 +1,25 @@ +//! PeeringDB data module +//! +//! This module provides access to PeeringDB data via their public API. +//! +//! # Data source +//! - PeeringDB API: +//! +//! # PeeringDB API key required +//! +//! It is strongly recommended to obtain a [PeeringDB API key](https://docs.peeringdb.com/blog/api_keys/) +//! and set the `PEERINGDB_API_KEY` environment variable. +//! Without it, the API call will likely fail due to rate limiting. + use crate::{BgpkitCommonsError, Result}; -use peeringdb_rs::{PeeringdbNet, load_peeringdb_net}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::io::Read; +use tracing::warn; + +const PEERINGDB_NET_API_URL: &str = "https://www.peeringdb.com/api/net"; +/// PeeringDB network data #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PeeringdbData { pub asn: u32, @@ -13,20 +30,114 @@ pub struct PeeringdbData { pub website: Option, } +/// Full PeeringDB network response from API +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeeringdbNet { + pub id: u32, + pub name: Option, + pub name_long: Option, + pub aka: Option, + pub asn: Option, + pub org_id: Option, + pub irr_as_set: Option, + pub website: Option, + pub notes: Option, + pub fac_count: Option, + pub ix_count: Option, + + pub policy_contracts: Option, + pub policy_general: Option, + pub policy_locations: Option, + pub policy_ratio: Option, + pub policy_url: Option, + + pub info_ipv6: Option, + pub info_multicast: Option, + pub info_never_via_route_servers: Option, + pub info_prefixes4: Option, + pub info_prefixes6: Option, + pub info_ratio: Option, + pub info_scope: Option, + pub info_traffic: Option, + pub info_type: Option, + pub info_types: Option>, + pub info_unicast: Option, + + pub rir_status: Option, + pub status: Option, + pub status_dashboard: Option, + pub created: Option, + pub updated: Option, + pub route_server: Option, + pub looking_glass: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +struct PeeringdbNetResponse { + data: Vec, +} + +/// Get a reader for PeeringDB API with proper authentication headers +fn get_peeringdb_reader(url: &str) -> Result> { + // Try to load API key from environment + let api_key = std::env::var("PEERINGDB_API_KEY").unwrap_or_else(|_| { + warn!("missing PEERINGDB_API_KEY env var, call may fail due to rate limiting"); + "".to_string() + }); + + let client = oneio::remote::create_client_with_headers([ + ("Authorization".to_string(), format!("Api-Key {}", api_key)), + ( + "User-Agent".to_string(), + format!("bgpkit-commons/{}", env!("CARGO_PKG_VERSION")), + ), + ])?; + + let res = client + .execute(client.get(url).build().map_err(|e| { + BgpkitCommonsError::data_source_error( + crate::errors::data_sources::PEERINGDB, + format!("failed to build request: {}", e), + ) + })?) + .map_err(|e| { + BgpkitCommonsError::data_source_error( + crate::errors::data_sources::PEERINGDB, + format!("request failed: {}", e), + ) + })? + .error_for_status() + .map_err(|e| { + BgpkitCommonsError::data_source_error( + crate::errors::data_sources::PEERINGDB, + format!("API returned error status: {}", e), + ) + })?; + + Ok(Box::new(res)) +} + +/// Load PeeringDB network data from the API +pub fn load_peeringdb_net() -> Result> { + let mut reader = get_peeringdb_reader(PEERINGDB_NET_API_URL)?; + let mut buf = String::new(); + reader.read_to_string(&mut buf)?; + let res: PeeringdbNetResponse = serde_json::from_str(&buf)?; + Ok(res.data) +} + +/// PeeringDB data accessor with cached network information #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Peeringdb { peeringdb_map: HashMap, } impl Peeringdb { + /// Create a new Peeringdb accessor by loading data from the API pub fn new() -> Result { let mut peeringdb_map = HashMap::new(); - let net_vec: Vec = load_peeringdb_net().map_err(|e| { - BgpkitCommonsError::data_source_error( - crate::errors::data_sources::PEERINGDB, - e.to_string(), - ) - })?; + let net_vec = load_peeringdb_net()?; + for net in net_vec { if let Some(asn) = net.asn { peeringdb_map.entry(asn).or_insert(PeeringdbData { @@ -37,13 +148,320 @@ impl Peeringdb { irr_as_set: net.irr_as_set, website: net.website, }); - }; + } } Ok(Self { peeringdb_map }) } + /// Get PeeringDB data for a specific ASN pub fn get_data(&self, asn: u32) -> Option<&PeeringdbData> { self.peeringdb_map.get(&asn) } + + /// Get all ASNs in the PeeringDB data + pub fn get_all_asns(&self) -> Vec { + self.peeringdb_map.keys().copied().collect() + } + + /// Check if an ASN exists in PeeringDB + pub fn contains(&self, asn: u32) -> bool { + self.peeringdb_map.contains_key(&asn) + } + + /// Get the number of networks in the database + pub fn len(&self) -> usize { + self.peeringdb_map.len() + } + + /// Check if the database is empty + pub fn is_empty(&self) -> bool { + self.peeringdb_map.is_empty() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_peeringdb_data_struct() { + let data = PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: Some("Cloudflare, Inc.".to_string()), + aka: Some("CF".to_string()), + irr_as_set: Some("AS-CLOUDFLARE".to_string()), + website: Some("https://cloudflare.com".to_string()), + }; + assert_eq!(data.asn, 13335); + assert_eq!(data.name, Some("Cloudflare".to_string())); + assert_eq!(data.name_long, Some("Cloudflare, Inc.".to_string())); + assert_eq!(data.aka, Some("CF".to_string())); + assert_eq!(data.irr_as_set, Some("AS-CLOUDFLARE".to_string())); + assert_eq!(data.website, Some("https://cloudflare.com".to_string())); + } + + #[test] + fn test_peeringdb_data_with_none_fields() { + let data = PeeringdbData { + asn: 12345, + name: None, + name_long: None, + aka: None, + irr_as_set: None, + website: None, + }; + assert_eq!(data.asn, 12345); + assert!(data.name.is_none()); + assert!(data.name_long.is_none()); + assert!(data.aka.is_none()); + assert!(data.irr_as_set.is_none()); + assert!(data.website.is_none()); + } + + #[test] + fn test_peeringdb_data_serialization() { + let data = PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: Some("Cloudflare, Inc.".to_string()), + aka: None, + irr_as_set: Some("AS-CLOUDFLARE".to_string()), + website: Some("https://cloudflare.com".to_string()), + }; + let json = serde_json::to_string(&data).unwrap(); + assert!(json.contains("\"asn\":13335")); + assert!(json.contains("\"name\":\"Cloudflare\"")); + + // Test round-trip + let deserialized: PeeringdbData = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.asn, data.asn); + assert_eq!(deserialized.name, data.name); + } + + #[test] + fn test_peeringdb_data_deserialization() { + let json = r#"{"asn":13335,"name":"Cloudflare","name_long":"Cloudflare, Inc.","aka":null,"irr_as_set":"AS-CLOUDFLARE","website":"https://cloudflare.com"}"#; + let data: PeeringdbData = serde_json::from_str(json).unwrap(); + assert_eq!(data.asn, 13335); + assert_eq!(data.name, Some("Cloudflare".to_string())); + assert_eq!(data.name_long, Some("Cloudflare, Inc.".to_string())); + assert!(data.aka.is_none()); + assert_eq!(data.irr_as_set, Some("AS-CLOUDFLARE".to_string())); + } + + #[test] + fn test_peeringdb_net_struct() { + let net = PeeringdbNet { + id: 1, + name: Some("Test Network".to_string()), + name_long: Some("Test Network Inc.".to_string()), + aka: None, + asn: Some(12345), + org_id: Some(100), + irr_as_set: Some("AS-TEST".to_string()), + website: Some("https://test.com".to_string()), + notes: None, + fac_count: Some(5), + ix_count: Some(3), + policy_contracts: None, + policy_general: Some("Open".to_string()), + policy_locations: None, + policy_ratio: None, + policy_url: None, + info_ipv6: Some(true), + info_multicast: Some(false), + info_never_via_route_servers: Some(false), + info_prefixes4: Some(100), + info_prefixes6: Some(50), + info_ratio: None, + info_scope: Some("Global".to_string()), + info_traffic: None, + info_type: Some("NSP".to_string()), + info_types: None, + info_unicast: Some(true), + rir_status: None, + status: Some("ok".to_string()), + status_dashboard: None, + created: Some("2020-01-01".to_string()), + updated: Some("2024-01-01".to_string()), + route_server: None, + looking_glass: None, + }; + assert_eq!(net.id, 1); + assert_eq!(net.asn, Some(12345)); + assert_eq!(net.name, Some("Test Network".to_string())); + assert_eq!(net.info_prefixes4, Some(100)); + } + + #[test] + fn test_peeringdb_net_deserialization() { + let json = r#"{"id":1,"name":"Test","name_long":null,"aka":null,"asn":12345,"org_id":100,"irr_as_set":null,"website":null,"notes":null,"fac_count":null,"ix_count":null,"policy_contracts":null,"policy_general":null,"policy_locations":null,"policy_ratio":null,"policy_url":null,"info_ipv6":null,"info_multicast":null,"info_never_via_route_servers":null,"info_prefixes4":null,"info_prefixes6":null,"info_ratio":null,"info_scope":null,"info_traffic":null,"info_type":null,"info_types":null,"info_unicast":null,"rir_status":null,"status":"ok","status_dashboard":null,"created":null,"updated":null,"route_server":null,"looking_glass":null}"#; + let net: PeeringdbNet = serde_json::from_str(json).unwrap(); + assert_eq!(net.id, 1); + assert_eq!(net.asn, Some(12345)); + assert_eq!(net.name, Some("Test".to_string())); + } + + #[test] + fn test_peeringdb_net_response_deserialization() { + let json = r#"{"data":[{"id":1,"name":"Test1","name_long":null,"aka":null,"asn":11111,"org_id":100,"irr_as_set":null,"website":null,"notes":null,"fac_count":null,"ix_count":null,"policy_contracts":null,"policy_general":null,"policy_locations":null,"policy_ratio":null,"policy_url":null,"info_ipv6":null,"info_multicast":null,"info_never_via_route_servers":null,"info_prefixes4":null,"info_prefixes6":null,"info_ratio":null,"info_scope":null,"info_traffic":null,"info_type":null,"info_types":null,"info_unicast":null,"rir_status":null,"status":"ok","status_dashboard":null,"created":null,"updated":null,"route_server":null,"looking_glass":null},{"id":2,"name":"Test2","name_long":null,"aka":null,"asn":22222,"org_id":200,"irr_as_set":null,"website":null,"notes":null,"fac_count":null,"ix_count":null,"policy_contracts":null,"policy_general":null,"policy_locations":null,"policy_ratio":null,"policy_url":null,"info_ipv6":null,"info_multicast":null,"info_never_via_route_servers":null,"info_prefixes4":null,"info_prefixes6":null,"info_ratio":null,"info_scope":null,"info_traffic":null,"info_type":null,"info_types":null,"info_unicast":null,"rir_status":null,"status":"ok","status_dashboard":null,"created":null,"updated":null,"route_server":null,"looking_glass":null}]}"#; + let response: PeeringdbNetResponse = serde_json::from_str(json).unwrap(); + assert_eq!(response.data.len(), 2); + assert_eq!(response.data[0].asn, Some(11111)); + assert_eq!(response.data[1].asn, Some(22222)); + } + + #[test] + fn test_peeringdb_struct_from_hashmap() { + let mut peeringdb_map = HashMap::new(); + peeringdb_map.insert( + 13335, + PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: Some("Cloudflare, Inc.".to_string()), + aka: None, + irr_as_set: Some("AS-CLOUDFLARE".to_string()), + website: Some("https://cloudflare.com".to_string()), + }, + ); + peeringdb_map.insert( + 15169, + PeeringdbData { + asn: 15169, + name: Some("Google".to_string()), + name_long: Some("Google LLC".to_string()), + aka: None, + irr_as_set: Some("AS-GOOGLE".to_string()), + website: Some("https://google.com".to_string()), + }, + ); + + let peeringdb = Peeringdb { peeringdb_map }; + + // Test get_data + let cf_data = peeringdb.get_data(13335); + assert!(cf_data.is_some()); + assert_eq!(cf_data.unwrap().name, Some("Cloudflare".to_string())); + + let google_data = peeringdb.get_data(15169); + assert!(google_data.is_some()); + assert_eq!(google_data.unwrap().name, Some("Google".to_string())); + + // Test non-existent ASN + let nonexistent = peeringdb.get_data(999999); + assert!(nonexistent.is_none()); + + // Test contains + assert!(peeringdb.contains(13335)); + assert!(peeringdb.contains(15169)); + assert!(!peeringdb.contains(999999)); + + // Test len + assert_eq!(peeringdb.len(), 2); + + // Test is_empty + assert!(!peeringdb.is_empty()); + + // Test get_all_asns + let asns = peeringdb.get_all_asns(); + assert_eq!(asns.len(), 2); + assert!(asns.contains(&13335)); + assert!(asns.contains(&15169)); + } + + #[test] + fn test_peeringdb_empty() { + let peeringdb = Peeringdb { + peeringdb_map: HashMap::new(), + }; + assert!(peeringdb.is_empty()); + assert_eq!(peeringdb.len(), 0); + assert!(peeringdb.get_all_asns().is_empty()); + assert!(peeringdb.get_data(12345).is_none()); + assert!(!peeringdb.contains(12345)); + } + + #[test] + fn test_peeringdb_serialization() { + let mut peeringdb_map = HashMap::new(); + peeringdb_map.insert( + 13335, + PeeringdbData { + asn: 13335, + name: Some("Cloudflare".to_string()), + name_long: None, + aka: None, + irr_as_set: None, + website: None, + }, + ); + + let peeringdb = Peeringdb { peeringdb_map }; + let json = serde_json::to_string(&peeringdb).unwrap(); + assert!(json.contains("13335")); + assert!(json.contains("Cloudflare")); + + // Test round-trip + let deserialized: Peeringdb = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.len(), 1); + assert!(deserialized.contains(13335)); + } + + // Integration tests that require network access - marked as ignored by default + #[test] + #[ignore] + fn test_load_peeringdb_net() { + // This test requires PEERINGDB_API_KEY to be set + let result = load_peeringdb_net(); + assert!(result.is_ok()); + let nets = result.unwrap(); + assert!(!nets.is_empty()); + // Check that we got some networks with ASNs + let with_asn: Vec<_> = nets.iter().filter(|n| n.asn.is_some()).collect(); + assert!(!with_asn.is_empty()); + } + + #[test] + #[ignore] + fn test_peeringdb_new() { + // This test requires PEERINGDB_API_KEY to be set + let result = Peeringdb::new(); + assert!(result.is_ok()); + let peeringdb = result.unwrap(); + assert!(!peeringdb.is_empty()); + + // Test that we can look up well-known networks + // Cloudflare (AS13335) should be in PeeringDB + let cf = peeringdb.get_data(13335); + assert!(cf.is_some()); + } + + #[test] + #[ignore] + fn test_peeringdb_get_data_existing() { + // This test requires PEERINGDB_API_KEY to be set + let peeringdb = Peeringdb::new().expect("Failed to load PeeringDB"); + + // Test with Cloudflare + let data = peeringdb.get_data(13335); + assert!(data.is_some()); + let data = data.unwrap(); + assert_eq!(data.asn, 13335); + // Cloudflare should have a name + assert!(data.name.is_some()); + } + + #[test] + #[ignore] + fn test_peeringdb_get_data_nonexistent() { + // This test requires PEERINGDB_API_KEY to be set + let peeringdb = Peeringdb::new().expect("Failed to load PeeringDB"); + + // Test with a non-existent ASN + let data = peeringdb.get_data(999999999); + assert!(data.is_none()); + } } From aa5c9e7de58900ea235c48824fbadfd848e378f2 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 4 Dec 2025 18:31:02 -0800 Subject: [PATCH 3/7] feat(rpki): Add RPKIviews historical data support with streaming optimization This commit adds RPKIviews as an additional historical RPKI data source alongside RIPE NCC archives, with efficient streaming extraction that avoids downloading entire 300+ MB archives. - New `RpkiViewsCollector` enum with four collectors: - Josephine (default) - A2B Internet, Amsterdam - Amber - Massar, Lugano - Dango - IIJ, Tokyo - Kerfuffle - Kerfuffle LLC, Fremont - `RpkiTrie::from_rpkiviews(collector, date)` for loading historical data - `list_rpkiviews_files(collector, date)` for discovering available archives - `HistoricalRpkiSource` enum for explicitly selecting data source - `rpki-client.json` is located at position 3-4 in RPKIviews archives - Early termination after ~80MB instead of downloading full 300+ MB - New helper functions: - `extract_file_from_tgz(url, target_path)` - Stream and extract specific file - `list_files_in_tgz(url, max_entries)` - List archive contents with limit - `tgz_contains_file(url, target_path)` - Check file existence - Test completion time reduced from minutes to ~8 seconds - New internal `rpki_client.rs` module with `RpkiClientData` struct - Handles format variations across sources: - ASN formats: numeric (12345) vs string ("AS12345") - ASPA field names: `customer_asid` vs `customer` - Provider arrays: numbers vs strings - Used by Cloudflare, RIPE historical, and RPKIviews sources - New stable `Roa` struct: prefix, asn, max_length, not_before, not_after - New stable `Aspa` struct: customer_asn, providers - Internal rpki-client structs now `pub(crate)` only - New `BgpkitCommons` methods: - `load_rpki_historical(source, date)` - `list_rpki_files(source, date)` - `load_rpki_from_files(urls, date)` - Changed from CSV to `output.json.xz` format for consistency - Provides richer data including expiry timestamps - Added `reqwest` (blocking) for HTTP streaming - Added `tar` crate for reading tar archives - Enabled `xz` feature in `oneio` for .xz decompression - New `examples/rpki_historical.rs` demonstrating historical data loading - Updated `examples/list_aspas.rs` to count ASPA objects for 2020-2025 - Fixed clippy warnings (derivable_impls, dead_code) - Added #[allow(dead_code)] for public API functions used by consumers Files changed: - Added: src/rpki/rpki_client.rs (371 lines) - Added: src/rpki/rpkiviews.rs (667 lines) - Added: examples/rpki_historical.rs (64 lines) - Modified: src/rpki/mod.rs, src/rpki/cloudflare.rs, src/rpki/ripe_historical.rs - Modified: src/lib.rs, Cargo.toml, CHANGELOG.md - Modified: examples/list_aspas.rs - Modified: src/asinfo/as2org.rs, src/asinfo/peeringdb.rs (clippy fixes) --- Cargo.toml | 12 +- examples/list_aspas.rs | 43 ++- examples/rpki_historical.rs | 64 ++++ src/asinfo/as2org.rs | 18 +- src/asinfo/peeringdb.rs | 8 +- src/lib.rs | 161 +++++++-- src/rpki/cloudflare.rs | 213 +----------- src/rpki/mod.rs | 657 ++++++++++++++++++++++------------- src/rpki/ripe_historical.rs | 126 ++++--- src/rpki/rpki_client.rs | 371 ++++++++++++++++++++ src/rpki/rpkiviews.rs | 667 ++++++++++++++++++++++++++++++++++++ 11 files changed, 1812 insertions(+), 528 deletions(-) create mode 100644 examples/rpki_historical.rs create mode 100644 src/rpki/rpki_client.rs create mode 100644 src/rpki/rpkiviews.rs diff --git a/Cargo.toml b/Cargo.toml index d40727f..c5f8d3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,15 +21,17 @@ serde = { version = "1.0", features = ["derive"] } chrono = { version = "0.4", features = ["serde"], optional = true } ipnet = { version = "2.9", features = ["serde"], optional = true } ipnet-trie = { version = "0.3.0", optional = true } -oneio = { version = "0.20.0", optional = true, features = ["json"] } +oneio = { version = "0.20.0", optional = true, features = ["json", "xz"] } regex = { version = "1", optional = true } serde_json = { version = "1", optional = true } tracing = { version = "0.1", optional = true } +reqwest = { version = "0.12", optional = true, features = ["blocking"] } +tar = { version = "0.4", optional = true } [dev-dependencies] tracing-subscriber = "0.3" serde_json = "1" -oneio = { version = "0.20.0", features = ["json"] } +oneio = { version = "0.20.0", features = ["json", "xz"] } [features] @@ -41,7 +43,7 @@ as2rel = ["oneio", "serde_json", "tracing"] bogons = ["oneio", "ipnet", "regex", "chrono"] countries = ["oneio"] mrt_collectors = ["oneio", "chrono"] -rpki = ["oneio", "ipnet", "ipnet-trie", "chrono", "tracing"] +rpki = ["oneio", "ipnet", "ipnet-trie", "chrono", "tracing", "reqwest", "tar"] # Convenience feature to enable all modules all = ["asinfo", "as2rel", "bogons", "countries", "mrt_collectors", "rpki"] @@ -59,6 +61,10 @@ required-features = ["mrt_collectors"] name = "list_aspas" required-features = ["rpki"] +[[example]] +name = "rpki_historical" +required-features = ["rpki"] + [lints.clippy] uninlined_format_args = "allow" collapsible_if = "allow" diff --git a/examples/list_aspas.rs b/examples/list_aspas.rs index 47b4f9c..854c54a 100644 --- a/examples/list_aspas.rs +++ b/examples/list_aspas.rs @@ -1,9 +1,40 @@ -use serde_json::json; +use bgpkit_commons::rpki::{RpkiTrie, RpkiViewsCollector}; +use chrono::NaiveDate; fn main() { - let cf_data = bgpkit_commons::rpki::CfData::new().unwrap(); - println!( - "{}", - serde_json::to_string_pretty(&json!(cf_data.aspas)).unwrap() - ); + println!("Counting ASPA objects on the first day of each year (2020-2025)"); + println!("{}", "=".repeat(60)); + + for year in 2023..=2025 { + let date = NaiveDate::from_ymd_opt(year, 1, 1).unwrap(); + + // // Try RIPE historical first + // match RpkiTrie::from_ripe_historical(date) { + // Ok(trie) => { + // println!("{}-01-01: {} ASPAs (from RIPE)", year, trie.aspas.len()); + // continue; + // } + // Err(_) => { + // // RIPE failed, try RPKIviews + // } + // } + + // Fallback to RPKIviews + match RpkiTrie::from_rpkiviews(RpkiViewsCollector::Dango, date) { + Ok(trie) => { + println!( + "{}-01-01: {} ASPAs (from RPKIviews)", + year, + trie.aspas.len() + ); + } + Err(e) => { + eprintln!("{e}"); + println!("{}-01-01: No data available", year); + } + } + } + + println!("{}", "=".repeat(60)); + println!("Done!"); } diff --git a/examples/rpki_historical.rs b/examples/rpki_historical.rs new file mode 100644 index 0000000..a2bc7bd --- /dev/null +++ b/examples/rpki_historical.rs @@ -0,0 +1,64 @@ +//! Example demonstrating historical RPKI data loading from RIPE NCC and RPKIviews +//! +//! Run with: cargo run --example rpki_historical --features rpki + +use bgpkit_commons::BgpkitCommons; +use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; +use chrono::NaiveDate; + +fn main() { + // Initialize tracing for debug output + tracing_subscriber::fmt::init(); + + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let commons = BgpkitCommons::new(); + + println!("=== Listing available RPKI files for {} ===\n", date); + + // List files from RIPE NCC (one file per RIR) + println!("RIPE NCC files:"); + match commons.list_rpki_files(date, HistoricalRpkiSource::Ripe) { + Ok(files) => { + for file in &files { + println!( + " - {} (RIR: {})", + file.url, + file.rir + .map(|r| r.to_string()) + .unwrap_or_else(|| "N/A".to_string()) + ); + } + } + Err(e) => println!(" Error listing RIPE files: {}", e), + } + println!(); + + // List files from RPKIviews (multiple snapshots per day) + println!("RPKIviews files (Kerfuffle collector):"); + let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::Kerfuffle); + match commons.list_rpki_files(date, source) { + Ok(files) => { + println!(" Found {} files for {}", files.len(), date); + // Show first 5 files + for file in files.iter().take(5) { + println!( + " - {} ({} bytes, timestamp: {})", + file.url, + file.size.unwrap_or(0), + file.timestamp + ); + } + if files.len() > 5 { + println!(" ... and {} more files", files.len() - 5); + } + } + Err(e) => println!(" Error listing RPKIviews files: {}", e), + } + println!(); + + // Show available collectors + println!("Available RPKIviews collectors:"); + for collector in RpkiViewsCollector::all() { + println!(" - {} ({})", collector, collector.base_url()); + } +} diff --git a/src/asinfo/as2org.rs b/src/asinfo/as2org.rs index ed6c0e3..195933b 100644 --- a/src/asinfo/as2org.rs +++ b/src/asinfo/as2org.rs @@ -79,6 +79,7 @@ pub struct As2orgAsInfo { } /// In-memory accessor for CAIDA's AS-to-Organization dataset. +#[allow(dead_code)] pub struct As2org { as_map: HashMap, org_map: HashMap, @@ -139,11 +140,13 @@ impl As2org { } /// List all available dataset files published by CAIDA with their dates. + #[allow(dead_code)] pub fn get_all_files_with_dates() -> Result> { get_all_files_with_dates() } /// Returns the URL for the latest AS-to-Organization dataset file. + #[allow(dead_code)] pub fn get_latest_file_url() -> String { format!("{BASE_URL}/latest.as-org2info.jsonl.gz") } @@ -164,6 +167,7 @@ impl As2org { } /// Return all ASNs that belong to the same organization as the given ASN. + #[allow(dead_code)] pub fn get_siblings(&self, asn: u32) -> Option> { let org_id = self.as_to_org.get(&asn)?; let org_asns = self.org_to_as.get(org_id)?.to_vec(); @@ -176,6 +180,7 @@ impl As2org { } /// Return `true` if both ASNs belong to the same organization. + #[allow(dead_code)] pub fn are_siblings(&self, asn1: u32, asn2: u32) -> bool { let org1 = match self.as_to_org.get(&asn1) { None => return false, @@ -266,9 +271,8 @@ fn parse_as2org_file(path: &str) -> Result> { /// Returns a vector of tuples containing the full URLs of AS2Org data files and their corresponding dates. fn get_all_files_with_dates() -> Result> { - let data_link: Regex = Regex::new(r".*(\d{8}\.as-org2info\.jsonl\.gz).*").map_err(|e| { - BgpkitCommonsError::Internal(format!("failed to compile regex: {}", e)) - })?; + let data_link: Regex = Regex::new(r".*(\d{8}\.as-org2info\.jsonl\.gz).*") + .map_err(|e| BgpkitCommonsError::Internal(format!("failed to compile regex: {}", e)))?; let content = oneio::read_to_string(BASE_URL)?; let mut res: Vec<(String, NaiveDate)> = data_link .captures_iter(content.as_str()) @@ -284,12 +288,12 @@ fn get_all_files_with_dates() -> Result> { fn get_most_recent_data() -> Result { let files = get_all_files_with_dates()?; - let last_file = files - .last() - .ok_or_else(|| BgpkitCommonsError::data_source_error( + let last_file = files.last().ok_or_else(|| { + BgpkitCommonsError::data_source_error( crate::errors::data_sources::CAIDA, "No dataset files found", - ))?; + ) + })?; Ok(last_file.0.clone()) } diff --git a/src/asinfo/peeringdb.rs b/src/asinfo/peeringdb.rs index 75c77d5..ee2c5af 100644 --- a/src/asinfo/peeringdb.rs +++ b/src/asinfo/peeringdb.rs @@ -160,21 +160,25 @@ impl Peeringdb { } /// Get all ASNs in the PeeringDB data + #[allow(dead_code)] pub fn get_all_asns(&self) -> Vec { self.peeringdb_map.keys().copied().collect() } /// Check if an ASN exists in PeeringDB + #[allow(dead_code)] pub fn contains(&self, asn: u32) -> bool { self.peeringdb_map.contains_key(&asn) } /// Get the number of networks in the database + #[allow(dead_code)] pub fn len(&self) -> usize { self.peeringdb_map.len() } /// Check if the database is empty + #[allow(dead_code)] pub fn is_empty(&self) -> bool { self.peeringdb_map.is_empty() } @@ -444,7 +448,7 @@ mod tests { fn test_peeringdb_get_data_existing() { // This test requires PEERINGDB_API_KEY to be set let peeringdb = Peeringdb::new().expect("Failed to load PeeringDB"); - + // Test with Cloudflare let data = peeringdb.get_data(13335); assert!(data.is_some()); @@ -459,7 +463,7 @@ mod tests { fn test_peeringdb_get_data_nonexistent() { // This test requires PEERINGDB_API_KEY to be set let peeringdb = Peeringdb::new().expect("Failed to load PeeringDB"); - + // Test with a non-existent ASN let data = peeringdb.get_data(999999999); assert!(data.is_none()); diff --git a/src/lib.rs b/src/lib.rs index 79b540d..06ca4c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,39 +6,39 @@ //! ## Available Modules //! //! ### [`asinfo`] - Autonomous System Information (requires `asinfo` feature) -//! **Load Method**: `load_asinfo(as2org, population, hegemony, peeringdb)` or `load_asinfo_cached()` -//! **Access Methods**: `asinfo_get(asn)`, `asinfo_all()` -//! **Data Sources**: RIPE NCC, CAIDA as2org, APNIC population, IIJ IHR hegemony, PeeringDB +//! **Load Method**: `load_asinfo(as2org, population, hegemony, peeringdb)` or `load_asinfo_cached()` +//! **Access Methods**: `asinfo_get(asn)`, `asinfo_all()` +//! **Data Sources**: RIPE NCC, CAIDA as2org, APNIC population, IIJ IHR hegemony, PeeringDB //! **Functionality**: AS name resolution, country mapping, organization data, population statistics, hegemony scores //! //! ### [`as2rel`] - AS Relationship Data (requires `as2rel` feature) -//! **Load Method**: `load_as2rel()` -//! **Access Methods**: `as2rel_lookup(asn1, asn2)` -//! **Data Sources**: BGPKIT AS relationship inference +//! **Load Method**: `load_as2rel()` +//! **Access Methods**: `as2rel_lookup(asn1, asn2)` +//! **Data Sources**: BGPKIT AS relationship inference //! **Functionality**: Provider-customer, peer-to-peer, and sibling relationships between ASes //! //! ### [`bogons`] - Bogon Detection (requires `bogons` feature) -//! **Load Method**: `load_bogons()` -//! **Access Methods**: `bogons_match(input)`, `bogons_match_prefix(prefix)`, `bogons_match_asn(asn)`, `get_bogon_prefixes()`, `get_bogon_asns()` -//! **Data Sources**: IANA special registries (IPv4, IPv6, ASN) +//! **Load Method**: `load_bogons()` +//! **Access Methods**: `bogons_match(input)`, `bogons_match_prefix(prefix)`, `bogons_match_asn(asn)`, `get_bogon_prefixes()`, `get_bogon_asns()` +//! **Data Sources**: IANA special registries (IPv4, IPv6, ASN) //! **Functionality**: Detect invalid/reserved IP prefixes and ASNs that shouldn't appear in routing //! //! ### [`countries`] - Country Information (requires `countries` feature) -//! **Load Method**: `load_countries()` -//! **Access Methods**: `country_by_code(code)`, country lookup by name -//! **Data Sources**: GeoNames geographical database +//! **Load Method**: `load_countries()` +//! **Access Methods**: `country_by_code(code)`, country lookup by name +//! **Data Sources**: GeoNames geographical database //! **Functionality**: ISO country code to name mapping and geographical information //! //! ### [`mrt_collectors`] - MRT Collector Metadata (requires `mrt_collectors` feature) -//! **Load Methods**: `load_mrt_collectors()`, `load_mrt_collector_peers()` -//! **Access Methods**: `mrt_collectors_all()`, `mrt_collector_peers()`, `mrt_collector_peers_full_feed()` -//! **Data Sources**: RouteViews and RIPE RIS official APIs +//! **Load Methods**: `load_mrt_collectors()`, `load_mrt_collector_peers()` +//! **Access Methods**: `mrt_collectors_all()`, `mrt_collector_peers()`, `mrt_collector_peers_full_feed()` +//! **Data Sources**: RouteViews and RIPE RIS official APIs //! **Functionality**: BGP collector information, peer details, full-feed vs partial-feed classification //! //! ### [`rpki`] - RPKI Validation (requires `rpki` feature) -//! **Load Method**: `load_rpki(optional_date)` -//! **Access Methods**: `rpki_validate(prefix, asn)` -//! **Data Sources**: RIPE NCC historical data, Cloudflare real-time data +//! **Load Method**: `load_rpki(optional_date)` +//! **Access Methods**: `rpki_validate(prefix, asn)` +//! **Data Sources**: RIPE NCC historical data, Cloudflare real-time data //! **Functionality**: Route Origin Authorization (ROA) validation, supports multiple ROAs per prefix //! //! ## Quick Start @@ -92,12 +92,12 @@ //! ### Module Features //! - `asinfo` - AS information with organization and population data //! - `as2rel` - AS relationship data -//! - `bogons` - Bogon prefix and ASN detection +//! - `bogons` - Bogon prefix and ASN detection //! - `countries` - Country information lookup //! - `mrt_collectors` - MRT collector metadata //! - `rpki` - RPKI validation functionality //! -//! ### Convenience Features +//! ### Convenience Features //! - `all` (default) - Enables all modules for backwards compatibility //! //! ### Minimal Build Example @@ -283,7 +283,12 @@ impl BgpkitCommons { Ok(()) } - /// Load RPKI data + /// Load RPKI data from Cloudflare (real-time) or historical archives + /// + /// - If `date_opt` is `None`, loads real-time data from Cloudflare + /// - If `date_opt` is `Some(date)`, loads historical data from RIPE NCC by default + /// + /// For more control over the data source, use `load_rpki_historical()` instead. #[cfg(feature = "rpki")] pub fn load_rpki(&mut self, date_opt: Option) -> Result<()> { if let Some(date) = date_opt { @@ -294,6 +299,120 @@ impl BgpkitCommons { Ok(()) } + /// Load RPKI data from a specific historical data source + /// + /// This allows you to choose between RIPE NCC and RPKIviews for historical data. + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; + /// use chrono::NaiveDate; + /// + /// let mut commons = BgpkitCommons::new(); + /// let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + /// + /// // Load from RIPE NCC + /// commons.load_rpki_historical(date, HistoricalRpkiSource::Ripe).unwrap(); + /// + /// // Or load from RPKIviews + /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::Kerfuffle); + /// commons.load_rpki_historical(date, source).unwrap(); + /// ``` + #[cfg(feature = "rpki")] + pub fn load_rpki_historical( + &mut self, + date: chrono::NaiveDate, + source: rpki::HistoricalRpkiSource, + ) -> Result<()> { + match source { + rpki::HistoricalRpkiSource::Ripe => { + self.rpki_trie = Some(rpki::RpkiTrie::from_ripe_historical(date)?); + } + rpki::HistoricalRpkiSource::RpkiViews(collector) => { + self.rpki_trie = Some(rpki::RpkiTrie::from_rpkiviews(collector, date)?); + } + } + Ok(()) + } + + /// Load RPKI data from specific file URLs + /// + /// This allows loading from specific archive files, which is useful when you want + /// to process multiple files or use specific timestamps. + /// + /// # Arguments + /// + /// * `urls` - A slice of URLs pointing to RPKI data files + /// * `source` - The type of data source (RIPE or RPKIviews) - determines how files are parsed + /// * `date` - Optional date to associate with the loaded data + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// use bgpkit_commons::rpki::HistoricalRpkiSource; + /// + /// let mut commons = BgpkitCommons::new(); + /// let urls = vec![ + /// "https://example.com/rpki-20240104T144128Z.tgz".to_string(), + /// ]; + /// commons.load_rpki_from_files(&urls, HistoricalRpkiSource::RpkiViews( + /// bgpkit_commons::rpki::RpkiViewsCollector::Kerfuffle + /// ), None).unwrap(); + /// ``` + #[cfg(feature = "rpki")] + pub fn load_rpki_from_files( + &mut self, + urls: &[String], + source: rpki::HistoricalRpkiSource, + date: Option, + ) -> Result<()> { + match source { + rpki::HistoricalRpkiSource::Ripe => { + self.rpki_trie = Some(rpki::RpkiTrie::from_ripe_files(urls, date)?); + } + rpki::HistoricalRpkiSource::RpkiViews(_) => { + self.rpki_trie = Some(rpki::RpkiTrie::from_rpkiviews_files(urls, date)?); + } + } + Ok(()) + } + + /// List available RPKI files for a given date from a specific source + /// + /// # Example + /// + /// ```rust,no_run + /// use bgpkit_commons::BgpkitCommons; + /// use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; + /// use chrono::NaiveDate; + /// + /// let commons = BgpkitCommons::new(); + /// let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + /// + /// // List files from RIPE NCC + /// let ripe_files = commons.list_rpki_files(date, HistoricalRpkiSource::Ripe).unwrap(); + /// + /// // List files from RPKIviews + /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::Kerfuffle); + /// let rpkiviews_files = commons.list_rpki_files(date, source).unwrap(); + /// ``` + #[cfg(feature = "rpki")] + pub fn list_rpki_files( + &self, + date: chrono::NaiveDate, + source: rpki::HistoricalRpkiSource, + ) -> Result> { + match source { + rpki::HistoricalRpkiSource::Ripe => rpki::list_ripe_files(date), + rpki::HistoricalRpkiSource::RpkiViews(collector) => { + rpki::list_rpkiviews_files(collector, date) + } + } + } + /// Load MRT mrt_collectors data #[cfg(feature = "mrt_collectors")] pub fn load_mrt_collectors(&mut self) -> Result<()> { diff --git a/src/rpki/cloudflare.rs b/src/rpki/cloudflare.rs index c8dd6a1..3b94cd2 100644 --- a/src/rpki/cloudflare.rs +++ b/src/rpki/cloudflare.rs @@ -1,221 +1,36 @@ //! Load current RPKI information from Cloudflare RPKI portal. use crate::Result; -use chrono::DateTime; -use ipnet::IpNet; -use serde::{Deserialize, Serialize}; -use std::str::FromStr; -use super::{Rir, RoaEntry, RpkiTrie}; +use super::RpkiTrie; +use super::rpki_client::RpkiClientData; -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CfData { - pub metadata: CfMetaData, - pub roas: Vec, - pub aspas: Vec, - pub bgpsec_keys: Vec, -} - -impl CfData { - pub fn new() -> Result { - let data: CfData = - oneio::read_json_struct::("https://rpki.cloudflare.com/rpki.json")?; - Ok(data) - } -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfMetaData { - pub buildmachine: Option, - pub buildtime: Option, - pub elapsedtime: Option, - pub usertime: Option, - pub systemtime: Option, - pub roas: Option, - pub failedroas: Option, - pub invalidroas: Option, - pub spls: Option, - pub failedspls: Option, - pub invalidspls: Option, - pub aspas: Option, - pub failedaspas: Option, - pub invalidaspas: Option, - pub bgpsec_pubkeys: Option, - pub certificates: Option, - pub invalidcertificates: Option, - pub taks: Option, - pub tals: Option, - pub invalidtals: Option, - pub talfiles: Option>, - pub manifests: Option, - pub failedmanifests: Option, - pub crls: Option, - pub gbrs: Option, - pub repositories: Option, - pub vrps: Option, - pub uniquevrps: Option, - pub vsps: Option, - pub uniquevsps: Option, - pub vaps: Option, - pub uniquevaps: Option, - pub cachedir_new_files: Option, - pub cachedir_del_files: Option, - pub cachedir_del_dirs: Option, - pub cachedir_superfluous_files: Option, - pub cachedir_del_superfluous_files: Option, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfAspaEntry { - pub customer_asid: u32, - pub expires: i64, - pub providers: Vec, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfBgpsecKeysEntry { - pub asn: u32, - pub ski: String, - pub pubkey: String, - pub ta: String, - pub expires: i64, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct CfRoaEntry { - pub prefix: String, - #[serde(rename = "maxLength")] - pub max_length: u8, - pub asn: u32, - pub ta: String, - pub expires: u64, -} +const CLOUDFLARE_RPKI_URL: &str = "https://rpki.cloudflare.com/rpki.json"; impl RpkiTrie { + /// Load current RPKI data from Cloudflare RPKI portal. + /// + /// This loads real-time RPKI data from Cloudflare's public RPKI JSON endpoint. + /// The data includes ROAs, ASPAs, and BGPsec keys. pub fn from_cloudflare() -> Result { - let data: CfData = - oneio::read_json_struct::("https://rpki.cloudflare.com/rpki.json")?; - - let mut trie = RpkiTrie { - aspas: data.aspas, - ..Default::default() - }; - - for roa in data.roas { - let prefix = roa.prefix.parse::()?; - let max_length = roa.max_length; - let rir = Rir::from_str(roa.ta.as_str()).ok(); - - // Convert expires timestamp to NaiveDateTime - let not_after = - DateTime::from_timestamp(roa.expires as i64, 0).map(|dt| dt.naive_utc()); - - let roa_entry = RoaEntry { - prefix, - asn: roa.asn, - max_length, - rir, - not_before: None, - not_after, - }; - - trie.insert_roa(roa_entry); - } - Ok(trie) + let data = RpkiClientData::from_url(CLOUDFLARE_RPKI_URL)?; + Self::from_rpki_client_data(data, None) } } #[cfg(test)] mod tests { use super::*; - use chrono::{DateTime, Utc}; #[test] - #[ignore] // This test requires network access and is for manual testing - // Run with: cargo test --release --features rpki test_cloudflare_rpki_expiry_loading -- --ignored --nocapture - fn test_cloudflare_rpki_expiry_loading() { - println!("Loading RPKI data from Cloudflare..."); - - // Load the RPKI data + #[ignore] // Requires network access + fn test_from_cloudflare() { let trie = RpkiTrie::from_cloudflare().expect("Failed to load Cloudflare RPKI data"); - // Count total ROAs let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); - println!("Total ROAs loaded: {}", total_roas); - - // Count ROAs with expiry dates - let mut roas_with_expiry = 0; - let mut expired_roas = 0; - let mut future_roas = 0; - let current_time = Utc::now().naive_utc(); - - for (prefix, roas) in trie.trie.iter() { - for roa in roas { - if roa.not_after.is_some() { - roas_with_expiry += 1; - - if let Some(not_after) = roa.not_after { - if not_after < current_time { - expired_roas += 1; - println!( - "Expired ROA found: prefix={}, asn={}, expired={}", - prefix, roa.asn, not_after - ); - } - } - } - - if let Some(not_before) = roa.not_before { - if not_before > current_time { - future_roas += 1; - println!( - "Future ROA found: prefix={}, asn={}, valid_from={}", - prefix, roa.asn, not_before - ); - } - } - } - } - - println!("\nSummary:"); - println!("- ROAs with expiry dates: {}", roas_with_expiry); - println!("- Expired ROAs: {}", expired_roas); - println!("- Future ROAs: {}", future_roas); - - // Test expiry validation with a sample ROA if any have expiry dates - if roas_with_expiry > 0 { - // Find a ROA with expiry date to test - for (prefix, roas) in trie.trie.iter() { - for roa in roas { - if roa.not_after.is_some() { - println!( - "\nTesting validation with expiry check for prefix={}, asn={}", - prefix, roa.asn - ); - - // Test with current time - let validation = trie.validate_check_expiry(&prefix, roa.asn, None); - println!("Validation result (current time): {:?}", validation); - - // Test with a far future time - let future_time = DateTime::from_timestamp(3000000000, 0) - .map(|dt| dt.naive_utc()) - .unwrap(); - let future_validation = - trie.validate_check_expiry(&prefix, roa.asn, Some(future_time)); - println!("Validation result (future time): {:?}", future_validation); - - break; - } - } - if roas_with_expiry > 0 { - break; - } - } - } + println!("Loaded {} ROAs from Cloudflare", total_roas); + println!("Loaded {} ASPAs", trie.aspas.len()); - // Basic sanity check - we should have loaded some ROAs - assert!(total_roas > 0, "No ROAs were loaded from Cloudflare"); - println!("\nTest completed successfully!"); + assert!(total_roas > 0, "Should have loaded some ROAs"); } } diff --git a/src/rpki/mod.rs b/src/rpki/mod.rs index e7ae975..228d7a9 100644 --- a/src/rpki/mod.rs +++ b/src/rpki/mod.rs @@ -1,7 +1,7 @@ //! RPKI (Resource Public Key Infrastructure) validation and data structures. //! //! This module provides functionality for loading and validating RPKI data from multiple sources, -//! including real-time data from Cloudflare and historical data from RIPE NCC. +//! including real-time data from Cloudflare and historical data from RIPE NCC or RPKIviews. //! //! # Overview //! @@ -20,7 +20,7 @@ //! //! ## Historical Data (RIPE NCC) //! - **Source**: [RIPE NCC FTP archives](https://ftp.ripe.net/rpki/) -//! - **Format**: CSV files with historical RPKI states +//! - **Format**: JSON files (output.json.xz) with ROAs, ASPAs //! - **Use Case**: Historical analysis and research //! - **Date Range**: Configurable historical date //! - **TAL Sources**: @@ -30,15 +30,26 @@ //! - LACNIC: //! - RIPE NCC: //! +//! ## Historical Data (RPKIviews) +//! - **Source**: [RPKIviews](https://rpkiviews.org/) +//! - **Format**: Compressed tarballs (.tgz) containing rpki-client.json +//! - **Use Case**: Historical analysis from multiple vantage points +//! - **Default Collector**: Kerfuffle (rpkiviews.kerfuffle.net) +//! - **Collectors**: +//! - Josephine: A2B Internet (AS51088), Amsterdam, Netherlands +//! - Amber: Massar (AS57777), Lugano, Switzerland +//! - Dango: Internet Initiative Japan (AS2497), Tokyo, Japan +//! - Kerfuffle: Kerfuffle, LLC (AS35008), Fremont, California, United States +//! //! # Core Data Structures //! //! ## RpkiTrie //! The main data structure that stores RPKI data in a trie for efficient prefix lookups: -//! - **Trie**: `IpnetTrie>` - Maps IP prefixes to lists of ROA entries -//! - **ASPAs**: `Vec` - AS Provider Authorization records +//! - **Trie**: `IpnetTrie>` - Maps IP prefixes to lists of ROA entries +//! - **ASPAs**: `Vec` - AS Provider Authorization records //! - **Date**: `Option` - Optional date for historical data //! -//! ## RoaEntry +//! ## Roa //! Represents a Route Origin Authorization with the following fields: //! - `prefix: IpNet` - The IP prefix (e.g., 192.0.2.0/24) //! - `asn: u32` - The authorized ASN (e.g., 64496) @@ -47,38 +58,18 @@ //! - `not_before: Option` - ROA validity start time //! - `not_after: Option` - ROA validity end time (from expires field) //! +//! ## Aspa +//! Represents an AS Provider Authorization with the following fields: +//! - `customer_asn: u32` - The customer AS number +//! - `providers: Vec` - List of provider AS numbers +//! - `expires: Option` - When this ASPA expires +//! //! ## Validation Results //! RPKI validation returns one of three states: //! - **Valid**: The prefix-ASN pair is explicitly authorized by a valid ROA //! - **Invalid**: The prefix has ROAs but none authorize the given ASN //! - **Unknown**: No ROAs exist for the prefix, or all ROAs are outside their validity period //! -//! # Validation Process -//! -//! ## Standard Validation (`validate`) -//! 1. Look up all ROAs that cover the given prefix -//! 2. Check if any ROA authorizes the given ASN with appropriate max_length -//! 3. Return Valid/Invalid/Unknown based on matches -//! -//! ## Expiry-Aware Validation (`validate_check_expiry`) -//! 1. Look up all ROAs that cover the given prefix -//! 2. Filter ROAs to only include those within their validity time window: -//! - Check `not_before` ≤ check_time (if present) -//! - Check `not_after` ≥ check_time (if present) -//! 3. Among time-valid ROAs, check for ASN authorization -//! 4. Return validation result: -//! - **Valid**: Time-valid ROA found for the ASN -//! - **Invalid**: Time-valid ROAs exist but none authorize the ASN -//! - **Unknown**: No ROAs found, or all ROAs are outside validity period -//! -//! # Key Features -//! -//! - **Multiple ROAs per prefix**: A single prefix can have multiple valid ROAs with different ASNs -//! - **Duplicate prevention**: ROAs with identical (prefix, asn, max_length) are automatically deduplicated -//! - **Efficient lookup**: Fast prefix matching using a trie data structure -//! - **Temporal validation**: Support for time-aware validation with expiry checking -//! - **Comprehensive validation**: Full RPKI validation against stored ROAs -//! //! # Usage Examples //! //! ## Loading Real-time Data (Cloudflare) @@ -98,131 +89,141 @@ //! bgpkit_commons::rpki::RpkiValidation::Invalid => println!("Route is RPKI invalid"), //! bgpkit_commons::rpki::RpkiValidation::Unknown => println!("No RPKI data for this prefix"), //! } -//! -//! // Validate with expiry checking (current time) -//! let result = commons.rpki_validate_check_expiry(64496, "192.0.2.0/24", None)?; -//! -//! // Validate with expiry checking (specific time) -//! use chrono::{DateTime, Utc}; -//! let check_time = DateTime::from_timestamp(1700000000, 0).unwrap().naive_utc(); -//! let result = commons.rpki_validate_check_expiry(64496, "192.0.2.0/24", Some(check_time))?; //! # Ok(()) //! # } //! ``` //! -//! ## Loading Historical Data (RIPE) +//! ## Loading Historical Data with Source Selection //! ```rust,no_run //! use bgpkit_commons::BgpkitCommons; +//! use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; //! use chrono::NaiveDate; //! //! # fn main() -> Result<(), Box> { //! let mut commons = BgpkitCommons::new(); +//! let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); //! -//! // Load RPKI data for a specific historical date -//! let date = NaiveDate::from_ymd_opt(2023, 6, 15).unwrap(); -//! commons.load_rpki(Some(date))?; +//! // Load from RIPE NCC +//! commons.load_rpki_historical(date, HistoricalRpkiSource::Ripe)?; //! -//! // Validate using historical data -//! let result = commons.rpki_validate(64496, "192.0.2.0/24")?; +//! // Or load from RPKIviews (uses Kerfuffle collector by default) +//! let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::default()); +//! commons.load_rpki_historical(date, source)?; //! # Ok(()) //! # } //! ``` //! -//! ## Direct Trie Usage -//! ```rust,no_run -//! use bgpkit_commons::rpki::{RpkiTrie, RpkiValidation}; -//! use ipnet::IpNet; -//! -//! # fn main() -> Result<(), Box> { -//! // Load from Cloudflare directly -//! let trie = RpkiTrie::from_cloudflare()?; -//! -//! // Lookup all ROAs for a prefix -//! let prefix: IpNet = "192.0.2.0/24".parse()?; -//! let roas = trie.lookup_by_prefix(&prefix); -//! println!("Found {} ROAs for prefix", roas.len()); -//! -//! // Validate with expiry checking -//! let result = trie.validate_check_expiry(&prefix, 64496, None); -//! # Ok(()) -//! # } -//! ``` -//! -//! ## Handling Multiple ROAs -//! A single prefix can have multiple ROAs with different ASNs and validity periods: +//! ## Listing Available Files //! ```rust,no_run //! use bgpkit_commons::BgpkitCommons; +//! use bgpkit_commons::rpki::{HistoricalRpkiSource, RpkiViewsCollector}; +//! use chrono::NaiveDate; //! //! # fn main() -> Result<(), Box> { -//! let mut commons = BgpkitCommons::new(); -//! commons.load_rpki(None)?; -//! -//! // Look up all ROAs for a prefix -//! let roas = commons.rpki_lookup_by_prefix("192.0.2.0/24")?; -//! for roa in roas { -//! println!("ASN: {}, Max Length: {}, Expires: {:?}", -//! roa.asn, roa.max_length, roa.not_after); +//! let commons = BgpkitCommons::new(); +//! let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); +//! +//! // List available files from RPKIviews (multiple snapshots per day) +//! let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::default()); +//! let rpkiviews_files = commons.list_rpki_files(date, source)?; +//! for file in &rpkiviews_files { +//! println!("RPKIviews file: {} (timestamp: {})", file.url, file.timestamp); //! } //! # Ok(()) //! # } //! ``` -//! -//! # Performance Considerations -//! -//! - **Trie Structure**: Uses `ipnet-trie` for O(log n) prefix lookups -//! - **Memory Usage**: Stores all ROAs in memory for fast access -//! - **Loading Time**: Initial load from Cloudflare takes a few seconds -//! - **Caching**: No automatic caching - reload when fresh data is needed -//! -//! # Error Handling -//! -//! All validation methods return `Result` and can fail due to: -//! - Network errors when loading data -//! - Invalid prefix format in input -//! - RPKI data not loaded (call `load_rpki_*` methods first) mod cloudflare; mod ripe_historical; -// mod rpkiviews; +pub(crate) mod rpki_client; +mod rpkiviews; -use chrono::{NaiveDate, NaiveDateTime, Utc}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; use ipnet::IpNet; use ipnet_trie::IpnetTrie; use crate::errors::{load_methods, modules}; use crate::{BgpkitCommons, BgpkitCommonsError, LazyLoadable, Result}; -pub use cloudflare::*; +pub use ripe_historical::list_ripe_files; +use rpki_client::RpkiClientData; +pub use rpkiviews::{RpkiViewsCollector, list_rpkiviews_files}; use serde::{Deserialize, Serialize}; use std::fmt::Display; use std::str::FromStr; -#[derive(Clone)] -pub struct RpkiTrie { - pub trie: IpnetTrie>, - pub aspas: Vec, - date: Option, -} - -impl Default for RpkiTrie { - fn default() -> Self { - Self { - trie: IpnetTrie::new(), - aspas: vec![], - date: None, - } - } -} +// ============================================================================ +// Public Data Structures +// ============================================================================ +/// A validated Route Origin Authorization (ROA). +/// +/// ROAs authorize specific Autonomous Systems to originate specific IP prefixes. #[derive(Clone, Debug, Serialize, Deserialize)] -pub struct RoaEntry { +pub struct Roa { + /// The IP prefix (e.g., 192.0.2.0/24 or 2001:db8::/32) pub prefix: IpNet, + /// The AS number authorized to originate this prefix pub asn: u32, + /// Maximum prefix length allowed for announcements pub max_length: u8, + /// Regional Internet Registry that issued this ROA pub rir: Option, + /// ROA validity start time (if available) pub not_before: Option, + /// ROA validity end time (from expires field) pub not_after: Option, } +/// A validated AS Provider Authorization (ASPA). +/// +/// ASPAs specify which ASes are authorized providers for a customer AS. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Aspa { + /// The customer AS number + pub customer_asn: u32, + /// List of provider AS numbers + pub providers: Vec, + /// When this ASPA expires + pub expires: Option, +} + +/// Information about an available RPKI data file. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RpkiFile { + /// Full URL to download the file + pub url: String, + /// Timestamp when the file was created + pub timestamp: DateTime, + /// Size of the file in bytes (if available) + pub size: Option, + /// RIR that this file is for (for RIPE files) + pub rir: Option, + /// Collector that provides this file (for RPKIviews files) + pub collector: Option, +} + +/// Historical RPKI data source. +/// +/// Used to specify which data source to use when loading historical RPKI data. +#[derive(Debug, Clone, Default)] +pub enum HistoricalRpkiSource { + /// RIPE NCC historical archives (data from all 5 RIRs) + #[default] + Ripe, + /// RPKIviews collector + RpkiViews(RpkiViewsCollector), +} + +impl std::fmt::Display for HistoricalRpkiSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + HistoricalRpkiSource::Ripe => write!(f, "RIPE NCC"), + HistoricalRpkiSource::RpkiViews(collector) => write!(f, "RPKIviews ({})", collector), + } + } +} + +/// Regional Internet Registry (RIR). #[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum Rir { AFRINIC, @@ -271,10 +272,14 @@ impl Rir { } } +/// RPKI validation result. #[derive(Clone, Debug, PartialEq, Eq)] pub enum RpkiValidation { + /// The prefix-ASN pair is explicitly authorized by a valid ROA Valid, + /// The prefix has ROAs but none authorize the given ASN Invalid, + /// No ROAs exist for the prefix, or all ROAs are outside their validity period Unknown, } @@ -288,7 +293,41 @@ impl Display for RpkiValidation { } } +// ============================================================================ +// Backwards Compatibility Type Aliases +// ============================================================================ + +/// Type alias for backwards compatibility. Use [`Roa`] instead. +#[deprecated(since = "0.10.0", note = "Use Roa instead")] +pub type RoaEntry = Roa; + +// ============================================================================ +// RpkiTrie Implementation +// ============================================================================ + +/// The main RPKI data structure storing ROAs and ASPAs. +#[derive(Clone)] +pub struct RpkiTrie { + /// Trie mapping IP prefixes to ROA entries + pub trie: IpnetTrie>, + /// AS Provider Authorizations + pub aspas: Vec, + /// Date for historical data (None for real-time) + date: Option, +} + +impl Default for RpkiTrie { + fn default() -> Self { + Self { + trie: IpnetTrie::new(), + aspas: vec![], + date: None, + } + } +} + impl RpkiTrie { + /// Create a new empty RpkiTrie. pub fn new(date: Option) -> Self { Self { trie: IpnetTrie::new(), @@ -297,9 +336,9 @@ impl RpkiTrie { } } - /// insert an [RoaEntry]. Returns true if this is a new prefix, false if added to existing prefix. + /// Insert a ROA. Returns true if this is a new prefix, false if added to existing prefix. /// Duplicates are avoided - ROAs with same (prefix, asn, max_length) are considered identical. - pub fn insert_roa(&mut self, roa: RoaEntry) -> bool { + pub fn insert_roa(&mut self, roa: Roa) -> bool { match self.trie.exact_match_mut(roa.prefix) { Some(existing_roas) => { // Check if this ROA already exists (same prefix, asn, max_length) @@ -317,15 +356,71 @@ impl RpkiTrie { } } - /// insert multiple [RoaEntry]s - pub fn insert_roas(&mut self, roas: Vec) { + /// Insert multiple ROAs. + pub fn insert_roas(&mut self, roas: Vec) { for roa in roas { self.insert_roa(roa); } } - /// Lookup all ROAs that match a given prefix, including invalid ones - pub fn lookup_by_prefix(&self, prefix: &IpNet) -> Vec { + /// Convert rpki-client data into an RpkiTrie. + /// + /// This is a shared conversion function used by all data sources + /// (Cloudflare, RIPE, RPKIviews) since they all produce the same + /// rpki-client JSON format. + pub(crate) fn from_rpki_client_data( + data: RpkiClientData, + date: Option, + ) -> Result { + let mut trie = RpkiTrie::new(date); + trie.merge_rpki_client_data(data); + Ok(trie) + } + + /// Merge rpki-client data into this trie. + /// + /// This converts ROAs and ASPAs from rpki-client format and inserts them, + /// avoiding duplicates for ASPAs based on customer_asn. + pub(crate) fn merge_rpki_client_data(&mut self, data: RpkiClientData) { + // Convert and insert ROAs + for roa in data.roas { + let prefix = match roa.prefix.parse::() { + Ok(p) => p, + Err(_) => continue, + }; + let rir = Rir::from_str(&roa.ta).ok(); + let not_after = + DateTime::from_timestamp(roa.expires as i64, 0).map(|dt| dt.naive_utc()); + + self.insert_roa(Roa { + prefix, + asn: roa.asn, + max_length: roa.max_length, + rir, + not_before: None, + not_after, + }); + } + + // Convert and merge ASPAs (avoiding duplicates based on customer_asn) + for aspa in data.aspas { + if !self + .aspas + .iter() + .any(|a| a.customer_asn == aspa.customer_asid) + { + let expires = DateTime::from_timestamp(aspa.expires, 0).map(|dt| dt.naive_utc()); + self.aspas.push(Aspa { + customer_asn: aspa.customer_asid, + providers: aspa.providers, + expires, + }); + } + } + } + + /// Lookup all ROAs that match a given prefix, including invalid ones. + pub fn lookup_by_prefix(&self, prefix: &IpNet) -> Vec { let mut all_matches = vec![]; for (p, roas) in self.trie.matches(prefix) { if p.contains(prefix) { @@ -339,7 +434,7 @@ impl RpkiTrie { all_matches } - /// Validate a prefix with an ASN + /// Validate a prefix with an ASN. /// /// Return values: /// - `RpkiValidation::Valid` if the prefix-asn pair is valid @@ -360,7 +455,7 @@ impl RpkiTrie { RpkiValidation::Invalid } - /// Validate a prefix with an ASN, checking expiry dates + /// Validate a prefix with an ASN, checking expiry dates. /// /// Return values: /// - `RpkiValidation::Valid` if the prefix-asn pair is valid and not expired @@ -423,15 +518,18 @@ impl RpkiTrie { RpkiValidation::Invalid } + /// Reload the RPKI data from its original source. pub fn reload(&mut self) -> Result<()> { match self.date { Some(date) => { let trie = RpkiTrie::from_ripe_historical(date)?; self.trie = trie.trie; + self.aspas = trie.aspas; } None => { let trie = RpkiTrie::from_cloudflare()?; self.trie = trie.trie; + self.aspas = trie.aspas; } } @@ -457,8 +555,12 @@ impl LazyLoadable for RpkiTrie { } } +// ============================================================================ +// BgpkitCommons Integration +// ============================================================================ + impl BgpkitCommons { - pub fn rpki_lookup_by_prefix(&self, prefix: &str) -> Result> { + pub fn rpki_lookup_by_prefix(&self, prefix: &str) -> Result> { if self.rpki_trie.is_none() { return Err(BgpkitCommonsError::module_not_loaded( modules::RPKI, @@ -503,6 +605,10 @@ impl BgpkitCommons { } } +// ============================================================================ +// Tests +// ============================================================================ + #[cfg(test)] mod tests { use super::*; @@ -512,179 +618,250 @@ mod tests { fn test_multiple_roas_same_prefix() { let mut trie = RpkiTrie::new(None); - // Create a test prefix - let prefix: IpNet = "10.0.0.0/8".parse().unwrap(); - - // Create multiple ROAs for the same prefix with different ASNs - let roa1 = RoaEntry { - prefix, + // Insert first ROA + let roa1 = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64496, - max_length: 16, - rir: Some(Rir::ARIN), + max_length: 24, + rir: Some(Rir::APNIC), not_before: None, not_after: None, }; + assert!(trie.insert_roa(roa1.clone())); - let roa2 = RoaEntry { - prefix, + // Insert second ROA with different ASN for same prefix + let roa2 = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64497, max_length: 24, - rir: Some(Rir::ARIN), + rir: Some(Rir::APNIC), not_before: None, not_after: None, }; + assert!(!trie.insert_roa(roa2.clone())); - // Create a duplicate ROA (same prefix, asn, max_length as roa1) - let roa1_duplicate = RoaEntry { - prefix, + // Insert duplicate ROA (same prefix, asn, max_length) - should be ignored + let roa_dup = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64496, - max_length: 16, - rir: Some(Rir::APNIC), // Different RIR but same (prefix, asn, max_length) + max_length: 24, + rir: Some(Rir::ARIN), // Different RIR shouldn't matter not_before: None, not_after: None, }; + assert!(!trie.insert_roa(roa_dup)); - // Insert ROAs - assert!(trie.insert_roa(roa1)); // Should return true for new prefix - assert!(!trie.insert_roa(roa2)); // Should return false for existing prefix - assert!(!trie.insert_roa(roa1_duplicate)); // Should return false and not add duplicate - - // Lookup should return only 2 ROAs (duplicate should be ignored) - let matches = trie.lookup_by_prefix(&prefix); - assert_eq!(matches.len(), 2); + // Insert ROA with different max_length - should be added + let roa3 = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), + asn: 64496, + max_length: 28, + rir: Some(Rir::APNIC), + not_before: None, + not_after: None, + }; + assert!(!trie.insert_roa(roa3.clone())); - // Check that both ASNs are present - let asns: std::collections::HashSet = matches.iter().map(|r| r.asn).collect(); - assert!(asns.contains(&64496)); - assert!(asns.contains(&64497)); + // Lookup should return 3 ROAs (roa1, roa2, roa3) + let prefix: IpNet = "192.0.2.0/24".parse().unwrap(); + let roas = trie.lookup_by_prefix(&prefix); + assert_eq!(roas.len(), 3); - // Test validation - should be valid for both ASNs + // Validate AS 64496 - should be valid assert_eq!(trie.validate(&prefix, 64496), RpkiValidation::Valid); + + // Validate AS 64497 - should be valid assert_eq!(trie.validate(&prefix, 64497), RpkiValidation::Valid); + + // Validate AS 64498 - should be invalid (prefix has ROAs but not for this ASN) assert_eq!(trie.validate(&prefix, 64498), RpkiValidation::Invalid); + + // Validate unknown prefix - should be unknown + let unknown_prefix: IpNet = "10.0.0.0/8".parse().unwrap(); + assert_eq!( + trie.validate(&unknown_prefix, 64496), + RpkiValidation::Unknown + ); } #[test] fn test_validate_check_expiry() { let mut trie = RpkiTrie::new(None); - // Create a test prefix - let prefix: IpNet = "10.0.0.0/8".parse().unwrap(); - - // Create test dates - let past = DateTime::from_timestamp(1000000000, 0).unwrap().naive_utc(); // 2001-09-09 - let present = DateTime::from_timestamp(1700000000, 0).unwrap().naive_utc(); // 2023-11-14 - let future = DateTime::from_timestamp(2000000000, 0).unwrap().naive_utc(); // 2033-05-18 - - // Test 1: ROA with no time constraints - let roa_no_time = RoaEntry { - prefix, + // Time references + let past_time = DateTime::from_timestamp(1600000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); + let current_time = DateTime::from_timestamp(1700000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); + let future_time = DateTime::from_timestamp(1800000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); + + // Insert ROA that's currently valid (not_before in past, not_after in future) + let roa_valid = Roa { + prefix: "192.0.2.0/24".parse().unwrap(), asn: 64496, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: None, - not_after: None, + max_length: 24, + rir: Some(Rir::APNIC), + not_before: Some(past_time), + not_after: Some(future_time), }; - trie.insert_roa(roa_no_time.clone()); - - // Should be valid at any time - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, Some(past)), - RpkiValidation::Valid - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, Some(present)), - RpkiValidation::Valid - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, Some(future)), - RpkiValidation::Valid - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64496, None), - RpkiValidation::Valid - ); + trie.insert_roa(roa_valid); - // Test 2: ROA that's expired - let expired_roa = RoaEntry { - prefix, + // Insert ROA that's expired + let roa_expired = Roa { + prefix: "198.51.100.0/24".parse().unwrap(), asn: 64497, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: None, - not_after: Some(past), + max_length: 24, + rir: Some(Rir::APNIC), + not_before: Some(past_time), + not_after: Some(past_time), // Expired in the past }; - trie.insert_roa(expired_roa); - - // Should be unknown after expiry (ROA exists but is outside valid time range) - assert_eq!( - trie.validate_check_expiry(&prefix, 64497, Some(present)), - RpkiValidation::Unknown - ); - assert_eq!( - trie.validate_check_expiry(&prefix, 64497, None), - RpkiValidation::Unknown - ); + trie.insert_roa(roa_expired); - // Test 3: ROA that's not yet valid - let future_roa = RoaEntry { - prefix, + // Insert ROA that's not yet valid + let roa_future = Roa { + prefix: "203.0.113.0/24".parse().unwrap(), asn: 64498, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: Some(future), + max_length: 24, + rir: Some(Rir::APNIC), + not_before: Some(future_time), // Not valid yet not_after: None, }; - trie.insert_roa(future_roa); + trie.insert_roa(roa_future); - // Should be unknown before validity period (ROA exists but is outside valid time range) + // Test valid ROA at current time + let prefix_valid: IpNet = "192.0.2.0/24".parse().unwrap(); assert_eq!( - trie.validate_check_expiry(&prefix, 64498, Some(present)), - RpkiValidation::Unknown + trie.validate_check_expiry(&prefix_valid, 64496, Some(current_time)), + RpkiValidation::Valid ); + + // Test expired ROA at current time - should return Unknown (was valid but expired) + let prefix_expired: IpNet = "198.51.100.0/24".parse().unwrap(); assert_eq!( - trie.validate_check_expiry(&prefix, 64498, None), + trie.validate_check_expiry(&prefix_expired, 64497, Some(current_time)), RpkiValidation::Unknown ); - // Test 4: ROA with valid time window - let windowed_roa = RoaEntry { - prefix, - asn: 64499, - max_length: 16, - rir: Some(Rir::ARIN), - not_before: Some(past), - not_after: Some(future), - }; - trie.insert_roa(windowed_roa); - - // Should be valid within window + // Test not-yet-valid ROA at current time - should return Unknown + let prefix_future: IpNet = "203.0.113.0/24".parse().unwrap(); assert_eq!( - trie.validate_check_expiry(&prefix, 64499, Some(present)), - RpkiValidation::Valid - ); - // Should be unknown outside window (ROA exists but is outside valid time range) - assert_eq!( - trie.validate_check_expiry( - &prefix, - 64499, - Some(DateTime::from_timestamp(900000000, 0).unwrap().naive_utc()) - ), + trie.validate_check_expiry(&prefix_future, 64498, Some(current_time)), RpkiValidation::Unknown ); + + // Test not-yet-valid ROA at future time - should return Valid + let far_future = DateTime::from_timestamp(1900000000, 0) + .map(|dt| dt.naive_utc()) + .unwrap(); assert_eq!( - trie.validate_check_expiry( - &prefix, - 64499, - Some(DateTime::from_timestamp(2100000000, 0).unwrap().naive_utc()) - ), - RpkiValidation::Unknown + trie.validate_check_expiry(&prefix_future, 64498, Some(far_future)), + RpkiValidation::Valid ); - // Test 5: Ensure Invalid is still returned for wrong ASN + // Test wrong ASN - should return Invalid assert_eq!( - trie.validate_check_expiry(&prefix, 99999, Some(present)), + trie.validate_check_expiry(&prefix_valid, 64499, Some(current_time)), RpkiValidation::Invalid ); } + + #[test] + #[ignore] // Requires network access + fn test_load_from_ripe_historical() { + // Use a recent date that should have data available + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let trie = RpkiTrie::from_ripe_historical(date).expect("Failed to load RIPE data"); + + let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); + println!( + "Loaded {} ROAs from RIPE historical for {}", + total_roas, date + ); + println!("Loaded {} ASPAs", trie.aspas.len()); + + assert!(total_roas > 0, "Should have loaded some ROAs"); + } + + #[test] + #[ignore] // Requires network access + fn test_load_from_rpkiviews() { + // Note: This test streams from a remote tgz file but stops early + // once rpki-client.json is found (typically at position 3-4 in the archive). + // Due to streaming optimization, this typically completes in ~8 seconds. + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::default(), date) + .expect("Failed to load RPKIviews data"); + + let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); + println!("Loaded {} ROAs from RPKIviews for {}", total_roas, date); + println!("Loaded {} ASPAs", trie.aspas.len()); + + assert!(total_roas > 0, "Should have loaded some ROAs"); + } + + #[test] + #[ignore] // Requires network access + fn test_rpkiviews_file_position() { + // Verify that rpki-client.json appears early in the archive + // This confirms our early-termination optimization works + use crate::rpki::rpkiviews::list_files_in_tgz; + + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::default(), date) + .expect("Failed to list files"); + + assert!(!files.is_empty(), "Should have found some files"); + + let tgz_url = &files[0].url; + println!("Checking file positions in: {}", tgz_url); + + // List first 50 entries to see where rpki-client.json appears + let entries = list_files_in_tgz(tgz_url, Some(50)).expect("Failed to list tgz entries"); + + let json_position = entries + .iter() + .position(|e| e.path.ends_with("rpki-client.json")); + + println!("First {} entries:", entries.len()); + for (i, entry) in entries.iter().enumerate() { + println!(" [{}] {} ({} bytes)", i, entry.path, entry.size); + } + + if let Some(pos) = json_position { + println!( + "\nrpki-client.json found at position {} (early in archive)", + pos + ); + assert!( + pos < 50, + "rpki-client.json should appear early in the archive" + ); + } else { + println!("\nrpki-client.json not in first 50 entries - may need to stream more"); + } + } + + #[test] + #[ignore] // Requires network access + fn test_list_rpkiviews_files() { + let date = NaiveDate::from_ymd_opt(2024, 6, 1).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::default(), date) + .expect("Failed to list files"); + + println!("Found {} files for {} from Kerfuffle", files.len(), date); + for file in files.iter().take(3) { + println!( + " {} ({} bytes, {})", + file.url, + file.size.unwrap_or(0), + file.timestamp + ); + } + + assert!(!files.is_empty(), "Should have found some files"); + } } diff --git a/src/rpki/ripe_historical.rs b/src/rpki/ripe_historical.rs index 9ff2cdd..bfa9f13 100644 --- a/src/rpki/ripe_historical.rs +++ b/src/rpki/ripe_historical.rs @@ -1,15 +1,64 @@ -//! load RIPE NCC historical RPKI VRP dump +//! Load RIPE NCC historical RPKI VRP dump using JSON format. +//! +//! RIPE NCC provides historical RPKI data archives for all 5 RIRs: +//! - AFRINIC: +//! - APNIC: +//! - ARIN: +//! - LACNIC: +//! - RIPE NCC: use crate::Result; -use crate::rpki::{Rir, RoaEntry, RpkiTrie}; -use chrono::{Datelike, NaiveDate, NaiveDateTime}; -use ipnet::IpNet; -use std::str::FromStr; +use crate::rpki::rpki_client::RpkiClientData; +use crate::rpki::{Rir, RpkiFile, RpkiTrie}; +use chrono::{Datelike, NaiveDate, Utc}; use tracing::info; +/// List available RIPE output.json.xz files for a given date from all RIRs. +pub fn list_ripe_files(date: NaiveDate) -> Result> { + let mut files = vec![]; + + for rir in [ + Rir::AFRINIC, + Rir::APNIC, + Rir::ARIN, + Rir::LACNIC, + Rir::RIPENCC, + ] { + let base_url = rir.to_ripe_ftp_root_url(); + let url = format!( + "{}/{:04}/{:02}/{:02}/output.json.xz", + base_url, + date.year(), + date.month(), + date.day() + ); + + files.push(RpkiFile { + url, + // We don't have exact timestamp for RIPE files, use midnight of the date + timestamp: date + .and_hms_opt(0, 0, 0) + .unwrap() + .and_utc() + .fixed_offset() + .with_timezone(&Utc), + size: None, + rir: Some(rir), + collector: None, + }); + } + + Ok(files) +} + impl RpkiTrie { + /// Load RPKI data from RIPE NCC historical archives for a specific date. + /// + /// This loads data from all 5 RIRs (AFRINIC, APNIC, ARIN, LACNIC, RIPENCC) + /// using the output.json.xz format which contains ROAs and ASPAs. pub fn from_ripe_historical(date: NaiveDate) -> Result { let mut trie = RpkiTrie::new(Some(date)); + for rir in [ Rir::AFRINIC, Rir::APNIC, @@ -17,55 +66,32 @@ impl RpkiTrie { Rir::LACNIC, Rir::RIPENCC, ] { - let roas = Self::load_vrp_from_ripe(rir, date)?; - for roa in roas { - trie.insert_roa(roa); - } + let url = format!( + "{}/{:04}/{:02}/{:02}/output.json.xz", + rir.to_ripe_ftp_root_url(), + date.year(), + date.month(), + date.day() + ); + info!("loading {} ROAs from {}", rir, url); + + let data = RpkiClientData::from_url(&url)?; + trie.merge_rpki_client_data(data); } + Ok(trie) } - fn load_vrp_from_ripe(rir: Rir, date: NaiveDate) -> Result> { - let mut roas = vec![]; - let base_url = rir.to_ripe_ftp_root_url(); - let url = format!( - "{}/{:04}/{:02}/{:02}/roas.csv.xz", - base_url, - date.year(), - date.month(), - date.day() - ); - info!("loading {} ROAs from {}", rir, url); - for line in oneio::read_lines(&url)?.skip(1) { - let line = line?; - let mut fields = line.split(','); - let _uri = fields.next().unwrap(); - let asn = fields - .next() - .unwrap() - .to_lowercase() - .strip_prefix("as") - .unwrap() - .parse::()?; - let prefix = IpNet::from_str(fields.next().unwrap())?; - let max_length = match fields.next().unwrap().parse::() { - Ok(l) => l, - Err(_) => continue, - }; - let not_before = - NaiveDateTime::parse_from_str(fields.next().unwrap(), "%Y-%m-%d %H:%M:%S").ok(); - let not_after = - NaiveDateTime::parse_from_str(fields.next().unwrap(), "%Y-%m-%d %H:%M:%S").ok(); - let roa_entry = RoaEntry { - prefix, - asn, - max_length, - rir: Some(rir), - not_before, - not_after, - }; - roas.push(roa_entry); + /// Load RPKI data from specific RIPE file URLs. + pub fn from_ripe_files(urls: &[String], date: Option) -> Result { + let mut trie = RpkiTrie::new(date); + + for url in urls { + info!("loading ROAs from {}", url); + let data = RpkiClientData::from_url(url)?; + trie.merge_rpki_client_data(data); } - Ok(roas) + + Ok(trie) } } diff --git a/src/rpki/rpki_client.rs b/src/rpki/rpki_client.rs new file mode 100644 index 0000000..753c733 --- /dev/null +++ b/src/rpki/rpki_client.rs @@ -0,0 +1,371 @@ +//! Internal data structures for parsing rpki-client JSON output format. +//! +//! The `rpki-client` software produces JSON output that is used by multiple RPKI data sources: +//! - Cloudflare RPKI Portal () +//! - RIPE NCC historical archives (output.json.xz files) +//! - RPKIviews collectors (rpki-client.json inside .tgz files) +//! +//! This module defines the internal data structures for parsing this JSON format. +//! For public access, use the `Roa` and `Aspa` structs from the parent module. + +use serde::{Deserialize, Deserializer, Serialize}; + +/// Custom deserializer for ASN that handles both numeric and string formats. +/// RIPE uses "AS12345" format, while Cloudflare uses numeric 12345. +fn deserialize_asn<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + use serde::de::{self, Visitor}; + + struct AsnVisitor; + + impl<'de> Visitor<'de> for AsnVisitor { + type Value = u32; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("an ASN as a number or string like 'AS12345'") + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + u32::try_from(value).map_err(|_| E::custom(format!("ASN {} out of range", value))) + } + + fn visit_i64(self, value: i64) -> Result + where + E: de::Error, + { + u32::try_from(value).map_err(|_| E::custom(format!("ASN {} out of range", value))) + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + // Handle "AS12345" or "as12345" format + let num_str = value + .strip_prefix("AS") + .or_else(|| value.strip_prefix("as")) + .unwrap_or(value); + + num_str + .parse::() + .map_err(|_| E::custom(format!("invalid ASN string: {}", value))) + } + } + + deserializer.deserialize_any(AsnVisitor) +} + +/// Custom deserializer for expires that handles both i64 and u64. +fn deserialize_expires<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + use serde::de::{self, Visitor}; + + struct ExpiresVisitor; + + impl<'de> Visitor<'de> for ExpiresVisitor { + type Value = u64; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a timestamp as a number") + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + Ok(value) + } + + fn visit_i64(self, value: i64) -> Result + where + E: de::Error, + { + if value >= 0 { + Ok(value as u64) + } else { + Err(E::custom(format!("negative timestamp: {}", value))) + } + } + } + + deserializer.deserialize_any(ExpiresVisitor) +} + +/// Custom deserializer for provider list that handles both string array and number array. +/// RIPE uses ["AS123", "AS456"] format, Cloudflare uses [123, 456]. +fn deserialize_providers<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + use serde::de::{self, SeqAccess, Visitor}; + + struct ProvidersVisitor; + + impl<'de> Visitor<'de> for ProvidersVisitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a list of ASNs as numbers or strings") + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + let mut providers = Vec::new(); + + while let Some(elem) = seq.next_element::()? { + let asn = match elem { + serde_json::Value::Number(n) => n + .as_u64() + .and_then(|v| u32::try_from(v).ok()) + .ok_or_else(|| de::Error::custom("invalid ASN number"))?, + serde_json::Value::String(s) => { + let num_str = s + .strip_prefix("AS") + .or_else(|| s.strip_prefix("as")) + .unwrap_or(&s); + num_str + .parse::() + .map_err(|_| de::Error::custom(format!("invalid ASN string: {}", s)))? + } + _ => return Err(de::Error::custom("expected number or string")), + }; + providers.push(asn); + } + + Ok(providers) + } + } + + deserializer.deserialize_seq(ProvidersVisitor) +} + +/// The main rpki-client JSON output structure. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub(crate) struct RpkiClientData { + #[serde(default)] + pub metadata: RpkiClientMetadata, + #[serde(default)] + pub roas: Vec, + #[serde(default)] + pub aspas: Vec, + #[serde(default)] + pub bgpsec_keys: Vec, +} + +/// Metadata about the rpki-client validation run. +#[derive(Clone, Debug, Serialize, Deserialize, Default)] +pub(crate) struct RpkiClientMetadata { + pub buildmachine: Option, + pub buildtime: Option, + #[serde(default)] + pub generated: Option, + #[serde(rename = "generatedTime", default)] + pub generated_time: Option, + pub elapsedtime: Option, + pub usertime: Option, + pub systemtime: Option, + pub roas: Option, + pub failedroas: Option, + pub invalidroas: Option, + pub spls: Option, + pub failedspls: Option, + pub invalidspls: Option, + pub aspas: Option, + pub failedaspas: Option, + pub invalidaspas: Option, + pub bgpsec_pubkeys: Option, + pub certificates: Option, + pub invalidcertificates: Option, + pub taks: Option, + pub tals: Option, + pub invalidtals: Option, + pub talfiles: Option>, + pub manifests: Option, + pub failedmanifests: Option, + pub crls: Option, + pub gbrs: Option, + pub repositories: Option, + pub vrps: Option, + pub uniquevrps: Option, + pub vsps: Option, + pub uniquevsps: Option, + pub vaps: Option, + pub uniquevaps: Option, + pub cachedir_new_files: Option, + pub cachedir_del_files: Option, + pub cachedir_del_dirs: Option, + pub cachedir_superfluous_files: Option, + pub cachedir_del_superfluous_files: Option, +} + +/// A validated Route Origin Authorization (ROA) entry from rpki-client. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub(crate) struct RpkiClientRoaEntry { + pub prefix: String, + #[serde(rename = "maxLength")] + pub max_length: u8, + #[serde(deserialize_with = "deserialize_asn")] + pub asn: u32, + pub ta: String, + #[serde(default, deserialize_with = "deserialize_expires")] + pub expires: u64, +} + +/// A validated AS Provider Authorization (ASPA) entry from rpki-client. +/// +/// Handles both Cloudflare format (customer_asid as number, providers as numbers) +/// and RIPE format (customer as string "AS123", providers as strings ["AS456"]). +#[derive(Clone, Debug, Serialize, Deserialize)] +pub(crate) struct RpkiClientAspaEntry { + /// Customer ASN - Cloudflare uses "customer_asid", RIPE uses "customer" + #[serde(alias = "customer", deserialize_with = "deserialize_asn")] + pub customer_asid: u32, + /// Expiry timestamp - may be missing in RIPE format + #[serde(default)] + pub expires: i64, + /// Provider ASNs - can be numbers or strings like "AS123" + #[serde(deserialize_with = "deserialize_providers")] + pub providers: Vec, +} + +/// A validated BGPsec router key entry from rpki-client. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub(crate) struct RpkiClientBgpsecKeyEntry { + pub asn: u32, + pub ski: String, + pub pubkey: String, + pub ta: String, + pub expires: i64, +} + +impl RpkiClientData { + /// Load rpki-client data from a URL. + /// + /// This uses oneio to handle remote URLs and compression (xz, gz, etc.), + /// then parses the JSON with our custom deserializers. + pub fn from_url(url: &str) -> crate::Result { + let reader = oneio::get_reader(url)?; + let data: RpkiClientData = serde_json::from_reader(reader)?; + Ok(data) + } + + /// Load rpki-client data from a JSON string. + pub fn from_json(json: &str) -> crate::Result { + let data: RpkiClientData = serde_json::from_str(json)?; + Ok(data) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_deserialize_empty() { + let json = r#"{}"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert!(data.roas.is_empty()); + assert!(data.aspas.is_empty()); + assert!(data.bgpsec_keys.is_empty()); + } + + #[test] + fn test_deserialize_roa_numeric_asn() { + let json = r#"{ + "roas": [ + { + "prefix": "192.0.2.0/24", + "maxLength": 24, + "asn": 64496, + "ta": "apnic", + "expires": 1704067200 + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.roas.len(), 1); + assert_eq!(data.roas[0].prefix, "192.0.2.0/24"); + assert_eq!(data.roas[0].max_length, 24); + assert_eq!(data.roas[0].asn, 64496); + assert_eq!(data.roas[0].ta, "apnic"); + } + + #[test] + fn test_deserialize_roa_string_asn() { + // RIPE format uses "AS12345" string format + let json = r#"{ + "roas": [ + { + "prefix": "1.178.112.0/20", + "maxLength": 24, + "asn": "AS12975", + "ta": "ripencc" + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.roas.len(), 1); + assert_eq!(data.roas[0].prefix, "1.178.112.0/20"); + assert_eq!(data.roas[0].max_length, 24); + assert_eq!(data.roas[0].asn, 12975); + assert_eq!(data.roas[0].ta, "ripencc"); + } + + #[test] + fn test_deserialize_roa_lowercase_asn() { + let json = r#"{ + "roas": [ + { + "prefix": "10.0.0.0/8", + "maxLength": 8, + "asn": "as64496", + "ta": "arin" + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.roas[0].asn, 64496); + } + + #[test] + fn test_deserialize_aspa() { + let json = r#"{ + "aspas": [ + { + "customer_asid": 64496, + "expires": 1704067200, + "providers": [64497, 64498] + } + ] + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.aspas.len(), 1); + assert_eq!(data.aspas[0].customer_asid, 64496); + assert_eq!(data.aspas[0].providers, vec![64497, 64498]); + } + + #[test] + fn test_deserialize_ripe_metadata() { + let json = r#"{ + "metadata": { + "generated": 1717215759, + "generatedTime": "2024-06-01T04:22:39Z" + } + }"#; + let data: RpkiClientData = serde_json::from_str(json).unwrap(); + assert_eq!(data.metadata.generated, Some(1717215759)); + assert_eq!( + data.metadata.generated_time, + Some("2024-06-01T04:22:39Z".to_string()) + ); + } +} diff --git a/src/rpki/rpkiviews.rs b/src/rpki/rpkiviews.rs new file mode 100644 index 0000000..96d8df2 --- /dev/null +++ b/src/rpki/rpkiviews.rs @@ -0,0 +1,667 @@ +//! Load RPKI data from RPKIviews collectors. +//! +//! RPKIviews provides historical RPKI data from multiple collectors around the world. +//! See: + +use crate::Result; +use crate::rpki::rpki_client::RpkiClientData; +use crate::rpki::{RpkiFile, RpkiTrie}; +use chrono::{DateTime, Datelike, NaiveDate}; +use serde::{Deserialize, Serialize}; +use std::io::{Read, Write}; +use std::process::{Command, Stdio}; +use std::str::FromStr; +use tracing::info; + +/// Available RPKIviews collectors. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] +pub enum RpkiViewsCollector { + /// josephine.sobornost.net - A2B Internet (AS51088), Amsterdam, Netherlands + Josephine, + /// amber.massars.net - Massar (AS57777), Lugano, Switzerland + Amber, + /// dango.attn.jp - Internet Initiative Japan (AS2497), Tokyo, Japan + Dango, + /// rpkiviews.kerfuffle.net - Kerfuffle, LLC (AS35008), Fremont, California, United States + #[default] + Kerfuffle, +} + +impl RpkiViewsCollector { + /// Get the HTTPS base URL for this collector + pub fn base_url(&self) -> &'static str { + match self { + RpkiViewsCollector::Josephine => "https://josephine.sobornost.net/rpkidata", + RpkiViewsCollector::Amber => "https://amber.massars.net/rpkidata", + RpkiViewsCollector::Dango => "https://dango.attn.jp/rpkidata", + RpkiViewsCollector::Kerfuffle => "https://rpkiviews.kerfuffle.net/rpkidata", + } + } + + /// Get the index.txt URL for this collector + pub fn index_url(&self) -> String { + format!("{}/index.txt", self.base_url()) + } + + /// Get all available collectors + pub fn all() -> Vec { + vec![ + RpkiViewsCollector::Josephine, + RpkiViewsCollector::Amber, + RpkiViewsCollector::Dango, + RpkiViewsCollector::Kerfuffle, + ] + } +} + +impl std::fmt::Display for RpkiViewsCollector { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RpkiViewsCollector::Josephine => write!(f, "josephine"), + RpkiViewsCollector::Amber => write!(f, "amber"), + RpkiViewsCollector::Dango => write!(f, "dango"), + RpkiViewsCollector::Kerfuffle => write!(f, "kerfuffle"), + } + } +} + +impl FromStr for RpkiViewsCollector { + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "josephine" | "josephine.sobornost.net" => Ok(RpkiViewsCollector::Josephine), + "amber" | "amber.massars.net" => Ok(RpkiViewsCollector::Amber), + "dango" | "dango.attn.jp" => Ok(RpkiViewsCollector::Dango), + "kerfuffle" | "rpkiviews.kerfuffle.net" => Ok(RpkiViewsCollector::Kerfuffle), + _ => Err(format!("unknown RPKIviews collector: {}", s)), + } + } +} + +/// List available RPKIviews files for a given date from a specific collector. +/// +/// This function reads the index.txt file from the collector to find available +/// archives for the specified date. This is a fast operation as it only downloads +/// a small index file. +pub fn list_rpkiviews_files( + collector: RpkiViewsCollector, + date: NaiveDate, +) -> Result> { + let index_url = collector.index_url(); + let base_url = collector.base_url(); + + // Format the date path prefix we're looking for (e.g., "2024/01/04/") + let date_prefix = format!("{:04}/{:02}/{:02}/", date.year(), date.month(), date.day()); + + let mut files = vec![]; + + for line in oneio::read_lines(&index_url)? { + let line = line?; + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() < 3 { + continue; + } + + let path = parts[0]; + let timestamp_secs: i64 = match parts[1].parse() { + Ok(t) => t, + Err(_) => continue, + }; + let size: u64 = match parts[2].parse() { + Ok(s) => s, + Err(_) => continue, + }; + + // Check if this file is for the requested date and is a .tgz file + if path.starts_with(&date_prefix) && path.ends_with(".tgz") && path.contains("/rpki-") { + let url = format!("{}/{}", base_url, path); + let timestamp = DateTime::from_timestamp(timestamp_secs, 0) + .unwrap_or_else(|| DateTime::from_timestamp(0, 0).unwrap()); + + files.push(RpkiFile { + url, + timestamp, + size: Some(size), + rir: None, + collector: Some(collector), + }); + } + } + + // Sort by timestamp (oldest first) + files.sort_by_key(|f| f.timestamp); + + Ok(files) +} + +/// Information about a file entry found within a .tgz archive. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct TgzFileEntry { + /// Path of the file within the archive + pub path: String, + /// Size of the file in bytes + pub size: u64, +} + +/// List files within a remote .tgz archive by streaming only the tar headers. +#[allow(dead_code)] +/// +/// This function streams the .tgz archive and reads tar headers to enumerate +/// the files contained within. It skips reading the actual file content, which +/// makes it much faster than downloading the entire archive. +/// +/// **Important**: Due to the nature of gzip compression, we still need to decompress +/// the data sequentially, but we skip over file content rather than buffering it. +/// If `max_entries` is provided, we stop early after finding that many entries. +/// +/// # Arguments +/// * `url` - URL of the .tgz file +/// * `max_entries` - Optional maximum number of entries to return (for early termination) +pub fn list_files_in_tgz(url: &str, max_entries: Option) -> Result> { + info!("listing files in tgz archive: {}", url); + + // Spawn gunzip process + let mut gunzip = Command::new("gunzip") + .arg("-c") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to spawn gunzip: {}", e), + ) + })?; + + let mut gunzip_stdin = gunzip.stdin.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdin") + })?; + + let gunzip_stdout = gunzip.stdout.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdout") + })?; + + // Flag to signal early termination to writer thread + let should_stop = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let should_stop_writer = should_stop.clone(); + + // Stream the .tgz file using reqwest in a separate thread + let url_owned = url.to_string(); + let writer_thread = std::thread::spawn(move || -> Result<()> { + let response = reqwest::blocking::get(&url_owned).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to fetch {}: {}", url_owned, e), + ) + })?; + + if !response.status().is_success() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("HTTP error {} for {}", response.status(), url_owned), + )); + } + + let mut reader = response; + let mut buffer = [0u8; 65536]; + loop { + // Check if we should stop early + if should_stop_writer.load(std::sync::atomic::Ordering::Relaxed) { + break; + } + + let n = match reader.read(&mut buffer) { + Ok(0) => break, + Ok(n) => n, + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(_) => break, // Connection closed or error, stop gracefully + }; + + if gunzip_stdin.write_all(&buffer[..n]).is_err() { + // Pipe closed (reader side done), stop gracefully + break; + } + } + drop(gunzip_stdin); + Ok(()) + }); + + // Read tar entries from gunzip stdout - only reading headers, skipping content + let mut archive = tar::Archive::new(gunzip_stdout); + let mut entries_list = Vec::new(); + + let entries_iter = archive.entries().map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to read tar entries: {}", e), + ) + })?; + + for entry_result in entries_iter { + let entry = match entry_result { + Ok(e) => e, + Err(_) => continue, + }; + + let path = match entry.path() { + Ok(p) => p.to_string_lossy().to_string(), + Err(_) => continue, + }; + + let size = entry.size(); + + // Skip directories (they have size 0 and path ends with /) + if !path.ends_with('/') { + entries_list.push(TgzFileEntry { path, size }); + } + + // Check if we've reached max_entries + if let Some(max) = max_entries { + if entries_list.len() >= max { + // Signal writer to stop and break out + should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + break; + } + } + } + + // Signal writer to stop (in case we finished iterating) + should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + + // Don't wait for writer thread - it will terminate when pipe closes + // Just detach it + drop(writer_thread); + + // Kill gunzip process to clean up + let _ = gunzip.kill(); + let _ = gunzip.wait(); + + Ok(entries_list) +} + +/// Check if a .tgz archive contains a specific file path. +#[allow(dead_code)] +/// +/// This is an optimized function that stops as soon as it finds the target file, +/// avoiding the need to decompress the entire archive. +pub fn tgz_contains_file(url: &str, target_path: &str) -> Result { + info!( + "checking if tgz archive contains file: {} in {}", + target_path, url + ); + + let mut gunzip = Command::new("gunzip") + .arg("-c") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to spawn gunzip: {}", e), + ) + })?; + + let mut gunzip_stdin = gunzip.stdin.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdin") + })?; + + let gunzip_stdout = gunzip.stdout.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdout") + })?; + + let should_stop = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let should_stop_writer = should_stop.clone(); + + let url_owned = url.to_string(); + let writer_thread = std::thread::spawn(move || -> Result<()> { + let response = reqwest::blocking::get(&url_owned).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to fetch {}: {}", url_owned, e), + ) + })?; + + if !response.status().is_success() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("HTTP error {} for {}", response.status(), url_owned), + )); + } + + let mut reader = response; + let mut buffer = [0u8; 65536]; + loop { + if should_stop_writer.load(std::sync::atomic::Ordering::Relaxed) { + break; + } + + let n = match reader.read(&mut buffer) { + Ok(0) => break, + Ok(n) => n, + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(_) => break, + }; + + if gunzip_stdin.write_all(&buffer[..n]).is_err() { + break; + } + } + drop(gunzip_stdin); + Ok(()) + }); + + let mut archive = tar::Archive::new(gunzip_stdout); + let mut found = false; + + if let Ok(entries_iter) = archive.entries() { + for entry_result in entries_iter { + let entry = match entry_result { + Ok(e) => e, + Err(_) => continue, + }; + + let path = match entry.path() { + Ok(p) => p.to_string_lossy().to_string(), + Err(_) => continue, + }; + + if path.ends_with(target_path) || path == target_path { + found = true; + should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + break; + } + } + } + + should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + drop(writer_thread); + let _ = gunzip.kill(); + let _ = gunzip.wait(); + + Ok(found) +} + +/// Extract a specific file from a remote .tgz archive. +/// +/// This function streams the .tgz archive and extracts only the target file, +/// stopping as soon as the file is found and read. This is much faster than +/// downloading and extracting the entire archive. +/// +/// # Arguments +/// * `url` - URL of the .tgz file +/// * `target_path` - Path of the file to extract within the archive (e.g., "output/rpki-client.json") +/// +/// # Returns +/// The content of the target file as a String, or an error if the file is not found. +pub fn extract_file_from_tgz(url: &str, target_path: &str) -> Result { + info!("extracting {} from tgz archive: {}", target_path, url); + + let mut gunzip = Command::new("gunzip") + .arg("-c") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to spawn gunzip: {}", e), + ) + })?; + + let mut gunzip_stdin = gunzip.stdin.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdin") + })?; + + let gunzip_stdout = gunzip.stdout.take().ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error("RPKIviews", "Failed to get gunzip stdout") + })?; + + let should_stop = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let should_stop_writer = should_stop.clone(); + + let url_owned = url.to_string(); + let writer_thread = std::thread::spawn(move || -> Result<()> { + let response = reqwest::blocking::get(&url_owned).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to fetch {}: {}", url_owned, e), + ) + })?; + + if !response.status().is_success() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("HTTP error {} for {}", response.status(), url_owned), + )); + } + + let mut reader = response; + let mut buffer = [0u8; 65536]; + loop { + if should_stop_writer.load(std::sync::atomic::Ordering::Relaxed) { + break; + } + + let n = match reader.read(&mut buffer) { + Ok(0) => break, + Ok(n) => n, + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(_) => break, + }; + + if gunzip_stdin.write_all(&buffer[..n]).is_err() { + break; + } + } + drop(gunzip_stdin); + Ok(()) + }); + + let mut archive = tar::Archive::new(gunzip_stdout); + let mut content: Option = None; + + let entries_iter = archive.entries().map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to read tar entries: {}", e), + ) + })?; + + for entry_result in entries_iter { + let mut entry = match entry_result { + Ok(e) => e, + Err(_) => continue, + }; + + let path = match entry.path() { + Ok(p) => p.to_string_lossy().to_string(), + Err(_) => continue, + }; + + // Check if this is the target file + if path.ends_with(target_path) || path == target_path { + let mut file_content = String::new(); + entry.read_to_string(&mut file_content).map_err(|e| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("Failed to read {}: {}", target_path, e), + ) + })?; + content = Some(file_content); + should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + break; + } + // Note: if this is not our target file, the tar iterator automatically + // skips past the file content when we move to the next entry, + // so we don't buffer unnecessary data + } + + should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + drop(writer_thread); + let _ = gunzip.kill(); + let _ = gunzip.wait(); + + content.ok_or_else(|| { + crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!("{} not found in archive: {}", target_path, url), + ) + }) +} + +/// Stream and extract rpki-client.json from a .tgz URL. +/// +/// This is a convenience function that extracts the rpki-client.json file +/// from an RPKIviews archive. It stops streaming as soon as the file is found. +fn stream_tgz_and_extract_json(url: &str) -> Result { + let json_str = extract_file_from_tgz(url, "output/rpki-client.json")?; + RpkiClientData::from_json(&json_str) +} + +impl RpkiTrie { + /// Load RPKI data from RPKIviews for a specific date. + /// + /// This will use the first (earliest) available file for the given date from the specified collector. + /// By default, uses the Kerfuffle collector. + pub fn from_rpkiviews(collector: RpkiViewsCollector, date: NaiveDate) -> Result { + let files = list_rpkiviews_files(collector, date)?; + + if files.is_empty() { + return Err(crate::BgpkitCommonsError::data_source_error( + "RPKIviews", + format!( + "No RPKIviews files found for date {} from collector {}", + date, collector + ), + )); + } + + // Use the first (earliest) file for the date + let first_file = files.first().unwrap(); + info!( + "Using RPKIviews file from {} (timestamp: {})", + collector, first_file.timestamp + ); + + Self::from_rpkiviews_file(&first_file.url, Some(date)) + } + + /// Load RPKI data from a specific RPKIviews .tgz file URL. + pub fn from_rpkiviews_file(url: &str, date: Option) -> Result { + let data = stream_tgz_and_extract_json(url)?; + Self::from_rpki_client_data(data, date) + } + + /// Load RPKI data from multiple RPKIviews file URLs. + /// + /// This allows loading and merging data from multiple files into a single trie. + pub fn from_rpkiviews_files(urls: &[String], date: Option) -> Result { + let mut trie = RpkiTrie::new(date); + + for url in urls { + let data = stream_tgz_and_extract_json(url)?; + trie.merge_rpki_client_data(data); + } + + Ok(trie) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collector_urls() { + assert_eq!( + RpkiViewsCollector::Josephine.base_url(), + "https://josephine.sobornost.net/rpkidata" + ); + assert_eq!( + RpkiViewsCollector::Kerfuffle.index_url(), + "https://rpkiviews.kerfuffle.net/rpkidata/index.txt" + ); + } + + #[test] + fn test_collector_from_str() { + assert_eq!( + RpkiViewsCollector::from_str("josephine").unwrap(), + RpkiViewsCollector::Josephine + ); + assert_eq!( + RpkiViewsCollector::from_str("amber.massars.net").unwrap(), + RpkiViewsCollector::Amber + ); + } + + #[test] + fn test_default_collector() { + assert_eq!(RpkiViewsCollector::default(), RpkiViewsCollector::Josephine); + } + + #[test] + #[ignore] // Requires network access + fn test_list_rpkiviews_files() { + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::Kerfuffle, date).unwrap(); + println!("Found {} files for {}", files.len(), date); + for file in &files { + println!(" {} ({} bytes)", file.url, file.size.unwrap_or(0)); + } + assert!(!files.is_empty()); + } + + #[test] + #[ignore] // Requires network access - streams partial archive + fn test_list_files_in_tgz() { + // List first 10 files in a remote tgz to verify streaming works + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::Kerfuffle, date).unwrap(); + assert!(!files.is_empty()); + + let tgz_url = &files[0].url; + println!("Listing files in: {}", tgz_url); + + // Only get first 10 entries to test early termination + let entries = list_files_in_tgz(tgz_url, Some(10)).unwrap(); + println!("Found {} entries (limited to 10):", entries.len()); + for entry in &entries { + println!(" {} ({} bytes)", entry.path, entry.size); + } + assert!(!entries.is_empty()); + assert!(entries.len() <= 10); + } + + #[test] + #[ignore] // Requires network access - streams partial archive + fn test_tgz_contains_file() { + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::Kerfuffle, date).unwrap(); + assert!(!files.is_empty()); + + let tgz_url = &files[0].url; + println!("Checking for rpki-client.json in: {}", tgz_url); + + let contains = tgz_contains_file(tgz_url, "output/rpki-client.json").unwrap(); + assert!(contains, "Archive should contain output/rpki-client.json"); + println!("Found rpki-client.json!"); + } + + #[test] + #[ignore] // Requires network access and takes time to download + fn test_from_rpkiviews() { + let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); + let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::Kerfuffle, date).unwrap(); + + let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); + println!("Loaded {} ROAs from RPKIviews", total_roas); + assert!(total_roas > 0); + } +} From c18f90598be91b708692d17b84f42196345a920f Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 4 Dec 2025 18:38:10 -0800 Subject: [PATCH 4/7] update RPKIviews historical data support Rename RPKIviews collectors to SoborostNet/MassarsNet/AttnJp/KerfuffleNet. Add streaming tgz extraction, unified rpki-client JSON parsing, public Roa and Aspa types, historical loading/listing APIs, and update examples and CHANGELOG --- CHANGELOG.md | 48 +++++++++++++++++++++++++++ examples/list_aspas.rs | 27 ++++++++------- examples/rpki_historical.rs | 4 +-- src/lib.rs | 6 ++-- src/rpki/rpkiviews.rs | 65 +++++++++++++++++++------------------ 5 files changed, 100 insertions(+), 50 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0602ac..92f849b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,54 @@ All notable changes to this project will be documented in this file. ## Unreleased +### RPKIviews Historical Data Support + +* **Added RPKIviews as a historical RPKI data source**: Users can now load historical RPKI data from RPKIviews collectors in addition to RIPE NCC archives + - New `RpkiViewsCollector` enum with four collectors: SoborostNet (default), MassarsNet, AttnJp, and KerfuffleNet + - Added `RpkiTrie::from_rpkiviews(collector, date)` method for loading from a specific collector + - Added `RpkiTrie::from_rpkiviews_file(url, date)` and `from_rpkiviews_files(urls, date)` for loading from specific archive URLs + - Added `list_rpkiviews_files(collector, date)` function to discover available archives for a given date + - New `HistoricalRpkiSource` enum to explicitly select between RIPE and RPKIviews sources + +* **Streaming optimization for .tgz archives**: RPKIviews archives are streamed efficiently without downloading the entire file + - `rpki-client.json` is located at position 3-4 in the archive, allowing early termination after ~80MB instead of downloading 300+ MB + - New `extract_file_from_tgz(url, target_path)` function for streaming extraction of specific files + - New `list_files_in_tgz(url, max_entries)` function for listing archive contents with early termination + - New `tgz_contains_file(url, target_path)` function for checking file existence + - Uses `reqwest` for HTTP streaming and external `gunzip` for decompression + - Test completion time reduced from several minutes to ~8 seconds + +* **Unified rpki-client JSON parsing**: Extracted shared parsing logic for rpki-client JSON format + - New internal `rpki_client.rs` module with `RpkiClientData` struct and robust deserializers + - Handles variations in ASN formats (numeric `12345` vs string `"AS12345"`) + - Handles variations in ASPA field names (`customer_asid` vs `customer`) + - Handles provider arrays as both numbers and strings + - Used by Cloudflare, RIPE historical, and RPKIviews sources + +* **Public ROA and ASPA structs**: Added stable public API types + - New `Roa` struct with fields: `prefix`, `asn`, `max_length`, `not_before`, `not_after` + - New `Aspa` struct with fields: `customer_asn`, `providers` + - Internal rpki-client format structs are now `pub(crate)` only + +* **Updated RIPE historical to use JSON format**: Changed from CSV to `output.json.xz` for consistency + - Requires `xz` feature in oneio (now enabled by default for rpki feature) + - Provides richer data including expiry timestamps + +* **New BgpkitCommons methods**: + - `load_rpki_historical(source, date)` - Load historical RPKI data from specified source + - `list_rpki_files(source, date)` - List available RPKI files for a date from specified source + - `load_rpki_from_files(urls, date)` - Load and merge RPKI data from multiple file URLs + +* **New example**: Added `examples/rpki_historical.rs` demonstrating historical RPKI data loading + +* **Updated example**: `examples/list_aspas.rs` now counts ASPA objects for first day of years 2020-2025 + +### Dependencies + +* Added `reqwest` (with blocking feature) for HTTP streaming +* Added `tar` crate for reading tar archives +* Enabled `xz` feature in `oneio` for RIPE historical JSON support + ### Crate Consolidation * **Migrated `as2org-rs` into bgpkit-commons**: The CAIDA AS-to-Organization mapping functionality previously provided by the external `as2org-rs` crate has been fully integrated into the `asinfo` module diff --git a/examples/list_aspas.rs b/examples/list_aspas.rs index 854c54a..f50f3b5 100644 --- a/examples/list_aspas.rs +++ b/examples/list_aspas.rs @@ -5,22 +5,22 @@ fn main() { println!("Counting ASPA objects on the first day of each year (2020-2025)"); println!("{}", "=".repeat(60)); - for year in 2023..=2025 { + for year in 2020..=2025 { let date = NaiveDate::from_ymd_opt(year, 1, 1).unwrap(); - // // Try RIPE historical first - // match RpkiTrie::from_ripe_historical(date) { - // Ok(trie) => { - // println!("{}-01-01: {} ASPAs (from RIPE)", year, trie.aspas.len()); - // continue; - // } - // Err(_) => { - // // RIPE failed, try RPKIviews - // } - // } + // Try RIPE historical first + match RpkiTrie::from_ripe_historical(date) { + Ok(trie) => { + println!("{}-01-01: {} ASPAs (from RIPE)", year, trie.aspas.len()); + continue; + } + Err(_) => { + // RIPE failed, try RPKIviews + } + } // Fallback to RPKIviews - match RpkiTrie::from_rpkiviews(RpkiViewsCollector::Dango, date) { + match RpkiTrie::from_rpkiviews(RpkiViewsCollector::default(), date) { Ok(trie) => { println!( "{}-01-01: {} ASPAs (from RPKIviews)", @@ -28,8 +28,7 @@ fn main() { trie.aspas.len() ); } - Err(e) => { - eprintln!("{e}"); + Err(_) => { println!("{}-01-01: No data available", year); } } diff --git a/examples/rpki_historical.rs b/examples/rpki_historical.rs index a2bc7bd..e6d640e 100644 --- a/examples/rpki_historical.rs +++ b/examples/rpki_historical.rs @@ -34,8 +34,8 @@ fn main() { println!(); // List files from RPKIviews (multiple snapshots per day) - println!("RPKIviews files (Kerfuffle collector):"); - let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::Kerfuffle); + println!("RPKIviews files (KerfuffleNet collector):"); + let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::KerfuffleNet); match commons.list_rpki_files(date, source) { Ok(files) => { println!(" Found {} files for {}", files.len(), date); diff --git a/src/lib.rs b/src/lib.rs index 06ca4c1..f090ac2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -317,7 +317,7 @@ impl BgpkitCommons { /// commons.load_rpki_historical(date, HistoricalRpkiSource::Ripe).unwrap(); /// /// // Or load from RPKIviews - /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::Kerfuffle); + /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::KerfuffleNet); /// commons.load_rpki_historical(date, source).unwrap(); /// ``` #[cfg(feature = "rpki")] @@ -359,7 +359,7 @@ impl BgpkitCommons { /// "https://example.com/rpki-20240104T144128Z.tgz".to_string(), /// ]; /// commons.load_rpki_from_files(&urls, HistoricalRpkiSource::RpkiViews( - /// bgpkit_commons::rpki::RpkiViewsCollector::Kerfuffle + /// bgpkit_commons::rpki::RpkiViewsCollector::KerfuffleNet /// ), None).unwrap(); /// ``` #[cfg(feature = "rpki")] @@ -396,7 +396,7 @@ impl BgpkitCommons { /// let ripe_files = commons.list_rpki_files(date, HistoricalRpkiSource::Ripe).unwrap(); /// /// // List files from RPKIviews - /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::Kerfuffle); + /// let source = HistoricalRpkiSource::RpkiViews(RpkiViewsCollector::KerfuffleNet); /// let rpkiviews_files = commons.list_rpki_files(date, source).unwrap(); /// ``` #[cfg(feature = "rpki")] diff --git a/src/rpki/rpkiviews.rs b/src/rpki/rpkiviews.rs index 96d8df2..8af8463 100644 --- a/src/rpki/rpkiviews.rs +++ b/src/rpki/rpkiviews.rs @@ -17,24 +17,24 @@ use tracing::info; #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] pub enum RpkiViewsCollector { /// josephine.sobornost.net - A2B Internet (AS51088), Amsterdam, Netherlands - Josephine, + #[default] + SoborostNet, /// amber.massars.net - Massar (AS57777), Lugano, Switzerland - Amber, + MassarsNet, /// dango.attn.jp - Internet Initiative Japan (AS2497), Tokyo, Japan - Dango, + AttnJp, /// rpkiviews.kerfuffle.net - Kerfuffle, LLC (AS35008), Fremont, California, United States - #[default] - Kerfuffle, + KerfuffleNet, } impl RpkiViewsCollector { /// Get the HTTPS base URL for this collector pub fn base_url(&self) -> &'static str { match self { - RpkiViewsCollector::Josephine => "https://josephine.sobornost.net/rpkidata", - RpkiViewsCollector::Amber => "https://amber.massars.net/rpkidata", - RpkiViewsCollector::Dango => "https://dango.attn.jp/rpkidata", - RpkiViewsCollector::Kerfuffle => "https://rpkiviews.kerfuffle.net/rpkidata", + RpkiViewsCollector::SoborostNet => "https://josephine.sobornost.net/rpkidata", + RpkiViewsCollector::MassarsNet => "https://amber.massars.net/rpkidata", + RpkiViewsCollector::AttnJp => "https://dango.attn.jp/rpkidata", + RpkiViewsCollector::KerfuffleNet => "https://rpkiviews.kerfuffle.net/rpkidata", } } @@ -46,10 +46,10 @@ impl RpkiViewsCollector { /// Get all available collectors pub fn all() -> Vec { vec![ - RpkiViewsCollector::Josephine, - RpkiViewsCollector::Amber, - RpkiViewsCollector::Dango, - RpkiViewsCollector::Kerfuffle, + RpkiViewsCollector::SoborostNet, + RpkiViewsCollector::MassarsNet, + RpkiViewsCollector::AttnJp, + RpkiViewsCollector::KerfuffleNet, ] } } @@ -57,10 +57,10 @@ impl RpkiViewsCollector { impl std::fmt::Display for RpkiViewsCollector { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - RpkiViewsCollector::Josephine => write!(f, "josephine"), - RpkiViewsCollector::Amber => write!(f, "amber"), - RpkiViewsCollector::Dango => write!(f, "dango"), - RpkiViewsCollector::Kerfuffle => write!(f, "kerfuffle"), + RpkiViewsCollector::SoborostNet => write!(f, "sobornost.net"), + RpkiViewsCollector::MassarsNet => write!(f, "massars.net"), + RpkiViewsCollector::AttnJp => write!(f, "attn.jp"), + RpkiViewsCollector::KerfuffleNet => write!(f, "kerfuffle.net"), } } } @@ -70,10 +70,10 @@ impl FromStr for RpkiViewsCollector { fn from_str(s: &str) -> std::result::Result { match s.to_lowercase().as_str() { - "josephine" | "josephine.sobornost.net" => Ok(RpkiViewsCollector::Josephine), - "amber" | "amber.massars.net" => Ok(RpkiViewsCollector::Amber), - "dango" | "dango.attn.jp" => Ok(RpkiViewsCollector::Dango), - "kerfuffle" | "rpkiviews.kerfuffle.net" => Ok(RpkiViewsCollector::Kerfuffle), + "sobornost.net" | "josephine.sobornost.net" => Ok(RpkiViewsCollector::SoborostNet), + "massars.net" | "amber.massars.net" => Ok(RpkiViewsCollector::MassarsNet), + "attn.jp" | "dango.attn.jp" => Ok(RpkiViewsCollector::AttnJp), + "kerfuffle.net" | "rpkiviews.kerfuffle.net" => Ok(RpkiViewsCollector::KerfuffleNet), _ => Err(format!("unknown RPKIviews collector: {}", s)), } } @@ -580,11 +580,11 @@ mod tests { #[test] fn test_collector_urls() { assert_eq!( - RpkiViewsCollector::Josephine.base_url(), + RpkiViewsCollector::SoborostNet.base_url(), "https://josephine.sobornost.net/rpkidata" ); assert_eq!( - RpkiViewsCollector::Kerfuffle.index_url(), + RpkiViewsCollector::KerfuffleNet.index_url(), "https://rpkiviews.kerfuffle.net/rpkidata/index.txt" ); } @@ -592,25 +592,28 @@ mod tests { #[test] fn test_collector_from_str() { assert_eq!( - RpkiViewsCollector::from_str("josephine").unwrap(), - RpkiViewsCollector::Josephine + RpkiViewsCollector::from_str("sobornost.net").unwrap(), + RpkiViewsCollector::SoborostNet ); assert_eq!( RpkiViewsCollector::from_str("amber.massars.net").unwrap(), - RpkiViewsCollector::Amber + RpkiViewsCollector::MassarsNet ); } #[test] fn test_default_collector() { - assert_eq!(RpkiViewsCollector::default(), RpkiViewsCollector::Josephine); + assert_eq!( + RpkiViewsCollector::default(), + RpkiViewsCollector::SoborostNet + ); } #[test] #[ignore] // Requires network access fn test_list_rpkiviews_files() { let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); - let files = list_rpkiviews_files(RpkiViewsCollector::Kerfuffle, date).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::KerfuffleNet, date).unwrap(); println!("Found {} files for {}", files.len(), date); for file in &files { println!(" {} ({} bytes)", file.url, file.size.unwrap_or(0)); @@ -623,7 +626,7 @@ mod tests { fn test_list_files_in_tgz() { // List first 10 files in a remote tgz to verify streaming works let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); - let files = list_rpkiviews_files(RpkiViewsCollector::Kerfuffle, date).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::KerfuffleNet, date).unwrap(); assert!(!files.is_empty()); let tgz_url = &files[0].url; @@ -643,7 +646,7 @@ mod tests { #[ignore] // Requires network access - streams partial archive fn test_tgz_contains_file() { let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); - let files = list_rpkiviews_files(RpkiViewsCollector::Kerfuffle, date).unwrap(); + let files = list_rpkiviews_files(RpkiViewsCollector::KerfuffleNet, date).unwrap(); assert!(!files.is_empty()); let tgz_url = &files[0].url; @@ -658,7 +661,7 @@ mod tests { #[ignore] // Requires network access and takes time to download fn test_from_rpkiviews() { let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); - let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::Kerfuffle, date).unwrap(); + let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::KerfuffleNet, date).unwrap(); let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); println!("Loaded {} ROAs from RPKIviews", total_roas); From aeae12551eba81bc7e41f8860601290f127ef1ef Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 4 Dec 2025 20:29:53 -0800 Subject: [PATCH 5/7] Guard examples and tests with feature cfgs --- src/lib.rs | 15 +++++++++++++++ tests/asinfo_integration.rs | 2 ++ tests/mrt_collectors.rs | 2 ++ 3 files changed, 19 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index f090ac2..c9b2960 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -57,6 +57,8 @@ //! 3. Access the data using the corresponding `xxx_yyy()` methods //! //! ```rust +//! # #[cfg(feature = "bogons")] +//! # fn main() { //! use bgpkit_commons::BgpkitCommons; //! //! let mut commons = BgpkitCommons::new(); @@ -68,11 +70,16 @@ //! if let Ok(is_bogon) = commons.bogons_match("23456") { //! println!("ASN 23456 is a bogon: {}", is_bogon); //! } +//! # } +//! # #[cfg(not(feature = "bogons"))] +//! # fn main() {} //! ``` //! //! ### Working with Multiple Modules //! //! ```rust +//! # #[cfg(all(feature = "asinfo", feature = "countries"))] +//! # fn main() { //! use bgpkit_commons::BgpkitCommons; //! //! let mut commons = BgpkitCommons::new(); @@ -85,6 +92,9 @@ //! if let Ok(Some(asinfo)) = commons.asinfo_get(13335) { //! println!("AS13335: {} ({})", asinfo.name, asinfo.country); //! } +//! # } +//! # #[cfg(not(all(feature = "asinfo", feature = "countries")))] +//! # fn main() {} //! ``` //! //! ## Feature Flags @@ -118,12 +128,17 @@ //! all currently loaded data sources: //! //! ```rust +//! # #[cfg(feature = "bogons")] +//! # fn main() { //! # use bgpkit_commons::BgpkitCommons; //! let mut commons = BgpkitCommons::new(); //! commons.load_bogons().unwrap(); //! //! // Later, reload all loaded data //! commons.reload().unwrap(); +//! # } +//! # #[cfg(not(feature = "bogons"))] +//! # fn main() {} //! ``` #![doc( diff --git a/tests/asinfo_integration.rs b/tests/asinfo_integration.rs index e09f30d..b17d3d1 100644 --- a/tests/asinfo_integration.rs +++ b/tests/asinfo_integration.rs @@ -1,3 +1,5 @@ +#![cfg(feature = "asinfo")] + /// Integration test for basic AS information retrieval. #[test] fn test_basic_info() { diff --git a/tests/mrt_collectors.rs b/tests/mrt_collectors.rs index 6ecb8e0..fbd0b96 100644 --- a/tests/mrt_collectors.rs +++ b/tests/mrt_collectors.rs @@ -1,3 +1,5 @@ +#![cfg(feature = "mrt_collectors")] + #[test] fn test_get_collectors() { let mut commons = bgpkit_commons::BgpkitCommons::new(); From 16aeedee892294458584fb1e8d504fb6d7440af3 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 4 Dec 2025 20:37:16 -0800 Subject: [PATCH 6/7] Add serde_json to rpki feature --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c5f8d3c..c6c13ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ as2rel = ["oneio", "serde_json", "tracing"] bogons = ["oneio", "ipnet", "regex", "chrono"] countries = ["oneio"] mrt_collectors = ["oneio", "chrono"] -rpki = ["oneio", "ipnet", "ipnet-trie", "chrono", "tracing", "reqwest", "tar"] +rpki = ["oneio", "ipnet", "ipnet-trie", "chrono", "tracing", "reqwest", "tar", "serde_json"] # Convenience feature to enable all modules all = ["asinfo", "as2rel", "bogons", "countries", "mrt_collectors", "rpki"] From b85569311059bde1d3b0f3789b212fd7fb63f985 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 4 Dec 2025 21:07:37 -0800 Subject: [PATCH 7/7] Harden tgz streaming shutdown and add docs Use SeqCst ordering for should_stop/should_stop_writer, log network read errors instead of silently breaking, and join the writer thread to ensure a clean shutdown. Add module documentation noting the `gunzip` requirement for RPKIviews. Add a deprecation note to the RoaEntry alias, rename/update related tests, and clarify collector usage in the from_rpkiviews docs and test. --- src/rpki/mod.rs | 3 ++- src/rpki/rpkiviews.rs | 53 +++++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/rpki/mod.rs b/src/rpki/mod.rs index 228d7a9..2659dc3 100644 --- a/src/rpki/mod.rs +++ b/src/rpki/mod.rs @@ -298,6 +298,7 @@ impl Display for RpkiValidation { // ============================================================================ /// Type alias for backwards compatibility. Use [`Roa`] instead. +/// Deprecated since 0.10.0. This alias will be removed in version 0.12.0. #[deprecated(since = "0.10.0", note = "Use Roa instead")] pub type RoaEntry = Roa; @@ -685,7 +686,7 @@ mod tests { } #[test] - fn test_validate_check_expiry() { + fn test_validate_check_expiry_with_time_constraints() { let mut trie = RpkiTrie::new(None); // Time references diff --git a/src/rpki/rpkiviews.rs b/src/rpki/rpkiviews.rs index 8af8463..7cf1fb1 100644 --- a/src/rpki/rpkiviews.rs +++ b/src/rpki/rpkiviews.rs @@ -2,6 +2,13 @@ //! //! RPKIviews provides historical RPKI data from multiple collectors around the world. //! See: +//! +//! ## System Requirements +//! +//! This module requires the `gunzip` command to be available in the system PATH +//! for decompressing `.tgz` archives. On most Unix-like systems, this is provided +//! by the `gzip` package. On macOS, it is available by default. On Windows, +//! you may need to install it via WSL, Cygwin, or similar tools. use crate::Result; use crate::rpki::rpki_client::RpkiClientData; @@ -209,7 +216,7 @@ pub fn list_files_in_tgz(url: &str, max_entries: Option) -> Result) -> Result break, Ok(n) => n, Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, - Err(_) => break, // Connection closed or error, stop gracefully + Err(e) => { + tracing::debug!("Network read error during tgz streaming: {}", e); + break; + } }; if gunzip_stdin.write_all(&buffer[..n]).is_err() { @@ -262,18 +272,17 @@ pub fn list_files_in_tgz(url: &str, max_entries: Option) -> Result= max { // Signal writer to stop and break out - should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); break; } } } // Signal writer to stop (in case we finished iterating) - should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); - // Don't wait for writer thread - it will terminate when pipe closes - // Just detach it - drop(writer_thread); + // Wait for writer thread to finish to ensure clean shutdown + let _ = writer_thread.join(); // Kill gunzip process to clean up let _ = gunzip.kill(); @@ -336,7 +345,7 @@ pub fn tgz_contains_file(url: &str, target_path: &str) -> Result { let mut reader = response; let mut buffer = [0u8; 65536]; loop { - if should_stop_writer.load(std::sync::atomic::Ordering::Relaxed) { + if should_stop_writer.load(std::sync::atomic::Ordering::SeqCst) { break; } @@ -344,7 +353,10 @@ pub fn tgz_contains_file(url: &str, target_path: &str) -> Result { Ok(0) => break, Ok(n) => n, Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, - Err(_) => break, + Err(e) => { + tracing::debug!("Network read error during tgz streaming: {}", e); + break; + } }; if gunzip_stdin.write_all(&buffer[..n]).is_err() { @@ -372,14 +384,14 @@ pub fn tgz_contains_file(url: &str, target_path: &str) -> Result { if path.ends_with(target_path) || path == target_path { found = true; - should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); break; } } } - should_stop.store(true, std::sync::atomic::Ordering::Relaxed); - drop(writer_thread); + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + let _ = writer_thread.join(); let _ = gunzip.kill(); let _ = gunzip.wait(); @@ -444,7 +456,7 @@ pub fn extract_file_from_tgz(url: &str, target_path: &str) -> Result { let mut reader = response; let mut buffer = [0u8; 65536]; loop { - if should_stop_writer.load(std::sync::atomic::Ordering::Relaxed) { + if should_stop_writer.load(std::sync::atomic::Ordering::SeqCst) { break; } @@ -452,7 +464,10 @@ pub fn extract_file_from_tgz(url: &str, target_path: &str) -> Result { Ok(0) => break, Ok(n) => n, Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, - Err(_) => break, + Err(e) => { + tracing::debug!("Network read error during tgz streaming: {}", e); + break; + } }; if gunzip_stdin.write_all(&buffer[..n]).is_err() { @@ -494,7 +509,7 @@ pub fn extract_file_from_tgz(url: &str, target_path: &str) -> Result { ) })?; content = Some(file_content); - should_stop.store(true, std::sync::atomic::Ordering::Relaxed); + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); break; } // Note: if this is not our target file, the tar iterator automatically @@ -502,8 +517,8 @@ pub fn extract_file_from_tgz(url: &str, target_path: &str) -> Result { // so we don't buffer unnecessary data } - should_stop.store(true, std::sync::atomic::Ordering::Relaxed); - drop(writer_thread); + should_stop.store(true, std::sync::atomic::Ordering::SeqCst); + let _ = writer_thread.join(); let _ = gunzip.kill(); let _ = gunzip.wait(); @@ -528,7 +543,7 @@ impl RpkiTrie { /// Load RPKI data from RPKIviews for a specific date. /// /// This will use the first (earliest) available file for the given date from the specified collector. - /// By default, uses the Kerfuffle collector. + /// The caller must specify which collector to use. See [`RpkiViewsCollector`] for available options. pub fn from_rpkiviews(collector: RpkiViewsCollector, date: NaiveDate) -> Result { let files = list_rpkiviews_files(collector, date)?; @@ -661,7 +676,7 @@ mod tests { #[ignore] // Requires network access and takes time to download fn test_from_rpkiviews() { let date = NaiveDate::from_ymd_opt(2024, 1, 4).unwrap(); - let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::KerfuffleNet, date).unwrap(); + let trie = RpkiTrie::from_rpkiviews(RpkiViewsCollector::default(), date).unwrap(); let total_roas: usize = trie.trie.iter().map(|(_, roas)| roas.len()).sum(); println!("Loaded {} ROAs from RPKIviews", total_roas);