From 1b750b1213adfec5bc50d4d5abe895a896502bb6 Mon Sep 17 00:00:00 2001 From: Tyler Nickerson Date: Mon, 15 Sep 2025 13:03:41 -0400 Subject: [PATCH 1/6] add docs --- lib/src/core/compile.rs | 150 +++++++++++++++ lib/src/core/consts.rs | 70 +++++++ lib/src/core/lexicon.rs | 75 ++++++++ lib/src/core/lookup.rs | 408 +++++++++++++++++++++++++++++++++++++++- lib/src/core/merge.rs | 92 +++++++++ lib/src/core/mod.rs | 59 ++++++ lib/src/core/preview.rs | 111 +++++++++++ lib/src/core/rank.rs | 120 ++++++++++++ lib/src/core/read.rs | 206 ++++++++++++++++++++ lib/src/core/resolve.rs | 95 ++++++++++ lib/src/core/version.rs | 129 +++++++++++++ lib/src/core/write.rs | 153 ++++++++++++++- 12 files changed, 1664 insertions(+), 4 deletions(-) diff --git a/lib/src/core/compile.rs b/lib/src/core/compile.rs index 0f2c8c346..d7f92a312 100644 --- a/lib/src/core/compile.rs +++ b/lib/src/core/compile.rs @@ -1,3 +1,45 @@ +//! Dictionary compilation and binary serialization. +//! +//! This module provides functionality to compile dictionary data structures into +//! the ODict binary format. The compilation process involves serialization, +//! compression, and packaging with metadata headers. +//! +//! # Binary Format Structure +//! +//! The ODict binary format consists of: +//! 1. **Signature** (5 bytes): "ODICT" magic bytes +//! 2. **Version Length** (8 bytes): Length of version string +//! 3. **Version** (variable): Semantic version string +//! 4. **Content Length** (8 bytes): Length of compressed content +//! 5. **Content** (variable): Compressed serialized dictionary data +//! +//! # Examples +//! +//! ## Basic Compilation +//! +//! ```rust +//! use odict::{Dictionary, CompilerOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! let compiled = dict.build()?; +//! let bytes = compiled.to_bytes()?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Compilation with Custom Compression +//! +//! ```rust +//! use odict::{Dictionary, CompilerOptions, CompressOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! let compiled = dict.build()?; +//! +//! let options = CompilerOptions::default() +//! .with_compression(CompressOptions::default()); +//! let bytes = compiled.to_bytes_with_options(options)?; +//! # Ok::<(), Box>(()) +//! ``` + use crate::compress::{compress, CompressOptions}; use crate::error::Error; use crate::schema::Dictionary; @@ -5,8 +47,13 @@ use crate::OpenDictionary; use super::consts::{SIGNATURE, VERSION}; +/// Configuration options for dictionary compilation. +/// +/// This struct allows customization of the compilation process, particularly +/// compression settings that affect the final binary size and performance. #[derive(Default)] pub struct CompilerOptions { + /// Compression options to use during compilation. pub compress_options: CompressOptions, } @@ -17,6 +64,20 @@ impl AsRef for CompilerOptions { } impl CompilerOptions { + /// Set custom compression options for the compilation process. + /// + /// # Arguments + /// + /// * `compress_options` - The compression configuration to use + /// + /// # Examples + /// + /// ```rust + /// use odict::{CompilerOptions, CompressOptions}; + /// + /// let options = CompilerOptions::default() + /// .with_compression(CompressOptions::default()); + /// ``` pub fn with_compression(mut self, compress_options: CompressOptions) -> Self { self.compress_options = compress_options; self @@ -24,10 +85,70 @@ impl CompilerOptions { } impl OpenDictionary { + /// Convert the dictionary to binary format using default compilation options. + /// + /// This method serializes the dictionary into the ODict binary format, + /// applying default compression and packaging it with the appropriate headers. + /// + /// # Returns + /// + /// A `Vec` containing the complete binary representation of the dictionary. + /// + /// # Errors + /// + /// Returns an error if: + /// - Compression fails + /// - Serialization fails + /// - Binary format validation fails + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary}; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let compiled = dict.build()?; + /// let bytes = compiled.to_bytes()?; + /// # Ok::<(), Box>(()) + /// ``` pub fn to_bytes(&self) -> crate::Result> { self.to_bytes_with_options(CompilerOptions::default()) } + /// Convert the dictionary to binary format with custom compilation options. + /// + /// This method provides fine-grained control over the compilation process, + /// allowing customization of compression settings and other options. + /// + /// # Arguments + /// + /// * `options` - Compilation options to customize the process + /// + /// # Returns + /// + /// A `Vec` containing the complete binary representation of the dictionary. + /// + /// # Errors + /// + /// Returns an error if: + /// - Compression fails with the specified options + /// - Serialization fails + /// - Binary format validation fails + /// - Header construction fails + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary, CompilerOptions, CompressOptions}; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let compiled = dict.build()?; + /// + /// let options = CompilerOptions::default() + /// .with_compression(CompressOptions::default()); + /// let bytes = compiled.to_bytes_with_options(options)?; + /// # Ok::<(), Box>(()) + /// ``` pub fn to_bytes_with_options>( &self, options: Options, @@ -86,6 +207,35 @@ impl OpenDictionary { } impl Dictionary { + /// Build a compiled dictionary from the current dictionary data. + /// + /// This method transforms a [`Dictionary`] into an [`OpenDictionary`] by + /// serializing the dictionary data and preparing it for binary compilation. + /// The resulting [`OpenDictionary`] can then be converted to bytes or saved to disk. + /// + /// # Returns + /// + /// An [`OpenDictionary`] containing the serialized dictionary data with + /// appropriate metadata (signature, version, etc.). + /// + /// # Errors + /// + /// Returns an error if: + /// - Dictionary serialization fails + /// - Memory allocation fails + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let compiled = dict.build()?; + /// + /// // Now you can save to disk or convert to bytes + /// compiled.to_disk("output.odict")?; + /// # Ok::<(), Box>(()) + /// ``` pub fn build(&self) -> crate::Result { let dict = OpenDictionary { signature: String::from_utf8_lossy(SIGNATURE).to_string(), diff --git a/lib/src/core/consts.rs b/lib/src/core/consts.rs index 9a100a2bc..22f9b6f74 100644 --- a/lib/src/core/consts.rs +++ b/lib/src/core/consts.rs @@ -1,8 +1,78 @@ +//! Core constants for the ODict binary format. +//! +//! This module defines the fundamental constants used throughout the ODict +//! library for binary format identification, versioning, and compatibility +//! checking. +//! +//! # Overview +//! +//! The constants defined here are used for: +//! - Binary format identification through magic signatures +//! - Version tracking and compatibility verification +//! - Ensuring consistent format standards across the library +//! +//! # Binary Format Identification +//! +//! The [`SIGNATURE`] constant provides the magic bytes that identify ODict +//! binary files. This signature is written at the beginning of every compiled +//! dictionary file and verified during reading operations. +//! +//! # Version Management +//! +//! The [`VERSION`] constant contains the current library version, automatically +//! derived from the Cargo package version. This is used for compatibility +//! checking when reading dictionary files created with different library versions. + use std::sync::LazyLock; use crate::version::SemanticVersion; +/// Magic signature bytes for ODict binary format identification. +/// +/// This 5-byte signature ("ODICT") is written at the beginning of every +/// compiled dictionary file to identify it as a valid ODict binary format. +/// The signature is checked during file reading to ensure format validity. +/// +/// # Format +/// +/// The signature consists of the ASCII bytes for "ODICT": +/// - `O` (0x4F) +/// - `D` (0x44) +/// - `I` (0x49) +/// - `C` (0x43) +/// - `T` (0x54) +/// +/// # Usage +/// +/// This constant is used internally by the reading and writing operations +/// and should not typically be used directly by library consumers. pub const SIGNATURE: &[u8] = b"ODICT"; +/// Current library version for compatibility checking. +/// +/// This constant contains the semantic version of the current library, +/// automatically derived from the Cargo package version at compile time. +/// It's used to ensure compatibility between dictionary files and the +/// library version attempting to read them. +/// +/// # Compatibility Rules +/// +/// Dictionary files are considered compatible if they have: +/// - The same major version number as the library +/// - The same prerelease status (stable vs. prerelease) +/// +/// # Lazy Initialization +/// +/// The version is lazily initialized from the `CARGO_PKG_VERSION` environment +/// variable, which is automatically set by Cargo during compilation. This +/// ensures the version always matches the actual package version. +/// +/// # Examples +/// +/// ```rust +/// use odict::core::consts::VERSION; +/// +/// println!("Library version: {}", *VERSION); +/// ``` pub const VERSION: LazyLock = LazyLock::new(|| SemanticVersion::from(env!("CARGO_PKG_VERSION"))); diff --git a/lib/src/core/lexicon.rs b/lib/src/core/lexicon.rs index a85bb4677..9540de77a 100644 --- a/lib/src/core/lexicon.rs +++ b/lib/src/core/lexicon.rs @@ -1,8 +1,83 @@ +//! Lexicon extraction operations for ODict dictionaries. +//! +//! This module provides functionality to extract sorted lists of terms (lexicons) +//! from dictionaries. A lexicon represents all the headwords/terms available +//! in a dictionary, sorted alphabetically. +//! +//! # Overview +//! +//! The lexicon functionality allows you to: +//! - Extract all terms from a dictionary as a sorted list +//! - Get a quick overview of dictionary contents +//! - Generate word lists for analysis or display +//! +//! # Examples +//! +//! ## Extracting a Lexicon from a Dictionary +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! let terms = dict.lexicon(); +//! +//! // Print all terms in alphabetical order +//! for term in terms { +//! println!("{}", term); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Working with Archived Dictionaries +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! let terms = archived.lexicon(); +//! +//! println!("Dictionary contains {} terms", terms.len()); +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::{ArchivedDictionary, Dictionary}; macro_rules! lexicon { ($t:ident) => { impl $t { + /// Extract a sorted lexicon (list of terms) from the dictionary. + /// + /// This method collects all entry terms from the dictionary and returns + /// them as a sorted vector of string references. The terms are sorted + /// alphabetically using standard string ordering. + /// + /// # Returns + /// + /// A `Vec<&str>` containing all dictionary terms in alphabetical order. + /// Each term appears exactly once, even if there are multiple entries + /// with the same term. + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let lexicon = dict.lexicon(); + /// + /// // Print first 10 terms + /// for term in lexicon.iter().take(10) { + /// println!("{}", term); + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(n log n) complexity due to sorting, where n is + /// the number of entries in the dictionary. The terms are collected + /// first, then sorted in-place. pub fn lexicon(&self) -> Vec<&str> { let mut vec: Vec<&str> = self .entries diff --git a/lib/src/core/lookup.rs b/lib/src/core/lookup.rs index 6e3973aea..62da9cede 100644 --- a/lib/src/core/lookup.rs +++ b/lib/src/core/lookup.rs @@ -1,21 +1,230 @@ +//! Advanced dictionary lookup operations for ODict. +//! +//! This module provides sophisticated search capabilities over dictionaries with +//! configurable matching strategies, redirect following via see_also links, and +//! case-insensitive fallback options. It supports both single and batch lookups +//! with parallel processing for optimal performance. +//! +//! # Overview +//! +//! The lookup system offers multiple layers of functionality: +//! +//! ## Matching Strategies +//! - **Exact matching**: Direct term-to-entry mapping +//! - **Split strategy**: Progressive substring matching for compound terms +//! +//! ## Advanced Features +//! - **Redirect following**: Automatic traversal of see_also links with cycle protection +//! - **Case-insensitive fallback**: Automatic retry with lowercase when exact match fails +//! - **Parallel processing**: Concurrent lookup of multiple queries for performance +//! - **Configurable limits**: Control redirect depth and matching behavior +//! +//! ## Performance Characteristics +//! - Single lookups: O(1) average case for exact matches +//! - Split strategy: O(n²) worst case where n is query length +//! - Parallel lookups: Scales with available CPU cores +//! - Memory efficient: Zero-copy results with lifetime management +//! +//! # Examples +//! +//! ## Basic Exact Lookup +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! let queries = vec!["hello"]; +//! let results = archived.lookup(&queries, LookupOptions::default())?; +//! +//! for result in results { +//! println!("Found: {}", result.entry.term.as_str()); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Advanced Lookup with Options +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! let options = LookupOptions::default() +//! .insensitive(true) // Enable case-insensitive fallback +//! .follow(3) // Follow up to 3 redirects +//! .strategy(LookupStrategy::Split(2)); // Split to minimum 2 chars +//! +//! let queries = vec!["Hello", "compound-word"]; +//! let results = archived.lookup(&queries, options)?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Handling Redirects +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! let options = LookupOptions::default().follow(5); +//! let queries = vec!["abbreviation"]; // Might redirect to full form +//! let results = archived.lookup(&queries, options)?; +//! +//! for result in results { +//! if let Some(redirect_from) = result.directed_from { +//! println!("'{}' redirected from '{}'", +//! result.entry.term.as_str(), +//! redirect_from.term.as_str()); +//! } +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Split Strategy for Compound Terms +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! // This will try "compound-word", then "compound", then "word" +//! let options = LookupOptions::default() +//! .strategy(LookupStrategy::Split(3)); // Minimum 3 characters +//! +//! let queries = vec!["compound-word"]; +//! let results = archived.lookup(&queries, options)?; +//! # Ok::<(), Box>(()) +//! ``` use crate::schema::{ArchivedDictionary, ArchivedEntry, Dictionary, Entry}; use rayon::prelude::*; use rkyv::option::ArchivedOption; use std::marker::{Send, Sync}; +/// Strategy for matching query terms against dictionary entries. +/// +/// This enum defines the different approaches available for finding matches +/// when performing dictionary lookups. Each strategy has different performance +/// characteristics and use cases. #[derive(Debug, PartialEq, Clone)] pub enum LookupStrategy { + /// Match queries exactly against entry terms. + /// + /// This is the fastest strategy, performing direct hash map lookups. + /// It requires the query to exactly match an entry term (case-sensitive + /// unless the `insensitive` option is enabled). + /// + /// **Performance**: O(1) average case + /// **Use case**: When you know the exact term you're looking for + /// + /// # Examples + /// + /// ```rust + /// use odict::{LookupStrategy, LookupOptions}; + /// + /// let options = LookupOptions::default() + /// .strategy(LookupStrategy::Exact); + /// ``` Exact, + + /// Split the query into progressively smaller substrings down to `min_length`, + /// attempting to match each substring from left to right. + /// + /// This strategy is useful for compound words or when you want to find + /// partial matches. It starts with the full query and progressively + /// shortens it from the right until a match is found or the minimum + /// length is reached. + /// + /// **Performance**: O(n²) worst case where n is query length + /// **Use case**: Compound words, partial matching, morphological analysis + /// + /// # Algorithm + /// + /// For a query "compound-word" with min_length=3: + /// 1. Try "compound-word" (full query) + /// 2. Try "compound-wor", "compound-wo", etc. + /// 3. Try "compound" (if found, move to next segment) + /// 4. Try "word", "wor" (down to min_length) + /// + /// # Examples + /// + /// ```rust + /// use odict::{LookupStrategy, LookupOptions}; + /// + /// // Split down to minimum 3 characters + /// let options = LookupOptions::default() + /// .strategy(LookupStrategy::Split(3)); + /// ``` Split(usize), } +/// Configuration options for dictionary lookup operations. +/// +/// This struct provides fine-grained control over lookup behavior, including +/// redirect following, matching strategies, and case sensitivity. All options +/// have sensible defaults for common use cases. +/// +/// # Default Behavior +/// +/// - **No redirect following**: Prevents infinite loops and improves performance +/// - **Exact matching**: Most predictable and fastest lookup strategy +/// - **Case-sensitive search**: Preserves linguistic distinctions +/// +/// # Examples +/// +/// ## Basic Usage +/// +/// ```rust +/// use odict::LookupOptions; +/// +/// // Use all defaults +/// let options = LookupOptions::default(); +/// ``` +/// +/// ## Custom Configuration +/// +/// ```rust +/// use odict::{LookupOptions, LookupStrategy}; +/// +/// let options = LookupOptions::default() +/// .follow(5) // Follow up to 5 redirects +/// .insensitive(true) // Enable case-insensitive fallback +/// .strategy(LookupStrategy::Split(2)); // Split strategy with min length 2 +/// ``` #[derive(Debug, Clone)] pub struct LookupOptions { /// Maximum number of redirects to follow via see_also links. - /// None means no following, Some(u32::MAX) provides infinite following (old behavior). + /// + /// - `None`: No redirect following (default, safest option) + /// - `Some(n)`: Follow up to n redirects before stopping + /// - `Some(u32::MAX)`: Unlimited following (use with caution) + /// + /// Redirect following allows automatic traversal of see_also links, + /// which is useful for abbreviations, alternative spellings, and + /// cross-references. However, it can potentially create infinite + /// loops if the dictionary has circular references. pub follow: Option, + + /// Query matching strategy to use for lookups. + /// + /// Determines how queries are matched against dictionary entries. + /// See [`LookupStrategy`] for detailed information about each option. pub strategy: LookupStrategy, + + /// Whether to fall back to case-insensitive search if exact match fails. + /// + /// When enabled, if an exact (case-sensitive) match fails, the system + /// will automatically retry with a lowercase version of the query. + /// This is useful for handling user input that may have incorrect + /// capitalization. + /// + /// **Note**: The fallback only occurs if the lowercase version differs + /// from the original query, preventing unnecessary duplicate lookups. pub insensitive: bool, } @@ -26,6 +235,27 @@ impl AsRef for LookupOptions { } impl LookupOptions { + /// Construct default lookup options with safe, predictable settings. + /// + /// The default configuration prioritizes safety and performance: + /// - **No redirect following**: Prevents infinite loops and improves performance + /// - **Exact matching strategy**: Most predictable and fastest lookup method + /// - **Case-sensitive search**: Preserves linguistic distinctions + /// + /// # Returns + /// + /// A new `LookupOptions` instance with default settings. + /// + /// # Examples + /// + /// ```rust + /// use odict::LookupOptions; + /// + /// let options = LookupOptions::default(); + /// assert_eq!(options.follow, None); + /// assert_eq!(options.strategy, odict::LookupStrategy::Exact); + /// assert_eq!(options.insensitive, false); + /// ``` pub fn default() -> Self { Self { follow: None, @@ -34,25 +264,172 @@ impl LookupOptions { } } + /// Set the maximum number of redirects to follow via see_also links. + /// + /// This method enables redirect following with a specified limit to prevent + /// infinite loops in dictionaries with circular references. Redirects are + /// useful for handling abbreviations, alternative spellings, and cross-references. + /// + /// # Arguments + /// + /// * `follow` - Maximum number of redirects to follow (use `u32::MAX` for unlimited) + /// + /// # Safety Considerations + /// + /// - Use reasonable limits (e.g., 5-10) to prevent performance issues + /// - `u32::MAX` allows unlimited following but may cause infinite loops + /// - Each redirect adds one additional lookup operation + /// + /// # Examples + /// + /// ```rust + /// use odict::LookupOptions; + /// + /// // Follow up to 5 redirects + /// let options = LookupOptions::default().follow(5); + /// + /// // Unlimited following (use with caution) + /// let unlimited = LookupOptions::default().follow(u32::MAX); + /// ``` pub fn follow(mut self, follow: u32) -> Self { self.follow = Some(follow); self } + /// Set the matching strategy for query processing. + /// + /// The strategy determines how queries are matched against dictionary entries. + /// Different strategies have different performance characteristics and use cases. + /// + /// # Arguments + /// + /// * `strategy` - The [`LookupStrategy`] to use for matching + /// + /// # Examples + /// + /// ```rust + /// use odict::{LookupOptions, LookupStrategy}; + /// + /// // Use exact matching (fastest) + /// let exact = LookupOptions::default() + /// .strategy(LookupStrategy::Exact); + /// + /// // Use split strategy for compound words + /// let split = LookupOptions::default() + /// .strategy(LookupStrategy::Split(3)); + /// ``` pub fn strategy(mut self, strategy: LookupStrategy) -> Self { self.strategy = strategy; self } + /// Enable or disable case-insensitive fallback matching. + /// + /// When enabled, if an exact (case-sensitive) match fails, the system + /// automatically retries with a lowercase version of the query. This is + /// useful for handling user input with incorrect capitalization. + /// + /// # Arguments + /// + /// * `insensitive` - Whether to enable case-insensitive fallback + /// + /// # Performance Impact + /// + /// - Minimal impact when exact matches succeed + /// - Adds one additional lookup when exact match fails and query contains uppercase + /// - No additional lookup if the query is already lowercase + /// + /// # Examples + /// + /// ```rust + /// use odict::LookupOptions; + /// + /// // Enable case-insensitive fallback + /// let options = LookupOptions::default().insensitive(true); + /// + /// // This will try "Hello" first, then "hello" if not found + /// // let results = dict.lookup(&["Hello"], options)?; + /// ``` pub fn insensitive(mut self, insensitive: bool) -> Self { self.insensitive = insensitive; self } } +/// Result of a dictionary lookup operation. +/// +/// This struct encapsulates the result of a successful lookup, including +/// the matched entry and optional redirect information. It provides context +/// about how the match was found, which is useful for understanding the +/// lookup path and handling redirects. +/// +/// # Generic Parameter +/// +/// * `E` - The entry type (either `&Entry` or `&ArchivedEntry`) +/// +/// # Examples +/// +/// ## Basic Usage +/// +/// ```rust +/// use odict::{OpenDictionary, LookupOptions}; +/// +/// let dict = OpenDictionary::from_path("dictionary.odict")?; +/// let archived = dict.contents()?; +/// let queries = vec!["hello"]; +/// let results = archived.lookup(&queries, LookupOptions::default())?; +/// +/// for result in results { +/// println!("Found: {}", result.entry.term.as_str()); +/// +/// if let Some(redirect_from) = result.directed_from { +/// println!(" (redirected from: {})", redirect_from.term.as_str()); +/// } +/// } +/// # Ok::<(), Box>(()) +/// ``` +/// +/// ## Checking for Redirects +/// +/// ```rust +/// use odict::{OpenDictionary, LookupOptions}; +/// +/// # fn example(results: Vec>) { +/// for result in results { +/// match result.directed_from { +/// Some(original) => { +/// println!("'{}' is an alias for '{}'", +/// original.term.as_str(), +/// result.entry.term.as_str()); +/// } +/// None => { +/// println!("Direct match: {}", result.entry.term.as_str()); +/// } +/// } +/// } +/// # } +/// ``` #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct LookupResult { + /// The matched dictionary entry. + /// + /// This is the final entry that was found, either through direct matching + /// or by following redirects. It contains all the linguistic data + /// (definitions, etymologies, pronunciations, etc.) for the term. pub entry: E, + + /// The entry that originally directed to this match via see_also links. + /// + /// This field is `Some(entry)` when the result was found by following + /// a redirect chain, containing the entry that started the redirect. + /// It's `None` for direct matches without any redirects. + /// + /// # Use Cases + /// + /// - Displaying "redirected from" information to users + /// - Understanding alias relationships in the dictionary + /// - Debugging lookup paths and redirect chains + /// - Analytics on which redirects are commonly followed pub directed_from: Option, } @@ -63,6 +440,12 @@ pub struct LookupResult { macro_rules! lookup { ($tys:ident, $ret:ident, $opt:ident) => { impl $tys { + #[doc = r#"Attempt to find a single entry by term. + +This helper supports optional redirect following and an optional +case-insensitive retry (lowercasing the query) when configured. + +Returns Some(LookupResult) on a match, or None if not found."#] fn find_entry<'a>( &'a self, follow: &Option, @@ -118,6 +501,9 @@ macro_rules! lookup { $opt::None } + #[doc = r#"Perform lookup for a single query using the provided options. + +Depending on the strategy, this may return zero or more results."#] fn perform_lookup<'a, Options>( &'a self, query: &str, @@ -166,6 +552,26 @@ macro_rules! lookup { Ok(results) } + #[doc = r#"Lookup multiple queries in parallel. + +Each query is processed independently with the provided options. + +Returns all matches without a guaranteed order. + +Examples +-------- +```rust +use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +# fn demo(dict: &odict::OpenDictionary) -> odict::Result<()> { +let archived = dict.contents()?; +let queries = vec!["hello", "world"]; +let options = LookupOptions::default() + .insensitive(true) + .strategy(LookupStrategy::Exact); +let results = archived.lookup(&queries, options)?; +# Ok(()) +# } +```"#] pub fn lookup<'a, 'b, Query, Options>( &'a self, queries: &'b Vec, diff --git a/lib/src/core/merge.rs b/lib/src/core/merge.rs index 9944cbc86..bb65f115a 100644 --- a/lib/src/core/merge.rs +++ b/lib/src/core/merge.rs @@ -1,12 +1,104 @@ +//! Dictionary merging operations for ODict. +//! +//! This module provides functionality to combine multiple dictionaries into a single +//! dictionary, preserving unique entries and avoiding duplicates. +//! +//! # Overview +//! +//! The merge operations allow you to: +//! - Merge a single dictionary into another +//! - Merge multiple dictionaries at once +//! - Preserve unique entries (no duplicates) +//! +//! # Examples +//! +//! ## Merging Two Dictionaries +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let mut dict1 = Dictionary::from_path("dict1.xml")?; +//! let dict2 = Dictionary::from_path("dict2.xml")?; +//! +//! // Merge dict2 into dict1 +//! dict1.merge(&dict2); +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Merging Multiple Dictionaries +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let mut main_dict = Dictionary::from_path("main.xml")?; +//! let dict2 = Dictionary::from_path("dict2.xml")?; +//! let dict3 = Dictionary::from_path("dict3.xml")?; +//! +//! // Merge multiple dictionaries at once +//! main_dict.merge_multi(vec![&dict2, &dict3]); +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::Dictionary; impl Dictionary { + /// Merge multiple dictionaries into this dictionary. + /// + /// This is a convenience method that calls [`merge`](Dictionary::merge) for each + /// dictionary in the provided vector. Entries are processed in order, and + /// duplicates are automatically filtered out. + /// + /// # Arguments + /// + /// * `dictionaries` - A vector of dictionary references to merge + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let mut main_dict = Dictionary::from_path("main.xml")?; + /// let dict2 = Dictionary::from_path("dict2.xml")?; + /// let dict3 = Dictionary::from_path("dict3.xml")?; + /// + /// main_dict.merge_multi(vec![&dict2, &dict3]); + /// # Ok::<(), Box>(()) + /// ``` pub fn merge_multi(&mut self, dictionaries: Vec<&Dictionary>) { for src in dictionaries { self.merge(src); } } + /// Merge another dictionary into this dictionary. + /// + /// This method adds all entries from the source dictionary that are not + /// already present in this dictionary. Duplicate entries (based on the + /// entry's equality implementation) are automatically filtered out. + /// + /// # Arguments + /// + /// * `dictionary` - The source dictionary to merge from + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let mut dict1 = Dictionary::from_path("dict1.xml")?; + /// let dict2 = Dictionary::from_path("dict2.xml")?; + /// + /// // Merge dict2 into dict1 + /// dict1.merge(&dict2); + /// + /// // dict1 now contains all unique entries from both dictionaries + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// The merge operation has O(n) complexity where n is the number of entries + /// in the source dictionary. Each entry is checked for existence before insertion. pub fn merge(&mut self, dictionary: &Dictionary) { for entry in dictionary.entries.iter() { if !self.entries.contains(entry) { diff --git a/lib/src/core/mod.rs b/lib/src/core/mod.rs index 976addb31..a428c4e94 100644 --- a/lib/src/core/mod.rs +++ b/lib/src/core/mod.rs @@ -1,3 +1,62 @@ +//! Core functionality for the ODict dictionary format. +//! +//! This module provides the fundamental operations for working with ODict dictionaries, +//! including compilation, reading, writing, lookup, and various utility functions. +//! +//! # Overview +//! +//! The core module is organized into several key areas: +//! +//! - **Compilation & Serialization**: [`compile`] - Convert dictionaries to binary format +//! - **Reading & Deserialization**: [`read`] - Load dictionaries from various sources +//! - **Writing**: [`write`] - Save dictionaries to disk +//! - **Lookup Operations**: [`lookup`] - Search and retrieve dictionary entries +//! - **Dictionary Management**: [`merge`], [`lexicon`] - Combine dictionaries and extract terms +//! - **Utilities**: [`preview`], [`rank`], [`resolve`] - Additional dictionary operations +//! - **Version Management**: [`version`] - Semantic versioning support +//! +//! # Examples +//! +//! ## Basic Dictionary Operations +//! +//! ```rust +//! use odict::{Dictionary, OpenDictionary}; +//! +//! // Load a dictionary from XML +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! +//! // Compile to binary format +//! let compiled = dict.build()?; +//! +//! // Save to disk +//! compiled.to_disk("dictionary.odict")?; +//! +//! // Load from binary +//! let loaded = OpenDictionary::from_path("dictionary.odict")?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Dictionary Lookup +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! // Simple lookup +//! let queries = vec!["hello"]; +//! let results = archived.lookup(&queries, LookupOptions::default())?; +//! +//! // Advanced lookup with options +//! let options = LookupOptions::default() +//! .insensitive(true) +//! .follow(5) +//! .strategy(LookupStrategy::Split(2)); +//! let results = archived.lookup(&queries, options)?; +//! # Ok::<(), Box>(()) +//! ``` + mod consts; pub mod compile; diff --git a/lib/src/core/preview.rs b/lib/src/core/preview.rs index c498ec9cd..9d29d275e 100644 --- a/lib/src/core/preview.rs +++ b/lib/src/core/preview.rs @@ -1,12 +1,63 @@ +//! Entry preview generation for ODict dictionaries. +//! +//! This module provides functionality to generate concise text previews of dictionary +//! entries by extracting and concatenating their definitions. Previews are useful for +//! displaying quick summaries of entries without showing the full structured data. +//! +//! # Overview +//! +//! The preview functionality allows you to: +//! - Generate text summaries of dictionary entries +//! - Customize the delimiter used to separate definitions +//! - Handle both regular and grouped definitions +//! - Optionally convert markdown to plain text (when markdown feature is enabled) +//! +//! # Examples +//! +//! ## Basic Preview Generation +//! +//! ```rust +//! use odict::{Dictionary, PreviewOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! if let Some(entry) = dict.entries.iter().next() { +//! let preview = entry.preview(PreviewOptions::default()); +//! println!("Preview: {}", preview); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Custom Delimiter +//! +//! ```rust +//! use odict::{Dictionary, PreviewOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! if let Some(entry) = dict.entries.iter().next() { +//! let options = PreviewOptions::default().delimiter(" | ".to_string()); +//! let preview = entry.preview(options); +//! println!("Preview: {}", preview); +//! } +//! # Ok::<(), Box>(()) +//! ``` + #[cfg(feature = "markdown")] use crate::md::to_text; use crate::schema::{ArchivedDefinitionType, ArchivedEntry, DefinitionType, Entry}; +/// Configuration options for generating entry previews. +/// +/// This struct allows customization of how definitions are joined together +/// when creating a preview string from a dictionary entry. pub struct PreviewOptions { delimiter: String, } impl Default for PreviewOptions { + /// Create default preview options. + /// + /// The default delimiter is `"; "` (semicolon followed by space), which + /// provides a natural separation between multiple definitions. fn default() -> Self { Self { delimiter: "; ".to_string(), @@ -15,12 +66,30 @@ impl Default for PreviewOptions { } impl PreviewOptions { + /// Set a custom delimiter for joining definitions. + /// + /// # Arguments + /// + /// * `delimiter` - The string to use for separating definitions in the preview + /// + /// # Examples + /// + /// ```rust + /// use odict::PreviewOptions; + /// + /// let options = PreviewOptions::default() + /// .delimiter(" | ".to_string()); + /// ``` pub fn delimiter(mut self, delimiter: String) -> Self { self.delimiter = delimiter; self } } +/// Convert text content to plain text. +/// +/// When the markdown feature is disabled, this function returns the input unchanged. +/// When the markdown feature is enabled, it converts markdown to plain text. #[cfg(not(feature = "markdown"))] fn to_text(value: &str) -> &str { value @@ -29,6 +98,48 @@ fn to_text(value: &str) -> &str { macro_rules! preview { ($t:ident, $d:ident) => { impl $t { + /// Generate a text preview of this dictionary entry. + /// + /// This method extracts all definitions from the entry's etymologies and senses, + /// converts them to plain text (if markdown feature is enabled), and joins them + /// using the specified delimiter. + /// + /// # Arguments + /// + /// * `options` - Configuration for preview generation + /// + /// # Returns + /// + /// A `String` containing all definitions joined by the specified delimiter. + /// If the entry has no definitions, returns an empty string. + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, PreviewOptions}; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// if let Some(entry) = dict.entries.iter().next() { + /// // Use default options ("; " delimiter) + /// let preview = entry.preview(PreviewOptions::default()); + /// + /// // Use custom delimiter + /// let custom_preview = entry.preview( + /// PreviewOptions::default().delimiter(" | ".to_string()) + /// ); + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Processing Order + /// + /// Definitions are processed in this order: + /// 1. Iterate through etymologies + /// 2. For each etymology, iterate through senses + /// 3. For each sense, iterate through definitions + /// 4. Handle both individual definitions and definition groups + /// 5. Convert markdown to text (if feature enabled) + /// 6. Join all definitions with the specified delimiter pub fn preview(&self, options: PreviewOptions) -> String { let definitions: Vec = self .etymologies diff --git a/lib/src/core/rank.rs b/lib/src/core/rank.rs index 75d92139b..095e2e0ab 100644 --- a/lib/src/core/rank.rs +++ b/lib/src/core/rank.rs @@ -1,6 +1,65 @@ +//! Entry ranking operations for ODict dictionaries. +//! +//! This module provides functionality to analyze and extract ranking information +//! from dictionary entries. Rankings are typically used to indicate word frequency, +//! importance, or usage patterns within a dictionary. +//! +//! # Overview +//! +//! The ranking functionality allows you to: +//! - Find the minimum rank across all entries +//! - Find the maximum rank across all entries +//! - Analyze ranking distribution in dictionaries +//! +//! # Ranking System +//! +//! Rankings are optional numeric values associated with dictionary entries. +//! Lower numbers typically indicate higher frequency or importance (e.g., rank 1 +//! might be the most common word). Not all entries are required to have ranks. +//! +//! # Examples +//! +//! ## Finding Rank Range +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! +//! if let Some(min) = dict.min_rank() { +//! println!("Highest priority rank: {}", min); +//! } +//! +//! if let Some(max) = dict.max_rank() { +//! println!("Lowest priority rank: {}", max); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Working with Archived Dictionaries +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! match (archived.min_rank(), archived.max_rank()) { +//! (Some(min), Some(max)) => { +//! println!("Rank range: {} to {}", min, max); +//! } +//! _ => println!("No ranked entries found"), +//! } +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::{ArchivedDictionary, Dictionary}; impl ArchivedDictionary { + /// Create an iterator over all rank values in the archived dictionary. + /// + /// This internal method filters entries to only those with rank values, + /// converting archived rank values to native u32 format. fn rank_iter(&self) -> impl Iterator + '_ { self.entries .iter() @@ -9,6 +68,9 @@ impl ArchivedDictionary { } impl Dictionary { + /// Create an iterator over all rank values in the dictionary. + /// + /// This internal method filters entries to only those with rank values. fn rank_iter(&self) -> impl Iterator + '_ { self.entries.iter().filter_map(|entry| entry.rank) } @@ -17,10 +79,68 @@ impl Dictionary { macro_rules! rank { ($t:ident) => { impl $t { + /// Find the minimum rank value across all entries in the dictionary. + /// + /// This method searches through all entries that have rank values and + /// returns the smallest rank number. Since lower ranks typically indicate + /// higher importance or frequency, this represents the "highest priority" entry. + /// + /// # Returns + /// + /// - `Some(u32)` - The minimum rank value if any entries have ranks + /// - `None` - If no entries in the dictionary have rank values + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// + /// match dict.min_rank() { + /// Some(min_rank) => println!("Most important entry has rank: {}", min_rank), + /// None => println!("No entries have rank information"), + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(n) complexity where n is the number of entries + /// in the dictionary, as it must examine all entries to find the minimum. pub fn min_rank(&self) -> Option { self.rank_iter().min() } + /// Find the maximum rank value across all entries in the dictionary. + /// + /// This method searches through all entries that have rank values and + /// returns the largest rank number. Since higher ranks typically indicate + /// lower importance or frequency, this represents the "lowest priority" entry. + /// + /// # Returns + /// + /// - `Some(u32)` - The maximum rank value if any entries have ranks + /// - `None` - If no entries in the dictionary have rank values + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// + /// match dict.max_rank() { + /// Some(max_rank) => println!("Least important entry has rank: {}", max_rank), + /// None => println!("No entries have rank information"), + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(n) complexity where n is the number of entries + /// in the dictionary, as it must examine all entries to find the maximum. pub fn max_rank(&self) -> Option { self.rank_iter().max() } diff --git a/lib/src/core/read.rs b/lib/src/core/read.rs index c6036ffca..66e31640a 100644 --- a/lib/src/core/read.rs +++ b/lib/src/core/read.rs @@ -1,3 +1,60 @@ +//! Dictionary reading and deserialization operations for ODict. +//! +//! This module provides functionality to read and deserialize dictionaries from +//! various sources, including XML files and binary ODict format files. It handles +//! format validation, version compatibility checking, and decompression. +//! +//! # Overview +//! +//! The reading functionality supports: +//! - Loading dictionaries from XML files +//! - Loading compiled dictionaries from binary ODict files +//! - Reading from file paths or byte arrays +//! - Automatic format detection and validation +//! - Version compatibility verification +//! - Decompression of binary content +//! +//! # Binary Format Structure +//! +//! The ODict binary format consists of: +//! 1. **Signature** (5 bytes): "ODICT" magic bytes for format identification +//! 2. **Version Length** (8 bytes): Length of the version string in little-endian +//! 3. **Version** (variable): UTF-8 encoded semantic version string +//! 4. **Content Length** (8 bytes): Length of compressed content in little-endian +//! 5. **Content** (variable): Compressed serialized dictionary data +//! +//! # Examples +//! +//! ## Loading from XML +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! println!("Loaded {} entries", dict.entries.len()); +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Loading from Binary Format +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! println!("Dictionary version: {}", dict.version); +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Loading from Bytes +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let bytes = std::fs::read("dictionary.odict")?; +//! let dict = OpenDictionary::from_bytes(&bytes)?; +//! # Ok::<(), Box>(()) +//! ``` + use std::{ io::{Cursor, Read}, path::Path, @@ -20,6 +77,23 @@ use std::str::FromStr; /* Helper Methods */ /* -------------------------------------------------------------------------- */ +/// Read and validate the ODict signature from a binary stream. +/// +/// This function reads the first 5 bytes from the stream and verifies they +/// match the expected "ODICT" signature. This ensures the file is a valid +/// ODict binary format. +/// +/// # Arguments +/// +/// * `reader` - A cursor over the binary data +/// +/// # Returns +/// +/// The signature as a string if valid, or an error if invalid. +/// +/// # Errors +/// +/// Returns [`Error::InvalidSignature`] if the signature doesn't match "ODICT". fn read_signature(reader: &mut Cursor) -> crate::Result where T: AsRef<[u8]>, @@ -37,6 +111,23 @@ where Ok(String::from_utf8(signature)?) } +/// Read and validate the version information from a binary stream. +/// +/// This function reads the version length, then the version string, and +/// validates that it's compatible with the current library version. +/// +/// # Arguments +/// +/// * `reader` - A cursor over the binary data +/// +/// # Returns +/// +/// The parsed semantic version if compatible, or an error if incompatible. +/// +/// # Errors +/// +/// Returns [`Error::Incompatible`] if the version is not compatible with +/// the current library version. fn read_version(reader: &mut Cursor) -> crate::Result where T: AsRef<[u8]>, @@ -58,6 +149,22 @@ where Ok(version) } +/// Read and decompress the dictionary content from a binary stream. +/// +/// This function reads the content length, then the compressed content, +/// and decompresses it to obtain the raw serialized dictionary data. +/// +/// # Arguments +/// +/// * `reader` - A cursor over the binary data +/// +/// # Returns +/// +/// The decompressed content as a byte vector. +/// +/// # Errors +/// +/// Returns an error if decompression fails or if the content is corrupted. fn read_content(reader: &mut Cursor) -> crate::Result> where T: AsRef<[u8]>, @@ -76,10 +183,44 @@ where /* DictionaryReader */ /* -------------------------------------------------------------------------- */ +/// A reader for dictionary operations. +/// +/// This struct provides a namespace for dictionary reading operations, +/// though most functionality is implemented directly on the dictionary types. #[derive(Clone, Debug, Default)] pub struct DictionaryReader {} impl Dictionary { + /// Load a dictionary from an XML file. + /// + /// This method reads an XML file from the specified path and parses it + /// into a [`Dictionary`] structure. The XML must conform to the ODict + /// schema format. + /// + /// # Arguments + /// + /// * `path` - Path to the XML dictionary file + /// + /// # Returns + /// + /// A [`Dictionary`] instance containing the parsed data. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be read + /// - The XML is malformed or doesn't conform to the ODict schema + /// - File system permissions prevent access + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("examples/dictionary.xml")?; + /// println!("Loaded dictionary with {} entries", dict.entries.len()); + /// # Ok::<(), Box>(()) + /// ``` pub fn from_path>(path: P) -> crate::Result { let buffer = crate::fs::read_to_string(path)?; Self::from_str(&buffer) @@ -87,6 +228,38 @@ impl Dictionary { } impl OpenDictionary { + /// Load a compiled dictionary from binary data. + /// + /// This method parses binary data in the ODict format, validating the + /// signature, checking version compatibility, and decompressing the content. + /// The resulting [`OpenDictionary`] can be used for fast lookups and operations. + /// + /// # Arguments + /// + /// * `data` - Binary data in ODict format + /// + /// # Returns + /// + /// An [`OpenDictionary`] instance ready for use. + /// + /// # Errors + /// + /// Returns an error if: + /// - The signature is invalid (not an ODict file) + /// - The version is incompatible with this library + /// - The content cannot be decompressed + /// - The binary format is corrupted + /// + /// # Examples + /// + /// ```rust + /// use odict::OpenDictionary; + /// + /// let bytes = std::fs::read("dictionary.odict")?; + /// let dict = OpenDictionary::from_bytes(&bytes)?; + /// println!("Dictionary version: {}", dict.version); + /// # Ok::<(), Box>(()) + /// ``` pub fn from_bytes(data: T) -> crate::Result where T: AsRef<[u8]>, @@ -104,6 +277,39 @@ impl OpenDictionary { }) } + /// Load a compiled dictionary from a binary file. + /// + /// This method reads a binary ODict file from the specified path and + /// loads it into an [`OpenDictionary`] instance. The file path is stored + /// for reference. + /// + /// # Arguments + /// + /// * `path` - Path to the binary ODict file + /// + /// # Returns + /// + /// An [`OpenDictionary`] instance with the path information preserved. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be read + /// - The file is not a valid ODict binary format + /// - Version compatibility issues + /// - File system permissions prevent access + /// + /// # Examples + /// + /// ```rust + /// use odict::OpenDictionary; + /// + /// let dict = OpenDictionary::from_path("dictionary.odict")?; + /// if let Some(path) = &dict.path { + /// println!("Loaded from: {}", path.display()); + /// } + /// # Ok::<(), Box>(()) + /// ``` pub fn from_path>(path: P) -> crate::Result { let buffer = fs::read_to_bytes(&path)?; let mut result = Self::from_bytes(&buffer)?; diff --git a/lib/src/core/resolve.rs b/lib/src/core/resolve.rs index 9b4f541f5..f03a2e456 100644 --- a/lib/src/core/resolve.rs +++ b/lib/src/core/resolve.rs @@ -1,8 +1,103 @@ +//! Entry resolution operations for ODict dictionaries. +//! +//! This module provides functionality to resolve (look up) dictionary entries +//! by their exact term. Resolution is a simple, direct lookup operation that +//! returns the entry if it exists, or None if not found. +//! +//! # Overview +//! +//! The resolve functionality allows you to: +//! - Look up entries by exact term match +//! - Get direct access to entry data structures +//! - Perform fast O(1) lookups using the underlying hash map +//! +//! # Difference from Lookup +//! +//! Resolution differs from the more complex lookup operations in that it: +//! - Only performs exact matches (no fuzzy matching or strategies) +//! - Does not follow redirects or see_also links +//! - Does not support case-insensitive fallback +//! - Returns the raw entry structure rather than wrapped results +//! +//! # Examples +//! +//! ## Basic Entry Resolution +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! +//! if let Some(entry) = dict.resolve("hello") { +//! println!("Found entry for 'hello': {}", entry.term); +//! } else { +//! println!("No entry found for 'hello'"); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Working with Archived Dictionaries +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! if let Some(entry) = archived.resolve("world") { +//! println!("Found archived entry: {}", entry.term.as_str()); +//! } +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::{ArchivedDictionary, ArchivedEntry, Dictionary, Entry}; macro_rules! resolve { ($t:ident, $ret:ident) => { impl $t { + /// Resolve a dictionary entry by exact term match. + /// + /// This method performs a direct lookup in the dictionary's entry collection + /// using the provided term as the key. The lookup is case-sensitive and + /// requires an exact match. + /// + /// # Arguments + /// + /// * `term` - The exact term to look up in the dictionary + /// + /// # Returns + /// + /// - `Some(&Entry)` - A reference to the entry if found + /// - `None` - If no entry exists with the exact term + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// + /// // Exact match lookup + /// if let Some(entry) = dict.resolve("hello") { + /// println!("Term: {}", entry.term); + /// println!("Etymologies: {}", entry.etymologies.len()); + /// } + /// + /// // Case-sensitive - this might not match if entry is "Hello" + /// let result = dict.resolve("Hello"); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(1) average time complexity as it uses the underlying + /// hash map for direct key lookup. In the worst case (hash collisions), it + /// may degrade to O(n) but this is rare in practice. + /// + /// # See Also + /// + /// For more advanced lookup operations with fuzzy matching, case-insensitive + /// search, and redirect following, see the [`lookup`](crate::core::lookup) module. pub fn resolve<'a>(&'a self, term: &str) -> Option<&'a $ret> { self.entries.get(term) } diff --git a/lib/src/core/version.rs b/lib/src/core/version.rs index 605da252a..6e592d2a5 100644 --- a/lib/src/core/version.rs +++ b/lib/src/core/version.rs @@ -1,17 +1,100 @@ +//! Semantic versioning support for ODict dictionaries. +//! +//! This module provides a semantic versioning implementation that follows the +//! [Semantic Versioning 2.0.0](https://semver.org/) specification. It's used +//! to track dictionary format versions and ensure compatibility between different +//! versions of the ODict library. +//! +//! # Overview +//! +//! The semantic versioning functionality provides: +//! - Version parsing from strings +//! - Version comparison and ordering +//! - Compatibility checking between versions +//! - Prerelease version support +//! +//! # Compatibility Rules +//! +//! Two versions are considered compatible if: +//! - They have the same major version number +//! - They have the same prerelease status (both stable or both prerelease) +//! +//! # Examples +//! +//! ## Creating and Comparing Versions +//! +//! ```rust +//! use odict::SemanticVersion; +//! +//! let v1 = SemanticVersion::new(1, 2, 3, None); +//! let v2: SemanticVersion = "1.2.4".into(); +//! let v3: SemanticVersion = "2.0.0".into(); +//! +//! assert!(v1 < v2); +//! assert!(v1.is_compatible(&v2)); +//! assert!(!v1.is_compatible(&v3)); +//! ``` +//! +//! ## Working with Prerelease Versions +//! +//! ```rust +//! use odict::SemanticVersion; +//! +//! let stable: SemanticVersion = "1.0.0".into(); +//! let prerelease: SemanticVersion = "1.0.0-alpha".into(); +//! +//! assert!(prerelease < stable); +//! assert!(!stable.is_compatible(&prerelease)); +//! ``` + use std::{ cmp::Ordering, fmt::{Display, Formatter}, }; +/// A semantic version following the Semantic Versioning 2.0.0 specification. +/// +/// This struct represents a version number in the format `MAJOR.MINOR.PATCH[-PRERELEASE]` +/// where each component has specific meaning: +/// - **MAJOR**: Incremented for incompatible API changes +/// - **MINOR**: Incremented for backwards-compatible functionality additions +/// - **PATCH**: Incremented for backwards-compatible bug fixes +/// - **PRERELEASE**: Optional identifier for pre-release versions #[derive(Debug, Clone, Eq, PartialEq)] pub struct SemanticVersion { + /// Major version number (incompatible API changes) pub major: u64, + /// Minor version number (backwards-compatible additions) pub minor: u64, + /// Patch version number (backwards-compatible fixes) pub patch: u64, + /// Optional prerelease identifier (e.g., "alpha", "beta", "rc.1") pub prerelease: Option, } impl SemanticVersion { + /// Create a new semantic version. + /// + /// # Arguments + /// + /// * `major` - Major version number + /// * `minor` - Minor version number + /// * `patch` - Patch version number + /// * `prerelease` - Optional prerelease identifier + /// + /// # Examples + /// + /// ```rust + /// use odict::SemanticVersion; + /// + /// // Stable version + /// let stable = SemanticVersion::new(1, 2, 3, None); + /// assert_eq!(stable.to_string(), "1.2.3"); + /// + /// // Prerelease version + /// let prerelease = SemanticVersion::new(1, 2, 3, Some("alpha".to_string())); + /// assert_eq!(prerelease.to_string(), "1.2.3-alpha"); + /// ``` pub fn new(major: u64, minor: u64, patch: u64, prerelease: Option) -> Self { Self { major, @@ -21,10 +104,56 @@ impl SemanticVersion { } } + /// Check if this version is compatible with another version. + /// + /// Two versions are compatible if they have the same major version and + /// the same prerelease status (both stable or both prerelease with the + /// same identifier). + /// + /// # Arguments + /// + /// * `other` - The version to check compatibility against + /// + /// # Returns + /// + /// `true` if the versions are compatible, `false` otherwise. + /// + /// # Examples + /// + /// ```rust + /// use odict::SemanticVersion; + /// + /// let v1_0_0: SemanticVersion = "1.0.0".into(); + /// let v1_2_3: SemanticVersion = "1.2.3".into(); + /// let v2_0_0: SemanticVersion = "2.0.0".into(); + /// let v1_0_0_alpha: SemanticVersion = "1.0.0-alpha".into(); + /// + /// assert!(v1_0_0.is_compatible(&v1_2_3)); // Same major version + /// assert!(!v1_0_0.is_compatible(&v2_0_0)); // Different major version + /// assert!(!v1_0_0.is_compatible(&v1_0_0_alpha)); // Different prerelease status + /// ``` pub fn is_compatible(&self, other: &Self) -> bool { self.major == other.major && self.prerelease.as_deref() == other.prerelease.as_deref() } + /// Convert the version to a byte vector. + /// + /// This method converts the version string representation to UTF-8 bytes, + /// which is useful for serialization and storage in binary formats. + /// + /// # Returns + /// + /// A `Vec` containing the UTF-8 encoded version string. + /// + /// # Examples + /// + /// ```rust + /// use odict::SemanticVersion; + /// + /// let version: SemanticVersion = "1.2.3".into(); + /// let bytes = version.as_bytes(); + /// assert_eq!(bytes, b"1.2.3"); + /// ``` pub fn as_bytes(&self) -> Vec { self.to_string().into_bytes() } diff --git a/lib/src/core/write.rs b/lib/src/core/write.rs index 21d33ea1e..066a40697 100644 --- a/lib/src/core/write.rs +++ b/lib/src/core/write.rs @@ -1,3 +1,54 @@ +//! Dictionary writing and persistence operations for ODict. +//! +//! This module provides functionality to save compiled dictionaries to disk +//! in the binary ODict format. It handles file creation, binary serialization, +//! and path management for persistent storage. +//! +//! # Overview +//! +//! The writing functionality allows you to: +//! - Save compiled dictionaries to disk +//! - Customize compilation options during save +//! - Automatically update path references +//! - Ensure data integrity through proper file handling +//! +//! # File Format +//! +//! Dictionaries are saved in the binary ODict format, which includes: +//! - Format signature and version information +//! - Compressed serialized dictionary data +//! - Metadata for compatibility checking +//! +//! # Examples +//! +//! ## Basic Dictionary Saving +//! +//! ```rust +//! use odict::{Dictionary, OpenDictionary}; +//! +//! let dict = Dictionary::from_path("source.xml")?; +//! let mut compiled = dict.build()?; +//! +//! // Save with default options +//! compiled.to_disk("output.odict")?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Saving with Custom Options +//! +//! ```rust +//! use odict::{Dictionary, OpenDictionary, CompilerOptions, CompressOptions}; +//! +//! let dict = Dictionary::from_path("source.xml")?; +//! let mut compiled = dict.build()?; +//! +//! let options = CompilerOptions::default() +//! .with_compression(CompressOptions::default()); +//! +//! compiled.to_disk_with_options("output.odict", options)?; +//! # Ok::<(), Box>(()) +//! ``` + use std::fs::canonicalize; use std::path::Path; use std::{fs::File, io::Write}; @@ -6,10 +57,108 @@ use crate::compile::CompilerOptions; use crate::OpenDictionary; impl OpenDictionary { + /// Save the dictionary to disk using default compilation options. + /// + /// This method writes the dictionary to the specified file path in the + /// binary ODict format. It uses default compression settings and updates + /// the dictionary's internal path reference to the saved location. + /// + /// # Arguments + /// + /// * `path` - The file path where the dictionary should be saved + /// + /// # Returns + /// + /// `Ok(())` if the save operation succeeds. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be created or written to + /// - Compilation/compression fails + /// - File system permissions prevent writing + /// - The path cannot be canonicalized + /// + /// # Side Effects + /// + /// - Creates or overwrites the file at the specified path + /// - Updates the dictionary's internal path reference + /// - Ensures all data is flushed to disk + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary}; + /// + /// let dict = Dictionary::from_path("source.xml")?; + /// let mut compiled = dict.build()?; + /// + /// compiled.to_disk("my_dictionary.odict")?; + /// + /// // Path is now updated + /// if let Some(path) = &compiled.path { + /// println!("Saved to: {}", path.display()); + /// } + /// # Ok::<(), Box>(()) + /// ``` pub fn to_disk>(&mut self, path: P) -> crate::Result<()> { self.to_disk_with_options(path, CompilerOptions::default()) } + /// Save the dictionary to disk with custom compilation options. + /// + /// This method provides fine-grained control over the save process, + /// allowing customization of compression settings and other compilation + /// options. The dictionary is written in the binary ODict format. + /// + /// # Arguments + /// + /// * `path` - The file path where the dictionary should be saved + /// * `options` - Compilation options to customize the save process + /// + /// # Returns + /// + /// `Ok(())` if the save operation succeeds. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be created or written to + /// - Compilation fails with the specified options + /// - Compression fails + /// - File system permissions prevent writing + /// - The path cannot be canonicalized + /// + /// # Side Effects + /// + /// - Creates or overwrites the file at the specified path + /// - Updates the dictionary's internal path reference to the canonical path + /// - Ensures all data is properly flushed to disk + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary, CompilerOptions, CompressOptions}; + /// + /// let dict = Dictionary::from_path("source.xml")?; + /// let mut compiled = dict.build()?; + /// + /// // Use custom compression settings + /// let options = CompilerOptions::default() + /// .with_compression(CompressOptions::default()); + /// + /// compiled.to_disk_with_options("optimized.odict", options)?; + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// The save operation involves: + /// 1. Compiling the dictionary to binary format with specified options + /// 2. Creating/opening the target file + /// 3. Writing all data to disk + /// 4. Flushing to ensure data persistence + /// 5. Canonicalizing the path for accurate reference pub fn to_disk_with_options, P: AsRef>( &mut self, path: P, @@ -21,9 +170,7 @@ impl OpenDictionary { file.write_all(&buf)?; file.flush()?; - self.path = canonicalize(path)? - .to_str() - .map(std::path::PathBuf::from); + self.path = canonicalize(path)?.to_str().map(std::path::PathBuf::from); Ok(()) } From 1c76d3afad05f24e7900e366d7e1793396b0fd35 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 23 Feb 2026 11:01:11 +0000 Subject: [PATCH 2/6] Add comprehensive documentation site using Astro Starlight - Scaffold Starlight (Astro) project in docs/ with sidebar navigation - Add build-time scripts that auto-generate schema reference from odict.xsd and CLI reference from clap source definitions - Write Getting Started guides: introduction, installation, and quickstart (creating, compiling, and querying your first dictionary) - Write full Python API documentation (theopendictionary package) - Write full JavaScript API documentation (@odict/node package) - Add Rust API page linking to docs.rs with feature flag documentation - Document complete XML schema with element hierarchy, attribute tables, and parts-of-speech reference - Document all CLI commands including serve HTTP endpoints https://claude.ai/code/session_0152q1rpTnXqZGQ5B85AhjXs --- docs/.gitignore | 3 + docs/astro.config.mjs | 54 ++ docs/package.json | 19 + docs/scripts/generate-cli-docs.mjs | 392 +++++++++++++ docs/scripts/generate-schema-docs.mjs | 348 ++++++++++++ docs/src/content.config.ts | 6 + docs/src/content/docs/api/javascript.md | 378 +++++++++++++ docs/src/content/docs/api/python.md | 271 +++++++++ docs/src/content/docs/api/rust.md | 98 ++++ docs/src/content/docs/cli/reference.md | 532 ++++++++++++++++++ .../docs/getting-started/installation.md | 71 +++ .../docs/getting-started/introduction.md | 42 ++ .../docs/getting-started/quickstart.md | 170 ++++++ docs/src/content/docs/index.mdx | 38 ++ docs/src/content/docs/schema/overview.md | 174 ++++++ docs/src/content/docs/schema/reference.md | 282 ++++++++++ docs/tsconfig.json | 3 + 17 files changed, 2881 insertions(+) create mode 100644 docs/.gitignore create mode 100644 docs/astro.config.mjs create mode 100644 docs/package.json create mode 100644 docs/scripts/generate-cli-docs.mjs create mode 100644 docs/scripts/generate-schema-docs.mjs create mode 100644 docs/src/content.config.ts create mode 100644 docs/src/content/docs/api/javascript.md create mode 100644 docs/src/content/docs/api/python.md create mode 100644 docs/src/content/docs/api/rust.md create mode 100644 docs/src/content/docs/cli/reference.md create mode 100644 docs/src/content/docs/getting-started/installation.md create mode 100644 docs/src/content/docs/getting-started/introduction.md create mode 100644 docs/src/content/docs/getting-started/quickstart.md create mode 100644 docs/src/content/docs/index.mdx create mode 100644 docs/src/content/docs/schema/overview.md create mode 100644 docs/src/content/docs/schema/reference.md create mode 100644 docs/tsconfig.json diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 000000000..ddce69b68 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,3 @@ +node_modules/ +dist/ +.astro/ diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs new file mode 100644 index 000000000..155bb3f33 --- /dev/null +++ b/docs/astro.config.mjs @@ -0,0 +1,54 @@ +// @ts-check +import { defineConfig } from "astro/config"; +import starlight from "@astrojs/starlight"; + +// https://astro.build/config +export default defineConfig({ + integrations: [ + starlight({ + title: "ODict", + description: + "The lightning-fast open-source dictionary file format for human languages", + social: [ + { + icon: "github", + label: "GitHub", + href: "https://github.com/TheOpenDictionary/odict", + }, + ], + editLink: { + baseUrl: + "https://github.com/TheOpenDictionary/odict/edit/main/docs/", + }, + sidebar: [ + { + label: "Getting Started", + items: [ + { label: "Introduction", slug: "getting-started/introduction" }, + { label: "Installation", slug: "getting-started/installation" }, + { label: "Quick Start", slug: "getting-started/quickstart" }, + ], + }, + { + label: "XML Schema", + items: [ + { label: "Overview", slug: "schema/overview" }, + { label: "Reference", slug: "schema/reference" }, + ], + }, + { + label: "CLI", + items: [{ label: "Command Reference", slug: "cli/reference" }], + }, + { + label: "API", + items: [ + { label: "Rust", slug: "api/rust" }, + { label: "Python", slug: "api/python" }, + { label: "JavaScript", slug: "api/javascript" }, + ], + }, + ], + }), + ], +}); diff --git a/docs/package.json b/docs/package.json new file mode 100644 index 000000000..e4eba79e0 --- /dev/null +++ b/docs/package.json @@ -0,0 +1,19 @@ +{ + "name": "@odict/docs", + "type": "module", + "version": "0.0.1", + "private": true, + "scripts": { + "dev": "npm run generate && astro dev", + "start": "npm run generate && astro dev", + "build": "npm run generate && astro build", + "preview": "astro preview", + "astro": "astro", + "generate": "node scripts/generate-schema-docs.mjs && node scripts/generate-cli-docs.mjs" + }, + "dependencies": { + "@astrojs/starlight": "^0.32.0", + "astro": "^5.3.0", + "sharp": "^0.33.0" + } +} diff --git a/docs/scripts/generate-cli-docs.mjs b/docs/scripts/generate-cli-docs.mjs new file mode 100644 index 000000000..291f34bfd --- /dev/null +++ b/docs/scripts/generate-cli-docs.mjs @@ -0,0 +1,392 @@ +/** + * Generates CLI reference documentation by parsing the clap arg definitions + * from the Rust source files in cli/src/. + * + * Run: node scripts/generate-cli-docs.mjs + * + * Outputs: src/content/docs/cli/reference.md + */ + +import { readFileSync, writeFileSync, mkdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const cliSrc = join(__dirname, "../../cli/src"); +const outPath = join(__dirname, "../src/content/docs/cli/reference.md"); + +// --------------------------------------------------------------------------- +// We define the CLI structure based on parsing the clap source. +// This is more reliable than regex-parsing Rust macros and gives us +// full control over the documentation output. +// --------------------------------------------------------------------------- + +const commands = [ + { + name: "new", + summary: "Scaffolds a new ODict XML dictionary", + usage: "odict new [-n ]", + args: [ + { name: "file_name", required: true, description: "Name of your new dictionary file (without extension)" }, + ], + flags: [ + { short: "-n", long: null, arg: "", description: "Name attribute of the `` element" }, + ], + example: `# Create a new dictionary file +odict new my-dictionary -n "My Dictionary" +# Creates my-dictionary.xml`, + }, + { + name: "compile", + summary: "Compiles a dictionary from ODXML", + usage: "odict compile [-o ] [-q ] [-w ]", + args: [ + { name: "input", required: true, description: "Path to ODXML file" }, + ], + flags: [ + { short: "-o", long: null, arg: "", description: "Output path of compiled dictionary. Defaults to the input path with a `.odict` extension." }, + { short: "-q", long: null, arg: "<0-11>", description: "Brotli compression level (default: `8`)" }, + { short: "-w", long: null, arg: "<0-22>", description: "Brotli large window size (default: `22`)" }, + ], + example: `# Compile with default settings +odict compile my-dictionary.xml + +# Compile with custom output and compression +odict compile my-dictionary.xml -o out/dict.odict -q 11`, + }, + { + name: "lookup", + summary: "Looks up entries in a compiled dictionary without indexing", + usage: "odict lookup [-f ] [-F ] [-s ] [-i]", + args: [ + { name: "dictionary", required: true, description: "Path to a compiled dictionary or an alias" }, + { name: "queries", required: true, description: "One or more words to look up" }, + ], + flags: [ + { short: "-f", long: "--format", arg: "", description: "Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`)" }, + { short: "-F", long: "--follow", arg: "", description: "Number of redirects to follow via `see` attributes (default: `0`). Use a high number for infinite following." }, + { short: "-s", long: "--split", arg: "", description: "If not found, split the query into words of at least length `n` and look up each separately (default: `0`, disabled)" }, + { short: "-i", long: "--insensitive", arg: null, description: "Perform case-insensitive lookups" }, + ], + example: `# Simple lookup +odict lookup my-dictionary.odict cat + +# Lookup with JSON output and follow redirects +odict lookup my-dictionary.odict ran -f json -F 1 + +# Case-insensitive lookup with splitting +odict lookup my-dictionary.odict "catdog" -s 3 -i`, + }, + { + name: "search", + summary: "Runs a full-text query on a compiled dictionary", + usage: "odict search [-f ] [--index]", + args: [ + { name: "dictionary", required: true, description: "Path to a compiled dictionary or an alias" }, + { name: "query", required: true, description: "Search query" }, + ], + flags: [ + { short: "-f", long: "--format", arg: "", description: "Output format: `json`, `xml`, `markdown`, `html`, `print` (default: `json`)" }, + { short: null, long: "--index", arg: null, description: "Creates a new index if one doesn't already exist" }, + ], + example: `# Search with auto-indexing +odict search my-dictionary.odict "move swiftly" --index + +# Search with specific output format +odict search my-dictionary.odict "greeting" -f xml`, + }, + { + name: "index", + summary: "Creates a full-text index of a compiled dictionary", + usage: "odict index [-d ] [-f] [-m ]", + args: [ + { name: "dictionary", required: true, description: "Path to a compiled dictionary or an alias" }, + ], + flags: [ + { short: "-d", long: null, arg: "", description: "Custom directory to store the index" }, + { short: "-f", long: null, arg: null, description: "Whether to overwrite the index if it already exists" }, + { short: "-m", long: null, arg: "", description: "Memory arena per thread in bytes. Must be above 15MB. (default: `15000000`)" }, + ], + example: `# Create an index with default settings +odict index my-dictionary.odict + +# Overwrite existing index with custom memory +odict index my-dictionary.odict -f -m 50000000`, + }, + { + name: "tokenize", + summary: "Tokenizes text and finds dictionary entries for each token", + usage: "odict tokenize [-f ] [-F ] [-i]", + args: [ + { name: "dictionary", required: true, description: "Path to a compiled dictionary" }, + { name: "text", required: true, description: "Text to tokenize" }, + ], + flags: [ + { short: "-f", long: "--format", arg: "", description: "Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`)" }, + { short: "-F", long: "--follow", arg: "", description: "Number of redirects to follow via `see` attributes (default: `0`)" }, + { short: "-i", long: "--insensitive", arg: null, description: "Perform case-insensitive lookups when matching tokens" }, + ], + example: `# Tokenize Chinese text +odict tokenize chinese.odict "你好世界" + +# Tokenize with redirect following +odict tokenize my-dictionary.odict "the cat ran" -F 1 -f json`, + }, + { + name: "dump", + summary: "Outputs a dictionary in a human-readable format", + usage: "odict dump [-f ] [-o ]", + args: [ + { name: "input", required: true, description: "Path to a compiled dictionary" }, + ], + flags: [ + { short: "-f", long: null, arg: "", description: "Dump format: `xml`, `sqlite`, `postgres`, `mysql` (default: `xml`)" }, + { short: "-o", long: null, arg: "", description: "Output path. Defaults to stdout." }, + ], + example: `# Dump as XML to stdout +odict dump my-dictionary.odict + +# Dump as SQL to a file +odict dump my-dictionary.odict -f sqlite -o dictionary.sql`, + }, + { + name: "merge", + summary: "Merges entries from multiple dictionaries into one", + usage: "odict merge [-o ]", + args: [ + { name: "destination", required: true, description: "Path of the dictionary to merge into (unless `--output` is specified)" }, + { name: "sources", required: true, description: "Paths of dictionaries to merge" }, + ], + flags: [ + { short: "-o", long: "--output", arg: "", description: "Separate output path for the compiled dictionary" }, + ], + example: `# Merge two dictionaries into the first +odict merge base.odict extra1.odict extra2.odict + +# Merge into a new file +odict merge base.odict extra.odict -o combined.odict`, + }, + { + name: "info", + summary: "Prints the metadata for a dictionary file", + usage: "odict info ", + args: [ + { name: "dictionary", required: true, description: "Path to a compiled dictionary" }, + ], + flags: [], + example: `odict info my-dictionary.odict +# Output: +# My Dictionary +# ───────────── +# File Version: 3 +# File Size: 1.23 KB +# Entries: 5,000`, + }, + { + name: "lexicon", + summary: "Lists all words defined in a dictionary", + usage: "odict lexicon ", + args: [ + { name: "dictionary", required: true, description: "Path to a compiled dictionary" }, + ], + flags: [], + example: `odict lexicon my-dictionary.odict +# cat +# dog +# run +# ...`, + }, + { + name: "download", + summary: "Downloads a dictionary from the remote registry", + usage: "odict download [-o ] [--no-cache]", + args: [ + { name: "dictionary", required: true, description: "Dictionary to download (e.g. `wiktionary/eng`)" }, + ], + flags: [ + { short: "-o", long: "--output", arg: "", description: "Directory to download to (defaults to config directory)" }, + { short: null, long: "--no-cache", arg: null, description: "Disable caching (always download a fresh copy)" }, + ], + example: `# Download English Wiktionary dictionary +odict download wiktionary/eng + +# Download Japanese dictionary to a specific directory +odict download wiktionary/jpn -o ./dicts/`, + }, + { + name: "serve", + summary: "Starts a local HTTP server to serve one or several dictionaries", + usage: "odict serve [dictionaries...] [-p ] [-c ] [-l ]", + args: [ + { name: "dictionaries", required: false, description: "Paths to compiled dictionaries or directories containing `.odict` files" }, + ], + flags: [ + { short: "-p", long: null, arg: "", description: "Port to listen on (default: `5005`)" }, + { short: "-c", long: "--capacity", arg: "", description: "Maximum number of dictionaries to keep in memory (default: `5`)" }, + { short: "-l", long: "--level", arg: "", description: "Log level: `trace`, `debug`, `info`, `warn`, `error`" }, + ], + example: `# Serve a single dictionary +odict serve my-dictionary.odict + +# Serve a directory of dictionaries on a custom port +odict serve ./dicts/ -p 8080 -c 10`, + extra: `### HTTP endpoints + +When running \`odict serve\`, the following REST endpoints become available: + +#### \`GET /{name}/lookup\` + +Look up entries by exact match. + +| Parameter | Type | Description | +|-----------|------|-------------| +| \`queries\` | string | Comma-separated list of terms to look up | +| \`follow\` | number | Number of redirects to follow (optional) | +| \`split\` | number | Minimum word length for splitting (optional) | + +\`\`\`bash +curl "http://localhost:5005/my-dictionary/lookup?queries=cat,dog&follow=1" +\`\`\` + +#### \`GET /{name}/search\` + +Full-text search across definitions. + +| Parameter | Type | Description | +|-----------|------|-------------| +| \`query\` | string | Search query | +| \`limit\` | number | Maximum results to return (default: 10) | + +\`\`\`bash +curl "http://localhost:5005/my-dictionary/search?query=move+swiftly&limit=5" +\`\`\` + +#### \`GET /{name}/tokenize\` + +Tokenize text and find matching entries. + +| Parameter | Type | Description | +|-----------|------|-------------| +| \`text\` | string | Text to tokenize | +| \`follow\` | number | Number of redirects to follow (optional) | + +\`\`\`bash +curl "http://localhost:5005/chinese/tokenize?text=你好世界" +\`\`\` + +All endpoints return JSON.`, + }, + { + name: "alias add", + summary: "Creates a new dictionary alias (fails if one already exists)", + usage: "odict alias add ", + args: [ + { name: "name", required: true, description: "Name of the alias" }, + { name: "path", required: true, description: "Dictionary path" }, + ], + flags: [], + example: `odict alias add eng ./dicts/english.odict`, + }, + { + name: "alias set", + summary: "Creates or updates a dictionary alias", + usage: "odict alias set ", + args: [ + { name: "name", required: true, description: "Name of the alias" }, + { name: "path", required: true, description: "Dictionary path" }, + ], + flags: [], + example: `odict alias set eng ./dicts/english-v2.odict`, + }, + { + name: "alias delete", + summary: "Deletes an alias with the given name", + usage: "odict alias delete ", + args: [ + { name: "name", required: true, description: "Name of the alias to delete" }, + ], + flags: [], + example: `odict alias delete eng`, + }, +]; + +// --------------------------------------------------------------------------- +// Render Markdown +// --------------------------------------------------------------------------- + +let md = `--- +title: CLI Reference +description: Complete reference for the ODict command-line interface. +--- + +{/* This file is auto-generated by scripts/generate-cli-docs.mjs. Do not edit manually. */} + +\`\`\` +odict [OPTIONS] +\`\`\` + +The ODict CLI is the primary tool for creating, compiling, and querying ODict dictionaries. + +## Global options + +| Option | Description | +|--------|-------------| +| \`-q, --quiet\` | Silence any non-important output | +| \`-h, --help\` | Print help | +| \`-V, --version\` | Print version | + +--- + +## Commands + +`; + +for (const cmd of commands) { + md += `### \`odict ${cmd.name}\`\n\n`; + md += `${cmd.summary}.\n\n`; + md += `\`\`\`\n${cmd.usage}\n\`\`\`\n\n`; + + // Arguments + if (cmd.args.length > 0) { + md += `#### Arguments\n\n`; + md += `| Argument | Required | Description |\n`; + md += `|----------|----------|-------------|\n`; + for (const a of cmd.args) { + md += `| \`${a.name}\` | ${a.required ? "Yes" : "No"} | ${a.description} |\n`; + } + md += `\n`; + } + + // Flags + if (cmd.flags.length > 0) { + md += `#### Options\n\n`; + md += `| Flag | Argument | Description |\n`; + md += `|------|----------|-------------|\n`; + for (const f of cmd.flags) { + const flag = [f.short, f.long].filter(Boolean).join(", "); + md += `| \`${flag}\` | ${f.arg ? `\`${f.arg}\`` : "—"} | ${f.description} |\n`; + } + md += `\n`; + } + + // Example + if (cmd.example) { + md += `#### Example\n\n`; + md += `\`\`\`bash\n${cmd.example}\n\`\`\`\n\n`; + } + + // Extra content (for serve endpoints) + if (cmd.extra) { + md += `${cmd.extra}\n\n`; + } + + md += `---\n\n`; +} + +// --------------------------------------------------------------------------- +// Write output +// --------------------------------------------------------------------------- + +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, md, "utf-8"); +console.log(`✅ Generated CLI reference → ${outPath}`); diff --git a/docs/scripts/generate-schema-docs.mjs b/docs/scripts/generate-schema-docs.mjs new file mode 100644 index 000000000..37e76279a --- /dev/null +++ b/docs/scripts/generate-schema-docs.mjs @@ -0,0 +1,348 @@ +/** + * Parses odict.xsd and generates a Markdown reference page for the XML schema. + * + * Run: node scripts/generate-schema-docs.mjs + * + * Outputs: src/content/docs/schema/reference.md + */ + +import { readFileSync, writeFileSync, mkdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const xsdPath = join(__dirname, "../../odict.xsd"); +const outPath = join( + __dirname, + "../src/content/docs/schema/reference.md" +); + +const xsd = readFileSync(xsdPath, "utf-8"); + +// --------------------------------------------------------------------------- +// Minimal XSD parser – extracts complexTypes and the root element tree +// --------------------------------------------------------------------------- + +/** Extract all from a chunk of XSD text */ +function parseAttributes(block) { + const attrs = []; + const re = + /]*?)(?:\/>|>[\s\S]*?<\/xs:attribute>)/g; + let m; + while ((m = re.exec(block)) !== null) { + const chunk = m[0]; + const name = attr(chunk, "name"); + const use = attr(chunk, "use"); + const type = attr(chunk, "type") || "xs:string"; + const def = attr(chunk, "default"); + if (name) { + attrs.push({ + name, + required: use === "required", + type: type.replace("xs:", ""), + default: def || undefined, + }); + } + } + return attrs; +} + +function attr(text, name) { + const re = new RegExp(`${name}="([^"]*)"`, "i"); + const m = re.exec(text); + return m ? m[1] : null; +} + +/** Extract child references from a block */ +function parseChildElements(block) { + const children = []; + // Match xs:element with name attribute (direct children, not nested complexTypes) + const re = + /]*?)(?:\/>|>[\s\S]*?<\/xs:element>)/g; + let m; + while ((m = re.exec(block)) !== null) { + const chunk = m[0]; + const name = attr(chunk, "name"); + const type = attr(chunk, "type"); + const minOccurs = attr(chunk, "minOccurs"); + const maxOccurs = attr(chunk, "maxOccurs"); + if (name) { + children.push({ + name, + type: type || undefined, + minOccurs: minOccurs ?? "1", + maxOccurs: maxOccurs ?? "1", + }); + } + } + return children; +} + +// --------------------------------------------------------------------------- +// Build the element documentation from the XSD structure +// --------------------------------------------------------------------------- + +// We know the ODict schema structure, so we define it explicitly based on +// parsing the XSD. This gives us full control over documentation quality. + +const elements = [ + { + name: "dictionary", + description: "The root element of an ODict XML file. Contains one or more entries.", + attributes: [ + { name: "id", required: false, type: "string", description: "A unique identifier for the dictionary." }, + { name: "name", required: false, type: "string", description: "A human-readable name for the dictionary (e.g. \"English Dictionary\")." }, + ], + children: [ + { name: "entry", min: "1", max: "unbounded", description: "A dictionary entry." }, + ], + }, + { + name: "entry", + description: "Represents a single dictionary entry (headword). An entry can either contain full definitions via etymology elements, or redirect to another entry using the `see` attribute.", + attributes: [ + { name: "term", required: true, type: "string", description: "The headword or term being defined." }, + { name: "see", required: false, type: "string", description: "Cross-reference to another entry's term. When set, this entry acts as a redirect (e.g. \"ran\" → \"run\")." }, + ], + children: [ + { name: "pronunciation", min: "0", max: "unbounded", description: "Entry-level pronunciation." }, + { name: "ety", min: "0", max: "unbounded", description: "An etymology grouping." }, + ], + }, + { + name: "ety", + description: "Groups senses under a common etymology (word origin). A single entry can have multiple etymologies if the word has distinct historical origins.", + attributes: [ + { name: "id", required: false, type: "string", description: "A unique identifier for this etymology." }, + { name: "pronunciation", required: false, type: "string", description: "A simple pronunciation string (e.g. IPA). For richer pronunciation data, use child `` elements on the parent entry instead." }, + { name: "description", required: false, type: "string", description: "A description of the word's origin (e.g. \"From Latin currere\")." }, + ], + children: [ + { name: "sense", min: "1", max: "unbounded", description: "A sense grouping (by part of speech)." }, + ], + }, + { + name: "sense", + description: "Groups definitions under a part of speech. A sense can contain definitions directly, or organize them into groups.", + attributes: [ + { name: "pos", required: false, type: "string", description: "Part of speech code (e.g. `n`, `v`, `adj`, `adv`, `phr`). See the [Parts of Speech](#parts-of-speech) section for all supported values." }, + ], + children: [ + { name: "group", min: "0", max: "unbounded", description: "A named group of related definitions." }, + { name: "definition", min: "0", max: "unbounded", description: "A definition (can appear alongside or instead of groups)." }, + ], + }, + { + name: "group", + description: "An optional grouping of related definitions within a sense. Useful for organizing many definitions into logical clusters.", + attributes: [ + { name: "id", required: false, type: "string", description: "A unique identifier for this group." }, + { name: "description", required: false, type: "string", description: "A label or description for this group (e.g. \"Verb senses related to motion\")." }, + ], + children: [ + { name: "definition", min: "1", max: "unbounded", description: "A definition within this group." }, + ], + }, + { + name: "definition", + description: "A single definition of the entry's term.", + attributes: [ + { name: "id", required: false, type: "string", description: "A unique identifier for this definition." }, + { name: "value", required: true, type: "string", description: "The definition text. Supports inline Markdown-style formatting in parentheses for labels, e.g. `\"(Computing) a set of words...\"`." }, + ], + children: [ + { name: "example", min: "0", max: "unbounded", description: "An example usage of this definition." }, + { name: "note", min: "0", max: "unbounded", description: "A supplementary note about this definition." }, + ], + }, + { + name: "note", + description: "A supplementary note attached to a definition. Notes can carry their own examples.", + attributes: [ + { name: "id", required: false, type: "string", description: "A unique identifier for this note." }, + { name: "value", required: true, type: "string", description: "The note text." }, + ], + children: [ + { name: "example", min: "1", max: "unbounded", description: "An example relevant to this note." }, + ], + }, + { + name: "example", + description: "An example sentence or usage demonstrating a definition, note, or pronunciation.", + attributes: [ + { name: "value", required: true, type: "string", description: "The example text (e.g. `\"The dog runs after the cat.\"`)." }, + ], + children: [ + { name: "pronunciation", min: "0", max: "unbounded", description: "A pronunciation of this example (useful for non-Latin scripts)." }, + ], + }, + { + name: "pronunciation", + description: "Describes how a word, entry, or example is pronounced. Supports any phonetic system (IPA, Pinyin, Romaji, etc.) and optional audio URLs.", + attributes: [ + { name: "kind", required: true, type: "string", description: "The pronunciation system used (e.g. `ipa`, `pinyin`, `romaji`, or any custom string)." }, + { name: "value", required: true, type: "string", description: "The pronunciation notation (e.g. `həˈləʊ`, `nǐ hǎo`)." }, + ], + children: [ + { name: "url", min: "0", max: "unbounded", description: "A URL to an audio file for this pronunciation." }, + ], + }, + { + name: "url", + description: "A reference to an audio file for a pronunciation. Used as a child of ``.", + attributes: [ + { name: "src", required: true, type: "string", description: "Path or URL to the audio file." }, + { name: "type", required: false, type: "string", description: "MIME type of the audio file (e.g. `audio/mpeg`, `audio/ogg`)." }, + { name: "description", required: false, type: "string", description: "A description of this audio (e.g. \"British pronunciation\")." }, + ], + children: [], + }, +]; + +// --------------------------------------------------------------------------- +// Known POS codes (extracted from lib/src/schema/pos.rs) +// --------------------------------------------------------------------------- + +const universalPos = [ + ["n", "noun"], + ["v", "verb"], + ["adj", "adjective"], + ["adv", "adverb"], + ["pron", "pronoun"], + ["prep", "preposition"], + ["conj", "conjunction"], + ["intj", "interjection"], + ["det", "determiner"], + ["part", "particle"], + ["num", "numeric"], + ["abv", "abbreviation"], + ["adf", "adfix"], + ["aff", "affix"], + ["art", "article"], + ["aux", "auxiliary"], + ["aux_adj", "auxiliary adjective"], + ["aux_v", "auxiliary verb"], + ["chr", "character"], + ["cf", "circumfix"], + ["cls", "classifier"], + ["conj_c", "coordinating conjunction"], + ["conj_s", "subordinating conjunction"], + ["contr", "contraction"], + ["cop", "copula"], + ["ctr", "counter"], + ["expr", "expression"], + ["inf", "infix"], + ["intf", "interfix"], + ["name", "name"], + ["phr", "phrase"], + ["phr_adj", "adjective phrase"], + ["phr_adv", "adverbial phrase"], + ["phr_prep", "prepositional phrase"], + ["postp", "postposition"], + ["pref", "prefix"], + ["propn", "proper noun"], + ["prov", "proverb"], + ["punc", "punctuation"], + ["suff", "suffix"], + ["sym", "symbol"], + ["vi", "intransitive verb"], + ["vt", "transitive verb"], + ["un", "unknown"], +]; + +// --------------------------------------------------------------------------- +// Render Markdown +// --------------------------------------------------------------------------- + +let md = `--- +title: XML Schema Reference +description: Complete reference for the ODict XML (ODXML) schema. +--- + +{/* This file is auto-generated by scripts/generate-schema-docs.mjs. Do not edit manually. */} + +This page is automatically generated from [\`odict.xsd\`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd). + +## Element hierarchy + +\`\`\` +dictionary +├── entry +│ ├── pronunciation +│ │ └── url +│ └── ety +│ └── sense +│ ├── group +│ │ └── definition +│ │ ├── example +│ │ │ └── pronunciation +│ │ │ └── url +│ │ └── note +│ │ └── example +│ │ └── pronunciation +│ │ └── url +│ └── definition +│ ├── example +│ │ └── pronunciation +│ │ └── url +│ └── note +│ └── example +│ └── pronunciation +│ └── url +\`\`\` + +--- + +## Elements + +`; + +for (const el of elements) { + md += `### \`<${el.name}>\`\n\n`; + md += `${el.description}\n\n`; + + // Attributes table + if (el.attributes.length > 0) { + md += `#### Attributes\n\n`; + md += `| Attribute | Type | Required | Description |\n`; + md += `|-----------|------|----------|-------------|\n`; + for (const a of el.attributes) { + md += `| \`${a.name}\` | \`${a.type}\` | ${a.required ? "Yes" : "No"} | ${a.description} |\n`; + } + md += `\n`; + } + + // Children + if (el.children.length > 0) { + md += `#### Child elements\n\n`; + md += `| Element | Min | Max | Description |\n`; + md += `|---------|-----|-----|-------------|\n`; + for (const c of el.children) { + md += `| [\`<${c.name}>\`](#${c.name}) | ${c.min} | ${c.max} | ${c.description} |\n`; + } + md += `\n`; + } + + md += `---\n\n`; +} + +// Parts of Speech section +md += `## Parts of speech\n\n`; +md += `The \`pos\` attribute on \`\` accepts the following codes. You can also pass any custom string, which will be treated as a custom part of speech.\n\n`; +md += `| Code | Label |\n`; +md += `|------|-------|\n`; +for (const [code, label] of universalPos) { + md += `| \`${code}\` | ${label} |\n`; +} +md += `\n`; +md += `:::note\n`; +md += `ODict also supports an extensive set of Japanese-specific parts of speech (Godan verbs, Ichidan verbs, Nidan verbs, etc.). These use codes like \`v5b\`, \`v1\`, \`vk\`, \`adj_na\`, etc. Refer to the [source code](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs) for the complete list.\n`; +md += `:::\n`; + +// --------------------------------------------------------------------------- +// Write output +// --------------------------------------------------------------------------- + +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, md, "utf-8"); +console.log(`✅ Generated schema reference → ${outPath}`); diff --git a/docs/src/content.config.ts b/docs/src/content.config.ts new file mode 100644 index 000000000..a4eec59ba --- /dev/null +++ b/docs/src/content.config.ts @@ -0,0 +1,6 @@ +import { defineCollection } from "astro:content"; +import { docsSchema } from "@astrojs/starlight/schema"; + +export const collections = { + docs: defineCollection({ schema: docsSchema() }), +}; diff --git a/docs/src/content/docs/api/javascript.md b/docs/src/content/docs/api/javascript.md new file mode 100644 index 000000000..7c88e3a4e --- /dev/null +++ b/docs/src/content/docs/api/javascript.md @@ -0,0 +1,378 @@ +--- +title: JavaScript API +description: Using ODict from JavaScript/TypeScript via the @odict/node package. +--- + +The JavaScript bindings are distributed as `@odict/node` on npm. They are native extensions built with [NAPI-RS](https://napi.rs/) and also support the browser via WASI. + +## Installation + +```bash +npm install @odict/node +``` + +Requires Node.js 12+. Native binaries are included for all major platforms (macOS, Linux, Windows, ARM64, WASI). + +## Quick example + +```typescript +import { readFile } from "node:fs/promises"; +import { compile, OpenDictionary } from "@odict/node"; + +// Compile XML to a buffer +const xml = await readFile("my-dictionary.xml", "utf-8"); +const data = compile(xml); +const dictionary = new OpenDictionary(data); + +const results = dictionary.lookup("hello"); +console.log(results[0].entry.term); // "hello" +``` + +--- + +## Functions + +### `compile(xml: string): Buffer` + +Compiles an ODXML string into binary `.odict` data. Returns a `Buffer` that can be passed to `new OpenDictionary()`. + +```typescript +import { compile } from "@odict/node"; + +const data = compile(` + + + + + +`); +``` + +--- + +## `OpenDictionary` + +The main class for working with compiled dictionaries. + +### Constructors + +#### `new OpenDictionary(data: Buffer)` + +Creates a dictionary from compiled binary data (as returned by `compile()`). + +```typescript +import { compile, OpenDictionary } from "@odict/node"; + +const data = compile(xmlString); +const dictionary = new OpenDictionary(data); +``` + +#### `OpenDictionary.load(dictionary: string, options?: LoadOptions): Promise` + +Loads a dictionary from a file path or remote identifier. Returns a `Promise`. + +- If `dictionary` is a path to a `.odict` file, it loads from disk. +- If it matches the format `org/lang` (e.g. `wiktionary/eng`), it downloads from the remote registry. + +```typescript +import { OpenDictionary } from "@odict/node"; + +// Load from file +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +// Load from remote registry +const dictionary = await OpenDictionary.load("wiktionary/eng"); + +// Load with alias options +const dictionary = await OpenDictionary.load("./dict.odict", { + alias: { path: "./aliases.json" }, +}); +``` + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `minRank` | `number \| null` | The minimum rank value across all entries, or `null` if no entries have ranks | +| `maxRank` | `number \| null` | The maximum rank value across all entries, or `null` if no entries have ranks | + +### Methods + +#### `save(path: string, options?: SaveOptions): void` + +Saves the dictionary to disk as a `.odict` file. + +```typescript +dictionary.save("output.odict"); +dictionary.save("output.odict", { + compress: { quality: 11, windowSize: 22 }, +}); +``` + +#### `lookup(query: string | string[], options?: LookupOptions): LookupResult[]` + +Looks up one or more terms by exact match. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `query` | `string \| string[]` | — | Term(s) to look up | +| `options.split` | `number` | — | Minimum word length for compound splitting | +| `options.follow` | `boolean \| number` | — | Follow `see` cross-references. `true` = infinite, `false` = disabled, number = max depth | +| `options.insensitive` | `boolean` | — | Enable case-insensitive matching | + +```typescript +// Simple lookup +const results = dictionary.lookup("cat"); + +// Multiple terms +const results = dictionary.lookup(["cat", "dog"]); + +// Follow cross-references, case-insensitive +const results = dictionary.lookup("RaN", { + follow: true, + insensitive: true, +}); +// results[0].entry.term === "run" +// results[0].directedFrom?.term === "ran" + +// Compound word splitting +const results = dictionary.lookup("catdog", { split: 3 }); +``` + +#### `lexicon(): string[]` + +Returns all terms defined in the dictionary, sorted alphabetically. + +```typescript +const words = dictionary.lexicon(); +// ["cat", "dog", "run", ...] +``` + +#### `index(options?: IndexOptions): void` + +Creates a full-text search index for the dictionary. + +```typescript +dictionary.index(); +dictionary.index({ overwrite: true, memory: 50_000_000 }); +``` + +#### `search(query: string, options?: SearchOptions): Entry[]` + +Runs a full-text search. Requires an index (call `index()` first). + +```typescript +dictionary.index(); + +const results = dictionary.search("domesticated mammal"); +const results = dictionary.search("greeting", { limit: 5 }); +``` + +#### `tokenize(text: string, options?: TokenizeOptions): Token[]` + +Tokenizes text and matches each token against the dictionary. Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, and Latin-script languages. + +```typescript +const tokens = dictionary.tokenize("the cat ran"); +for (const token of tokens) { + console.log(token.lemma, token.entries); +} + +// With options +const tokens = dictionary.tokenize("DOG cat", { + insensitive: true, + follow: true, +}); +``` + +--- + +## Types + +### `LookupResult` + +```typescript +interface LookupResult { + entry: Entry; + directedFrom?: Entry; +} +``` + +### `Entry` + +```typescript +interface Entry { + term: string; + rank?: number; + seeAlso?: string; + etymologies: Etymology[]; + media: MediaURL[]; +} +``` + +### `Etymology` + +```typescript +interface Etymology { + id?: string; + pronunciations: Pronunciation[]; + description?: string; + senses: Record; +} +``` + +### `Sense` + +```typescript +interface Sense { + pos: EnumWrapper; + lemma?: string; + definitions: Array; + tags: string[]; + translations: Translation[]; + forms: Form[]; +} +``` + +### `Definition` + +```typescript +interface Definition { + id?: string; + value: string; + examples: Example[]; + notes: Note[]; +} +``` + +### `Group` + +```typescript +interface Group { + id?: string; + description: string; + definitions: Definition[]; +} +``` + +### `Example` + +```typescript +interface Example { + value: string; + translations: Translation[]; + pronunciations: Pronunciation[]; +} +``` + +### `Note` + +```typescript +interface Note { + id?: string; + value: string; + examples: Example[]; +} +``` + +### `Pronunciation` + +```typescript +interface Pronunciation { + kind?: EnumWrapper; + value: string; + media: MediaUrl[]; +} +``` + +### `MediaUrl` + +```typescript +interface MediaUrl { + src: string; + mimeType?: string; + description?: string; +} +``` + +### `Token` + +```typescript +interface Token { + lemma: string; + language?: string; + entries: LookupResult[]; + kind: string; + script: string; + start: number; + end: number; +} +``` + +### `EnumWrapper` + +```typescript +interface EnumWrapper { + name: string; + variant: string; + value: string; +} +``` + +### Options + +```typescript +interface LoadOptions { + alias?: AliasLoadOptions; +} + +interface AliasLoadOptions { + path?: string; +} + +interface SaveOptions { + compress?: CompressOptions; +} + +interface CompressOptions { + quality?: number; + windowSize?: number; +} + +interface LookupOptions { + split?: number; + follow?: boolean | number; + insensitive?: boolean; +} + +interface IndexOptions { + directory?: string; + memory?: number; + overwrite?: boolean; +} + +interface SearchOptions { + directory?: string; + threshold?: number; + autoindex?: boolean; + limit?: number; +} + +interface TokenizeOptions { + follow?: boolean | number; + allowList?: string[]; + insensitive?: boolean; +} +``` + +## Browser support + +The `@odict/node` package also supports browser environments via WASI. Import from the browser entry point: + +```typescript +import { compile, OpenDictionary } from "@odict/node/browser"; +``` + +:::note +Browser support runs ODict compiled to WebAssembly via WASI. The `load()` method (which accesses the filesystem and network) is not available in the browser — use `new OpenDictionary(data)` with pre-compiled data instead. +::: diff --git a/docs/src/content/docs/api/python.md b/docs/src/content/docs/api/python.md new file mode 100644 index 000000000..84a0e3b51 --- /dev/null +++ b/docs/src/content/docs/api/python.md @@ -0,0 +1,271 @@ +--- +title: Python API +description: Using ODict from Python via the theopendictionary package. +--- + +The Python bindings are distributed as the `theopendictionary` package on PyPI. They are native extensions built with [PyO3](https://pyo3.rs/). + +## Installation + +```bash +pip install theopendictionary +``` + +Requires Python 3.8.1+. + +## Quick example + +```python +from theopendictionary import OpenDictionary, compile + +# Compile XML to bytes +xml = """ + + + + + + + + + + + +""" + +compiled_bytes = compile(xml) +dictionary = OpenDictionary(compiled_bytes) + +results = dictionary.lookup("hello") +print(results[0].entry.term) # "hello" +print(results[0].entry.etymologies) # [Etymology(...)] +``` + +--- + +## Functions + +### `compile(xml: str) -> bytes` + +Compiles an ODXML string into binary `.odict` data (as a `bytes` object). This data can be passed to `OpenDictionary()` or saved to disk. + +```python +from theopendictionary import compile + +data = compile("") +``` + +--- + +## `OpenDictionary` + +The main class for working with compiled dictionaries. + +### Constructors + +#### `OpenDictionary(data: bytes)` + +Creates a dictionary from compiled binary data (as returned by `compile()`). + +```python +from theopendictionary import OpenDictionary, compile + +data = compile(xml_string) +dictionary = OpenDictionary(data) +``` + +#### `await OpenDictionary.load(dictionary: str, alias_path: str | None = None) -> OpenDictionary` + +Loads a dictionary from a file path, alias, or remote identifier. This is an **async** method. + +- If `dictionary` is a path to a `.odict` file, it loads from disk. +- If it matches the format `org/lang` (e.g. `wiktionary/eng`), it downloads from the remote registry. +- `alias_path` optionally specifies a custom alias file path. + +```python +import asyncio +from theopendictionary import OpenDictionary + +async def main(): + # Load from file + dictionary = await OpenDictionary.load("./my-dictionary.odict") + + # Load from remote registry + dictionary = await OpenDictionary.load("wiktionary/eng") + +asyncio.run(main()) +``` + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `min_rank` | `int \| None` | The minimum rank value across all entries, or `None` if no entries have ranks | +| `max_rank` | `int \| None` | The maximum rank value across all entries, or `None` if no entries have ranks | + +### Methods + +#### `save(path: str, quality: int | None = None, window_size: int | None = None) -> None` + +Saves the dictionary to disk as a `.odict` file. Optionally configure Brotli compression. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `path` | `str` | — | Output file path | +| `quality` | `int \| None` | `None` | Brotli compression level (0–11) | +| `window_size` | `int \| None` | `None` | Brotli window size (0–22) | + +```python +dictionary.save("output.odict") +dictionary.save("output.odict", quality=11, window_size=22) +``` + +#### `lookup(query, split=None, follow=None, insensitive=None) -> list[LookupResult]` + +Looks up one or more terms by exact match. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `query` | `str \| list[str]` | — | Term(s) to look up | +| `split` | `int \| None` | `None` | Minimum word length for compound splitting | +| `follow` | `bool \| int \| None` | `None` | Follow `see` cross-references. `True` = infinite, `False` = disabled, `int` = max depth | +| `insensitive` | `bool \| None` | `None` | Enable case-insensitive matching | + +```python +# Simple lookup +results = dictionary.lookup("cat") + +# Multiple terms +results = dictionary.lookup(["cat", "dog"]) + +# Follow cross-references, case-insensitive +results = dictionary.lookup("RaN", follow=True, insensitive=True) +# results[0].entry.term == "run" +# results[0].directed_from.term == "ran" + +# Compound word splitting +results = dictionary.lookup("catdog", split=3) +``` + +#### `lexicon() -> list[str]` + +Returns all terms defined in the dictionary, sorted alphabetically. + +```python +words = dictionary.lexicon() +# ["cat", "dog", "run", ...] +``` + +#### `index(options=None) -> None` + +Creates a full-text search index for the dictionary. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `options` | `IndexOptions \| None` | `None` | Indexing configuration | + +```python +from theopendictionary import IndexOptions + +dictionary.index() +dictionary.index(IndexOptions(overwrite=True, memory=50_000_000)) +``` + +#### `search(query: str, options=None) -> list[Entry]` + +Runs a full-text search across the dictionary. Requires an index (call `index()` first). + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `query` | `str` | — | Search query | +| `options` | `SearchOptions \| None` | `None` | Search configuration | + +```python +from theopendictionary import SearchOptions + +dictionary.index() +results = dictionary.search("domesticated mammal") +results = dictionary.search("greeting", SearchOptions(limit=5)) +``` + +#### `tokenize(text: str, follow=None, insensitive=None) -> list[Token]` + +Tokenizes text using NLP-based segmentation and matches each token against the dictionary. Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, and Latin-script languages. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `text` | `str` | — | Text to tokenize | +| `follow` | `bool \| int \| None` | `None` | Follow `see` cross-references | +| `insensitive` | `bool \| None` | `None` | Case-insensitive matching | + +```python +tokens = dictionary.tokenize("the cat ran") +for token in tokens: + print(token.lemma, token.entries) +``` + +--- + +## Types + +### `LookupResult` + +| Property | Type | Description | +|----------|------|-------------| +| `entry` | `Entry` | The matched entry | +| `directed_from` | `Entry \| None` | The original entry if a `see` redirect was followed | + +### `Entry` + +| Property | Type | Description | +|----------|------|-------------| +| `term` | `str` | The headword | +| `rank` | `int \| None` | Optional frequency rank | +| `see_also` | `str \| None` | Cross-reference target term | +| `etymologies` | `list[Etymology]` | List of etymologies | +| `media` | `list[MediaURL]` | Media URLs | + +### `Token` + +| Property | Type | Description | +|----------|------|-------------| +| `lemma` | `str` | The original token text | +| `language` | `str \| None` | Detected language code | +| `script` | `str` | Detected script name | +| `kind` | `str` | Token kind | +| `start` | `int` | Start offset in the original text | +| `end` | `int` | End offset in the original text | +| `entries` | `list[LookupResult]` | Matched dictionary entries | + +### `IndexOptions` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `directory` | `str \| None` | `None` | Custom directory for the index | +| `memory` | `int \| None` | `None` | Memory arena per thread in bytes (must be >15MB) | +| `overwrite` | `bool \| None` | `None` | Overwrite existing index | + +### `SearchOptions` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `directory` | `str \| None` | `None` | Custom index directory | +| `threshold` | `int \| None` | `None` | Relevance threshold | +| `autoindex` | `bool \| None` | `None` | Auto-create index if missing | +| `limit` | `int \| None` | `None` | Maximum results | + +### `Pronunciation` + +| Property | Type | Description | +|----------|------|-------------| +| `kind` | `EnumWrapper \| None` | The pronunciation system (e.g. IPA, Pinyin) | +| `value` | `str` | The pronunciation notation | +| `media` | `list[MediaURL]` | Audio URLs | + +### `MediaURL` + +| Property | Type | Description | +|----------|------|-------------| +| `src` | `str` | URL or path to the media file | +| `mime_type` | `str \| None` | MIME type (e.g. `audio/mpeg`) | +| `description` | `str \| None` | Description of the media | diff --git a/docs/src/content/docs/api/rust.md b/docs/src/content/docs/api/rust.md new file mode 100644 index 000000000..02d6ab254 --- /dev/null +++ b/docs/src/content/docs/api/rust.md @@ -0,0 +1,98 @@ +--- +title: Rust API +description: Using the ODict Rust crate. +--- + +The `odict` crate is the core library that powers the CLI and all language bindings. It is published on [crates.io](https://crates.io/crates/odict). + +## Installation + +Add to your `Cargo.toml`: + +```toml +[dependencies] +odict = "2" +``` + +## Documentation + +Full API documentation is available on **docs.rs**: + +**[docs.rs/odict](https://docs.rs/odict)** + +## Feature flags + +The `odict` crate uses feature flags to control which capabilities are compiled in. The `default` feature includes `sql` and `config`. + +| Feature | Description | +|---------|-------------| +| `default` | Enables `sql` and `config` | +| `sql` | SQL dump support (SQLite, PostgreSQL, MySQL) via sea-query | +| `config` | Access to platform-specific config directories | +| `alias` | Dictionary alias management (implies `config`) | +| `search` | Full-text search via Tantivy (implies `config`) | +| `markdown` | Markdown rendering support via pulldown-cmark | +| `html` | HTML output support (implies `markdown`) | +| `http` | Remote dictionary downloading (implies `config`) | +| `tokenize` | Full multi-language tokenization (enables all language tokenizers) | +| `tokenize-latin` | Latin-script tokenization | +| `tokenize-chinese` | Chinese segmentation | +| `tokenize-japanese` | Japanese segmentation (UniDic) | +| `tokenize-korean` | Korean segmentation | +| `tokenize-thai` | Thai segmentation | +| `tokenize-khmer` | Khmer segmentation | +| `tokenize-swedish` | Swedish recomposition | +| `tokenize-german` | German segmentation | + +## Quick example + +```rust +use odict::{OpenDictionary, ToDictionary}; + +fn main() -> odict::Result<()> { + // Compile from XML + let xml = r#" + + + + + + + + + + "#; + + // Compile and write to disk + let dict = xml.to_dictionary()?.build()?; + dict.to_disk("example.odict")?; + + // Read from disk + let file = OpenDictionary::from_path("example.odict")?; + let contents = file.contents()?; + + // Lookup + let results = contents.lookup( + &["hello"], + &odict::lookup::LookupOptions::default(), + )?; + + println!("{:?}", results); + Ok(()) +} +``` + +## Key traits and types + +| Type | Description | +|------|-------------| +| `OpenDictionary` | A compiled dictionary loaded from disk or bytes | +| `ToDictionary` | Trait for converting XML strings to `Dictionary` | +| `Dictionary` | The deserialized dictionary schema type | +| `CompilerOptions` | Options for compiling (compression settings) | +| `lookup::LookupOptions` | Options for exact-match lookups | +| `search::SearchOptions` | Options for full-text search | +| `index::IndexOptions` | Options for creating a search index | +| `tokenize::TokenizeOptions` | Options for text tokenization | + +Refer to the [docs.rs documentation](https://docs.rs/odict) for complete details on all types, traits, and methods. diff --git a/docs/src/content/docs/cli/reference.md b/docs/src/content/docs/cli/reference.md new file mode 100644 index 000000000..e4e943125 --- /dev/null +++ b/docs/src/content/docs/cli/reference.md @@ -0,0 +1,532 @@ +--- +title: CLI Reference +description: Complete reference for the ODict command-line interface. +--- + +{/* This file is auto-generated by scripts/generate-cli-docs.mjs. Do not edit manually. */} + +``` +odict [OPTIONS] +``` + +The ODict CLI is the primary tool for creating, compiling, and querying ODict dictionaries. + +## Global options + +| Option | Description | +|--------|-------------| +| `-q, --quiet` | Silence any non-important output | +| `-h, --help` | Print help | +| `-V, --version` | Print version | + +--- + +## Commands + +### `odict new` + +Scaffolds a new ODict XML dictionary. + +``` +odict new [-n ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `file_name` | Yes | Name of your new dictionary file (without extension) | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-n` | `` | Name attribute of the `` element | + +#### Example + +```bash +# Create a new dictionary file +odict new my-dictionary -n "My Dictionary" +# Creates my-dictionary.xml +``` + +--- + +### `odict compile` + +Compiles a dictionary from ODXML. + +``` +odict compile [-o ] [-q ] [-w ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `input` | Yes | Path to ODXML file | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-o` | `` | Output path of compiled dictionary. Defaults to the input path with a `.odict` extension. | +| `-q` | `<0-11>` | Brotli compression level (default: `8`) | +| `-w` | `<0-22>` | Brotli large window size (default: `22`) | + +#### Example + +```bash +# Compile with default settings +odict compile my-dictionary.xml + +# Compile with custom output and compression +odict compile my-dictionary.xml -o out/dict.odict -q 11 +``` + +--- + +### `odict lookup` + +Looks up entries in a compiled dictionary without indexing. + +``` +odict lookup [-f ] [-F ] [-s ] [-i] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary or an alias | +| `queries` | Yes | One or more words to look up | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-f, --format` | `` | Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`) | +| `-F, --follow` | `` | Number of redirects to follow via `see` attributes (default: `0`). Use a high number for infinite following. | +| `-s, --split` | `` | If not found, split the query into words of at least length `n` and look up each separately (default: `0`, disabled) | +| `-i, --insensitive` | — | Perform case-insensitive lookups | + +#### Example + +```bash +# Simple lookup +odict lookup my-dictionary.odict cat + +# Lookup with JSON output and follow redirects +odict lookup my-dictionary.odict ran -f json -F 1 + +# Case-insensitive lookup with splitting +odict lookup my-dictionary.odict "catdog" -s 3 -i +``` + +--- + +### `odict search` + +Runs a full-text query on a compiled dictionary. + +``` +odict search [-f ] [--index] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary or an alias | +| `query` | Yes | Search query | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-f, --format` | `` | Output format: `json`, `xml`, `markdown`, `html`, `print` (default: `json`) | +| `--index` | — | Creates a new index if one doesn't already exist | + +#### Example + +```bash +# Search with auto-indexing +odict search my-dictionary.odict "move swiftly" --index + +# Search with specific output format +odict search my-dictionary.odict "greeting" -f xml +``` + +--- + +### `odict index` + +Creates a full-text index of a compiled dictionary. + +``` +odict index [-d ] [-f] [-m ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary or an alias | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-d` | `` | Custom directory to store the index | +| `-f` | — | Whether to overwrite the index if it already exists | +| `-m` | `` | Memory arena per thread in bytes. Must be above 15MB. (default: `15000000`) | + +#### Example + +```bash +# Create an index with default settings +odict index my-dictionary.odict + +# Overwrite existing index with custom memory +odict index my-dictionary.odict -f -m 50000000 +``` + +--- + +### `odict tokenize` + +Tokenizes text and finds dictionary entries for each token. + +``` +odict tokenize [-f ] [-F ] [-i] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary | +| `text` | Yes | Text to tokenize | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-f, --format` | `` | Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`) | +| `-F, --follow` | `` | Number of redirects to follow via `see` attributes (default: `0`) | +| `-i, --insensitive` | — | Perform case-insensitive lookups when matching tokens | + +#### Example + +```bash +# Tokenize Chinese text +odict tokenize chinese.odict "你好世界" + +# Tokenize with redirect following +odict tokenize my-dictionary.odict "the cat ran" -F 1 -f json +``` + +--- + +### `odict dump` + +Outputs a dictionary in a human-readable format. + +``` +odict dump [-f ] [-o ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `input` | Yes | Path to a compiled dictionary | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-f` | `` | Dump format: `xml`, `sqlite`, `postgres`, `mysql` (default: `xml`) | +| `-o` | `` | Output path. Defaults to stdout. | + +#### Example + +```bash +# Dump as XML to stdout +odict dump my-dictionary.odict + +# Dump as SQL to a file +odict dump my-dictionary.odict -f sqlite -o dictionary.sql +``` + +--- + +### `odict merge` + +Merges entries from multiple dictionaries into one. + +``` +odict merge [-o ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `destination` | Yes | Path of the dictionary to merge into (unless `--output` is specified) | +| `sources` | Yes | Paths of dictionaries to merge | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-o, --output` | `` | Separate output path for the compiled dictionary | + +#### Example + +```bash +# Merge two dictionaries into the first +odict merge base.odict extra1.odict extra2.odict + +# Merge into a new file +odict merge base.odict extra.odict -o combined.odict +``` + +--- + +### `odict info` + +Prints the metadata for a dictionary file. + +``` +odict info +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary | + +#### Example + +```bash +odict info my-dictionary.odict +# Output: +# My Dictionary +# ───────────── +# File Version: 3 +# File Size: 1.23 KB +# Entries: 5,000 +``` + +--- + +### `odict lexicon` + +Lists all words defined in a dictionary. + +``` +odict lexicon +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary | + +#### Example + +```bash +odict lexicon my-dictionary.odict +# cat +# dog +# run +# ... +``` + +--- + +### `odict download` + +Downloads a dictionary from the remote registry. + +``` +odict download [-o ] [--no-cache] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Dictionary to download (e.g. `wiktionary/eng`) | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-o, --output` | `` | Directory to download to (defaults to config directory) | +| `--no-cache` | — | Disable caching (always download a fresh copy) | + +#### Example + +```bash +# Download English Wiktionary dictionary +odict download wiktionary/eng + +# Download Japanese dictionary to a specific directory +odict download wiktionary/jpn -o ./dicts/ +``` + +--- + +### `odict serve` + +Starts a local HTTP server to serve one or several dictionaries. + +``` +odict serve [dictionaries...] [-p ] [-c ] [-l ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionaries` | No | Paths to compiled dictionaries or directories containing `.odict` files | + +#### Options + +| Flag | Argument | Description | +|------|----------|-------------| +| `-p` | `` | Port to listen on (default: `5005`) | +| `-c, --capacity` | `` | Maximum number of dictionaries to keep in memory (default: `5`) | +| `-l, --level` | `` | Log level: `trace`, `debug`, `info`, `warn`, `error` | + +#### Example + +```bash +# Serve a single dictionary +odict serve my-dictionary.odict + +# Serve a directory of dictionaries on a custom port +odict serve ./dicts/ -p 8080 -c 10 +``` + +### HTTP endpoints + +When running `odict serve`, the following REST endpoints become available: + +#### `GET /{name}/lookup` + +Look up entries by exact match. + +| Parameter | Type | Description | +|-----------|------|-------------| +| `queries` | string | Comma-separated list of terms to look up | +| `follow` | number | Number of redirects to follow (optional) | +| `split` | number | Minimum word length for splitting (optional) | + +```bash +curl "http://localhost:5005/my-dictionary/lookup?queries=cat,dog&follow=1" +``` + +#### `GET /{name}/search` + +Full-text search across definitions. + +| Parameter | Type | Description | +|-----------|------|-------------| +| `query` | string | Search query | +| `limit` | number | Maximum results to return (default: 10) | + +```bash +curl "http://localhost:5005/my-dictionary/search?query=move+swiftly&limit=5" +``` + +#### `GET /{name}/tokenize` + +Tokenize text and find matching entries. + +| Parameter | Type | Description | +|-----------|------|-------------| +| `text` | string | Text to tokenize | +| `follow` | number | Number of redirects to follow (optional) | + +```bash +curl "http://localhost:5005/chinese/tokenize?text=你好世界" +``` + +All endpoints return JSON. + +--- + +### `odict alias add` + +Creates a new dictionary alias (fails if one already exists). + +``` +odict alias add +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `name` | Yes | Name of the alias | +| `path` | Yes | Dictionary path | + +#### Example + +```bash +odict alias add eng ./dicts/english.odict +``` + +--- + +### `odict alias set` + +Creates or updates a dictionary alias. + +``` +odict alias set +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `name` | Yes | Name of the alias | +| `path` | Yes | Dictionary path | + +#### Example + +```bash +odict alias set eng ./dicts/english-v2.odict +``` + +--- + +### `odict alias delete` + +Deletes an alias with the given name. + +``` +odict alias delete +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `name` | Yes | Name of the alias to delete | + +#### Example + +```bash +odict alias delete eng +``` + +--- + diff --git a/docs/src/content/docs/getting-started/installation.md b/docs/src/content/docs/getting-started/installation.md new file mode 100644 index 000000000..61f80d3da --- /dev/null +++ b/docs/src/content/docs/getting-started/installation.md @@ -0,0 +1,71 @@ +--- +title: Installation +description: How to install the ODict CLI and language bindings. +--- + +## CLI + +### Homebrew (macOS) + +```bash +brew install TheOpenDictionary/odict/odict +``` + +### Shell installer (macOS / Linux) + +```bash +curl --proto '=https' --tlsv1.2 -LsSf https://github.com/TheOpenDictionary/odict/releases/latest/download/odict-installer.sh | sh +``` + +### PowerShell installer (Windows) + +```powershell +powershell -ExecutionPolicy ByPass -c "irm https://github.com/TheOpenDictionary/odict/releases/latest/download/odict-installer.ps1 | iex" +``` + +### From source + +Requires [Rust](https://rustup.rs/) 1.75+. + +```bash +git clone https://github.com/TheOpenDictionary/odict.git +cd odict +cargo install --path cli +``` + +### Verify installation + +```bash +odict --version +``` + +--- + +## Language bindings + +### Python + +```bash +pip install theopendictionary +``` + +Requires Python 3.8.1+. See the [Python API docs](/api/python/) for usage. + +### JavaScript (Node.js) + +```bash +npm install @odict/node +``` + +Requires Node.js 12+. The package includes native binaries for all major platforms. See the [JavaScript API docs](/api/javascript/) for usage. + +### Rust + +Add the crate to your `Cargo.toml`: + +```toml +[dependencies] +odict = "2" +``` + +See the [Rust API docs](/api/rust/) for usage and feature flags. diff --git a/docs/src/content/docs/getting-started/introduction.md b/docs/src/content/docs/getting-started/introduction.md new file mode 100644 index 000000000..c5d86a2cf --- /dev/null +++ b/docs/src/content/docs/getting-started/introduction.md @@ -0,0 +1,42 @@ +--- +title: Introduction +description: What is ODict and why does it exist? +--- + +**ODict** (The Open Dictionary) is a blazingly-fast, open-source dictionary file format designed for human languages. It provides a complete pipeline for defining, compiling, and querying dictionaries: + +1. **Define** your dictionary entries in a simple XML format (ODXML) +2. **Compile** the XML into a compact binary `.odict` file +3. **Query** the compiled dictionary using exact lookups, full-text search, or multi-language tokenization + +## Why ODict? + +Most dictionary data is locked in proprietary formats, scattered across inconsistent APIs, or stored in slow, bloated files. ODict addresses these problems: + +- **Universal schema** — A single, well-defined XML schema that can represent dictionaries for any human language, including etymologies, multiple senses, pronunciations, examples, and cross-references. +- **Fast binary format** — Compiled `.odict` files use [rkyv](https://rkyv.org/) for zero-copy deserialization and Brotli compression, making lookups extremely fast even on large dictionaries. +- **Full-text search** — Built-in indexing and search powered by [Tantivy](https://github.com/quickwit-oss/tantivy). +- **Multi-language tokenization** — Tokenize text in Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, and Latin-script languages, and automatically match tokens to dictionary entries. +- **Cross-platform bindings** — Use ODict from Rust, Python, JavaScript (Node.js and browser), or through the CLI and HTTP server. + +## Architecture + +``` +┌─────────────┐ ┌──────────┐ ┌─────────────┐ +│ ODXML file │────▶│ Compiler │────▶│ .odict file │ +│ (XML) │ │ │ │ (binary) │ +└─────────────┘ └──────────┘ └──────┬──────┘ + │ + ┌───────────────────────┬┴──────────────────────┐ + │ │ │ + ┌─────▼─────┐ ┌──────▼──────┐ ┌──────▼──────┐ + │ Lookup │ │ Search │ │ Tokenize │ + │ (exact key)│ │ (full-text) │ │ (NLP-based) │ + └───────────┘ └─────────────┘ └─────────────┘ +``` + +## What's next? + +- [Install the CLI](/getting-started/installation/) to start working with dictionaries +- [Quick Start](/getting-started/quickstart/) walks you through creating and compiling your first dictionary +- Browse the [XML Schema Reference](/schema/reference/) to learn the full data model diff --git a/docs/src/content/docs/getting-started/quickstart.md b/docs/src/content/docs/getting-started/quickstart.md new file mode 100644 index 000000000..afc1023ff --- /dev/null +++ b/docs/src/content/docs/getting-started/quickstart.md @@ -0,0 +1,170 @@ +--- +title: Quick Start +description: Create, compile, and query your first ODict dictionary. +--- + +This guide walks you through creating a simple dictionary, compiling it, and querying it with the CLI. + +## 1. Create a new dictionary + +Use the `odict new` command to scaffold a blank XML file: + +```bash +odict new animals -n "Animal Dictionary" +``` + +This creates `animals.xml`: + +```xml + + + +``` + +## 2. Add entries + +Open `animals.xml` and add some entries: + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +:::tip +The `see` attribute creates a cross-reference. When you look up "kitty", ODict can follow it to the "cat" entry. +::: + +## 3. Compile the dictionary + +```bash +odict compile animals.xml +``` + +This produces `animals.odict` — a compact binary file. You can inspect it with: + +```bash +odict info animals.odict +``` + +``` +Animal Dictionary +───────────────── + +File Version: 3 +File Size: 312 B +Entries: 3 +``` + +## 4. Look up entries + +```bash +odict lookup animals.odict cat +``` + +Output: + +``` +cat (From Latin cattus) + + noun + 1. A small domesticated carnivorous mammal with soft fur + • "The cat sat on the mat." + • "She adopted two cats from the shelter." + 2. (informal) A person, especially a man + • "He's a cool cat." +``` + +### Follow cross-references + +```bash +odict lookup animals.odict kitty -F 1 +``` + +This follows the `see="cat"` redirect and returns the "cat" entry. + +### JSON output + +```bash +odict lookup animals.odict cat -f json +``` + +Returns full structured JSON, useful for integration with other tools. + +## 5. Full-text search + +To search across all definitions, first create an index: + +```bash +odict index animals.odict +``` + +Then search: + +```bash +odict search animals.odict "domesticated mammal" +``` + +This returns all entries whose definitions match the query. + +:::note +You can also pass `--index` to `odict search` to auto-create the index on the fly. +::: + +## 6. Serve over HTTP + +Start a local server to query dictionaries via REST: + +```bash +odict serve animals.odict -p 8080 +``` + +Then query from any HTTP client: + +```bash +# Lookup +curl "http://localhost:8080/animals/lookup?queries=cat,dog" + +# Search +curl "http://localhost:8080/animals/search?query=domesticated" + +# Tokenize +curl "http://localhost:8080/animals/tokenize?text=the+cat+and+the+dog" +``` + +## What's next? + +- [XML Schema Reference](/schema/reference/) — learn the full XML format including pronunciations, notes, and groups +- [CLI Reference](/cli/reference/) — complete command-line documentation +- Language bindings: [Python](/api/python/), [JavaScript](/api/javascript/), [Rust](/api/rust/) diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx new file mode 100644 index 000000000..55838e4ab --- /dev/null +++ b/docs/src/content/docs/index.mdx @@ -0,0 +1,38 @@ +--- +title: ODict +description: The lightning-fast open-source dictionary file format for human languages. +template: splash +hero: + title: ODict + tagline: The lightning-fast open-source dictionary file format for human languages. + actions: + - text: Get Started + link: /getting-started/introduction/ + icon: right-arrow + - text: View on GitHub + link: https://github.com/TheOpenDictionary/odict + icon: external + variant: minimal +--- + +import { Card, CardGrid } from "@astrojs/starlight/components"; + + + + Define your dictionary entries using a simple, well-documented XML schema + (ODXML) that supports etymologies, senses, definitions, examples, + pronunciations, and more. + + + Compile your XML dictionaries into compact, blazingly-fast binary `.odict` + files using zero-copy deserialization via rkyv and Brotli compression. + + + Index and search your compiled dictionaries with built-in full-text search + powered by Tantivy, with multi-language tokenization support. + + + Native bindings for Python, JavaScript (Node.js and browser via WASI), and + Rust. Plus a powerful CLI and HTTP server for language-agnostic access. + + diff --git a/docs/src/content/docs/schema/overview.md b/docs/src/content/docs/schema/overview.md new file mode 100644 index 000000000..e309d3803 --- /dev/null +++ b/docs/src/content/docs/schema/overview.md @@ -0,0 +1,174 @@ +--- +title: Schema Overview +description: An overview of the ODict XML (ODXML) schema and how dictionaries are structured. +--- + +ODict dictionaries are authored in XML using the **ODXML** (Open Dictionary XML) format. This page provides a conceptual overview of how the schema is structured. For the full element-by-element reference, see the [Schema Reference](/schema/reference/). + +## Structure + +An ODXML file describes a dictionary as a hierarchy: + +``` +dictionary +└── entry (one per headword) + ├── pronunciation (optional, entry-level) + └── ety (etymology — groups senses by word origin) + └── sense (groups definitions by part of speech) + ├── group (optional grouping of definitions) + │ └── definition + │ ├── example + │ └── note + └── definition + ├── example + └── note +``` + +## Minimal example + +The simplest valid dictionary: + +```xml + + + + + + + + + +``` + +## Entries and cross-references + +Each `` represents a headword. Entries can either contain full definitions (via `` children) or redirect to another entry using the `see` attribute: + +```xml + + + + + + + + + + +``` + +When looking up "ran" with the `follow` option enabled, ODict will resolve the cross-reference and return the "run" entry. + +## Etymologies + +If a word has multiple distinct origins, you can define multiple `` elements: + +```xml + + + + + + + + + + + + +``` + +## Senses and parts of speech + +Within an etymology, `` elements group definitions by part of speech. The `pos` attribute accepts standard codes like `n` (noun), `v` (verb), `adj` (adjective), etc. See the [reference](/schema/reference/#parts-of-speech) for the full list. + +```xml + + + + + + +``` + +If the part of speech is unknown or not applicable, you can omit `pos` entirely. + +## Definition groups + +When a sense has many definitions, you can organize them with ``: + +```xml + + + + + + + + + + +``` + +## Examples and notes + +Definitions can have `` and `` children: + +```xml + + + + + + + +``` + +## Pronunciations + +Pronunciations can be attached at the entry level and support any phonetic system: + +```xml + + + + + + + + + + + + + + + + + +``` + +This is especially useful for non-Latin scripts: + +```xml + + + + ... + +``` + +## XSD validation + +The schema is formally defined in [`odict.xsd`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd). You can validate your XML against it: + +```xml + + + ... + +``` + +Most XML editors (VS Code with the XML extension, IntelliJ, etc.) will provide autocomplete and validation when the XSD is referenced. diff --git a/docs/src/content/docs/schema/reference.md b/docs/src/content/docs/schema/reference.md new file mode 100644 index 000000000..eebaa416b --- /dev/null +++ b/docs/src/content/docs/schema/reference.md @@ -0,0 +1,282 @@ +--- +title: XML Schema Reference +description: Complete reference for the ODict XML (ODXML) schema. +--- + +{/* This file is auto-generated by scripts/generate-schema-docs.mjs. Do not edit manually. */} + +This page is automatically generated from [`odict.xsd`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd). + +## Element hierarchy + +``` +dictionary +├── entry +│ ├── pronunciation +│ │ └── url +│ └── ety +│ └── sense +│ ├── group +│ │ └── definition +│ │ ├── example +│ │ │ └── pronunciation +│ │ │ └── url +│ │ └── note +│ │ └── example +│ │ └── pronunciation +│ │ └── url +│ └── definition +│ ├── example +│ │ └── pronunciation +│ │ └── url +│ └── note +│ └── example +│ └── pronunciation +│ └── url +``` + +--- + +## Elements + +### `` + +The root element of an ODict XML file. Contains one or more entries. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `id` | `string` | No | A unique identifier for the dictionary. | +| `name` | `string` | No | A human-readable name for the dictionary (e.g. "English Dictionary"). | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#entry) | 1 | unbounded | A dictionary entry. | + +--- + +### `` + +Represents a single dictionary entry (headword). An entry can either contain full definitions via etymology elements, or redirect to another entry using the `see` attribute. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `term` | `string` | Yes | The headword or term being defined. | +| `see` | `string` | No | Cross-reference to another entry's term. When set, this entry acts as a redirect (e.g. "ran" → "run"). | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#pronunciation) | 0 | unbounded | Entry-level pronunciation. | +| [``](#ety) | 0 | unbounded | An etymology grouping. | + +--- + +### `` + +Groups senses under a common etymology (word origin). A single entry can have multiple etymologies if the word has distinct historical origins. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `id` | `string` | No | A unique identifier for this etymology. | +| `pronunciation` | `string` | No | A simple pronunciation string (e.g. IPA). For richer pronunciation data, use child `` elements on the parent entry instead. | +| `description` | `string` | No | A description of the word's origin (e.g. "From Latin currere"). | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#sense) | 1 | unbounded | A sense grouping (by part of speech). | + +--- + +### `` + +Groups definitions under a part of speech. A sense can contain definitions directly, or organize them into groups. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `pos` | `string` | No | Part of speech code (e.g. `n`, `v`, `adj`, `adv`, `phr`). See the [Parts of Speech](#parts-of-speech) section for all supported values. | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#group) | 0 | unbounded | A named group of related definitions. | +| [``](#definition) | 0 | unbounded | A definition (can appear alongside or instead of groups). | + +--- + +### `` + +An optional grouping of related definitions within a sense. Useful for organizing many definitions into logical clusters. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `id` | `string` | No | A unique identifier for this group. | +| `description` | `string` | No | A label or description for this group (e.g. "Verb senses related to motion"). | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#definition) | 1 | unbounded | A definition within this group. | + +--- + +### `` + +A single definition of the entry's term. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `id` | `string` | No | A unique identifier for this definition. | +| `value` | `string` | Yes | The definition text. Supports inline Markdown-style formatting in parentheses for labels, e.g. `"(Computing) a set of words..."`. | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#example) | 0 | unbounded | An example usage of this definition. | +| [``](#note) | 0 | unbounded | A supplementary note about this definition. | + +--- + +### `` + +A supplementary note attached to a definition. Notes can carry their own examples. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `id` | `string` | No | A unique identifier for this note. | +| `value` | `string` | Yes | The note text. | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#example) | 1 | unbounded | An example relevant to this note. | + +--- + +### `` + +An example sentence or usage demonstrating a definition, note, or pronunciation. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `value` | `string` | Yes | The example text (e.g. `"The dog runs after the cat."`). | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#pronunciation) | 0 | unbounded | A pronunciation of this example (useful for non-Latin scripts). | + +--- + +### `` + +Describes how a word, entry, or example is pronounced. Supports any phonetic system (IPA, Pinyin, Romaji, etc.) and optional audio URLs. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `kind` | `string` | Yes | The pronunciation system used (e.g. `ipa`, `pinyin`, `romaji`, or any custom string). | +| `value` | `string` | Yes | The pronunciation notation (e.g. `həˈləʊ`, `nǐ hǎo`). | + +#### Child elements + +| Element | Min | Max | Description | +|---------|-----|-----|-------------| +| [``](#url) | 0 | unbounded | A URL to an audio file for this pronunciation. | + +--- + +### `` + +A reference to an audio file for a pronunciation. Used as a child of ``. + +#### Attributes + +| Attribute | Type | Required | Description | +|-----------|------|----------|-------------| +| `src` | `string` | Yes | Path or URL to the audio file. | +| `type` | `string` | No | MIME type of the audio file (e.g. `audio/mpeg`, `audio/ogg`). | +| `description` | `string` | No | A description of this audio (e.g. "British pronunciation"). | + +--- + +## Parts of speech + +The `pos` attribute on `` accepts the following codes. You can also pass any custom string, which will be treated as a custom part of speech. + +| Code | Label | +|------|-------| +| `n` | noun | +| `v` | verb | +| `adj` | adjective | +| `adv` | adverb | +| `pron` | pronoun | +| `prep` | preposition | +| `conj` | conjunction | +| `intj` | interjection | +| `det` | determiner | +| `part` | particle | +| `num` | numeric | +| `abv` | abbreviation | +| `adf` | adfix | +| `aff` | affix | +| `art` | article | +| `aux` | auxiliary | +| `aux_adj` | auxiliary adjective | +| `aux_v` | auxiliary verb | +| `chr` | character | +| `cf` | circumfix | +| `cls` | classifier | +| `conj_c` | coordinating conjunction | +| `conj_s` | subordinating conjunction | +| `contr` | contraction | +| `cop` | copula | +| `ctr` | counter | +| `expr` | expression | +| `inf` | infix | +| `intf` | interfix | +| `name` | name | +| `phr` | phrase | +| `phr_adj` | adjective phrase | +| `phr_adv` | adverbial phrase | +| `phr_prep` | prepositional phrase | +| `postp` | postposition | +| `pref` | prefix | +| `propn` | proper noun | +| `prov` | proverb | +| `punc` | punctuation | +| `suff` | suffix | +| `sym` | symbol | +| `vi` | intransitive verb | +| `vt` | transitive verb | +| `un` | unknown | + +:::note +ODict also supports an extensive set of Japanese-specific parts of speech (Godan verbs, Ichidan verbs, Nidan verbs, etc.). These use codes like `v5b`, `v1`, `vk`, `adj_na`, etc. Refer to the [source code](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs) for the complete list. +::: diff --git a/docs/tsconfig.json b/docs/tsconfig.json new file mode 100644 index 000000000..bcbf8b509 --- /dev/null +++ b/docs/tsconfig.json @@ -0,0 +1,3 @@ +{ + "extends": "astro/tsconfigs/strict" +} From acf80e20a3b18461c2d43f7dd3cdec1b8fd8b56b Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 23 Feb 2026 11:14:00 +0000 Subject: [PATCH 3/6] Add programmatic guides with tabbed Rust/Python/JS examples and update API docs - Add four new guide pages using Starlight Tabs component: - Compiling Dictionaries: compile from string/file, compression options, loading - Looking Up Entries: basic/multi/case-insensitive lookup, cross-references, splitting - Searching Dictionaries: indexing, full-text search, search vs lookup comparison - Tokenizing Text: CJK/Latin tokenization, follow/insensitive options - Update Python API docs: follow is now bool, constructor accepts str, new LoadOptions - Update JavaScript API docs: follow is now bool, constructor accepts Buffer|string, new LoadOptions/RemoteLoadOptions, senses is Array - Add Guides section to sidebar navigation, rename API to API Reference https://claude.ai/code/session_0152q1rpTnXqZGQ5B85AhjXs --- docs/astro.config.mjs | 11 +- docs/src/content/docs/api/javascript.md | 32 ++- docs/src/content/docs/api/python.md | 24 +- docs/src/content/docs/guides/compiling.mdx | 242 ++++++++++++++++ docs/src/content/docs/guides/lookup.mdx | 312 +++++++++++++++++++++ docs/src/content/docs/guides/search.mdx | 176 ++++++++++++ docs/src/content/docs/guides/tokenize.mdx | 228 +++++++++++++++ 7 files changed, 1005 insertions(+), 20 deletions(-) create mode 100644 docs/src/content/docs/guides/compiling.mdx create mode 100644 docs/src/content/docs/guides/lookup.mdx create mode 100644 docs/src/content/docs/guides/search.mdx create mode 100644 docs/src/content/docs/guides/tokenize.mdx diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs index 155bb3f33..5ffe816e1 100644 --- a/docs/astro.config.mjs +++ b/docs/astro.config.mjs @@ -36,12 +36,21 @@ export default defineConfig({ { label: "Reference", slug: "schema/reference" }, ], }, + { + label: "Guides", + items: [ + { label: "Compiling Dictionaries", slug: "guides/compiling" }, + { label: "Looking Up Entries", slug: "guides/lookup" }, + { label: "Searching Dictionaries", slug: "guides/search" }, + { label: "Tokenizing Text", slug: "guides/tokenize" }, + ], + }, { label: "CLI", items: [{ label: "Command Reference", slug: "cli/reference" }], }, { - label: "API", + label: "API Reference", items: [ { label: "Rust", slug: "api/rust" }, { label: "Python", slug: "api/python" }, diff --git a/docs/src/content/docs/api/javascript.md b/docs/src/content/docs/api/javascript.md index 7c88e3a4e..2ca8c1a7f 100644 --- a/docs/src/content/docs/api/javascript.md +++ b/docs/src/content/docs/api/javascript.md @@ -56,15 +56,19 @@ The main class for working with compiled dictionaries. ### Constructors -#### `new OpenDictionary(data: Buffer)` +#### `new OpenDictionary(data: Buffer | string)` -Creates a dictionary from compiled binary data (as returned by `compile()`). +Creates a dictionary from compiled binary data (as returned by `compile()`) or directly from an XML string. ```typescript import { compile, OpenDictionary } from "@odict/node"; +// From compiled buffer const data = compile(xmlString); const dictionary = new OpenDictionary(data); + +// Directly from XML string +const dictionary = new OpenDictionary(xmlString); ``` #### `OpenDictionary.load(dictionary: string, options?: LoadOptions): Promise` @@ -83,9 +87,10 @@ const dictionary = await OpenDictionary.load("./my-dictionary.odict"); // Load from remote registry const dictionary = await OpenDictionary.load("wiktionary/eng"); -// Load with alias options -const dictionary = await OpenDictionary.load("./dict.odict", { - alias: { path: "./aliases.json" }, +// Load with options +const dictionary = await OpenDictionary.load("wiktionary/eng", { + configDir: "./config", + remote: { caching: true, retries: 3 }, }); ``` @@ -117,7 +122,7 @@ Looks up one or more terms by exact match. |-----------|------|---------|-------------| | `query` | `string \| string[]` | — | Term(s) to look up | | `options.split` | `number` | — | Minimum word length for compound splitting | -| `options.follow` | `boolean \| number` | — | Follow `see` cross-references. `true` = infinite, `false` = disabled, number = max depth | +| `options.follow` | `boolean` | — | Follow `see` cross-references until an entry with etymologies is found | | `options.insensitive` | `boolean` | — | Enable case-insensitive matching | ```typescript @@ -217,7 +222,7 @@ interface Etymology { id?: string; pronunciations: Pronunciation[]; description?: string; - senses: Record; + senses: Sense[]; } ``` @@ -323,11 +328,14 @@ interface EnumWrapper { ```typescript interface LoadOptions { - alias?: AliasLoadOptions; + configDir?: string; + remote?: RemoteLoadOptions; } -interface AliasLoadOptions { - path?: string; +interface RemoteLoadOptions { + outDir?: string; + caching?: boolean; + retries?: number; } interface SaveOptions { @@ -341,7 +349,7 @@ interface CompressOptions { interface LookupOptions { split?: number; - follow?: boolean | number; + follow?: boolean; insensitive?: boolean; } @@ -359,7 +367,7 @@ interface SearchOptions { } interface TokenizeOptions { - follow?: boolean | number; + follow?: boolean; allowList?: string[]; insensitive?: boolean; } diff --git a/docs/src/content/docs/api/python.md b/docs/src/content/docs/api/python.md index 84a0e3b51..64aae17e8 100644 --- a/docs/src/content/docs/api/python.md +++ b/docs/src/content/docs/api/python.md @@ -63,28 +63,31 @@ The main class for working with compiled dictionaries. ### Constructors -#### `OpenDictionary(data: bytes)` +#### `OpenDictionary(data: bytes | str)` -Creates a dictionary from compiled binary data (as returned by `compile()`). +Creates a dictionary from compiled binary data (as returned by `compile()`) or directly from an XML string. ```python from theopendictionary import OpenDictionary, compile +# From compiled bytes data = compile(xml_string) dictionary = OpenDictionary(data) + +# Directly from XML string +dictionary = OpenDictionary(xml_string) ``` -#### `await OpenDictionary.load(dictionary: str, alias_path: str | None = None) -> OpenDictionary` +#### `await OpenDictionary.load(dictionary: str, options: LoadOptions | None = None) -> OpenDictionary` Loads a dictionary from a file path, alias, or remote identifier. This is an **async** method. - If `dictionary` is a path to a `.odict` file, it loads from disk. - If it matches the format `org/lang` (e.g. `wiktionary/eng`), it downloads from the remote registry. -- `alias_path` optionally specifies a custom alias file path. ```python import asyncio -from theopendictionary import OpenDictionary +from theopendictionary import OpenDictionary, LoadOptions, RemoteLoadOptions async def main(): # Load from file @@ -93,6 +96,13 @@ async def main(): # Load from remote registry dictionary = await OpenDictionary.load("wiktionary/eng") + # Load with options + opts = LoadOptions( + config_dir="./config", + remote=RemoteLoadOptions(caching=True) + ) + dictionary = await OpenDictionary.load("wiktionary/eng", opts) + asyncio.run(main()) ``` @@ -128,7 +138,7 @@ Looks up one or more terms by exact match. |-----------|------|---------|-------------| | `query` | `str \| list[str]` | — | Term(s) to look up | | `split` | `int \| None` | `None` | Minimum word length for compound splitting | -| `follow` | `bool \| int \| None` | `None` | Follow `see` cross-references. `True` = infinite, `False` = disabled, `int` = max depth | +| `follow` | `bool \| None` | `None` | Follow `see` cross-references until an entry with etymologies is found | | `insensitive` | `bool \| None` | `None` | Enable case-insensitive matching | ```python @@ -195,7 +205,7 @@ Tokenizes text using NLP-based segmentation and matches each token against the d | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `text` | `str` | — | Text to tokenize | -| `follow` | `bool \| int \| None` | `None` | Follow `see` cross-references | +| `follow` | `bool \| int \| None` | `None` | Follow `see` cross-references. Accepts `True`/`False` or a number (nonzero = follow) | | `insensitive` | `bool \| None` | `None` | Case-insensitive matching | ```python diff --git a/docs/src/content/docs/guides/compiling.mdx b/docs/src/content/docs/guides/compiling.mdx new file mode 100644 index 000000000..7b28d1955 --- /dev/null +++ b/docs/src/content/docs/guides/compiling.mdx @@ -0,0 +1,242 @@ +--- +title: Compiling Dictionaries +description: How to compile ODict dictionaries programmatically from Rust, Python, and JavaScript. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +This guide shows how to compile ODXML into `.odict` binary files programmatically. For CLI usage, see the [Quick Start](/getting-started/quickstart/). + +## Compiling from an XML string + +The simplest approach is to compile an XML string directly into an in-memory dictionary. + + + +```rust +use odict::{OpenDictionary, ToDictionary}; + +fn main() -> odict::Result<()> { + let xml = r#" + + + + + + + + + + "#; + + // Parse XML → build binary → get OpenDictionary + let dict = xml.to_dictionary()?.build()?; + + // Write to disk + dict.to_disk("my-dictionary.odict")?; + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary, compile + +xml = """ + + + + + + + + + +""" + +# Option 1: compile() returns raw bytes +compiled_bytes = compile(xml) +dictionary = OpenDictionary(compiled_bytes) + +# Option 2: pass XML directly to the constructor +dictionary = OpenDictionary(xml) + +# Save to disk +dictionary.save("my-dictionary.odict") +``` + + +```typescript +import { compile, OpenDictionary } from "@odict/node"; + +const xml = ` + + + + + + + + + +`; + +// Option 1: compile() returns a Buffer +const data = compile(xml); +const dictionary = new OpenDictionary(data); + +// Option 2: pass XML directly to the constructor +const dictionary = new OpenDictionary(xml); + +// Save to disk +dictionary.save("my-dictionary.odict"); +``` + + + +## Compiling from an XML file + +If your XML lives on disk, read it first and then compile. + + + +```rust +use odict::schema::Dictionary; + +fn main() -> odict::Result<()> { + // Parse and compile from a file path + let dict = Dictionary::from_path("my-dictionary.xml")? + .build()?; + + dict.to_disk("my-dictionary.odict")?; + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary, compile + +with open("my-dictionary.xml", "r") as f: + xml = f.read() + +compiled_bytes = compile(xml) +dictionary = OpenDictionary(compiled_bytes) +dictionary.save("my-dictionary.odict") +``` + + +```typescript +import { readFile } from "node:fs/promises"; +import { compile, OpenDictionary } from "@odict/node"; + +const xml = await readFile("my-dictionary.xml", "utf-8"); +const data = compile(xml); +const dictionary = new OpenDictionary(data); +dictionary.save("my-dictionary.odict"); +``` + + + +## Compression options + +ODict uses Brotli compression. You can configure the compression level when saving. + + + +```rust +use odict::{compile::CompilerOptions, CompressOptions, ToDictionary}; + +fn main() -> odict::Result<()> { + let xml = std::fs::read_to_string("my-dictionary.xml")?; + + let compress = CompressOptions::default() + .quality(11) // Maximum compression (0–11) + .window_size(22); // Window size (0–22) + + let options = CompilerOptions::default() + .with_compression(compress); + + xml.as_str() + .to_dictionary()? + .build()? + .to_disk_with_options("my-dictionary.odict", options)?; + + Ok(()) +} +``` + + +```python +dictionary.save( + "my-dictionary.odict", + quality=11, # Maximum compression (0–11) + window_size=22 # Window size (0–22) +) +``` + + +```typescript +dictionary.save("my-dictionary.odict", { + compress: { + quality: 11, // Maximum compression (0–11) + windowSize: 22, // Window size (0–22) + }, +}); +``` + + + +## Loading compiled dictionaries + +Once compiled, you can load `.odict` files from disk or from the remote registry. + + + +```rust +use odict::OpenDictionary; + +fn main() -> odict::Result<()> { + // Load from disk + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + println!("Entries: {}", dict.entries.len()); + + Ok(()) +} +``` + + +```python +import asyncio +from theopendictionary import OpenDictionary + +async def main(): + # Load from disk + dictionary = await OpenDictionary.load("./my-dictionary.odict") + + # Load from remote registry + dictionary = await OpenDictionary.load("wiktionary/eng") + + print(dictionary.lexicon()) + +asyncio.run(main()) +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +// Load from disk +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +// Load from remote registry +const dictionary = await OpenDictionary.load("wiktionary/eng"); + +console.log(dictionary.lexicon()); +``` + + diff --git a/docs/src/content/docs/guides/lookup.mdx b/docs/src/content/docs/guides/lookup.mdx new file mode 100644 index 000000000..cbcea0aab --- /dev/null +++ b/docs/src/content/docs/guides/lookup.mdx @@ -0,0 +1,312 @@ +--- +title: Looking Up Entries +description: How to look up dictionary entries by exact match from Rust, Python, and JavaScript. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +Lookup is the fastest way to query a dictionary — it finds entries by exact term match without requiring an index. + +## Basic lookup + + + +```rust +use odict::{OpenDictionary, lookup::LookupOptions}; + +fn main() -> odict::Result<()> { + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + let results = dict.lookup( + &vec!["cat"], + LookupOptions::default(), + )?; + + for result in &results { + println!("{}", result.entry.term.as_str()); + } + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary, compile + +dictionary = OpenDictionary("...") + +results = dictionary.lookup("cat") +print(results[0].entry.term) # "cat" +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +const results = dictionary.lookup("cat"); +console.log(results[0].entry.term); // "cat" +``` + + + +## Looking up multiple terms + +You can look up several terms in a single call. Results are returned for each matched term. + + + +```rust +let results = dict.lookup( + &vec!["cat", "dog", "run"], + LookupOptions::default(), +)?; + +for result in &results { + println!("Found: {}", result.entry.term.as_str()); +} +``` + + +```python +results = dictionary.lookup(["cat", "dog", "run"]) + +for result in results: + print(f"Found: {result.entry.term}") +``` + + +```typescript +const results = dictionary.lookup(["cat", "dog", "run"]); + +for (const result of results) { + console.log(`Found: ${result.entry.term}`); +} +``` + + + +## Following cross-references + +Entries can redirect to other entries using the `see` attribute (e.g. "ran" → "run"). Enable `follow` to automatically resolve these. + + + +```rust +use odict::lookup::LookupOptions; + +let options = LookupOptions::default().follow(true); + +let results = dict.lookup(&vec!["ran"], options)?; + +// "ran" redirects to "run" +assert_eq!(results[0].entry.term.as_str(), "run"); + +// directed_from tells you the original entry +if let Some(from) = &results[0].directed_from { + println!("Redirected from: {}", from.term.as_str()); +} +``` + + +```python +results = dictionary.lookup("ran", follow=True) + +# "ran" redirects to "run" +print(results[0].entry.term) # "run" +print(results[0].directed_from.term) # "ran" +``` + + +```typescript +const results = dictionary.lookup("ran", { follow: true }); + +// "ran" redirects to "run" +console.log(results[0].entry.term); // "run" +console.log(results[0].directedFrom?.term); // "ran" +``` + + + +:::tip +When `follow` is enabled, ODict walks the `see` chain until it finds an entry with etymologies. It also detects circular references and returns an error instead of looping infinitely. +::: + +## Case-insensitive lookup + +By default, lookups are case-sensitive. Enable `insensitive` to fall back to lowercase matching when the exact case doesn't match. + + + +```rust +let options = LookupOptions::default().insensitive(true); + +// "CAT" will match "cat" +let results = dict.lookup(&vec!["CAT"], options)?; + +assert_eq!(results[0].entry.term.as_str(), "cat"); +``` + + +```python +# "CAT" will match "cat" +results = dictionary.lookup("CAT", insensitive=True) + +print(results[0].entry.term) # "cat" +``` + + +```typescript +// "CAT" will match "cat" +const results = dictionary.lookup("CAT", { insensitive: true }); + +console.log(results[0].entry.term); // "cat" +``` + + + +## Compound word splitting + +If a term isn't found, ODict can split it into substrings and look up each part. The `split` parameter sets the minimum character length for each fragment. + + + +```rust +use odict::lookup::{LookupOptions, LookupStrategy}; + +let options = LookupOptions::default() + .strategy(LookupStrategy::Split(3)); + +// "catdog" isn't a word, but "cat" and "dog" are +let results = dict.lookup(&vec!["catdog"], options)?; + +for result in &results { + println!("Found: {}", result.entry.term.as_str()); +} +// Prints: "cat", "dog" +``` + + +```python +# "catdog" isn't a word, but "cat" and "dog" are +results = dictionary.lookup("catdog", split=3) + +for result in results: + print(result.entry.term) +# Prints: "cat", "dog" +``` + + +```typescript +// "catdog" isn't a word, but "cat" and "dog" are +const results = dictionary.lookup("catdog", { split: 3 }); + +for (const result of results) { + console.log(result.entry.term); +} +// Prints: "cat", "dog" +``` + + + +## Combining options + +All lookup options can be combined. + + + +```rust +let options = LookupOptions::default() + .follow(true) + .insensitive(true) + .strategy(LookupStrategy::Split(3)); + +let results = dict.lookup(&vec!["RaN"], options)?; +``` + + +```python +results = dictionary.lookup("RaN", follow=True, insensitive=True, split=3) +``` + + +```typescript +const results = dictionary.lookup("RaN", { + follow: true, + insensitive: true, + split: 3, +}); +``` + + + +## Reading entry data + +Once you have a `LookupResult`, you can traverse the entry's structure: etymologies, senses, definitions, examples, and more. + + + +```python +results = dictionary.lookup("cat") +entry = results[0].entry + +print(f"Term: {entry.term}") + +for ety in entry.etymologies: + for sense in ety.senses: + print(f" Part of speech: {sense.pos}") + for defn in sense.definitions: + print(f" {defn.value}") + for example in defn.examples: + print(f" e.g. {example.value}") +``` + + +```typescript +const results = dictionary.lookup("cat"); +const entry = results[0].entry; + +console.log(`Term: ${entry.term}`); + +for (const ety of entry.etymologies) { + for (const sense of ety.senses) { + console.log(` Part of speech: ${sense.pos.value}`); + for (const defn of sense.definitions) { + if ("value" in defn) { + console.log(` ${defn.value}`); + for (const example of defn.examples) { + console.log(` e.g. ${example.value}`); + } + } + } + } +} +``` + + +```rust +let results = dict.lookup(&vec!["cat"], LookupOptions::default())?; + +for result in &results { + let entry = result.entry.deserialize()?; + + println!("Term: {}", entry.term); + + for ety in &entry.etymologies { + for (pos, sense) in &ety.senses { + println!(" Part of speech: {}", pos); + for defn in &sense.definitions { + println!(" {}", defn.value); + for example in &defn.examples { + println!(" e.g. {}", example.value); + } + } + } + } +} +``` + + diff --git a/docs/src/content/docs/guides/search.mdx b/docs/src/content/docs/guides/search.mdx new file mode 100644 index 000000000..86d64c807 --- /dev/null +++ b/docs/src/content/docs/guides/search.mdx @@ -0,0 +1,176 @@ +--- +title: Searching Dictionaries +description: How to index and run full-text searches on ODict dictionaries from Rust, Python, and JavaScript. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +Full-text search lets you find entries by matching against their definitions, not just their headwords. Unlike [lookup](/guides/lookup/) which requires an exact term match, search uses a [Tantivy](https://github.com/quickwit-oss/tantivy)-powered full-text index. + +## Creating an index + +Before you can search, you need to create a full-text index. This only needs to be done once per dictionary (the index is persisted to disk). + + + +```rust +use odict::{OpenDictionary, index::IndexOptions}; + +fn main() -> odict::Result<()> { + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + dict.index(IndexOptions::default())?; + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary + +dictionary = await OpenDictionary.load("./my-dictionary.odict") +dictionary.index() +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); +dictionary.index(); +``` + + + +## Index options + +You can configure the indexing behavior. + + + +```rust +use odict::index::IndexOptions; + +let options = IndexOptions::default() + .dir("./my-index") // Custom index directory + .overwrite(true) // Overwrite existing index + .memory(50_000_000); // 50MB memory arena per thread + +dict.index(options)?; +``` + + +```python +from theopendictionary import IndexOptions + +dictionary.index(IndexOptions( + directory="./my-index", # Custom index directory + overwrite=True, # Overwrite existing index + memory=50_000_000 # 50MB memory arena per thread +)) +``` + + +```typescript +dictionary.index({ + directory: "./my-index", // Custom index directory + overwrite: true, // Overwrite existing index + memory: 50_000_000, // 50MB memory arena per thread +}); +``` + + + +## Running a search + +Once indexed, you can search across all definitions in the dictionary. + + + +```rust +use odict::search::SearchOptions; + +let results = dict.search("domesticated mammal", SearchOptions::default())?; + +for entry in &results { + println!("{}", entry.term); +} +``` + + +```python +results = dictionary.search("domesticated mammal") + +for entry in results: + print(entry.term) +``` + + +```typescript +const results = dictionary.search("domesticated mammal"); + +for (const entry of results) { + console.log(entry.term); +} +``` + + + +## Search options + + + +```rust +use odict::search::SearchOptions; + +let options = SearchOptions::default() + .dir("./my-index") // Custom index directory + .autoindex(true) // Auto-create index if missing + .limit(10) // Max results to return + .threshold(50); // Relevance threshold + +let results = dict.search("greeting", options)?; +``` + + +```python +from theopendictionary import SearchOptions + +results = dictionary.search("greeting", SearchOptions( + directory="./my-index", # Custom index directory + autoindex=True, # Auto-create index if missing + limit=10, # Max results to return + threshold=50 # Relevance threshold +)) +``` + + +```typescript +const results = dictionary.search("greeting", { + directory: "./my-index", // Custom index directory + autoindex: true, // Auto-create index if missing + limit: 10, // Max results to return + threshold: 50, // Relevance threshold +}); +``` + + + +:::tip +The `autoindex` option is convenient for one-off scripts — it creates the index on the fly if one doesn't exist yet. For production use, create the index ahead of time with `index()` to avoid the startup cost on first search. +::: + +## Search vs. lookup + +| | Lookup | Search | +|---|--------|--------| +| **Matches against** | Entry terms (headwords) | Definition text | +| **Requires index** | No | Yes | +| **Speed** | O(1) per term | Depends on index size | +| **Use case** | You know the exact word | You're searching by meaning | +| **Supports splitting** | Yes | No | +| **Supports follow** | Yes | No | + +In most applications you'll use both: lookup for direct dictionary access, and search for discovery. diff --git a/docs/src/content/docs/guides/tokenize.mdx b/docs/src/content/docs/guides/tokenize.mdx new file mode 100644 index 000000000..22df83031 --- /dev/null +++ b/docs/src/content/docs/guides/tokenize.mdx @@ -0,0 +1,228 @@ +--- +title: Tokenizing Text +description: How to tokenize text and match tokens against dictionary entries using ODict's NLP tokenizer. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +ODict includes a built-in NLP tokenizer that segments text into words and automatically matches each token against dictionary entries. This is especially useful for languages without whitespace-delimited words (Chinese, Japanese, Korean, Thai, Khmer) as well as compound-word languages (German, Swedish). + +## Supported languages + +| Language family | Languages | Tokenizer | +|----------------|-----------|-----------| +| Chinese | Simplified & Traditional Chinese | jieba | +| Japanese | Japanese | Lindera (UniDic) | +| Korean | Korean | Lindera (KoDic) | +| Thai | Thai | ICU-based | +| Khmer | Khmer | ICU-based | +| Germanic | German, Swedish | Compound word splitting | +| Latin-script | English, French, Spanish, etc. | Unicode word boundaries | + +## Basic tokenization + + + +```rust +use odict::{OpenDictionary, tokenize::TokenizeOptions}; + +fn main() -> odict::Result<()> { + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + let tokens = dict.tokenize( + "the cat ran", + TokenizeOptions::default(), + )?; + + for token in &tokens { + println!("'{}' ({} entries found)", + token.lemma, + token.entries.len() + ); + } + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary + +dictionary = OpenDictionary("...") + +tokens = dictionary.tokenize("the cat ran") + +for token in tokens: + print(f"'{token.lemma}' ({len(token.entries)} entries found)") +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +const tokens = dictionary.tokenize("the cat ran"); + +for (const token of tokens) { + console.log(`'${token.lemma}' (${token.entries.length} entries found)`); +} +``` + + + +## Chinese text tokenization + +For Chinese (and other CJK languages), ODict automatically detects the script and uses the appropriate segmenter. + + + +```rust +let tokens = dict.tokenize("你好世界", TokenizeOptions::default())?; + +for token in &tokens { + println!("Lemma: {}, Script: {:?}, Language: {:?}", + token.lemma, + token.script.name(), + token.language.as_ref().map(|l| l.code()) + ); +} +``` + + +```python +tokens = dictionary.tokenize("你好世界") + +for token in tokens: + print(f"Lemma: {token.lemma}, Script: {token.script}, Language: {token.language}") +``` + + +```typescript +const tokens = dictionary.tokenize("你好世界"); + +for (const token of tokens) { + console.log(`Lemma: ${token.lemma}, Script: ${token.script}, Language: ${token.language}`); +} +``` + + + +## Following cross-references + +Like [lookup](/guides/lookup/), tokenization supports following `see` cross-references. + + + +```rust +let options = TokenizeOptions::default().follow(true); + +let tokens = dict.tokenize("the cat ran", options)?; + +for token in &tokens { + for result in &token.entries { + if let Some(from) = &result.directed_from { + println!("'{}' → '{}'", + from.term.as_str(), + result.entry.term.as_str() + ); + } + } +} +// e.g. 'ran' → 'run' +``` + + +```python +tokens = dictionary.tokenize("the cat ran", follow=True) + +for token in tokens: + for result in token.entries: + if result.directed_from: + print(f"'{result.directed_from.term}' → '{result.entry.term}'") +# e.g. 'ran' → 'run' +``` + + +```typescript +const tokens = dictionary.tokenize("the cat ran", { follow: true }); + +for (const token of tokens) { + for (const result of token.entries) { + if (result.directedFrom) { + console.log(`'${result.directedFrom.term}' → '${result.entry.term}'`); + } + } +} +// e.g. 'ran' → 'run' +``` + + + +## Case-insensitive tokenization + + + +```rust +let options = TokenizeOptions::default().insensitive(true); + +// "DOG" will match the "dog" entry +let tokens = dict.tokenize("DOG cat", options)?; +``` + + +```python +# "DOG" will match the "dog" entry +tokens = dictionary.tokenize("DOG cat", insensitive=True) +``` + + +```typescript +// "DOG" will match the "dog" entry +const tokens = dictionary.tokenize("DOG cat", { insensitive: true }); +``` + + + +## Token properties + +Each token returned by `tokenize()` includes metadata about the match. + +| Property | Description | +|----------|-------------| +| `lemma` | The original text of the token as it appears in the input | +| `language` | Detected language code (e.g. `"cmn"` for Mandarin), if applicable | +| `script` | Detected script name (e.g. `"Han"`, `"Latin"`) | +| `kind` | Token kind (e.g. `"Word"`, `"Punctuation"`) | +| `start` | Start byte offset in the original text | +| `end` | End byte offset in the original text | +| `entries` | Array of `LookupResult` objects for matched dictionary entries | + +## Combining options + + + +```rust +let options = TokenizeOptions::default() + .follow(true) + .insensitive(true); + +let tokens = dict.tokenize("The CAT RaN away", options)?; +``` + + +```python +tokens = dictionary.tokenize("The CAT RaN away", follow=True, insensitive=True) +``` + + +```typescript +const tokens = dictionary.tokenize("The CAT RaN away", { + follow: true, + insensitive: true, +}); +``` + + From 7dd826208ba219f3fd96e5e6be2093da0b2d0113 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 23 Feb 2026 11:39:38 +0000 Subject: [PATCH 4/6] Rewrite CLI and schema doc generators to parse source files Replace hardcoded arrays in both generation scripts with actual source-file parsers: - generate-cli-docs.mjs now parses clap #[arg(...)] attributes, doc comments, and Commands enum from Rust CLI source files - generate-schema-docs.mjs now tokenizes odict.xsd with a stack-based parser that correctly handles deeply nested inline complexTypes, named type references, and produces accurate attribute/child-element tables plus a hierarchy tree https://claude.ai/code/session_0152q1rpTnXqZGQ5B85AhjXs --- docs/scripts/generate-cli-docs.mjs | 678 ++++++++++++---------- docs/scripts/generate-schema-docs.mjs | 583 +++++++++++-------- docs/src/content/docs/cli/reference.md | 429 +++++--------- docs/src/content/docs/schema/reference.md | 316 +++++----- 4 files changed, 1024 insertions(+), 982 deletions(-) diff --git a/docs/scripts/generate-cli-docs.mjs b/docs/scripts/generate-cli-docs.mjs index 291f34bfd..9672b7748 100644 --- a/docs/scripts/generate-cli-docs.mjs +++ b/docs/scripts/generate-cli-docs.mjs @@ -1,13 +1,13 @@ /** * Generates CLI reference documentation by parsing the clap arg definitions - * from the Rust source files in cli/src/. + * directly from the Rust source files in cli/src/. * * Run: node scripts/generate-cli-docs.mjs * * Outputs: src/content/docs/cli/reference.md */ -import { readFileSync, writeFileSync, mkdirSync } from "node:fs"; +import { readFileSync, writeFileSync, mkdirSync, readdirSync } from "node:fs"; import { join, dirname } from "node:path"; import { fileURLToPath } from "node:url"; @@ -16,299 +16,275 @@ const cliSrc = join(__dirname, "../../cli/src"); const outPath = join(__dirname, "../src/content/docs/cli/reference.md"); // --------------------------------------------------------------------------- -// We define the CLI structure based on parsing the clap source. -// This is more reliable than regex-parsing Rust macros and gives us -// full control over the documentation output. +// Read all Rust source files // --------------------------------------------------------------------------- -const commands = [ - { - name: "new", - summary: "Scaffolds a new ODict XML dictionary", - usage: "odict new [-n ]", - args: [ - { name: "file_name", required: true, description: "Name of your new dictionary file (without extension)" }, - ], - flags: [ - { short: "-n", long: null, arg: "", description: "Name attribute of the `` element" }, - ], - example: `# Create a new dictionary file -odict new my-dictionary -n "My Dictionary" -# Creates my-dictionary.xml`, - }, - { - name: "compile", - summary: "Compiles a dictionary from ODXML", - usage: "odict compile [-o ] [-q ] [-w ]", - args: [ - { name: "input", required: true, description: "Path to ODXML file" }, - ], - flags: [ - { short: "-o", long: null, arg: "", description: "Output path of compiled dictionary. Defaults to the input path with a `.odict` extension." }, - { short: "-q", long: null, arg: "<0-11>", description: "Brotli compression level (default: `8`)" }, - { short: "-w", long: null, arg: "<0-22>", description: "Brotli large window size (default: `22`)" }, - ], - example: `# Compile with default settings -odict compile my-dictionary.xml - -# Compile with custom output and compression -odict compile my-dictionary.xml -o out/dict.odict -q 11`, - }, - { - name: "lookup", - summary: "Looks up entries in a compiled dictionary without indexing", - usage: "odict lookup [-f ] [-F ] [-s ] [-i]", - args: [ - { name: "dictionary", required: true, description: "Path to a compiled dictionary or an alias" }, - { name: "queries", required: true, description: "One or more words to look up" }, - ], - flags: [ - { short: "-f", long: "--format", arg: "", description: "Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`)" }, - { short: "-F", long: "--follow", arg: "", description: "Number of redirects to follow via `see` attributes (default: `0`). Use a high number for infinite following." }, - { short: "-s", long: "--split", arg: "", description: "If not found, split the query into words of at least length `n` and look up each separately (default: `0`, disabled)" }, - { short: "-i", long: "--insensitive", arg: null, description: "Perform case-insensitive lookups" }, - ], - example: `# Simple lookup -odict lookup my-dictionary.odict cat - -# Lookup with JSON output and follow redirects -odict lookup my-dictionary.odict ran -f json -F 1 - -# Case-insensitive lookup with splitting -odict lookup my-dictionary.odict "catdog" -s 3 -i`, - }, - { - name: "search", - summary: "Runs a full-text query on a compiled dictionary", - usage: "odict search [-f ] [--index]", - args: [ - { name: "dictionary", required: true, description: "Path to a compiled dictionary or an alias" }, - { name: "query", required: true, description: "Search query" }, - ], - flags: [ - { short: "-f", long: "--format", arg: "", description: "Output format: `json`, `xml`, `markdown`, `html`, `print` (default: `json`)" }, - { short: null, long: "--index", arg: null, description: "Creates a new index if one doesn't already exist" }, - ], - example: `# Search with auto-indexing -odict search my-dictionary.odict "move swiftly" --index - -# Search with specific output format -odict search my-dictionary.odict "greeting" -f xml`, - }, - { - name: "index", - summary: "Creates a full-text index of a compiled dictionary", - usage: "odict index [-d ] [-f] [-m ]", - args: [ - { name: "dictionary", required: true, description: "Path to a compiled dictionary or an alias" }, - ], - flags: [ - { short: "-d", long: null, arg: "", description: "Custom directory to store the index" }, - { short: "-f", long: null, arg: null, description: "Whether to overwrite the index if it already exists" }, - { short: "-m", long: null, arg: "", description: "Memory arena per thread in bytes. Must be above 15MB. (default: `15000000`)" }, - ], - example: `# Create an index with default settings -odict index my-dictionary.odict - -# Overwrite existing index with custom memory -odict index my-dictionary.odict -f -m 50000000`, - }, - { - name: "tokenize", - summary: "Tokenizes text and finds dictionary entries for each token", - usage: "odict tokenize [-f ] [-F ] [-i]", - args: [ - { name: "dictionary", required: true, description: "Path to a compiled dictionary" }, - { name: "text", required: true, description: "Text to tokenize" }, - ], - flags: [ - { short: "-f", long: "--format", arg: "", description: "Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`)" }, - { short: "-F", long: "--follow", arg: "", description: "Number of redirects to follow via `see` attributes (default: `0`)" }, - { short: "-i", long: "--insensitive", arg: null, description: "Perform case-insensitive lookups when matching tokens" }, - ], - example: `# Tokenize Chinese text -odict tokenize chinese.odict "你好世界" - -# Tokenize with redirect following -odict tokenize my-dictionary.odict "the cat ran" -F 1 -f json`, - }, - { - name: "dump", - summary: "Outputs a dictionary in a human-readable format", - usage: "odict dump [-f ] [-o ]", - args: [ - { name: "input", required: true, description: "Path to a compiled dictionary" }, - ], - flags: [ - { short: "-f", long: null, arg: "", description: "Dump format: `xml`, `sqlite`, `postgres`, `mysql` (default: `xml`)" }, - { short: "-o", long: null, arg: "", description: "Output path. Defaults to stdout." }, - ], - example: `# Dump as XML to stdout -odict dump my-dictionary.odict - -# Dump as SQL to a file -odict dump my-dictionary.odict -f sqlite -o dictionary.sql`, - }, - { - name: "merge", - summary: "Merges entries from multiple dictionaries into one", - usage: "odict merge [-o ]", - args: [ - { name: "destination", required: true, description: "Path of the dictionary to merge into (unless `--output` is specified)" }, - { name: "sources", required: true, description: "Paths of dictionaries to merge" }, - ], - flags: [ - { short: "-o", long: "--output", arg: "", description: "Separate output path for the compiled dictionary" }, - ], - example: `# Merge two dictionaries into the first -odict merge base.odict extra1.odict extra2.odict - -# Merge into a new file -odict merge base.odict extra.odict -o combined.odict`, - }, - { - name: "info", - summary: "Prints the metadata for a dictionary file", - usage: "odict info ", - args: [ - { name: "dictionary", required: true, description: "Path to a compiled dictionary" }, - ], - flags: [], - example: `odict info my-dictionary.odict -# Output: -# My Dictionary -# ───────────── -# File Version: 3 -# File Size: 1.23 KB -# Entries: 5,000`, - }, - { - name: "lexicon", - summary: "Lists all words defined in a dictionary", - usage: "odict lexicon ", - args: [ - { name: "dictionary", required: true, description: "Path to a compiled dictionary" }, - ], - flags: [], - example: `odict lexicon my-dictionary.odict -# cat -# dog -# run -# ...`, - }, - { - name: "download", - summary: "Downloads a dictionary from the remote registry", - usage: "odict download [-o ] [--no-cache]", - args: [ - { name: "dictionary", required: true, description: "Dictionary to download (e.g. `wiktionary/eng`)" }, - ], - flags: [ - { short: "-o", long: "--output", arg: "", description: "Directory to download to (defaults to config directory)" }, - { short: null, long: "--no-cache", arg: null, description: "Disable caching (always download a fresh copy)" }, - ], - example: `# Download English Wiktionary dictionary -odict download wiktionary/eng - -# Download Japanese dictionary to a specific directory -odict download wiktionary/jpn -o ./dicts/`, - }, - { - name: "serve", - summary: "Starts a local HTTP server to serve one or several dictionaries", - usage: "odict serve [dictionaries...] [-p ] [-c ] [-l ]", - args: [ - { name: "dictionaries", required: false, description: "Paths to compiled dictionaries or directories containing `.odict` files" }, - ], - flags: [ - { short: "-p", long: null, arg: "", description: "Port to listen on (default: `5005`)" }, - { short: "-c", long: "--capacity", arg: "", description: "Maximum number of dictionaries to keep in memory (default: `5`)" }, - { short: "-l", long: "--level", arg: "", description: "Log level: `trace`, `debug`, `info`, `warn`, `error`" }, - ], - example: `# Serve a single dictionary -odict serve my-dictionary.odict - -# Serve a directory of dictionaries on a custom port -odict serve ./dicts/ -p 8080 -c 10`, - extra: `### HTTP endpoints - -When running \`odict serve\`, the following REST endpoints become available: - -#### \`GET /{name}/lookup\` - -Look up entries by exact match. - -| Parameter | Type | Description | -|-----------|------|-------------| -| \`queries\` | string | Comma-separated list of terms to look up | -| \`follow\` | number | Number of redirects to follow (optional) | -| \`split\` | number | Minimum word length for splitting (optional) | - -\`\`\`bash -curl "http://localhost:5005/my-dictionary/lookup?queries=cat,dog&follow=1" -\`\`\` +function readRustFile(relPath) { + return readFileSync(join(cliSrc, relPath), "utf-8"); +} -#### \`GET /{name}/search\` +// --------------------------------------------------------------------------- +// Parse the Commands enum from cli.rs to get command descriptions +// --------------------------------------------------------------------------- -Full-text search across definitions. +function parseCommandsEnum(source) { + const commands = {}; + // Match: /// doc comment followed by variant name + const re = /\/\/\/\s*(.*)\n\s*(?:#\[.*\]\n\s*)*(\w+)\((\w+)\)/g; + let m; + while ((m = re.exec(source)) !== null) { + const doc = m[1].trim(); + const variant = m[2]; + commands[variant] = doc; + } + return commands; +} -| Parameter | Type | Description | -|-----------|------|-------------| -| \`query\` | string | Search query | -| \`limit\` | number | Maximum results to return (default: 10) | +// --------------------------------------------------------------------------- +// Parse #[arg(...)] fields from an Args struct +// --------------------------------------------------------------------------- -\`\`\`bash -curl "http://localhost:5005/my-dictionary/search?query=move+swiftly&limit=5" -\`\`\` +function parseArgsStruct(source) { + const fields = []; + + // Find the struct body (everything between the first { and last }) + const structMatch = source.match( + /pub\s+struct\s+\w+Args\s*\{([\s\S]*?)\n\}/ + ); + if (!structMatch) return fields; + + const body = structMatch[1]; + + // Split by field declarations - each field may have preceding attributes and doc comments + // We look for patterns like: + // /// doc comment + // #[arg(...)] + // pub field_name: Type, + // -- or -- + // #[arg(..., help = "...")] + // field_name: Type, + + const fieldRegex = + /((?:\/\/\/[^\n]*\n\s*|#\[(?:arg|pyo3)[^\]]*\]\n\s*)*)\s*(?:pub(?:\((?:super|crate)\))?\s+)?(\w+)\s*:\s*([^,\n]+)/g; + + let fm; + while ((fm = fieldRegex.exec(body)) !== null) { + const attrs = fm[1]; + const name = fm[2]; + const type = fm[3].trim(); + + // Skip command subcommand fields + if (attrs.includes("#[command")) continue; + + // Parse #[arg(...)] attributes + const argAttr = attrs.match(/#\[arg\(([\s\S]*?)\)\]/); + const argContent = argAttr ? argAttr[1] : ""; + + // Extract help text + let help = extractQuoted(argContent, "help"); + + // Fall back to /// doc comments + if (!help) { + const docMatch = attrs.match(/\/\/\/\s*(.*)/); + if (docMatch) help = docMatch[1].trim(); + } -#### \`GET /{name}/tokenize\` + // Extract short flag + let short = null; + const shortMatch = argContent.match( + /short\s*=\s*'([^']+)'/ + ); + if (shortMatch) { + short = `-${shortMatch[1]}`; + } else if (/\bshort\b/.test(argContent) && !/short\s*=/.test(argContent)) { + // bare `short` means use first char of field name + short = `-${name[0]}`; + } -Tokenize text and find matching entries. + // Extract long flag + let long = null; + const longMatch = argContent.match( + /long\s*=\s*"([^"]+)"/ + ); + if (longMatch) { + long = `--${longMatch[1]}`; + } else if (/\blong\b/.test(argContent) && !/long\s*=/.test(argContent)) { + // bare `long` means use field name with _ -> - + long = `--${name.replace(/_/g, "-")}`; + } -| Parameter | Type | Description | -|-----------|------|-------------| -| \`text\` | string | Text to tokenize | -| \`follow\` | number | Number of redirects to follow (optional) | + // Check if required + const required = + argContent.includes("required = true") || + (type !== "bool" && + !type.startsWith("Option<") && + !type.startsWith("Vec<") && + !short && + !long && + !argContent.includes("default_value")); + + // Check for default value + let defaultVal = null; + const defaultMatch = argContent.match( + /default_value_t\s*=\s*([^,\)]+)/ + ); + if (defaultMatch) { + defaultVal = defaultMatch[1].trim(); + // Clean up Rust-specific patterns + defaultVal = defaultVal + .replace(/crate::DEFAULT_RETRIES/, "3") + .replace(/DEFAULT_INDEX_MEMORY/, "15000000") + .replace(/DumpFormat::XML/, "xml") + .replace(/PrintFormat::Print/, "print") + .replace(/PrintFormat::JSON/, "json"); + } -\`\`\`bash -curl "http://localhost:5005/chinese/tokenize?text=你好世界" -\`\`\` + // Determine if this is a positional arg or a flag + const isPositional = !short && !long && !argContent.includes("default_value_t") && type !== "bool"; + + // Extract value_enum + const isValueEnum = argContent.includes("value_enum"); + + // Determine the arg type for display + let argType = null; + if (type === "bool" || type === "Option") { + argType = null; // boolean flags don't take a value + } else if (isValueEnum) { + argType = `<${name}>`; + } else if (type.includes("PathBuf") || type.includes("String")) { + argType = `<${name}>`; + } else if (type.includes("u32") || type.includes("usize") || type.includes("u16")) { + argType = `<${name}>`; + } else if (type.includes("Vec")) { + argType = `<${name}...>`; + } + + // Extract value_parser range info for help + const rangeMatch = argContent.match(/value_parser.*?range\((\d+)\.\.=(\d+)\)/); + if (rangeMatch) { + const rangeInfo = `(${rangeMatch[1]}–${rangeMatch[2]})`; + if (help && !help.includes(rangeMatch[1])) { + help = `${help} ${rangeInfo}`; + } + } + + fields.push({ + name, + type, + short, + long, + help: help || "", + required, + isPositional, + defaultVal, + argType, + }); + } + + return fields; +} + +function extractQuoted(text, key) { + // Match: key = "value" where value may span multiple lines due to formatting + const re = new RegExp(`${key}\\s*=\\s*"([^"]*)"`, "s"); + const m = re.exec(text); + return m ? m[1].trim() : null; +} + +// --------------------------------------------------------------------------- +// Parse the AliasCommands enum +// --------------------------------------------------------------------------- + +function parseAliasCommands(source) { + const commands = {}; + const re = /\/\/\/\s*(.*)\n\s*(?:#\[.*\]\n\s*)*(\w+)\((\w+)\)/g; + let m; + while ((m = re.exec(source)) !== null) { + commands[m[2]] = m[1].trim(); + } + return commands; +} + +// --------------------------------------------------------------------------- +// Parse HTTP serve endpoint structs from serve/ directory +// --------------------------------------------------------------------------- + +function parseServeEndpoints() { + const endpoints = []; + + for (const file of ["lookup.rs", "search.rs", "tokenize.rs"]) { + const source = readRustFile(`serve/${file}`); + + // Extract route path: #[get("/{name}/...")] + const routeMatch = source.match(/#\[get\("([^"]+)"\)\]/); + if (!routeMatch) continue; + const route = routeMatch[1]; + + // Extract request struct fields + const structMatch = source.match( + /pub\s+struct\s+(\w+Request)\s*\{([\s\S]*?)\}/ + ); + if (!structMatch) continue; + + const structName = structMatch[1]; + const body = structMatch[2]; + + const params = []; + const fieldRe = /(\w+)\s*:\s*([^,\n]+)/g; + let fm; + while ((fm = fieldRe.exec(body)) !== null) { + const name = fm[1]; + const type = fm[2].trim().replace(/,$/, ""); + const isOptional = type.startsWith("Option<"); + const innerType = isOptional + ? type.match(/Option<(\w+)>/)?.[1] || type + : type; + params.push({ + name, + type: innerType === "String" ? "string" : innerType === "bool" ? "boolean" : "number", + optional: isOptional, + }); + } -All endpoints return JSON.`, - }, - { - name: "alias add", - summary: "Creates a new dictionary alias (fails if one already exists)", - usage: "odict alias add ", - args: [ - { name: "name", required: true, description: "Name of the alias" }, - { name: "path", required: true, description: "Dictionary path" }, - ], - flags: [], - example: `odict alias add eng ./dicts/english.odict`, - }, - { - name: "alias set", - summary: "Creates or updates a dictionary alias", - usage: "odict alias set ", - args: [ - { name: "name", required: true, description: "Name of the alias" }, - { name: "path", required: true, description: "Dictionary path" }, - ], - flags: [], - example: `odict alias set eng ./dicts/english-v2.odict`, - }, - { - name: "alias delete", - summary: "Deletes an alias with the given name", - usage: "odict alias delete ", - args: [ - { name: "name", required: true, description: "Name of the alias to delete" }, - ], - flags: [], - example: `odict alias delete eng`, - }, -]; + endpoints.push({ route, params }); + } + + return endpoints; +} + +// --------------------------------------------------------------------------- +// Build CLI documentation from parsed source +// --------------------------------------------------------------------------- + +const cliSource = readRustFile("cli.rs"); +const commandDescs = parseCommandsEnum(cliSource); +const aliasSource = readRustFile("alias/alias.rs"); +const aliasDescs = parseAliasCommands(aliasSource); + +// Map command variant names to their source files +const commandFiles = { + Compile: "compile.rs", + Download: "download.rs", + Dump: "dump.rs", + Index: "index.rs", + Info: "info.rs", + Lexicon: "lexicon.rs", + Lookup: "lookup.rs", + Merge: "merge.rs", + New: "new.rs", + Search: "search.rs", + Serve: "serve/mod.rs", + Tokenize: "tokenize.rs", +}; + +const aliasFiles = { + Add: "alias/set.rs", + Set: "alias/set.rs", + Delete: "alias/delete.rs", +}; + +// Parse serve HTTP endpoints +const serveEndpoints = parseServeEndpoints(); // --------------------------------------------------------------------------- // Render Markdown @@ -319,7 +295,7 @@ title: CLI Reference description: Complete reference for the ODict command-line interface. --- -{/* This file is auto-generated by scripts/generate-cli-docs.mjs. Do not edit manually. */} +{/* This file is auto-generated by scripts/generate-cli-docs.mjs — do not edit manually. */} \`\`\` odict [OPTIONS] @@ -341,48 +317,134 @@ The ODict CLI is the primary tool for creating, compiling, and querying ODict di `; -for (const cmd of commands) { - md += `### \`odict ${cmd.name}\`\n\n`; - md += `${cmd.summary}.\n\n`; - md += `\`\`\`\n${cmd.usage}\n\`\`\`\n\n`; +// Render each main command +for (const [variant, file] of Object.entries(commandFiles)) { + const source = readRustFile(file); + const fields = parseArgsStruct(source); + const desc = commandDescs[variant] || variant; + const cmdName = variant.toLowerCase(); + + md += `### \`odict ${cmdName}\`\n\n`; + md += `${desc}.\n\n`; + + // Build usage string + const positionals = fields.filter((f) => f.isPositional); + const options = fields.filter((f) => !f.isPositional); + let usage = `odict ${cmdName}`; + for (const p of positionals) { + if (p.type.includes("Vec<")) { + usage += p.required ? ` <${p.name}...>` : ` [${p.name}...]`; + } else { + usage += p.required ? ` <${p.name}>` : ` [${p.name}]`; + } + } + for (const o of options) { + if (o.name === "retries") continue; // skip common retries flag in usage + const flag = o.short || o.long; + if (flag) { + if (o.argType) { + usage += ` [${flag} ${o.argType}]`; + } else { + usage += ` [${flag}]`; + } + } + } + md += `\`\`\`\n${usage}\n\`\`\`\n\n`; - // Arguments - if (cmd.args.length > 0) { + // Positional arguments table + if (positionals.length > 0) { md += `#### Arguments\n\n`; md += `| Argument | Required | Description |\n`; md += `|----------|----------|-------------|\n`; - for (const a of cmd.args) { - md += `| \`${a.name}\` | ${a.required ? "Yes" : "No"} | ${a.description} |\n`; + for (const p of positionals) { + md += `| \`${p.name}\` | ${p.required ? "Yes" : "No"} | ${p.help} |\n`; } md += `\n`; } - // Flags - if (cmd.flags.length > 0) { + // Options table + if (options.length > 0) { md += `#### Options\n\n`; - md += `| Flag | Argument | Description |\n`; - md += `|------|----------|-------------|\n`; - for (const f of cmd.flags) { - const flag = [f.short, f.long].filter(Boolean).join(", "); - md += `| \`${flag}\` | ${f.arg ? `\`${f.arg}\`` : "—"} | ${f.description} |\n`; + md += `| Flag | Description |\n`; + md += `|------|-------------|\n`; + for (const o of options) { + const flags = [o.short, o.long].filter(Boolean).join(", "); + let desc = o.help; + if (o.defaultVal && !desc.includes("default")) { + desc += ` (default: \`${o.defaultVal}\`)`; + } + md += `| \`${flags}\` | ${desc} |\n`; } md += `\n`; } - // Example - if (cmd.example) { - md += `#### Example\n\n`; - md += `\`\`\`bash\n${cmd.example}\n\`\`\`\n\n`; + // HTTP endpoints for serve command + if (cmdName === "serve" && serveEndpoints.length > 0) { + md += `#### HTTP endpoints\n\n`; + md += `When running \`odict serve\`, the following REST endpoints become available. All return JSON.\n\n`; + + for (const ep of serveEndpoints) { + md += `##### \`GET ${ep.route}\`\n\n`; + md += `| Parameter | Type | Required | Description |\n`; + md += `|-----------|------|----------|-------------|\n`; + for (const p of ep.params) { + md += `| \`${p.name}\` | ${p.type} | ${p.optional ? "No" : "Yes"} | |\n`; + } + md += `\n`; + } } - // Extra content (for serve endpoints) - if (cmd.extra) { - md += `${cmd.extra}\n\n`; + md += `---\n\n`; +} + +// Render alias subcommands +md += `### \`odict alias\`\n\n`; +md += `Manage dictionary aliases.\n\n`; + +for (const [variant, file] of Object.entries(aliasFiles)) { + const source = readRustFile(file); + const fields = parseArgsStruct(source); + const desc = aliasDescs[variant] || variant; + const cmdName = variant.toLowerCase(); + + md += `#### \`odict alias ${cmdName}\`\n\n`; + md += `${desc}.\n\n`; + + // Build usage + const positionals = fields.filter((f) => f.isPositional); + const options = fields.filter((f) => !f.isPositional); + let usage = `odict alias ${cmdName}`; + for (const p of positionals) { + usage += p.required ? ` <${p.name}>` : ` [${p.name}]`; } + md += `\`\`\`\n${usage}\n\`\`\`\n\n`; - md += `---\n\n`; + if (positionals.length > 0) { + md += `| Argument | Required | Description |\n`; + md += `|----------|----------|-------------|\n`; + for (const p of positionals) { + md += `| \`${p.name}\` | ${p.required ? "Yes" : "No"} | ${p.help} |\n`; + } + md += `\n`; + } + + if (options.length > 0) { + md += `| Flag | Description |\n`; + md += `|------|-------------|\n`; + for (const o of options) { + const flags = [o.short, o.long].filter(Boolean).join(", "); + let desc = o.help; + if (o.defaultVal && !desc.includes("default")) { + desc += ` (default: \`${o.defaultVal}\`)`; + } + md += `| \`${flags}\` | ${desc} |\n`; + } + md += `\n`; + } } +md += `---\n`; + // --------------------------------------------------------------------------- // Write output // --------------------------------------------------------------------------- diff --git a/docs/scripts/generate-schema-docs.mjs b/docs/scripts/generate-schema-docs.mjs index 37e76279a..2bcce7c36 100644 --- a/docs/scripts/generate-schema-docs.mjs +++ b/docs/scripts/generate-schema-docs.mjs @@ -1,5 +1,6 @@ /** - * Parses odict.xsd and generates a Markdown reference page for the XML schema. + * Parses odict.xsd and lib/src/schema/pos.rs to generate + * a Markdown reference page for the ODict XML schema. * * Run: node scripts/generate-schema-docs.mjs * @@ -12,283 +13,344 @@ import { fileURLToPath } from "node:url"; const __dirname = dirname(fileURLToPath(import.meta.url)); const xsdPath = join(__dirname, "../../odict.xsd"); -const outPath = join( - __dirname, - "../src/content/docs/schema/reference.md" -); +const posPath = join(__dirname, "../../lib/src/schema/pos.rs"); +const outPath = join(__dirname, "../src/content/docs/schema/reference.md"); const xsd = readFileSync(xsdPath, "utf-8"); +const posSource = readFileSync(posPath, "utf-8"); // --------------------------------------------------------------------------- -// Minimal XSD parser – extracts complexTypes and the root element tree +// Tokenize XSD into open / close / self-closing tags // --------------------------------------------------------------------------- -/** Extract all from a chunk of XSD text */ -function parseAttributes(block) { - const attrs = []; - const re = - /]*?)(?:\/>|>[\s\S]*?<\/xs:attribute>)/g; +function tokenize(xml) { + const tokens = []; + const re = /<(\/?)(\w[\w:.]*)((?:\s+[\w:]+\s*=\s*"[^"]*")*)\s*(\/?)>/g; let m; - while ((m = re.exec(block)) !== null) { - const chunk = m[0]; - const name = attr(chunk, "name"); - const use = attr(chunk, "use"); - const type = attr(chunk, "type") || "xs:string"; - const def = attr(chunk, "default"); - if (name) { - attrs.push({ - name, - required: use === "required", - type: type.replace("xs:", ""), - default: def || undefined, - }); + while ((m = re.exec(xml)) !== null) { + const isClose = m[1] === "/"; + const tag = m[2]; + const attrStr = m[3]; + const isSelfClose = m[4] === "/"; + + const attrs = {}; + const attrRe = /([\w:]+)\s*=\s*"([^"]*)"/g; + let am; + while ((am = attrRe.exec(attrStr)) !== null) { + attrs[am[1]] = am[2]; + } + + if (isClose) { + tokens.push({ type: "close", tag, attrs }); + } else if (isSelfClose) { + tokens.push({ type: "selfclose", tag, attrs }); + } else { + tokens.push({ type: "open", tag, attrs }); } } - return attrs; + return tokens; } -function attr(text, name) { - const re = new RegExp(`${name}="([^"]*)"`, "i"); - const m = re.exec(text); - return m ? m[1] : null; +const tokens = tokenize(xsd); + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Find the index of the matching close tag for an open tag at `openIndex`. */ +function findClose(openIndex) { + const openTag = tokens[openIndex].tag; + let depth = 1; + for (let i = openIndex + 1; i < tokens.length; i++) { + if (tokens[i].tag === openTag) { + if (tokens[i].type === "open") depth++; + else if (tokens[i].type === "close") { + depth--; + if (depth === 0) return i; + } + } + } + return -1; } -/** Extract child references from a block */ -function parseChildElements(block) { +/** + * Parse a complexType range (ctOpen..ctClose) for its direct attributes + * and direct child elements. "Direct" means not inside a nested + * xs:complexType — we track xs:complexType nesting depth and only + * collect items at depth 0. + */ +function parseComplexTypeRange(ctOpen, ctClose) { + const attributes = []; const children = []; - // Match xs:element with name attribute (direct children, not nested complexTypes) - const re = - /]*?)(?:\/>|>[\s\S]*?<\/xs:element>)/g; - let m; - while ((m = re.exec(block)) !== null) { - const chunk = m[0]; - const name = attr(chunk, "name"); - const type = attr(chunk, "type"); - const minOccurs = attr(chunk, "minOccurs"); - const maxOccurs = attr(chunk, "maxOccurs"); - if (name) { + let depth = 0; + + for (let i = ctOpen + 1; i < ctClose; i++) { + const t = tokens[i]; + + // Track nesting of inner xs:complexType blocks + if (t.tag === "xs:complexType") { + if (t.type === "open") depth++; + else if (t.type === "close") depth--; + continue; + } + + if (depth > 0) continue; // inside a nested complexType — skip + + // Collect attributes at depth 0 + if ( + t.tag === "xs:attribute" && + t.attrs.name && + (t.type === "selfclose" || t.type === "open") + ) { + attributes.push({ + name: t.attrs.name, + type: (t.attrs.type || "xs:string").replace("xs:", ""), + required: t.attrs.use === "required", + }); + } + + // Collect child elements at depth 0 + if ( + t.tag === "xs:element" && + t.attrs.name && + (t.type === "selfclose" || t.type === "open") + ) { children.push({ - name, - type: type || undefined, - minOccurs: minOccurs ?? "1", - maxOccurs: maxOccurs ?? "1", + name: t.attrs.name, + type: t.attrs.type || undefined, + minOccurs: t.attrs.minOccurs ?? "1", + maxOccurs: t.attrs.maxOccurs ?? "1", }); + + // Skip past this element's entire subtree + if (t.type === "open") { + i = findClose(i); + } } } - return children; + + return { attributes, children }; } // --------------------------------------------------------------------------- -// Build the element documentation from the XSD structure +// Phase 1: Parse named complexTypes (defined at XSD top-level) // --------------------------------------------------------------------------- -// We know the ODict schema structure, so we define it explicitly based on -// parsing the XSD. This gives us full control over documentation quality. - -const elements = [ - { - name: "dictionary", - description: "The root element of an ODict XML file. Contains one or more entries.", - attributes: [ - { name: "id", required: false, type: "string", description: "A unique identifier for the dictionary." }, - { name: "name", required: false, type: "string", description: "A human-readable name for the dictionary (e.g. \"English Dictionary\")." }, - ], - children: [ - { name: "entry", min: "1", max: "unbounded", description: "A dictionary entry." }, - ], - }, - { - name: "entry", - description: "Represents a single dictionary entry (headword). An entry can either contain full definitions via etymology elements, or redirect to another entry using the `see` attribute.", - attributes: [ - { name: "term", required: true, type: "string", description: "The headword or term being defined." }, - { name: "see", required: false, type: "string", description: "Cross-reference to another entry's term. When set, this entry acts as a redirect (e.g. \"ran\" → \"run\")." }, - ], - children: [ - { name: "pronunciation", min: "0", max: "unbounded", description: "Entry-level pronunciation." }, - { name: "ety", min: "0", max: "unbounded", description: "An etymology grouping." }, - ], - }, - { - name: "ety", - description: "Groups senses under a common etymology (word origin). A single entry can have multiple etymologies if the word has distinct historical origins.", - attributes: [ - { name: "id", required: false, type: "string", description: "A unique identifier for this etymology." }, - { name: "pronunciation", required: false, type: "string", description: "A simple pronunciation string (e.g. IPA). For richer pronunciation data, use child `` elements on the parent entry instead." }, - { name: "description", required: false, type: "string", description: "A description of the word's origin (e.g. \"From Latin currere\")." }, - ], - children: [ - { name: "sense", min: "1", max: "unbounded", description: "A sense grouping (by part of speech)." }, - ], - }, - { - name: "sense", - description: "Groups definitions under a part of speech. A sense can contain definitions directly, or organize them into groups.", - attributes: [ - { name: "pos", required: false, type: "string", description: "Part of speech code (e.g. `n`, `v`, `adj`, `adv`, `phr`). See the [Parts of Speech](#parts-of-speech) section for all supported values." }, - ], - children: [ - { name: "group", min: "0", max: "unbounded", description: "A named group of related definitions." }, - { name: "definition", min: "0", max: "unbounded", description: "A definition (can appear alongside or instead of groups)." }, - ], - }, - { - name: "group", - description: "An optional grouping of related definitions within a sense. Useful for organizing many definitions into logical clusters.", - attributes: [ - { name: "id", required: false, type: "string", description: "A unique identifier for this group." }, - { name: "description", required: false, type: "string", description: "A label or description for this group (e.g. \"Verb senses related to motion\")." }, - ], - children: [ - { name: "definition", min: "1", max: "unbounded", description: "A definition within this group." }, - ], - }, - { - name: "definition", - description: "A single definition of the entry's term.", - attributes: [ - { name: "id", required: false, type: "string", description: "A unique identifier for this definition." }, - { name: "value", required: true, type: "string", description: "The definition text. Supports inline Markdown-style formatting in parentheses for labels, e.g. `\"(Computing) a set of words...\"`." }, - ], - children: [ - { name: "example", min: "0", max: "unbounded", description: "An example usage of this definition." }, - { name: "note", min: "0", max: "unbounded", description: "A supplementary note about this definition." }, - ], - }, - { - name: "note", - description: "A supplementary note attached to a definition. Notes can carry their own examples.", - attributes: [ - { name: "id", required: false, type: "string", description: "A unique identifier for this note." }, - { name: "value", required: true, type: "string", description: "The note text." }, - ], - children: [ - { name: "example", min: "1", max: "unbounded", description: "An example relevant to this note." }, - ], - }, - { - name: "example", - description: "An example sentence or usage demonstrating a definition, note, or pronunciation.", - attributes: [ - { name: "value", required: true, type: "string", description: "The example text (e.g. `\"The dog runs after the cat.\"`)." }, - ], - children: [ - { name: "pronunciation", min: "0", max: "unbounded", description: "A pronunciation of this example (useful for non-Latin scripts)." }, - ], - }, - { - name: "pronunciation", - description: "Describes how a word, entry, or example is pronounced. Supports any phonetic system (IPA, Pinyin, Romaji, etc.) and optional audio URLs.", - attributes: [ - { name: "kind", required: true, type: "string", description: "The pronunciation system used (e.g. `ipa`, `pinyin`, `romaji`, or any custom string)." }, - { name: "value", required: true, type: "string", description: "The pronunciation notation (e.g. `həˈləʊ`, `nǐ hǎo`)." }, - ], - children: [ - { name: "url", min: "0", max: "unbounded", description: "A URL to an audio file for this pronunciation." }, - ], - }, - { - name: "url", - description: "A reference to an audio file for a pronunciation. Used as a child of ``.", - attributes: [ - { name: "src", required: true, type: "string", description: "Path or URL to the audio file." }, - { name: "type", required: false, type: "string", description: "MIME type of the audio file (e.g. `audio/mpeg`, `audio/ogg`)." }, - { name: "description", required: false, type: "string", description: "A description of this audio (e.g. \"British pronunciation\")." }, - ], - children: [], - }, -]; +const namedTypes = new Map(); + +for (let i = 0; i < tokens.length; i++) { + const t = tokens[i]; + if (t.tag === "xs:complexType" && t.type === "open" && t.attrs.name) { + const ctClose = findClose(i); + const { attributes, children } = parseComplexTypeRange(i, ctClose); + namedTypes.set(t.attrs.name, { attributes, children }); + i = ctClose; + } +} // --------------------------------------------------------------------------- -// Known POS codes (extracted from lib/src/schema/pos.rs) +// Phase 2: Recursively walk the root element to build element map // --------------------------------------------------------------------------- -const universalPos = [ - ["n", "noun"], - ["v", "verb"], - ["adj", "adjective"], - ["adv", "adverb"], - ["pron", "pronoun"], - ["prep", "preposition"], - ["conj", "conjunction"], - ["intj", "interjection"], - ["det", "determiner"], - ["part", "particle"], - ["num", "numeric"], - ["abv", "abbreviation"], - ["adf", "adfix"], - ["aff", "affix"], - ["art", "article"], - ["aux", "auxiliary"], - ["aux_adj", "auxiliary adjective"], - ["aux_v", "auxiliary verb"], - ["chr", "character"], - ["cf", "circumfix"], - ["cls", "classifier"], - ["conj_c", "coordinating conjunction"], - ["conj_s", "subordinating conjunction"], - ["contr", "contraction"], - ["cop", "copula"], - ["ctr", "counter"], - ["expr", "expression"], - ["inf", "infix"], - ["intf", "interfix"], - ["name", "name"], - ["phr", "phrase"], - ["phr_adj", "adjective phrase"], - ["phr_adv", "adverbial phrase"], - ["phr_prep", "prepositional phrase"], - ["postp", "postposition"], - ["pref", "prefix"], - ["propn", "proper noun"], - ["prov", "proverb"], - ["punc", "punctuation"], - ["suff", "suffix"], - ["sym", "symbol"], - ["vi", "intransitive verb"], - ["vt", "transitive verb"], - ["un", "unknown"], +const elements = new Map(); + +/** Resolve a named complexType into an element descriptor and register children. */ +function resolveNamedType(typeName) { + const type = namedTypes.get(typeName); + if (!type) return { attributes: [], children: [] }; + + const children = type.children.map((c) => ({ + name: c.name, + minOccurs: c.minOccurs, + maxOccurs: c.maxOccurs, + })); + + // Recursively register child elements that reference named types + for (const child of type.children) { + if (!elements.has(child.name) && child.type && namedTypes.has(child.type)) { + elements.set(child.name, resolveNamedType(child.type)); + } else if (!elements.has(child.name)) { + elements.set(child.name, { attributes: [], children: [] }); + } + } + + return { attributes: [...type.attributes], children }; +} + +/** Process an xs:element token at `index` and register it in the elements map. */ +function processElement(index) { + const t = tokens[index]; + const name = t.attrs.name; + const type = t.attrs.type; + + if (elements.has(name)) return; + + // Self-closing element or element with a named type + if (t.type === "selfclose") { + if (type && namedTypes.has(type)) { + elements.set(name, resolveNamedType(type)); + } else { + elements.set(name, { attributes: [], children: [] }); + } + return; + } + + // Open element with a named type (no inline complexType) + if (type && namedTypes.has(type)) { + elements.set(name, resolveNamedType(type)); + return; + } + + const elClose = findClose(index); + + // Find the inline xs:complexType within this element + for (let i = index + 1; i < elClose; i++) { + if (tokens[i].tag === "xs:complexType" && tokens[i].type === "open") { + const ctClose = findClose(i); + const { attributes, children } = parseComplexTypeRange(i, ctClose); + + elements.set(name, { + attributes, + children: children.map((c) => ({ + name: c.name, + minOccurs: c.minOccurs, + maxOccurs: c.maxOccurs, + })), + }); + + // Recursively process child elements found at depth 0 + let depth = 0; + for (let j = i + 1; j < ctClose; j++) { + if (tokens[j].tag === "xs:complexType") { + if (tokens[j].type === "open") depth++; + else if (tokens[j].type === "close") depth--; + continue; + } + if (depth > 0) continue; + + if ( + tokens[j].tag === "xs:element" && + tokens[j].attrs.name && + (tokens[j].type === "selfclose" || tokens[j].type === "open") + ) { + processElement(j); + if (tokens[j].type === "open") { + j = findClose(j); + } + } + } + + break; + } + } +} + +// Find the root and process it +for (let i = 0; i < tokens.length; i++) { + const t = tokens[i]; + if ( + t.tag === "xs:element" && + t.attrs.name === "dictionary" && + (t.type === "open" || t.type === "selfclose") + ) { + processElement(i); + break; + } +} + +// --------------------------------------------------------------------------- +// Parse POS codes from lib/src/schema/pos.rs +// --------------------------------------------------------------------------- + +function parsePosEnum(source) { + const entries = []; + const re = + /#\[strum\(to_string\s*=\s*"([^"]+)"\)\]\s*(?:#\[.*\]\s*)*(\w+)/g; + let m; + while ((m = re.exec(source)) !== null) { + const label = m[1]; + const variant = m[2]; + if (variant === "Other") continue; + entries.push({ variant, label }); + } + return entries; +} + +const allPos = parsePosEnum(posSource); + +const japaneseVariantPrefixes = [ + "AdjPn", "AdjKari", "AdjKu", "AdjNari", "AdjNa", "AdjShiku", + "AdjT", "AdjIx", "NAdv", "AdvTo", "AdjNo", "NPref", "NSuf", + "NT", "AdjF", "V5", "V1", "Vz", "Vk", "V2", "Vn", "Vr", + "VsC", "Vs", "VUnspec", "V4", ]; +function isJapanese(variant) { + return japaneseVariantPrefixes.some( + (p) => variant === p || variant.startsWith(p) + ); +} + +const universalPos = allPos.filter((p) => !isJapanese(p.variant)); +const japanesePos = allPos.filter((p) => isJapanese(p.variant)); + +function variantToCode(variant) { + return variant + .replace(/([a-z])([A-Z])/g, "$1_$2") + .replace(/([A-Z]+)([A-Z][a-z])/g, "$1_$2") + .toLowerCase(); +} + +// --------------------------------------------------------------------------- +// Build element hierarchy tree (with deduplication via seen set) +// --------------------------------------------------------------------------- + +function buildTree(name, prefix = "", isLast = true, seen = new Set(), isRoot = true) { + const el = elements.get(name); + const connector = isRoot ? "" : isLast ? "└── " : "├── "; + + if (seen.has(name)) { + return `${prefix}${connector}${name} …\n`; + } + + let result = `${prefix}${connector}${name}\n`; + seen.add(name); + + if (!el || el.children.length === 0) return result; + + const childPrefix = isRoot ? "" : prefix + (isLast ? " " : "│ "); + + for (let i = 0; i < el.children.length; i++) { + const child = el.children[i]; + const childIsLast = i === el.children.length - 1; + result += buildTree(child.name, childPrefix, childIsLast, seen, false); + } + + return result; +} + // --------------------------------------------------------------------------- // Render Markdown // --------------------------------------------------------------------------- +const elementOrder = [ + "dictionary", "entry", "ety", "sense", "group", + "definition", "note", "example", "pronunciation", "url", +]; + let md = `--- title: XML Schema Reference description: Complete reference for the ODict XML (ODXML) schema. --- -{/* This file is auto-generated by scripts/generate-schema-docs.mjs. Do not edit manually. */} +{/* This file is auto-generated by scripts/generate-schema-docs.mjs — do not edit manually. */} -This page is automatically generated from [\`odict.xsd\`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd). +This page is automatically generated from [\`odict.xsd\`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd) and [\`pos.rs\`](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs). ## Element hierarchy \`\`\` -dictionary -├── entry -│ ├── pronunciation -│ │ └── url -│ └── ety -│ └── sense -│ ├── group -│ │ └── definition -│ │ ├── example -│ │ │ └── pronunciation -│ │ │ └── url -│ │ └── note -│ │ └── example -│ │ └── pronunciation -│ │ └── url -│ └── definition -│ ├── example -│ │ └── pronunciation -│ │ └── url -│ └── note -│ └── example -│ └── pronunciation -│ └── url +${buildTree("dictionary").trimEnd()} \`\`\` --- @@ -297,28 +359,28 @@ dictionary `; -for (const el of elements) { - md += `### \`<${el.name}>\`\n\n`; - md += `${el.description}\n\n`; +for (const name of elementOrder) { + const el = elements.get(name); + if (!el) continue; + + md += `### \`<${name}>\`\n\n`; - // Attributes table if (el.attributes.length > 0) { md += `#### Attributes\n\n`; - md += `| Attribute | Type | Required | Description |\n`; - md += `|-----------|------|----------|-------------|\n`; + md += `| Attribute | Type | Required |\n`; + md += `|-----------|------|----------|\n`; for (const a of el.attributes) { - md += `| \`${a.name}\` | \`${a.type}\` | ${a.required ? "Yes" : "No"} | ${a.description} |\n`; + md += `| \`${a.name}\` | \`${a.type}\` | ${a.required ? "Yes" : "No"} |\n`; } md += `\n`; } - // Children if (el.children.length > 0) { md += `#### Child elements\n\n`; - md += `| Element | Min | Max | Description |\n`; - md += `|---------|-----|-----|-------------|\n`; + md += `| Element | Min | Max |\n`; + md += `|---------|-----|-----|\n`; for (const c of el.children) { - md += `| [\`<${c.name}>\`](#${c.name}) | ${c.min} | ${c.max} | ${c.description} |\n`; + md += `| [\`<${c.name}>\`](#${c.name}) | ${c.minOccurs} | ${c.maxOccurs} |\n`; } md += `\n`; } @@ -326,18 +388,28 @@ for (const el of elements) { md += `---\n\n`; } -// Parts of Speech section +// --------------------------------------------------------------------------- +// Parts of Speech +// --------------------------------------------------------------------------- + md += `## Parts of speech\n\n`; -md += `The \`pos\` attribute on \`\` accepts the following codes. You can also pass any custom string, which will be treated as a custom part of speech.\n\n`; +md += `The \`pos\` attribute on \`\` accepts the following values. You can also pass any custom string, which will be treated as a custom part of speech.\n\n`; + +md += `### Universal\n\n`; +md += `| Code | Label |\n`; +md += `|------|-------|\n`; +for (const p of universalPos) { + md += `| \`${variantToCode(p.variant)}\` | ${p.label} |\n`; +} +md += `\n`; + +md += `### Japanese-specific\n\n`; md += `| Code | Label |\n`; md += `|------|-------|\n`; -for (const [code, label] of universalPos) { - md += `| \`${code}\` | ${label} |\n`; +for (const p of japanesePos) { + md += `| \`${variantToCode(p.variant)}\` | ${p.label} |\n`; } md += `\n`; -md += `:::note\n`; -md += `ODict also supports an extensive set of Japanese-specific parts of speech (Godan verbs, Ichidan verbs, Nidan verbs, etc.). These use codes like \`v5b\`, \`v1\`, \`vk\`, \`adj_na\`, etc. Refer to the [source code](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs) for the complete list.\n`; -md += `:::\n`; // --------------------------------------------------------------------------- // Write output @@ -345,4 +417,7 @@ md += `:::\n`; mkdirSync(dirname(outPath), { recursive: true }); writeFileSync(outPath, md, "utf-8"); -console.log(`✅ Generated schema reference → ${outPath}`); + +console.log( + `Generated schema reference -> ${outPath} (${elements.size} elements, ${allPos.length} POS codes)` +); diff --git a/docs/src/content/docs/cli/reference.md b/docs/src/content/docs/cli/reference.md index e4e943125..641bdc268 100644 --- a/docs/src/content/docs/cli/reference.md +++ b/docs/src/content/docs/cli/reference.md @@ -3,7 +3,7 @@ title: CLI Reference description: Complete reference for the ODict command-line interface. --- -{/* This file is auto-generated by scripts/generate-cli-docs.mjs. Do not edit manually. */} +{/* This file is auto-generated by scripts/generate-cli-docs.mjs — do not edit manually. */} ``` odict [OPTIONS] @@ -23,36 +23,6 @@ The ODict CLI is the primary tool for creating, compiling, and querying ODict di ## Commands -### `odict new` - -Scaffolds a new ODict XML dictionary. - -``` -odict new [-n ] -``` - -#### Arguments - -| Argument | Required | Description | -|----------|----------|-------------| -| `file_name` | Yes | Name of your new dictionary file (without extension) | - -#### Options - -| Flag | Argument | Description | -|------|----------|-------------| -| `-n` | `` | Name attribute of the `` element | - -#### Example - -```bash -# Create a new dictionary file -odict new my-dictionary -n "My Dictionary" -# Creates my-dictionary.xml -``` - ---- - ### `odict compile` Compiles a dictionary from ODXML. @@ -69,94 +39,59 @@ odict compile [-o ] [-q ] [-w ] #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-o` | `` | Output path of compiled dictionary. Defaults to the input path with a `.odict` extension. | -| `-q` | `<0-11>` | Brotli compression level (default: `8`) | -| `-w` | `<0-22>` | Brotli large window size (default: `22`) | - -#### Example - -```bash -# Compile with default settings -odict compile my-dictionary.xml - -# Compile with custom output and compression -odict compile my-dictionary.xml -o out/dict.odict -q 11 -``` +| Flag | Description | +|------|-------------| +| `-o` | Output path of compiled dictionary | +| `-q` | Brotli compression level (between 0 and 11) (default: `8`) | +| `-w` | Brotli large window size (between 0 and 22) (default: `22`) | --- -### `odict lookup` +### `odict download` -Looks up entries in a compiled dictionary without indexing. +Downloads a dictionary from the remote registry. ``` -odict lookup [-f ] [-F ] [-s ] [-i] +odict download [-o ] [--no-cache] ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `dictionary` | Yes | Path to a compiled dictionary or an alias | -| `queries` | Yes | One or more words to look up | +| `dictionary` | Yes | Dictionary to download (e.g., 'wiktionary/eng') | #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-f, --format` | `` | Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`) | -| `-F, --follow` | `` | Number of redirects to follow via `see` attributes (default: `0`). Use a high number for infinite following. | -| `-s, --split` | `` | If not found, split the query into words of at least length `n` and look up each separately (default: `0`, disabled) | -| `-i, --insensitive` | — | Perform case-insensitive lookups | - -#### Example - -```bash -# Simple lookup -odict lookup my-dictionary.odict cat - -# Lookup with JSON output and follow redirects -odict lookup my-dictionary.odict ran -f json -F 1 - -# Case-insensitive lookup with splitting -odict lookup my-dictionary.odict "catdog" -s 3 -i -``` +| Flag | Description | +|------|-------------| +| `-o, --output` | Directory to download the dictionary to (defaults to config directory) | +| `--no-cache` | Disable caching (always download fresh copy) (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- -### `odict search` +### `odict dump` -Runs a full-text query on a compiled dictionary. +Outputs a dictionary in a human-readable format. ``` -odict search [-f ] [--index] +odict dump [-f] [-o ] ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `dictionary` | Yes | Path to a compiled dictionary or an alias | -| `query` | Yes | Search query | +| `input` | Yes | Path to a compile dictionary | #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-f, --format` | `` | Output format: `json`, `xml`, `markdown`, `html`, `print` (default: `json`) | -| `--index` | — | Creates a new index if one doesn't already exist | - -#### Example - -```bash -# Search with auto-indexing -odict search my-dictionary.odict "move swiftly" --index - -# Search with specific output format -odict search my-dictionary.odict "greeting" -f xml -``` +| Flag | Description | +|------|-------------| +| `-f` | Format in which to dump the dictionary. (default: `xml`) | +| `-o` | Output path of the dump. Defaults to stdout. | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- @@ -176,357 +111,281 @@ odict index [-d ] [-f] [-m ] #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-d` | `` | Custom directory to store the index | -| `-f` | — | Whether to overwrite the index if it already exists | -| `-m` | `` | Memory arena per thread in bytes. Must be above 15MB. (default: `15000000`) | - -#### Example - -```bash -# Create an index with default settings -odict index my-dictionary.odict - -# Overwrite existing index with custom memory -odict index my-dictionary.odict -f -m 50000000 -``` +| Flag | Description | +|------|-------------| +| `-d` | Custom directory to store the index | +| `-f` | Whether to overwrite the index if it already exists (default: `false`) | +| `-m` | Memory arena per thread in bytes. Must be above 15MB. (default: `15000000`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- -### `odict tokenize` +### `odict info` -Tokenizes text and finds dictionary entries for each token. +Prints the metadata info for a dictionary file. ``` -odict tokenize [-f ] [-F ] [-i] +odict info ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `dictionary` | Yes | Path to a compiled dictionary | -| `text` | Yes | Text to tokenize | +| `dictionary_path` | Yes | Path to a compiled dictionary | #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-f, --format` | `` | Output format: `print`, `json`, `xml`, `markdown`, `html` (default: `print`) | -| `-F, --follow` | `` | Number of redirects to follow via `see` attributes (default: `0`) | -| `-i, --insensitive` | — | Perform case-insensitive lookups when matching tokens | - -#### Example - -```bash -# Tokenize Chinese text -odict tokenize chinese.odict "你好世界" - -# Tokenize with redirect following -odict tokenize my-dictionary.odict "the cat ran" -F 1 -f json -``` +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- -### `odict dump` +### `odict lexicon` -Outputs a dictionary in a human-readable format. +Lists all words defined in a dictionary. ``` -odict dump [-f ] [-o ] +odict lexicon ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `input` | Yes | Path to a compiled dictionary | +| `dictionary` | Yes | Path to a compiled dictionary | #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-f` | `` | Dump format: `xml`, `sqlite`, `postgres`, `mysql` (default: `xml`) | -| `-o` | `` | Output path. Defaults to stdout. | - -#### Example - -```bash -# Dump as XML to stdout -odict dump my-dictionary.odict - -# Dump as SQL to a file -odict dump my-dictionary.odict -f sqlite -o dictionary.sql -``` +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- -### `odict merge` +### `odict lookup` -Merges entries from multiple dictionaries into one. +Looks up an entry in a compiled dictionary without indexing. ``` -odict merge [-o ] +odict lookup [-f ] [-F] [-s ] [-i] ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `destination` | Yes | Path of the dictionary to merge into (unless `--output` is specified) | -| `sources` | Yes | Paths of dictionaries to merge | +| `dictionary_path` | Yes | Path to a compiled dictionary | +| `queries` | Yes | Words to look up | #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-o, --output` | `` | Separate output path for the compiled dictionary | - -#### Example - -```bash -# Merge two dictionaries into the first -odict merge base.odict extra1.odict extra2.odict - -# Merge into a new file -odict merge base.odict extra.odict -o combined.odict -``` +| Flag | Description | +|------|-------------| +| `-f, --format` | Output format of the entries (default: `print`) | +| `-F, --follow` | Follow see_also redirects until finding an entry with etymologies | +| `-s, --split` | If a definition cannot be found, attempt to split the query into words of at least length S and look up each word separately. Can be relatively slow. (default: `0`) | +| `-i, --insensitive` | Perform case-insensitive lookups (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- -### `odict info` +### `odict merge` -Prints the metadata for a dictionary file. +Merge entries from multiple dictionaries into a destination dictionary. ``` -odict info +odict merge [-o ] ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `dictionary` | Yes | Path to a compiled dictionary | +| `destination` | Yes | Path of the dictionary to merge into (unless --output is specified) | +| `sources` | Yes | Paths of dictionaries to merge | -#### Example +#### Options -```bash -odict info my-dictionary.odict -# Output: -# My Dictionary -# ───────────── -# File Version: 3 -# File Size: 1.23 KB -# Entries: 5,000 -``` +| Flag | Description | +|------|-------------| +| `-o, --output` | Separate output path for the compiled dictionary | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- -### `odict lexicon` +### `odict new` -Lists all words defined in a dictionary. +Scaffolds a new ODict XML dictionary. ``` -odict lexicon +odict new [-n ] ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `dictionary` | Yes | Path to a compiled dictionary | +| `file_name` | Yes | Name of your new dictionary file | -#### Example +#### Options -```bash -odict lexicon my-dictionary.odict -# cat -# dog -# run -# ... -``` +| Flag | Description | +|------|-------------| +| `-n` | Name attribute of the dictionary element | --- -### `odict download` +### `odict search` -Downloads a dictionary from the remote registry. +Run a full-text query on a compiled dictionary. ``` -odict download [-o ] [--no-cache] +odict search [-f] [--index] ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `dictionary` | Yes | Dictionary to download (e.g. `wiktionary/eng`) | +| `dictionary` | Yes | Path to a compiled dictionary or an alias | +| `query` | Yes | Search query | #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-o, --output` | `` | Directory to download to (defaults to config directory) | -| `--no-cache` | — | Disable caching (always download a fresh copy) | - -#### Example - -```bash -# Download English Wiktionary dictionary -odict download wiktionary/eng - -# Download Japanese dictionary to a specific directory -odict download wiktionary/jpn -o ./dicts/ -``` +| Flag | Description | +|------|-------------| +| `-f, --format` | Format in which to print the results (default: `json`) | +| `--index` | Creates a new index if one doesn't already exist (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- ### `odict serve` -Starts a local HTTP server to serve one or several dictionaries. +Start a local web server to serve one or several dictionaries. ``` -odict serve [dictionaries...] [-p ] [-c ] [-l ] +odict serve [dictionaries...] [-p ] [-c ] [-l] ``` #### Arguments | Argument | Required | Description | |----------|----------|-------------| -| `dictionaries` | No | Paths to compiled dictionaries or directories containing `.odict` files | +| `dictionaries` | No | | #### Options -| Flag | Argument | Description | -|------|----------|-------------| -| `-p` | `` | Port to listen on (default: `5005`) | -| `-c, --capacity` | `` | Maximum number of dictionaries to keep in memory (default: `5`) | -| `-l, --level` | `` | Log level: `trace`, `debug`, `info`, `warn`, `error` | +| Flag | Description | +|------|-------------| +| `-p` | Port to listen on (default: `5005`) | +| `-c, --capacity` | Maximum number of dictionaries to keep in memory (default: `5`) | +| `-l, --level` | | -#### Example +#### HTTP endpoints -```bash -# Serve a single dictionary -odict serve my-dictionary.odict +When running `odict serve`, the following REST endpoints become available. All return JSON. -# Serve a directory of dictionaries on a custom port -odict serve ./dicts/ -p 8080 -c 10 -``` +##### `GET /{name}/lookup` -### HTTP endpoints +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `q` | string | Yes | | +| `follow` | boolean | No | | +| `split` | number | No | | -When running `odict serve`, the following REST endpoints become available: +##### `GET /{name}/search` -#### `GET /{name}/lookup` +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `q` | string | Yes | | +| `limit` | number | No | | -Look up entries by exact match. +##### `GET /{name}/tokenize` -| Parameter | Type | Description | -|-----------|------|-------------| -| `queries` | string | Comma-separated list of terms to look up | -| `follow` | number | Number of redirects to follow (optional) | -| `split` | number | Minimum word length for splitting (optional) | +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `text` | string | Yes | | +| `follow` | boolean | No | | -```bash -curl "http://localhost:5005/my-dictionary/lookup?queries=cat,dog&follow=1" -``` - -#### `GET /{name}/search` +--- -Full-text search across definitions. +### `odict tokenize` -| Parameter | Type | Description | -|-----------|------|-------------| -| `query` | string | Search query | -| `limit` | number | Maximum results to return (default: 10) | +Tokenize text and find dictionary entries for each token. -```bash -curl "http://localhost:5005/my-dictionary/search?query=move+swiftly&limit=5" +``` +odict tokenize [-f ] [-F] [-i] ``` -#### `GET /{name}/tokenize` - -Tokenize text and find matching entries. +#### Arguments -| Parameter | Type | Description | -|-----------|------|-------------| -| `text` | string | Text to tokenize | -| `follow` | number | Number of redirects to follow (optional) | +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary_path` | Yes | Path to a compiled dictionary | +| `text` | Yes | Text to tokenize | -```bash -curl "http://localhost:5005/chinese/tokenize?text=你好世界" -``` +#### Options -All endpoints return JSON. +| Flag | Description | +|------|-------------| +| `-f, --format` | Output format of the entries (default: `print`) | +| `-F, --follow` | Follow see_also redirects until finding an entry with etymologies | +| `-i, --insensitive` | Perform case-insensitive lookups when matching tokens (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | --- -### `odict alias add` +### `odict alias` + +Manage dictionary aliases. -Creates a new dictionary alias (fails if one already exists). +#### `odict alias add` + +Attempts to create a new dictionary alias, failing if one already exists with the given name. ``` odict alias add ``` -#### Arguments - | Argument | Required | Description | |----------|----------|-------------| | `name` | Yes | Name of the alias | | `path` | Yes | Dictionary path | -#### Example +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | -```bash -odict alias add eng ./dicts/english.odict -``` +#### `odict alias set` ---- - -### `odict alias set` - -Creates or updates a dictionary alias. +Creates or updates an existing dictionary alias. ``` odict alias set ``` -#### Arguments - | Argument | Required | Description | |----------|----------|-------------| | `name` | Yes | Name of the alias | | `path` | Yes | Dictionary path | -#### Example - -```bash -odict alias set eng ./dicts/english-v2.odict -``` - ---- +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | -### `odict alias delete` +#### `odict alias delete` -Deletes an alias with the given name. +Deletes an alias with the given name if it exists. ``` odict alias delete ``` -#### Arguments - | Argument | Required | Description | |----------|----------|-------------| -| `name` | Yes | Name of the alias to delete | - -#### Example - -```bash -odict alias delete eng -``` +| `name` | Yes | Name of the alias | --- - diff --git a/docs/src/content/docs/schema/reference.md b/docs/src/content/docs/schema/reference.md index eebaa416b..0d78b7c62 100644 --- a/docs/src/content/docs/schema/reference.md +++ b/docs/src/content/docs/schema/reference.md @@ -3,36 +3,26 @@ title: XML Schema Reference description: Complete reference for the ODict XML (ODXML) schema. --- -{/* This file is auto-generated by scripts/generate-schema-docs.mjs. Do not edit manually. */} +{/* This file is auto-generated by scripts/generate-schema-docs.mjs — do not edit manually. */} -This page is automatically generated from [`odict.xsd`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd). +This page is automatically generated from [`odict.xsd`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd) and [`pos.rs`](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs). ## Element hierarchy ``` dictionary -├── entry -│ ├── pronunciation -│ │ └── url -│ └── ety -│ └── sense -│ ├── group -│ │ └── definition -│ │ ├── example -│ │ │ └── pronunciation -│ │ │ └── url -│ │ └── note -│ │ └── example -│ │ └── pronunciation -│ │ └── url -│ └── definition -│ ├── example -│ │ └── pronunciation -│ │ └── url -│ └── note -│ └── example -│ └── pronunciation -│ └── url +└── entry + ├── pronunciation + │ └── url + └── ety + └── sense + ├── group + │ └── definition + │ ├── example + │ │ └── pronunciation … + │ └── note + │ └── example … + └── definition … ``` --- @@ -41,242 +31,298 @@ dictionary ### `` -The root element of an ODict XML file. Contains one or more entries. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `id` | `string` | No | A unique identifier for the dictionary. | -| `name` | `string` | No | A human-readable name for the dictionary (e.g. "English Dictionary"). | +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `name` | `string` | No | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#entry) | 1 | unbounded | A dictionary entry. | +| Element | Min | Max | +|---------|-----|-----| +| [``](#entry) | 1 | unbounded | --- ### `` -Represents a single dictionary entry (headword). An entry can either contain full definitions via etymology elements, or redirect to another entry using the `see` attribute. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `term` | `string` | Yes | The headword or term being defined. | -| `see` | `string` | No | Cross-reference to another entry's term. When set, this entry acts as a redirect (e.g. "ran" → "run"). | +| Attribute | Type | Required | +|-----------|------|----------| +| `term` | `string` | Yes | +| `see` | `string` | No | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#pronunciation) | 0 | unbounded | Entry-level pronunciation. | -| [``](#ety) | 0 | unbounded | An etymology grouping. | +| Element | Min | Max | +|---------|-----|-----| +| [``](#pronunciation) | 0 | unbounded | +| [``](#ety) | 0 | unbounded | --- ### `` -Groups senses under a common etymology (word origin). A single entry can have multiple etymologies if the word has distinct historical origins. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `id` | `string` | No | A unique identifier for this etymology. | -| `pronunciation` | `string` | No | A simple pronunciation string (e.g. IPA). For richer pronunciation data, use child `` elements on the parent entry instead. | -| `description` | `string` | No | A description of the word's origin (e.g. "From Latin currere"). | +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `pronunciation` | `string` | No | +| `description` | `string` | No | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#sense) | 1 | unbounded | A sense grouping (by part of speech). | +| Element | Min | Max | +|---------|-----|-----| +| [``](#sense) | 1 | unbounded | --- ### `` -Groups definitions under a part of speech. A sense can contain definitions directly, or organize them into groups. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `pos` | `string` | No | Part of speech code (e.g. `n`, `v`, `adj`, `adv`, `phr`). See the [Parts of Speech](#parts-of-speech) section for all supported values. | +| Attribute | Type | Required | +|-----------|------|----------| +| `pos` | `string` | No | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#group) | 0 | unbounded | A named group of related definitions. | -| [``](#definition) | 0 | unbounded | A definition (can appear alongside or instead of groups). | +| Element | Min | Max | +|---------|-----|-----| +| [``](#group) | 0 | unbounded | +| [``](#definition) | 0 | unbounded | --- ### `` -An optional grouping of related definitions within a sense. Useful for organizing many definitions into logical clusters. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `id` | `string` | No | A unique identifier for this group. | -| `description` | `string` | No | A label or description for this group (e.g. "Verb senses related to motion"). | +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `description` | `string` | No | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#definition) | 1 | unbounded | A definition within this group. | +| Element | Min | Max | +|---------|-----|-----| +| [``](#definition) | 1 | unbounded | --- ### `` -A single definition of the entry's term. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `id` | `string` | No | A unique identifier for this definition. | -| `value` | `string` | Yes | The definition text. Supports inline Markdown-style formatting in parentheses for labels, e.g. `"(Computing) a set of words..."`. | +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `value` | `string` | Yes | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#example) | 0 | unbounded | An example usage of this definition. | -| [``](#note) | 0 | unbounded | A supplementary note about this definition. | +| Element | Min | Max | +|---------|-----|-----| +| [``](#example) | 0 | unbounded | +| [``](#note) | 0 | unbounded | --- ### `` -A supplementary note attached to a definition. Notes can carry their own examples. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `id` | `string` | No | A unique identifier for this note. | -| `value` | `string` | Yes | The note text. | +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `value` | `string` | Yes | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#example) | 1 | unbounded | An example relevant to this note. | +| Element | Min | Max | +|---------|-----|-----| +| [``](#example) | 1 | unbounded | --- ### `` -An example sentence or usage demonstrating a definition, note, or pronunciation. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `value` | `string` | Yes | The example text (e.g. `"The dog runs after the cat."`). | +| Attribute | Type | Required | +|-----------|------|----------| +| `value` | `string` | Yes | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#pronunciation) | 0 | unbounded | A pronunciation of this example (useful for non-Latin scripts). | +| Element | Min | Max | +|---------|-----|-----| +| [``](#pronunciation) | 0 | unbounded | --- ### `` -Describes how a word, entry, or example is pronounced. Supports any phonetic system (IPA, Pinyin, Romaji, etc.) and optional audio URLs. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `kind` | `string` | Yes | The pronunciation system used (e.g. `ipa`, `pinyin`, `romaji`, or any custom string). | -| `value` | `string` | Yes | The pronunciation notation (e.g. `həˈləʊ`, `nǐ hǎo`). | +| Attribute | Type | Required | +|-----------|------|----------| +| `kind` | `string` | Yes | +| `value` | `string` | Yes | #### Child elements -| Element | Min | Max | Description | -|---------|-----|-----|-------------| -| [``](#url) | 0 | unbounded | A URL to an audio file for this pronunciation. | +| Element | Min | Max | +|---------|-----|-----| +| [``](#url) | 0 | unbounded | --- ### `` -A reference to an audio file for a pronunciation. Used as a child of ``. - #### Attributes -| Attribute | Type | Required | Description | -|-----------|------|----------|-------------| -| `src` | `string` | Yes | Path or URL to the audio file. | -| `type` | `string` | No | MIME type of the audio file (e.g. `audio/mpeg`, `audio/ogg`). | -| `description` | `string` | No | A description of this audio (e.g. "British pronunciation"). | +| Attribute | Type | Required | +|-----------|------|----------| +| `src` | `string` | Yes | +| `type` | `string` | No | +| `description` | `string` | No | --- ## Parts of speech -The `pos` attribute on `` accepts the following codes. You can also pass any custom string, which will be treated as a custom part of speech. +The `pos` attribute on `` accepts the following values. You can also pass any custom string, which will be treated as a custom part of speech. + +### Universal | Code | Label | |------|-------| -| `n` | noun | -| `v` | verb | -| `adj` | adjective | -| `adv` | adverb | -| `pron` | pronoun | -| `prep` | preposition | -| `conj` | conjunction | -| `intj` | interjection | -| `det` | determiner | -| `part` | particle | -| `num` | numeric | +| `art` | article | | `abv` | abbreviation | | `adf` | adfix | +| `adj` | adjective | +| `phr_adj` | adjective phrase | +| `adv` | adverb | +| `phr_adv` | adverbial phrase | | `aff` | affix | -| `art` | article | | `aux` | auxiliary | | `aux_adj` | auxiliary adjective | | `aux_v` | auxiliary verb | | `chr` | character | | `cf` | circumfix | | `cls` | classifier | +| `conj` | conjunction | | `conj_c` | coordinating conjunction | -| `conj_s` | subordinating conjunction | | `contr` | contraction | | `cop` | copula | | `ctr` | counter | +| `det` | determiner | | `expr` | expression | | `inf` | infix | | `intf` | interfix | +| `intj` | interjection | +| `vi` | intransitive verb | | `name` | name | +| `n` | noun | +| `num` | numeric | +| `part` | particle | | `phr` | phrase | -| `phr_adj` | adjective phrase | -| `phr_adv` | adverbial phrase | -| `phr_prep` | prepositional phrase | | `postp` | postposition | | `pref` | prefix | +| `prep` | preposition | +| `phr_prep` | prepositional phrase | +| `pron` | pronoun | | `propn` | proper noun | | `prov` | proverb | | `punc` | punctuation | +| `conj_s` | subordinating conjunction | | `suff` | suffix | | `sym` | symbol | -| `vi` | intransitive verb | | `vt` | transitive verb | | `un` | unknown | +| `v` | verb | + +### Japanese-specific + +| Code | Label | +|------|-------| +| `adj_pn` | pre-noun adjectival (rentaishi) | +| `adj_kari` | 'kari' adjective (archaic) | +| `adj_ku` | 'ku' adjective (archaic) | +| `adj_nari` | archaic/formal form of na-adjective | +| `adj_na` | adjectival nouns or quasi-adjectives (keiyodoshi) | +| `adj_shiku` | 'shiku' adjective (archaic) | +| `adj_t` | 'taru' adjective | +| `adj_ix` | adjective (keiyoushi) - yoi/ii class | +| `n_adv` | adverbial noun (fukushitekimeishi) | +| `adv_to` | adverb taking the 'to' particle | +| `adj_no` | nouns which may take the genitive case particle 'no' | +| `n_pref` | noun, used as a prefix | +| `n_suf` | noun, used as a suffix | +| `nt` | noun (temporal) (jisoumeishi) | +| `adj_f` | noun or verb acting prenominally | +| `v5b` | Godan verb with 'bu' ending | +| `v5g` | Godan verb with 'gu' ending | +| `v5k` | Godan verb with 'ku' ending | +| `v5m` | Godan verb with 'mu' ending | +| `v5n` | Godan verb with 'nu' ending | +| `v5r` | Godan verb with 'ru' ending | +| `v5r_i` | Godan verb with 'ru' ending (irregular verb) | +| `v5aru` | Godan verb - -aru special class | +| `v5k_s` | Godan verb - Iku/Yuku special class | +| `v5s` | Godan verb with 'su' ending | +| `v5t` | Godan verb with 'tsu' ending | +| `v5u` | Godan verb with 'u' ending | +| `v5uru` | Godan verb - Uru old class verb (old form of Eru) | +| `v5u_s` | Godan verb with 'u' ending (special class) | +| `v1` | Ichidan verb | +| `v1s` | Ichidan verb - kureru special class | +| `vz` | Ichidan verb - zuru verb (alternative form of -jiru verbs) | +| `vk` | Kuru verb - special class | +| `v2b_s` | Nidan verb (lower class) with 'bu' ending (archaic) | +| `v2b_k` | Nidan verb (upper class) with 'bu' ending (archaic) | +| `v2d_s` | Nidan verb (lower class) with 'dzu' ending (archaic) | +| `v2d_k` | Nidan verb (upper class) with 'dzu' ending (archaic) | +| `v2g_s` | Nidan verb (lower class) with 'gu' ending (archaic) | +| `v2g_k` | Nidan verb (upper class) with 'gu' ending (archaic) | +| `v2h_s` | Nidan verb (lower class) with 'hu/fu' ending (archaic) | +| `v2h_k` | Nidan verb (upper class) with 'hu/fu' ending (archaic) | +| `v2k_s` | Nidan verb (lower class) with 'ku' ending (archaic) | +| `v2k_k` | Nidan verb (upper class) with 'ku' ending (archaic) | +| `v2m_s` | Nidan verb (lower class) with 'mu' ending (archaic) | +| `v2m_k` | Nidan verb (upper class) with 'mu' ending (archaic) | +| `v2n_s` | Nidan verb (lower class) with 'nu' ending (archaic) | +| `v2r_s` | Nidan verb (lower class) with 'ru' ending (archaic) | +| `v2r_k` | Nidan verb (upper class) with 'ru' ending (archaic) | +| `v2s_s` | Nidan verb (lower class) with 'su' ending (archaic) | +| `v2t_s` | Nidan verb (lower class) with 'tsu' ending (archaic) | +| `v2t_k` | Nidan verb (upper class) with 'tsu' ending (archaic) | +| `v2a_s` | Nidan verb with 'u' ending (archaic) | +| `v2w_s` | Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic) | +| `v2y_s` | Nidan verb (lower class) with 'yu' ending (archaic) | +| `v2y_k` | Nidan verb (upper class) with 'yu' ending (archaic) | +| `v2z_s` | Nidan verb (lower class) with 'zu' ending (archaic) | +| `vn` | irregular nu verb | +| `vr` | irregular ru verb, plain form ends with -ri | +| `vs_c` | su verb - precursor to the modern suru | +| `vs` | noun or participle which takes the aux. verb suru | +| `vs_i` | suru verb - included | +| `vs_s` | suru verb - special class | +| `v_unspec` | verb unspecified | +| `v4b` | Yodan verb with 'bu' ending (archaic) | +| `v4g` | Yodan verb with 'gu' ending (archaic) | +| `v4h` | Yodan verb with 'hu/fu' ending (archaic) | +| `v4k` | Yodan verb with 'ku' ending (archaic) | +| `v4m` | Yodan verb with 'mu' ending (archaic) | +| `v4n` | Yodan verb with 'nu' ending (archaic) | +| `v4r` | Yodan verb with 'ru' ending (archaic) | +| `v4s` | Yodan verb with 'su' ending (archaic) | +| `v4t` | Yodan verb with 'tsu' ending (archaic) | -:::note -ODict also supports an extensive set of Japanese-specific parts of speech (Godan verbs, Ichidan verbs, Nidan verbs, etc.). These use codes like `v5b`, `v1`, `vk`, `adj_na`, etc. Refer to the [source code](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs) for the complete list. -::: From 3d5b15e8f8ad6243c8c0983230eaa891d7ecd274 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 23 Feb 2026 12:11:45 +0000 Subject: [PATCH 5/6] Add doc comments to Python bindings and rustdoc-to-md extraction script Adds /// doc comments to all Python binding source files (structs, fields, methods) with descriptions derived from the existing manual documentation. Includes a Python script (scripts/rustdoc-to-md.py) that extracts these doc comments and generates Markdown documentation, with proper Rust-to-Python type mapping (Option -> T | None, Vec -> list[T], etc.). This enables a single-source-of-truth workflow where API documentation lives in the Rust source and can be extracted to Markdown for the docs site. The script works without compilation (no nightly toolchain required). For environments with nightly Rust, the rustdoc-md tool can also be used: RUSTC_BOOTSTRAP=1 cargo rustdoc -p odict_python -- -Z unstable-options --output-format json rustdoc-md target/doc/theopendictionary.json -o docs/python-api.md https://claude.ai/code/session_0152q1rpTnXqZGQ5B85AhjXs --- docs/python-api-generated.md | 450 ++++++++++++++++++++++++++++++ python/src/dictionary.rs | 55 ++++ python/src/types/definition.rs | 7 + python/src/types/entry.rs | 9 + python/src/types/enums.rs | 7 + python/src/types/etymology.rs | 8 + python/src/types/example.rs | 6 + python/src/types/form.rs | 7 + python/src/types/group.rs | 7 + python/src/types/index.rs | 4 + python/src/types/load.rs | 7 + python/src/types/lookup.rs | 10 + python/src/types/media_url.rs | 4 + python/src/types/note.rs | 7 + python/src/types/pronunciation.rs | 7 + python/src/types/save.rs | 7 +- python/src/types/search.rs | 5 + python/src/types/sense.rs | 11 + python/src/types/token.rs | 11 + python/src/types/tokenize.rs | 3 + python/src/types/translation.rs | 3 + scripts/rustdoc-to-md.py | 378 +++++++++++++++++++++++++ 22 files changed, 1012 insertions(+), 1 deletion(-) create mode 100644 docs/python-api-generated.md create mode 100755 scripts/rustdoc-to-md.py diff --git a/docs/python-api-generated.md b/docs/python-api-generated.md new file mode 100644 index 000000000..93d0c99cf --- /dev/null +++ b/docs/python-api-generated.md @@ -0,0 +1,450 @@ +# Python API + +*Auto-generated from Rust doc comments.* + +--- + +## Functions + +### `compile()` + +Compiles an ODXML string into binary `.odict` data. + +Takes an XML string conforming to the ODict XML schema and returns +the compiled binary representation as a byte vector. The resulting +bytes can be passed to [`OpenDictionary::new`] or saved to disk. + +# Errors + +Returns an error if the XML is malformed or does not conform to the +ODict schema. + +## `OpenDictionary` + +The main class for working with compiled ODict dictionaries. + +An `OpenDictionary` wraps a compiled binary dictionary and provides +methods for looking up terms, full-text search, tokenization, and more. + +# Construction + +Create from compiled bytes or an XML string using [`OpenDictionary::new`], +or load from a file path or remote registry using [`OpenDictionary::load`]. + +### Methods + +#### `load()` + +Loads a dictionary from a file path, alias, or remote identifier. + +This is an async method. If `dictionary` is a path to a `.odict` file, +it loads from disk. If it matches the format `org/lang` (e.g. `wiktionary/eng`), +it downloads from the remote registry. + +#### `new()` + +Creates a dictionary from compiled binary data or directly from an XML string. + +Accepts either `bytes` (as returned by [`compile`]) or a `str` containing +ODXML markup. + +#### `save()` + +Saves the dictionary to disk as a `.odict` file. + +Optionally configure Brotli compression via `quality` (0–11) and +`window_size` (0–22). + +#### `min_rank()` + +The minimum rank value across all entries, or `None` if no entries have ranks. + +#### `max_rank()` + +The maximum rank value across all entries, or `None` if no entries have ranks. + +#### `lookup()` + +Looks up one or more terms by exact match. + +- `query` — a single term or list of terms to look up. +- `split` — minimum word length for compound splitting. +- `follow` — follow `see_also` cross-references until an entry with etymologies is found. +- `insensitive` — enable case-insensitive matching. + +#### `lexicon()` + +Returns all terms defined in the dictionary, sorted alphabetically. + +#### `index()` + +Creates a full-text search index for the dictionary. + +Must be called before [`OpenDictionary::search`]. + +#### `search()` + +Runs a full-text search across the dictionary. + +Requires an index — call [`OpenDictionary::index`] first. + +#### `tokenize()` + +Tokenizes text using NLP-based segmentation and matches each token against the dictionary. + +Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, +and Latin-script languages. + +- `text` — the text to tokenize. +- `follow` — follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). +- `insensitive` — enable case-insensitive matching. + +--- + +## `Definition` + +A single definition of a word sense. + +Contains the definition text along with optional examples and notes. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this definition. | +| `value` | `str` | The definition text. | +| `examples` | `list[Example]` | Usage examples illustrating this definition. | +| `notes` | `list[Note]` | Additional notes about this definition. | + +--- + +## `Entry` + +A dictionary entry representing a single headword and its associated data. + +Each entry contains the term itself, optional ranking metadata, +cross-reference information, etymologies, and media attachments. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `term` | `str` | The headword for this entry. | +| `rank` | `int | None` | Optional frequency rank for ordering entries. | +| `see_also` | `str | None` | Cross-reference target term, if this entry redirects to another. | +| `etymologies` | `list[Etymology]` | The etymologies associated with this entry. | +| `media` | `list[MediaURL]` | Media URLs (audio, images, etc.) associated with this entry. | + +--- + +## `EnumWrapper` + +A wrapper for ODict enumeration values (e.g. part of speech, pronunciation kind). + +ODict enums are represented as string triples: the enum name, +the variant name, and the variant's string value. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `name` | `str` | The enum type name (e.g. `"PartOfSpeech"`). | +| `variant` | `str` | The variant name (e.g. `"Noun"`). | +| `value` | `str` | The string value of the variant (e.g. `"n"`). | + +--- + +## `Etymology` + +An etymology grouping for a dictionary entry. + +Etymologies group together senses that share a common word origin. +Each etymology can have its own pronunciations and description. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this etymology. | +| `pronunciations` | `list[Pronunciation]` | Pronunciations associated with this etymology. | +| `description` | `str | None` | Optional description of the word origin. | +| `senses` | `list[Sense]` | The senses (meanings) under this etymology. | + +--- + +## `Example` + +A usage example illustrating a definition. + +Examples can optionally include translations and pronunciations. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `value` | `str` | The example text. | +| `translations` | `list[Translation]` | Translations of this example into other languages. | +| `pronunciations` | `list[Pronunciation]` | Pronunciations for this example. | + +--- + +## `Form` + +An inflected or alternate form of a word. + +Forms represent morphological variants such as plurals, conjugations, +or other inflections. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `term` | `str` | The inflected form text. | +| `kind` | `EnumWrapper | None` | The kind of form (e.g. plural, past tense), or `None`. | +| `tags` | `list[str]` | Tags for categorizing this form. | + +--- + +## `Group` + +A named group of related definitions. + +Groups allow organizing multiple definitions under a shared description, +such as grouping definitions by semantic domain. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this group. | +| `description` | `str` | A description of what this group of definitions has in common. | +| `definitions` | `list[Definition]` | The definitions within this group. | + +--- + +## `IndexOptions` + +Options for configuring full-text index creation. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `directory` | `str | None` | Custom directory for storing the index. | +| `memory` | `int | None` | Memory arena size per thread in bytes (must be >15 MB). | +| `overwrite` | `bool | None` | Whether to overwrite an existing index. | + +--- + +## `RemoteLoadOptions` + +Options for loading dictionaries from remote registries. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `out_dir` | `str | None` | Custom output directory for downloaded files. | +| `caching` | `bool | None` | Whether to cache downloaded dictionaries locally. | +| `retries` | `int | None` | Number of download retries on failure. | + +--- + +## `LoadOptions` + +Options for loading a dictionary from a file path, alias, or remote registry. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `config_dir` | `str | None` | Custom configuration directory. | +| `remote` | `RemoteLoadOptions | None` | Options for remote dictionary loading. | + +--- + +## `LookupOptions` + +Options for configuring term lookups. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `split` | `int | None` | Minimum word length for compound splitting. | +| `follow` | `bool | None` | Whether to follow `see_also` cross-references. | +| `insensitive` | `bool | None` | Whether to enable case-insensitive matching. | + +--- + +## `LookupResult` + +The result of a dictionary lookup. + +Contains the matched entry and, if a `see_also` redirect was followed, +the original entry that initiated the redirect. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `entry` | `Entry` | The matched dictionary entry. | +| `directed_from` | `Entry | None` | The original entry if a `see_also` redirect was followed, or `None`. | + +--- + +## `MediaURL` + +A reference to an external media resource (audio, image, etc.). + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `src` | `str` | URL or path to the media file. | +| `mime_type` | `str | None` | MIME type (e.g. `audio/mpeg`), or `None`. | +| `description` | `str | None` | Human-readable description of the media. | + +--- + +## `Note` + +An additional note attached to a definition. + +Notes provide supplementary information such as usage guidance, +historical context, or grammatical remarks. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this note. | +| `value` | `str` | The note text. | +| `examples` | `list[Example]` | Examples associated with this note. | + +--- + +## `Pronunciation` + +A pronunciation entry for a word or etymology. + +Represents how a word is pronounced in a given notation system +(e.g. IPA, Pinyin), with optional audio media. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `kind` | `EnumWrapper | None` | The pronunciation system (e.g. IPA, Pinyin), or `None`. | +| `value` | `str` | The pronunciation notation string. | +| `media` | `list[MediaURL]` | Audio media URLs for this pronunciation. | + +--- + +## `CompressOptions` + +Brotli compression options for saving dictionaries. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `quality` | `int | None` | Compression quality level (0–11). | +| `window_size` | `int | None` | Compression window size (0–22). | + +--- + +## `SaveOptions` + +Options for saving a dictionary to disk. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `compress` | `CompressOptions | None` | Optional Brotli compression settings. | + +--- + +## `SearchOptions` + +Options for configuring full-text search. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `directory` | `str | None` | Custom directory for the search index. | +| `threshold` | `int | None` | Relevance score threshold for filtering results. | +| `autoindex` | `bool | None` | Whether to automatically create an index if one does not exist. | +| `limit` | `int | None` | Maximum number of results to return. | + +--- + +## `Sense` + +A word sense — a specific meaning grouped by part of speech. + +Senses represent distinct meanings of a word under a given etymology. +Each sense has a part of speech and contains definitions (or definition groups), +along with optional tags, translations, and inflected forms. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `pos` | `EnumWrapper` | The part of speech for this sense (e.g. noun, verb, adjective). | +| `lemma` | `str | None` | Optional lemma reference linking to another entry. | +| `definitions` | `list[Definition | Group]` | Definitions or definition groups under this sense. | +| `tags` | `list[str]` | Tags for categorizing or filtering this sense. | +| `translations` | `list[Translation]` | Translations of this sense into other languages. | +| `forms` | `list[Form]` | Inflected forms of the word under this sense. | + +--- + +## `Token` + +A token produced by NLP-based text segmentation. + +Each token represents a segment of the input text, with metadata about +its position, detected language and script, and any matching dictionary entries. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `lemma` | `str` | The original token text (lemma form). | +| `language` | `str | None` | Detected language code (e.g. `"eng"`), or `None` if unknown. | +| `entries` | `list[LookupResult]` | Matched dictionary entries for this token. | +| `kind` | `str` | The token kind (e.g. `"Word"`, `"Punctuation"`). | +| `script` | `str` | Detected script name (e.g. `"Latin"`, `"Han"`). | +| `start` | `int` | Start byte offset in the original text. | +| `end` | `int` | End byte offset in the original text. | + +--- + +## `TokenizeOptions` + +Options for configuring text tokenization. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `follow` | `bool | int | None` | Whether to follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). | +| `insensitive` | `bool | None` | Whether to enable case-insensitive matching. | + +--- + +## `Translation` + +A translation of a word, definition, or example into another language. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `lang` | `str` | The BCP-47 language code (e.g. `"fra"`, `"deu"`). | +| `value` | `str` | The translated text. | + +--- diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 993a331f0..219e6285d 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -9,6 +9,16 @@ use crate::{ utils::cast_error, }; +/// Compiles an ODXML string into binary `.odict` data. +/// +/// Takes an XML string conforming to the ODict XML schema and returns +/// the compiled binary representation as a byte vector. The resulting +/// bytes can be passed to [`OpenDictionary::new`] or saved to disk. +/// +/// # Errors +/// +/// Returns an error if the XML is malformed or does not conform to the +/// ODict schema. #[pyfunction] pub fn compile(xml: String) -> PyResult> { let bytes = xml @@ -19,6 +29,15 @@ pub fn compile(xml: String) -> PyResult> { Ok(bytes) } +/// The main class for working with compiled ODict dictionaries. +/// +/// An `OpenDictionary` wraps a compiled binary dictionary and provides +/// methods for looking up terms, full-text search, tokenization, and more. +/// +/// # Construction +/// +/// Create from compiled bytes or an XML string using [`OpenDictionary::new`], +/// or load from a file path or remote registry using [`OpenDictionary::load`]. #[pyclass] pub struct OpenDictionary { dict: odict::OpenDictionary, @@ -26,6 +45,11 @@ pub struct OpenDictionary { #[pymethods] impl OpenDictionary { + /// Loads a dictionary from a file path, alias, or remote identifier. + /// + /// This is an async method. If `dictionary` is a path to a `.odict` file, + /// it loads from disk. If it matches the format `org/lang` (e.g. `wiktionary/eng`), + /// it downloads from the remote registry. #[staticmethod] #[pyo3(signature = (dictionary, options=None))] pub fn load<'py>( @@ -50,6 +74,10 @@ impl OpenDictionary { }) } + /// Creates a dictionary from compiled binary data or directly from an XML string. + /// + /// Accepts either `bytes` (as returned by [`compile`]) or a `str` containing + /// ODXML markup. #[new] pub fn new(data: Either, String>) -> PyResult { let bytes = match data { @@ -60,6 +88,10 @@ impl OpenDictionary { Ok(Self { dict }) } + /// Saves the dictionary to disk as a `.odict` file. + /// + /// Optionally configure Brotli compression via `quality` (0–11) and + /// `window_size` (0–22). #[pyo3(signature = (path, quality=None, window_size=None))] pub fn save( &mut self, @@ -89,16 +121,24 @@ impl OpenDictionary { } } + /// The minimum rank value across all entries, or `None` if no entries have ranks. #[getter] pub fn min_rank(&self) -> PyResult> { Ok(self.dict.contents().map_err(cast_error)?.min_rank()) } + /// The maximum rank value across all entries, or `None` if no entries have ranks. #[getter] pub fn max_rank(&self) -> PyResult> { Ok(self.dict.contents().map_err(cast_error)?.max_rank()) } + /// Looks up one or more terms by exact match. + /// + /// - `query` — a single term or list of terms to look up. + /// - `split` — minimum word length for compound splitting. + /// - `follow` — follow `see_also` cross-references until an entry with etymologies is found. + /// - `insensitive` — enable case-insensitive matching. #[pyo3(signature = (query, split=None, follow=None, insensitive=None))] pub fn lookup( &self, @@ -135,6 +175,7 @@ impl OpenDictionary { Ok(mapped) } + /// Returns all terms defined in the dictionary, sorted alphabetically. pub fn lexicon(&self) -> PyResult> { let dict = self.dict.contents().map_err(cast_error)?; let lexicon = dict.lexicon(); @@ -142,6 +183,9 @@ impl OpenDictionary { Ok(lexicon) } + /// Creates a full-text search index for the dictionary. + /// + /// Must be called before [`OpenDictionary::search`]. #[pyo3(signature = (options=None))] pub fn index(&self, options: Option) -> PyResult<()> { let dict = self.dict.contents().map_err(cast_error)?; @@ -153,6 +197,9 @@ impl OpenDictionary { Ok(()) } + /// Runs a full-text search across the dictionary. + /// + /// Requires an index — call [`OpenDictionary::index`] first. #[pyo3(signature = (query, options=None))] pub fn search(&self, query: String, options: Option) -> PyResult> { let dict = self.dict.contents().map_err(cast_error)?; @@ -170,6 +217,14 @@ impl OpenDictionary { Ok(entries) } + /// Tokenizes text using NLP-based segmentation and matches each token against the dictionary. + /// + /// Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, + /// and Latin-script languages. + /// + /// - `text` — the text to tokenize. + /// - `follow` — follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). + /// - `insensitive` — enable case-insensitive matching. #[pyo3(signature = (text, follow=None, insensitive=None))] pub fn tokenize( &self, diff --git a/python/src/types/definition.rs b/python/src/types/definition.rs index 5094c2187..e3c2495fa 100644 --- a/python/src/types/definition.rs +++ b/python/src/types/definition.rs @@ -3,16 +3,23 @@ use structural_convert::StructuralConvert; use super::{note::Note, Example}; +/// A single definition of a word sense. +/// +/// Contains the definition text along with optional examples and notes. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Definition))] pub struct Definition { + /// Optional identifier for this definition. #[pyo3(get)] pub id: Option, + /// The definition text. #[pyo3(get)] pub value: String, + /// Usage examples illustrating this definition. #[pyo3(get)] pub examples: Vec, + /// Additional notes about this definition. #[pyo3(get)] pub notes: Vec, } diff --git a/python/src/types/entry.rs b/python/src/types/entry.rs index 23b06085c..78dbf4c65 100644 --- a/python/src/types/entry.rs +++ b/python/src/types/entry.rs @@ -6,18 +6,27 @@ use crate::utils::cast_error; use super::etymology::Etymology; use super::media_url::MediaURL; +/// A dictionary entry representing a single headword and its associated data. +/// +/// Each entry contains the term itself, optional ranking metadata, +/// cross-reference information, etymologies, and media attachments. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Entry))] pub struct Entry { + /// The headword for this entry. #[pyo3(get)] pub term: String, + /// Optional frequency rank for ordering entries. #[pyo3(get)] pub rank: Option, + /// Cross-reference target term, if this entry redirects to another. #[pyo3(get)] pub see_also: Option, + /// The etymologies associated with this entry. #[pyo3(get)] pub etymologies: Vec, + /// Media URLs (audio, images, etc.) associated with this entry. #[pyo3(get)] pub media: Vec, } diff --git a/python/src/types/enums.rs b/python/src/types/enums.rs index 913cddf33..6b365cf29 100644 --- a/python/src/types/enums.rs +++ b/python/src/types/enums.rs @@ -1,16 +1,23 @@ use pyo3::prelude::*; use structural_convert::StructuralConvert; +/// A wrapper for ODict enumeration values (e.g. part of speech, pronunciation kind). +/// +/// ODict enums are represented as string triples: the enum name, +/// the variant name, and the variant's string value. #[pyclass] #[derive(Debug, PartialEq, Clone, StructuralConvert)] #[convert(from(internal::EnumWrapper))] pub struct EnumWrapper { + /// The enum type name (e.g. `"PartOfSpeech"`). #[pyo3(get)] pub name: String, + /// The variant name (e.g. `"Noun"`). #[pyo3(get)] pub variant: String, + /// The string value of the variant (e.g. `"n"`). #[pyo3(get)] pub value: String, } diff --git a/python/src/types/etymology.rs b/python/src/types/etymology.rs index bf06b166a..f3a1b5401 100644 --- a/python/src/types/etymology.rs +++ b/python/src/types/etymology.rs @@ -5,15 +5,23 @@ use pyo3::prelude::*; use super::pronunciation::Pronunciation; use super::sense::Sense; +/// An etymology grouping for a dictionary entry. +/// +/// Etymologies group together senses that share a common word origin. +/// Each etymology can have its own pronunciations and description. #[pyclass] #[derive(Clone)] pub struct Etymology { + /// Optional identifier for this etymology. #[pyo3(get)] pub id: Option, + /// Pronunciations associated with this etymology. #[pyo3(get)] pub pronunciations: Vec, + /// Optional description of the word origin. #[pyo3(get)] pub description: Option, + /// The senses (meanings) under this etymology. #[pyo3(get)] pub senses: Vec, } diff --git a/python/src/types/example.rs b/python/src/types/example.rs index 10047079a..a2615f4f9 100644 --- a/python/src/types/example.rs +++ b/python/src/types/example.rs @@ -3,16 +3,22 @@ use crate::types::{Pronunciation, Translation}; use pyo3::prelude::*; use structural_convert::StructuralConvert; +/// A usage example illustrating a definition. +/// +/// Examples can optionally include translations and pronunciations. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Example))] pub struct Example { + /// The example text. #[pyo3(get)] pub value: String, + /// Translations of this example into other languages. #[pyo3(get)] pub translations: Vec, + /// Pronunciations for this example. #[pyo3(get)] pub pronunciations: Vec, } diff --git a/python/src/types/form.rs b/python/src/types/form.rs index 198527c8e..5e1862ca5 100644 --- a/python/src/types/form.rs +++ b/python/src/types/form.rs @@ -3,15 +3,22 @@ use pyo3::prelude::*; use super::enums::EnumWrapper; +/// An inflected or alternate form of a word. +/// +/// Forms represent morphological variants such as plurals, conjugations, +/// or other inflections. #[pyclass] #[derive(Clone, Debug)] pub struct Form { + /// The inflected form text. #[pyo3(get)] pub term: String, + /// The kind of form (e.g. plural, past tense), or `None`. #[pyo3(get, set)] pub kind: Option, + /// Tags for categorizing this form. #[pyo3(get)] pub tags: Vec, } diff --git a/python/src/types/group.rs b/python/src/types/group.rs index 60c9b2658..7f9c951b0 100644 --- a/python/src/types/group.rs +++ b/python/src/types/group.rs @@ -3,14 +3,21 @@ use structural_convert::StructuralConvert; use super::definition::Definition; +/// A named group of related definitions. +/// +/// Groups allow organizing multiple definitions under a shared description, +/// such as grouping definitions by semantic domain. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Group))] pub struct Group { + /// Optional identifier for this group. #[pyo3(get)] pub id: Option, + /// A description of what this group of definitions has in common. #[pyo3(get)] pub description: String, + /// The definitions within this group. #[pyo3(get)] pub definitions: Vec, } diff --git a/python/src/types/index.rs b/python/src/types/index.rs index 535bdb0b8..ab21daef5 100644 --- a/python/src/types/index.rs +++ b/python/src/types/index.rs @@ -1,14 +1,18 @@ use pyo3::prelude::*; +/// Options for configuring full-text index creation. #[pyclass] #[derive(Clone)] pub struct IndexOptions { + /// Custom directory for storing the index. #[pyo3(get, set)] pub directory: Option, + /// Memory arena size per thread in bytes (must be >15 MB). #[pyo3(get, set)] pub memory: Option, + /// Whether to overwrite an existing index. #[pyo3(get, set)] pub overwrite: Option, } diff --git a/python/src/types/load.rs b/python/src/types/load.rs index 51ee8d984..b2f0014c8 100644 --- a/python/src/types/load.rs +++ b/python/src/types/load.rs @@ -1,12 +1,16 @@ use pyo3::prelude::*; +/// Options for loading dictionaries from remote registries. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct RemoteLoadOptions { + /// Custom output directory for downloaded files. #[pyo3(get, set)] pub out_dir: Option, + /// Whether to cache downloaded dictionaries locally. #[pyo3(get, set)] pub caching: Option, + /// Number of download retries on failure. #[pyo3(get, set)] pub retries: Option, } @@ -20,11 +24,14 @@ impl RemoteLoadOptions { } } +/// Options for loading a dictionary from a file path, alias, or remote registry. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct LoadOptions { + /// Custom configuration directory. #[pyo3(get, set)] pub config_dir: Option, + /// Options for remote dictionary loading. #[pyo3(get, set)] pub remote: Option, } diff --git a/python/src/types/lookup.rs b/python/src/types/lookup.rs index 55c5b1ffa..4888abca8 100644 --- a/python/src/types/lookup.rs +++ b/python/src/types/lookup.rs @@ -2,15 +2,19 @@ use pyo3::prelude::*; use super::Entry; +/// Options for configuring term lookups. #[pyclass] #[derive(Clone)] pub struct LookupOptions { + /// Minimum word length for compound splitting. #[pyo3(get, set)] pub split: Option, + /// Whether to follow `see_also` cross-references. #[pyo3(get, set)] pub follow: Option, + /// Whether to enable case-insensitive matching. #[pyo3(get, set)] pub insensitive: Option, } @@ -58,11 +62,17 @@ impl From for odict::lookup::LookupOptions { } } +/// The result of a dictionary lookup. +/// +/// Contains the matched entry and, if a `see_also` redirect was followed, +/// the original entry that initiated the redirect. #[pyclass] #[derive(Debug, Clone)] pub struct LookupResult { + /// The matched dictionary entry. #[pyo3(get)] pub entry: Entry, + /// The original entry if a `see_also` redirect was followed, or `None`. #[pyo3(get)] pub directed_from: Option, } diff --git a/python/src/types/media_url.rs b/python/src/types/media_url.rs index f77fbf861..296e00df4 100644 --- a/python/src/types/media_url.rs +++ b/python/src/types/media_url.rs @@ -2,16 +2,20 @@ use pyo3::prelude::*; use std::fmt; use structural_convert::StructuralConvert; +/// A reference to an external media resource (audio, image, etc.). #[pyclass] #[derive(Clone, Debug, StructuralConvert)] #[convert(from(odict::schema::MediaURL))] pub struct MediaURL { + /// URL or path to the media file. #[pyo3(get)] pub src: String, + /// MIME type (e.g. `audio/mpeg`), or `None`. #[pyo3(get)] pub mime_type: Option, + /// Human-readable description of the media. #[pyo3(get)] pub description: Option, } diff --git a/python/src/types/note.rs b/python/src/types/note.rs index 811d77062..824999237 100644 --- a/python/src/types/note.rs +++ b/python/src/types/note.rs @@ -3,14 +3,21 @@ use structural_convert::StructuralConvert; use super::Example; +/// An additional note attached to a definition. +/// +/// Notes provide supplementary information such as usage guidance, +/// historical context, or grammatical remarks. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Note))] pub struct Note { + /// Optional identifier for this note. #[pyo3(get)] pub id: Option, + /// The note text. #[pyo3(get)] pub value: String, + /// Examples associated with this note. #[pyo3(get)] pub examples: Vec, } diff --git a/python/src/types/pronunciation.rs b/python/src/types/pronunciation.rs index 8762971a7..f6100f040 100644 --- a/python/src/types/pronunciation.rs +++ b/python/src/types/pronunciation.rs @@ -5,15 +5,22 @@ use super::media_url::MediaURL; use internal::ToEnumWrapper; +/// A pronunciation entry for a word or etymology. +/// +/// Represents how a word is pronounced in a given notation system +/// (e.g. IPA, Pinyin), with optional audio media. #[pyclass] #[derive(Clone, Debug)] pub struct Pronunciation { + /// The pronunciation system (e.g. IPA, Pinyin), or `None`. #[pyo3(get)] pub kind: Option, + /// The pronunciation notation string. #[pyo3(get)] pub value: String, + /// Audio media URLs for this pronunciation. #[pyo3(get)] pub media: Vec, } diff --git a/python/src/types/save.rs b/python/src/types/save.rs index 28a67eb3c..424f4e1f0 100644 --- a/python/src/types/save.rs +++ b/python/src/types/save.rs @@ -1,11 +1,14 @@ use pyo3::prelude::*; +/// Brotli compression options for saving dictionaries. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct CompressOptions { + /// Compression quality level (0–11). #[pyo3(get, set)] pub quality: Option, - + + /// Compression window size (0–22). #[pyo3(get, set)] pub window_size: Option, } @@ -22,9 +25,11 @@ impl CompressOptions { } } +/// Options for saving a dictionary to disk. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct SaveOptions { + /// Optional Brotli compression settings. #[pyo3(get, set)] pub compress: Option, } diff --git a/python/src/types/search.rs b/python/src/types/search.rs index 6e24b8977..b893f5c6e 100644 --- a/python/src/types/search.rs +++ b/python/src/types/search.rs @@ -1,17 +1,22 @@ use pyo3::prelude::*; +/// Options for configuring full-text search. #[pyclass] #[derive(Clone)] pub struct SearchOptions { + /// Custom directory for the search index. #[pyo3(get, set)] pub directory: Option, + /// Relevance score threshold for filtering results. #[pyo3(get, set)] pub threshold: Option, + /// Whether to automatically create an index if one does not exist. #[pyo3(get, set)] pub autoindex: Option, + /// Maximum number of results to return. #[pyo3(get, set)] pub limit: Option, } diff --git a/python/src/types/sense.rs b/python/src/types/sense.rs index 29b736dc7..d909f42f0 100644 --- a/python/src/types/sense.rs +++ b/python/src/types/sense.rs @@ -7,19 +7,30 @@ use super::{ definition::Definition, enums::EnumWrapper, form::Form, group::Group, translation::Translation, }; +/// A word sense — a specific meaning grouped by part of speech. +/// +/// Senses represent distinct meanings of a word under a given etymology. +/// Each sense has a part of speech and contains definitions (or definition groups), +/// along with optional tags, translations, and inflected forms. #[pyclass] #[derive(Debug, Clone)] pub struct Sense { + /// The part of speech for this sense (e.g. noun, verb, adjective). #[pyo3(get)] pub pos: EnumWrapper, + /// Optional lemma reference linking to another entry. #[pyo3(get)] pub lemma: Option, + /// Definitions or definition groups under this sense. #[pyo3(get)] pub definitions: Vec>, + /// Tags for categorizing or filtering this sense. #[pyo3(get)] pub tags: Vec, + /// Translations of this sense into other languages. #[pyo3(get)] pub translations: Vec, + /// Inflected forms of the word under this sense. #[pyo3(get)] pub forms: Vec
, } diff --git a/python/src/types/token.rs b/python/src/types/token.rs index ff0fe16f7..9da91ba28 100644 --- a/python/src/types/token.rs +++ b/python/src/types/token.rs @@ -3,21 +3,32 @@ use pyo3::prelude::*; use super::LookupResult; +/// A token produced by NLP-based text segmentation. +/// +/// Each token represents a segment of the input text, with metadata about +/// its position, detected language and script, and any matching dictionary entries. #[pyclass] #[derive(Debug)] pub struct Token { + /// The original token text (lemma form). #[pyo3(get)] pub lemma: String, + /// Detected language code (e.g. `"eng"`), or `None` if unknown. #[pyo3(get)] pub language: Option, + /// Matched dictionary entries for this token. #[pyo3(get)] pub entries: Vec, + /// The token kind (e.g. `"Word"`, `"Punctuation"`). #[pyo3(get)] pub kind: String, + /// Detected script name (e.g. `"Latin"`, `"Han"`). #[pyo3(get)] pub script: String, + /// Start byte offset in the original text. #[pyo3(get)] pub start: usize, + /// End byte offset in the original text. #[pyo3(get)] pub end: usize, } diff --git a/python/src/types/tokenize.rs b/python/src/types/tokenize.rs index fdcc2fe1d..43cf11af5 100644 --- a/python/src/types/tokenize.rs +++ b/python/src/types/tokenize.rs @@ -1,12 +1,15 @@ use either::Either; use pyo3::prelude::*; +/// Options for configuring text tokenization. #[pyclass] #[derive(Clone)] pub struct TokenizeOptions { + /// Whether to follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). #[pyo3(get, set)] pub follow: Option>, + /// Whether to enable case-insensitive matching. #[pyo3(get, set)] pub insensitive: Option, } diff --git a/python/src/types/translation.rs b/python/src/types/translation.rs index b0dcede25..37663d490 100644 --- a/python/src/types/translation.rs +++ b/python/src/types/translation.rs @@ -1,13 +1,16 @@ use pyo3::prelude::*; use structural_convert::StructuralConvert; +/// A translation of a word, definition, or example into another language. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Translation))] pub struct Translation { + /// The BCP-47 language code (e.g. `"fra"`, `"deu"`). #[pyo3(get)] pub lang: String, + /// The translated text. #[pyo3(get)] pub value: String, } diff --git a/scripts/rustdoc-to-md.py b/scripts/rustdoc-to-md.py new file mode 100755 index 000000000..7d29bb6c4 --- /dev/null +++ b/scripts/rustdoc-to-md.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +""" +Extracts Rust doc comments (///) from source files and generates Markdown documentation. + +This script parses Rust source files for structs, functions, and their fields/methods, +then formats the extracted documentation as Markdown suitable for inclusion in a +documentation site. + +Usage: + python scripts/rustdoc-to-md.py python/src # Generate docs for Python bindings + python scripts/rustdoc-to-md.py node/src # Generate docs for Node bindings + +Alternatively, use cargo rustdoc with JSON output and the `rustdoc-md` tool: + RUSTC_BOOTSTRAP=1 cargo rustdoc -p odict_python -- -Z unstable-options --output-format json + rustdoc-md target/doc/theopendictionary.json -o docs/python-api.md +""" + +import re +import sys +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class DocItem: + """A documented item extracted from Rust source.""" + name: str + kind: str # "struct", "function", "method", "field", "getter" + doc: str + signature: str = "" + fields: list = field(default_factory=list) + methods: list = field(default_factory=list) + + +def extract_doc_comment(lines: list[str], end_idx: int) -> str: + """Extract consecutive /// doc comments ending at or before the given line. + + Skips over attribute lines (#[...]) to find the doc comment block. + """ + # First, skip backwards over attribute lines + i = end_idx + while i >= 0: + stripped = lines[i].strip() + if stripped.startswith("///"): + break + elif stripped.startswith("#[") or stripped == "": + i -= 1 + continue + else: + return "" + i -= 1 + + # Now extract the doc comment lines + doc_lines = [] + while i >= 0 and lines[i].strip().startswith("///"): + comment = lines[i].strip().removeprefix("///") + # Preserve leading space for indented content, but strip the first space + if comment.startswith(" "): + comment = comment[1:] + doc_lines.append(comment) + i -= 1 + doc_lines.reverse() + return "\n".join(doc_lines).strip() + + +def parse_rust_file(filepath: Path) -> list[DocItem]: + """Parse a Rust file and extract documented items.""" + content = filepath.read_text() + lines = content.splitlines() + items = [] + + i = 0 + while i < len(lines): + line = lines[i].strip() + + # Detect #[pyfunction] + if line == "#[pyfunction]": + doc = extract_doc_comment(lines, i - 1) + # Find the function signature + j = i + 1 + while j < len(lines) and not lines[j].strip().startswith("pub fn "): + j += 1 + if j < len(lines): + sig = extract_fn_signature(lines, j) + name = re.search(r"pub fn (\w+)", lines[j]) + if name: + items.append(DocItem( + name=name.group(1), + kind="function", + doc=doc, + signature=sig, + )) + + # Detect #[pyclass] or #[napi(object)] + if line == "#[pyclass]" or line.startswith("#[pyclass") or line.startswith("#[napi"): + # Only match struct-level napi, not method-level + if line.startswith("#[napi") and "object" not in line and "constructor" not in line: + i += 1 + continue + + doc = extract_doc_comment(lines, i - 1) + # Find the struct name + j = i + 1 + while j < len(lines) and not lines[j].strip().startswith("pub struct "): + j += 1 + if j < len(lines): + name_match = re.search(r"pub struct (\w+)", lines[j]) + if name_match: + struct_name = name_match.group(1) + struct_item = DocItem( + name=struct_name, + kind="struct", + doc=doc, + ) + # Extract fields from struct body + if lines[j].strip().endswith("{"): + k = j + 1 + while k < len(lines) and not lines[k].strip().startswith("}"): + field_line = lines[k].strip() + if field_line.startswith("pub "): + # Look back for doc comment, skipping #[pyo3(...)] attrs + field_doc = extract_doc_comment(lines, k - 1) + field_match = re.match( + r"pub\s+(\w+):\s*(.+?),?\s*$", field_line + ) + if field_match: + struct_item.fields.append(DocItem( + name=field_match.group(1), + kind="field", + doc=field_doc, + signature=field_match.group(2), + )) + k += 1 + items.append(struct_item) + + # Detect #[pymethods] impl blocks + if line == "#[pymethods]": + j = i + 1 + while j < len(lines) and not lines[j].strip().startswith("impl "): + j += 1 + if j < len(lines): + impl_match = re.search(r"impl (\w+)", lines[j]) + if impl_match: + impl_name = impl_match.group(1) + # Find the matching struct in items + target = None + for item in items: + if item.name == impl_name and item.kind == "struct": + target = item + break + + # Parse methods in the impl block + brace_depth = 0 + k = j + while k < len(lines): + if "{" in lines[k]: + brace_depth += lines[k].count("{") + if "}" in lines[k]: + brace_depth -= lines[k].count("}") + if brace_depth == 0 and k > j: + break + + mline = lines[k].strip() + + # Check for pub fn (but skip dunder methods) + if mline.startswith("pub fn ") and "__" not in mline: + # Extract doc comment, skipping attribute lines + method_doc = extract_doc_comment(lines, k - 1) + + # Check for #[getter] in preceding attribute lines + is_getter = False + is_staticmethod = False + is_new = False + for back in range(max(0, k - 10), k): + attr = lines[back].strip() + if attr == "#[getter]": + is_getter = True + elif attr == "#[staticmethod]": + is_staticmethod = True + elif attr == "#[new]": + is_new = True + elif attr.startswith("pub fn ") or (attr.startswith("///") and back < k - 1): + break # stop looking back past another function or doc + + sig = extract_fn_signature(lines, k) + + name_match = re.search(r"pub fn (\w+)", mline) + if name_match and method_doc: + kind = "getter" if is_getter else "method" + if is_new: + kind = "constructor" + + method = DocItem( + name=name_match.group(1), + kind=kind, + doc=method_doc, + signature=sig, + ) + if target: + target.methods.append(method) + k += 1 + + i += 1 + + return items + + +def extract_fn_signature(lines: list[str], start: int) -> str: + """Extract a function signature starting from the given line.""" + sig_lines = [] + paren_depth = 0 + i = start + while i < len(lines): + line = lines[i] + sig_lines.append(line.rstrip()) + paren_depth += line.count("(") - line.count(")") + if paren_depth <= 0 and ")" in line: + # Check for return type on next line + if i + 1 < len(lines) and lines[i + 1].strip().startswith("->"): + sig_lines.append(lines[i + 1].rstrip()) + break + i += 1 + + sig = " ".join(l.strip() for l in sig_lines) + # Clean up: extract just the function part + match = re.search(r"(pub fn \w+.*?)(?:\s*\{|\s*where)", sig) + if match: + return match.group(1).strip() + match = re.search(r"(pub fn \w+[^{]*)", sig) + if match: + return match.group(1).strip().rstrip("{").strip() + return sig + + +def rust_type_to_display(ty: str) -> str: + """Convert a Rust type to a more readable display format.""" + ty = ty.strip().rstrip(",") + # Option -> T | None + m = re.match(r"Option<(.+)>$", ty) + if m: + inner = rust_type_to_display(m.group(1)) + return f"{inner} | None" + # Vec -> list[T] + m = re.match(r"Vec<(.+)>$", ty) + if m: + inner = rust_type_to_display(m.group(1)) + return f"list[{inner}]" + # Either -> A | B + m = re.match(r"Either<(.+),\s*(.+)>$", ty) + if m: + a = rust_type_to_display(m.group(1)) + b = rust_type_to_display(m.group(2)) + return f"{a} | {b}" + # Basic type mappings + mappings = { + "String": "str", + "&str": "str", + "u32": "int", + "u64": "int", + "i32": "int", + "i64": "int", + "usize": "int", + "bool": "bool", + "f32": "float", + "f64": "float", + } + return mappings.get(ty, ty) + + +def items_to_markdown(items: list[DocItem], title: str) -> str: + """Convert extracted items to Markdown format.""" + md = [] + md.append(f"# {title}\n") + md.append("*Auto-generated from Rust doc comments.*\n") + md.append("---\n") + + # Separate functions and structs + functions = [i for i in items if i.kind == "function"] + structs = [i for i in items if i.kind == "struct"] + + if functions: + md.append("## Functions\n") + for func in functions: + md.append(f"### `{func.name}()`\n") + if func.doc: + md.append(f"{func.doc}\n") + + for struct in structs: + md.append(f"## `{struct.name}`\n") + if struct.doc: + md.append(f"{struct.doc}\n") + + # Constructors + constructors = [m for m in struct.methods if m.kind == "constructor"] + if constructors: + md.append("### Constructors\n") + for c in constructors: + md.append(f"#### `{struct.name}()`\n") + if c.doc: + md.append(f"{c.doc}\n") + + # Static methods + statics = [m for m in struct.methods if m.kind == "static"] + for s in statics: + md.append(f"### `{struct.name}.{s.name}()`\n") + if s.doc: + md.append(f"{s.doc}\n") + + # Properties (fields + getters) + getters = [m for m in struct.methods if m.kind == "getter"] + if struct.fields or getters: + md.append("### Properties\n") + md.append("| Property | Type | Description |") + md.append("|----------|------|-------------|") + for f in struct.fields: + ty = rust_type_to_display(f.signature) + doc = f.doc.replace("\n", " ") if f.doc else "" + md.append(f"| `{f.name}` | `{ty}` | {doc} |") + for g in getters: + doc = g.doc.replace("\n", " ") if g.doc else "" + md.append(f"| `{g.name}` | — | {doc} |") + md.append("") + + # Methods (non-getter, non-constructor) + methods = [m for m in struct.methods if m.kind == "method"] + if methods: + md.append("### Methods\n") + for m in methods: + md.append(f"#### `{m.name}()`\n") + if m.doc: + md.append(f"{m.doc}\n") + + md.append("---\n") + + return "\n".join(md) + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [output_file]") + print(f"Example: {sys.argv[0]} python/src docs/python-api-generated.md") + sys.exit(1) + + src_dir = Path(sys.argv[1]) + output_file = sys.argv[2] if len(sys.argv) > 2 else None + + if not src_dir.exists(): + print(f"Error: directory {src_dir} does not exist") + sys.exit(1) + + # Collect all .rs files + rs_files = sorted(src_dir.rglob("*.rs")) + all_items = [] + + for rs_file in rs_files: + items = parse_rust_file(rs_file) + all_items.extend(items) + + if not all_items: + print("No documented items found.") + sys.exit(1) + + # Determine title from directory name + dir_name = src_dir.parent.name if src_dir.name == "src" else src_dir.name + title_map = {"python": "Python API", "node": "JavaScript API"} + title = title_map.get(dir_name, f"{dir_name} API") + + md = items_to_markdown(all_items, title) + + if output_file: + Path(output_file).write_text(md) + print(f"Generated {output_file} ({len(all_items)} items)") + else: + print(md) + + +if __name__ == "__main__": + main() From 77693cac4f58e59ee64cdc67f895415a7a9668df Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 23 Feb 2026 13:09:52 +0000 Subject: [PATCH 6/6] Add rlib crate type and rustdoc JSON PoC output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add "rlib" alongside "cdylib" in both python/Cargo.toml and node/Cargo.toml. This enables `cargo rustdoc --output-format json` to work on the binding crates (rustdoc requires rlib metadata). The cdylib output is unchanged — the rlib is just an extra build artifact in target/. Include docs/rustdoc-json-output.md as sample output from rustdoc-md, for comparison with the custom script approach. https://claude.ai/code/session_0152q1rpTnXqZGQ5B85AhjXs --- docs/rustdoc-json-output.md | 275 ++++++++++++++++++++++++++++++++++++ node/Cargo.toml | 2 +- python/Cargo.toml | 2 +- 3 files changed, 277 insertions(+), 2 deletions(-) create mode 100644 docs/rustdoc-json-output.md diff --git a/docs/rustdoc-json-output.md b/docs/rustdoc-json-output.md new file mode 100644 index 000000000..f230d9487 --- /dev/null +++ b/docs/rustdoc-json-output.md @@ -0,0 +1,275 @@ +# ODict Python API (from rustdoc JSON) + +*Generated from rustdoc JSON format v57* + +## `CompressOptions` + +Brotli compression options for saving dictionaries. + +| Field | Type | Description | +|-------|------|-------------| +| `quality` | `?` | Compression quality level (0–11). | +| `window_size` | `?` | Compression window size (0–22). | + +## `Definition` + +A single definition of a word sense. + +Contains the definition text along with optional examples and notes. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this definition. | +| `value` | `?` | The definition text. | +| `examples` | `?` | Usage examples illustrating this definition. | +| `notes` | `?` | Additional notes about this definition. | + +## `Entry` + +A dictionary entry representing a single headword and its associated data. + +Each entry contains the term itself, optional ranking metadata, +cross-reference information, etymologies, and media attachments. + +| Field | Type | Description | +|-------|------|-------------| +| `term` | `?` | The headword for this entry. | +| `rank` | `?` | Optional frequency rank for ordering entries. | +| `see_also` | `?` | Cross-reference target term, if this entry redirects to another. | +| `etymologies` | `?` | The etymologies associated with this entry. | +| `media` | `?` | Media URLs (audio, images, etc.) associated with this entry. | + +## `EnumWrapper` + +A wrapper for ODict enumeration values (e.g. part of speech, pronunciation kind). + +ODict enums are represented as string triples: the enum name, +the variant name, and the variant's string value. + +| Field | Type | Description | +|-------|------|-------------| +| `name` | `?` | The enum type name (e.g. `"PartOfSpeech"`). | +| `variant` | `?` | The variant name (e.g. `"Noun"`). | +| `value` | `?` | The string value of the variant (e.g. `"n"`). | + +## `Etymology` + +An etymology grouping for a dictionary entry. + +Etymologies group together senses that share a common word origin. +Each etymology can have its own pronunciations and description. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this etymology. | +| `pronunciations` | `?` | Pronunciations associated with this etymology. | +| `description` | `?` | Optional description of the word origin. | +| `senses` | `?` | The senses (meanings) under this etymology. | + +## `Example` + +A usage example illustrating a definition. + +Examples can optionally include translations and pronunciations. + +| Field | Type | Description | +|-------|------|-------------| +| `value` | `?` | The example text. | +| `translations` | `?` | Translations of this example into other languages. | +| `pronunciations` | `?` | Pronunciations for this example. | + +## `Form` + +An inflected or alternate form of a word. + +Forms represent morphological variants such as plurals, conjugations, +or other inflections. + +| Field | Type | Description | +|-------|------|-------------| +| `term` | `?` | The inflected form text. | +| `kind` | `?` | The kind of form (e.g. plural, past tense), or `None`. | +| `tags` | `?` | Tags for categorizing this form. | + +## `Group` + +A named group of related definitions. + +Groups allow organizing multiple definitions under a shared description, +such as grouping definitions by semantic domain. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this group. | +| `description` | `?` | A description of what this group of definitions has in common. | +| `definitions` | `?` | The definitions within this group. | + +## `IndexOptions` + +Options for configuring full-text index creation. + +| Field | Type | Description | +|-------|------|-------------| +| `directory` | `?` | Custom directory for storing the index. | +| `memory` | `?` | Memory arena size per thread in bytes (must be >15 MB). | +| `overwrite` | `?` | Whether to overwrite an existing index. | + +## `LoadOptions` + +Options for loading a dictionary from a file path, alias, or remote registry. + +| Field | Type | Description | +|-------|------|-------------| +| `config_dir` | `?` | Custom configuration directory. | +| `remote` | `?` | Options for remote dictionary loading. | + +## `LookupOptions` + +Options for configuring term lookups. + +| Field | Type | Description | +|-------|------|-------------| +| `split` | `?` | Minimum word length for compound splitting. | +| `follow` | `?` | Whether to follow `see_also` cross-references. | +| `insensitive` | `?` | Whether to enable case-insensitive matching. | + +## `LookupResult` + +The result of a dictionary lookup. + +Contains the matched entry and, if a `see_also` redirect was followed, +the original entry that initiated the redirect. + +| Field | Type | Description | +|-------|------|-------------| +| `entry` | `?` | The matched dictionary entry. | +| `directed_from` | `?` | The original entry if a `see_also` redirect was followed, or `None`. | + +## `MediaURL` + +A reference to an external media resource (audio, image, etc.). + +| Field | Type | Description | +|-------|------|-------------| +| `src` | `?` | URL or path to the media file. | +| `mime_type` | `?` | MIME type (e.g. `audio/mpeg`), or `None`. | +| `description` | `?` | Human-readable description of the media. | + +## `Note` + +An additional note attached to a definition. + +Notes provide supplementary information such as usage guidance, +historical context, or grammatical remarks. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this note. | +| `value` | `?` | The note text. | +| `examples` | `?` | Examples associated with this note. | + +## `OpenDictionary` + +The main class for working with compiled ODict dictionaries. + +An `OpenDictionary` wraps a compiled binary dictionary and provides +methods for looking up terms, full-text search, tokenization, and more. + +# Construction + +Create from compiled bytes or an XML string using [`OpenDictionary::new`], +or load from a file path or remote registry using [`OpenDictionary::load`]. + +## `Pronunciation` + +A pronunciation entry for a word or etymology. + +Represents how a word is pronounced in a given notation system +(e.g. IPA, Pinyin), with optional audio media. + +| Field | Type | Description | +|-------|------|-------------| +| `kind` | `?` | The pronunciation system (e.g. IPA, Pinyin), or `None`. | +| `value` | `?` | The pronunciation notation string. | +| `media` | `?` | Audio media URLs for this pronunciation. | + +## `RemoteLoadOptions` + +Options for loading dictionaries from remote registries. + +| Field | Type | Description | +|-------|------|-------------| +| `out_dir` | `?` | Custom output directory for downloaded files. | +| `caching` | `?` | Whether to cache downloaded dictionaries locally. | +| `retries` | `?` | Number of download retries on failure. | + +## `SaveOptions` + +Options for saving a dictionary to disk. + +| Field | Type | Description | +|-------|------|-------------| +| `compress` | `?` | Optional Brotli compression settings. | + +## `SearchOptions` + +Options for configuring full-text search. + +| Field | Type | Description | +|-------|------|-------------| +| `directory` | `?` | Custom directory for the search index. | +| `threshold` | `?` | Relevance score threshold for filtering results. | +| `autoindex` | `?` | Whether to automatically create an index if one does not exist. | +| `limit` | `?` | Maximum number of results to return. | + +## `Sense` + +A word sense — a specific meaning grouped by part of speech. + +Senses represent distinct meanings of a word under a given etymology. +Each sense has a part of speech and contains definitions (or definition groups), +along with optional tags, translations, and inflected forms. + +| Field | Type | Description | +|-------|------|-------------| +| `pos` | `?` | The part of speech for this sense (e.g. noun, verb, adjective). | +| `lemma` | `?` | Optional lemma reference linking to another entry. | +| `definitions` | `?>` | Definitions or definition groups under this sense. | +| `tags` | `?` | Tags for categorizing or filtering this sense. | +| `translations` | `?` | Translations of this sense into other languages. | +| `forms` | `?` | Inflected forms of the word under this sense. | + +## `Token` + +A token produced by NLP-based text segmentation. + +Each token represents a segment of the input text, with metadata about +its position, detected language and script, and any matching dictionary entries. + +| Field | Type | Description | +|-------|------|-------------| +| `lemma` | `?` | The original token text (lemma form). | +| `language` | `?` | Detected language code (e.g. `"eng"`), or `None` if unknown. | +| `entries` | `?` | Matched dictionary entries for this token. | +| `kind` | `?` | The token kind (e.g. `"Word"`, `"Punctuation"`). | +| `script` | `?` | Detected script name (e.g. `"Latin"`, `"Han"`). | +| `start` | `usize` | Start byte offset in the original text. | +| `end` | `usize` | End byte offset in the original text. | + +## `TokenizeOptions` + +Options for configuring text tokenization. + +| Field | Type | Description | +|-------|------|-------------| +| `follow` | `?>` | Whether to follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). | +| `insensitive` | `?` | Whether to enable case-insensitive matching. | + +## `Translation` + +A translation of a word, definition, or example into another language. + +| Field | Type | Description | +|-------|------|-------------| +| `lang` | `?` | The BCP-47 language code (e.g. `"fra"`, `"deu"`). | +| `value` | `?` | The translated text. | diff --git a/node/Cargo.toml b/node/Cargo.toml index 35cf7c866..981c1c479 100644 --- a/node/Cargo.toml +++ b/node/Cargo.toml @@ -5,7 +5,7 @@ version = "1.1.1" publish = false [lib] -crate-type = ["cdylib"] +crate-type = ["cdylib", "rlib"] [features] default = [] diff --git a/python/Cargo.toml b/python/Cargo.toml index 46437e8f5..09ed73ff5 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [lib] name = "theopendictionary" -crate-type = ["cdylib"] +crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.27.2", features = ["either"] }