Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
297 changes: 287 additions & 10 deletions Cargo.lock

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion crates/cli/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use clap::{crate_version, Parser, Subcommand};
use crate::alias::AliasCommands;
use crate::{
CompileArgs, DownloadArgs, DumpArgs, IndexArgs, InfoArgs, LexiconArgs, LookupArgs, MergeArgs,
NewArgs, SearchArgs, ServeArgs, TokenizeArgs,
NewArgs, SearchArgs, ServeArgs, SplitArgs, TokenizeArgs,
};

#[derive(Debug, Parser)]
Expand Down Expand Up @@ -71,6 +71,10 @@ pub enum Commands {
#[command(arg_required_else_help = true)]
Serve(ServeArgs),

/// Splits text into component dictionary words without attempting a whole-word lookup first
#[command(arg_required_else_help = true)]
Split(SplitArgs),

/// Tokenize text and find dictionary entries for each token
#[command(arg_required_else_help = true)]
Tokenize(TokenizeArgs),
Expand Down
2 changes: 2 additions & 0 deletions crates/cli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod new;
mod print;
mod search;
mod serve;
mod split;
mod tokenize;
mod utils;

Expand All @@ -34,5 +35,6 @@ pub use new::*;
pub use print::*;
pub use search::*;
pub use serve::*;
pub use split::*;
pub use tokenize::*;
pub use utils::*;
3 changes: 2 additions & 1 deletion crates/cli/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use clap::Parser;
use console::style;
use odict_cli::{
alias, compile, download, dump, index, info, lexicon, lookup, merge, new, search, serve,
alias, compile, download, dump, index, info, lexicon, lookup, merge, new, search, serve, split,
tokenize, CLIContext, Commands, CLI,
};

Expand All @@ -23,6 +23,7 @@ async fn main() {
Commands::Search(ref args) => search(&mut ctx, args).await,
Commands::Serve(ref args) => serve(&mut ctx, args).await,
Commands::Info(ref args) => info(&mut ctx, args).await,
Commands::Split(ref args) => split(&mut ctx, args).await,
Commands::Tokenize(ref args) => tokenize(&mut ctx, args).await,
};

Expand Down
2 changes: 2 additions & 0 deletions crates/cli/src/serve/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use crate::CLIContext;

mod lookup;
mod search;
mod split;
mod tokenize;

#[derive(Debug, Clone, ValueEnum)]
Expand Down Expand Up @@ -174,6 +175,7 @@ pub async fn serve<'a>(ctx: &mut CLIContext<'a>, args: &ServeArgs) -> anyhow::Re
.app_data(Data::clone(&data))
.service(lookup::handle_lookup)
.service(search::handle_search)
.service(split::handle_split)
.service(tokenize::handle_tokenize)
})
.bind(("0.0.0.0", *port))?
Expand Down
104 changes: 104 additions & 0 deletions crates/cli/src/serve/split.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
use actix_web::{
get,
http::{header::ContentType, StatusCode},
web::{Data, Path, Query},
HttpResponse, Responder, ResponseError,
};
use derive_more::{Display, Error};
use odict::{format::json::ToJSON, split::SplitOptions};
use serde::Deserialize;

use crate::get_lookup_entries;

#[derive(Debug, Deserialize)]
pub struct SplitRequest {
q: String,
follow: Option<bool>,
min_length: Option<usize>,
}

#[derive(Debug, Display, Error)]
enum SplitError {
#[display("Dictionary not found: {}", name)]
DictionaryNotFound { name: String },

#[display("Failed to read dictionary: {}", name)]
DictionaryReadError { name: String },

#[display("Split error: {}", message)]
SplitError { message: String },

#[display("Failed to serialize response")]
SerializeError,
}

impl ResponseError for SplitError {
fn error_response(&self) -> HttpResponse {
HttpResponse::build(self.status_code())
.insert_header(ContentType::html())
.body(self.to_string())
}

fn status_code(&self) -> StatusCode {
match *self {
SplitError::DictionaryNotFound { .. } => StatusCode::NOT_FOUND,
SplitError::DictionaryReadError { .. } => StatusCode::INTERNAL_SERVER_ERROR,
SplitError::SplitError { .. } => StatusCode::INTERNAL_SERVER_ERROR,
SplitError::SerializeError => StatusCode::INTERNAL_SERVER_ERROR,
}
}
}

#[get("/{name}/split")]
async fn handle_split(
params: Query<SplitRequest>,
dict: Path<String>,
dictionary_cache: Data<crate::serve::DictionaryCache>,
) -> Result<impl Responder, SplitError> {
let SplitRequest {
q: raw_queries,
follow,
min_length,
} = params.0;

let queries = raw_queries
.split(',')
.map(|s| s.to_string())
.collect::<Vec<_>>();

let dictionary_name = dict.into_inner();

let file = dictionary_cache
.get(&dictionary_name)
.await
.map_err(|_e| SplitError::DictionaryReadError {
name: dictionary_name.to_string(),
})?
.ok_or(SplitError::DictionaryNotFound {
name: dictionary_name.to_string(),
})?;

let dictionary = file
.contents()
.map_err(|_e| SplitError::DictionaryReadError {
name: dictionary_name.to_string(),
})?;

let opts = SplitOptions::default()
.threshold(min_length.unwrap_or(1))
.follow(follow.unwrap_or(false));

let entries = dictionary
.split(&queries, &opts)
.map_err(|e| SplitError::SplitError {
message: e.to_string(),
})?;

let json = get_lookup_entries(entries)
.to_json(true)
.map_err(|_e| SplitError::SerializeError)?;

Ok(HttpResponse::Ok()
.content_type("application/json")
.body(json))
}
98 changes: 98 additions & 0 deletions crates/cli/src/split.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
use std::time::Duration;

use crate::enums::PrintFormat;
use crate::get_lookup_entries;
use crate::{context::CLIContext, print_entries};
use clap::Args;
use odict::{download::DictionaryDownloader, split::SplitOptions, LoadOptions, OpenDictionary};

#[derive(Debug, Args)]
#[command(args_conflicts_with_subcommands = true)]
#[command(flatten_help = true)]
pub struct SplitArgs {
#[arg(required = true, help = "Path to a compiled dictionary")]
dictionary_path: String,

#[arg(required = true, help = "Text to split into dictionary words")]
queries: Vec<String>,

#[arg(
short,
long,
value_enum,
default_value_t = PrintFormat::Print,
help = "Output format of the entries"
)]
format: PrintFormat,

#[arg(
short = 'F',
long,
help = "Follow see_also redirects until finding an entry with etymologies"
)]
follow: bool,

#[arg(
short = 'm',
long,
default_value_t = 1,
help = "Minimum character length of each split segment"
)]
min_length: usize,

#[arg(
short = 'i',
long,
default_value_t = false,
help = "Perform case-insensitive lookups"
)]
insensitive: bool,

#[arg(
short = 'r',
long,
default_value_t = crate::DEFAULT_RETRIES,
help = "Number of times to retry loading the dictionary (remote-only)"
)]
retries: u32,
}

pub async fn split<'a>(ctx: &mut CLIContext<'a>, args: &SplitArgs) -> anyhow::Result<()> {
let SplitArgs {
dictionary_path: path,
queries,
format,
follow,
min_length,
insensitive,
retries,
} = args;

let spinner = indicatif::ProgressBar::new_spinner();

spinner.enable_steady_tick(Duration::from_millis(100));

let file = OpenDictionary::load_with_options(
path,
LoadOptions::default()
.with_downloader(DictionaryDownloader::default().with_retries(*retries)),
)
.await?;

let opts = SplitOptions::default()
.threshold(*min_length)
.follow(*follow)
.insensitive(*insensitive);

let result = file.contents()?.split(queries, opts);

spinner.finish_and_clear();

match result {
Ok(entries) => {
print_entries(ctx, get_lookup_entries(entries), format)?;
Ok(())
}
Err(err) => Err(anyhow::Error::from(err)),
}
}
99 changes: 63 additions & 36 deletions crates/lib/src/core/lookup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,47 @@ macro_rules! lookup {
Ok($opt::None)
}

fn perform_split<'a>(
&'a self,
query: &str,
options: &crate::split::SplitOptions,
) -> crate::Result<Vec<LookupResult<&'a $ret>>> {
let crate::split::SplitOptions {
threshold,
follow,
insensitive,
} = options;

let chars: Vec<_> = query.chars().collect();
let mut results: Vec<LookupResult<&'a $ret>> = Vec::new();
let mut start = 0;
let mut end = chars.len();

while start < end {
let substr: String = chars[start..end].iter().collect();
let mut path = Vec::new();

match self.find_entry(follow, insensitive, substr.as_str(), None, &mut path) {
Ok($opt::Some(result)) => {
results.push(result);
start = end;
end = chars.len();
}
Ok($opt::None) => {
if end - start <= *threshold {
start = end;
end = chars.len();
} else {
end -= 1;
}
}
Err(e) => return Err(e),
}
}

Ok(results)
}

fn perform_lookup<'a, Options>(
&'a self,
query: &str,
Expand All @@ -154,46 +195,32 @@ macro_rules! lookup {
return Ok(vec![result]);
}

let mut results: Vec<LookupResult<&$ret>> = Vec::new();

if let LookupStrategy::Split(min_length) = strategy {
let chars: Vec<_> = query.chars().collect();
let mut start = 0;
let mut end = chars.len();

while start < end {
let substr: String = chars[start..end].iter().collect();
let mut substr_path = Vec::new();
let maybe_entry = self.find_entry(
follow,
insensitive,
substr.as_str(),
None,
&mut substr_path,
);

match maybe_entry {
Ok($opt::Some(result)) => {
results.push(result);
start = end;
end = chars.len();
continue;
}
Ok($opt::None) => {
if substr.len() <= *min_length {
start = end;
end = chars.len();
continue;
}
}
Err(e) => return Err(e),
}
let split_opts = crate::split::SplitOptions::default()
.threshold(*min_length)
.follow(*follow)
.insensitive(*insensitive);

end -= 1;
}
return self.perform_split(query, &split_opts);
}

Ok(results)
Ok(vec![])
}

pub fn split<'a, Query, Options>(
&'a self,
queries: &Vec<Query>,
options: Options,
) -> crate::Result<Vec<LookupResult<&'a $ret>>>
where
Query: AsRef<str> + Send + Sync,
Options: AsRef<crate::split::SplitOptions> + Send + Sync,
{
queries
.par_iter()
.map(|q| self.perform_split(q.as_ref(), options.as_ref()))
.collect::<crate::Result<Vec<_>>>()
.map(|v| v.into_iter().flatten().collect())
}

pub fn lookup<'a, 'b, Query, Options>(
Expand Down
1 change: 1 addition & 0 deletions crates/lib/src/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod compile;
pub mod lexicon;
pub mod lookup;
pub mod merge;
pub mod split;
pub mod preview;
pub mod rank;
pub mod read;
Expand Down
Loading
Loading