From 543494107bafcb87056d9384c74bc87c7eb034f0 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 2 Dec 2025 03:22:48 +0800 Subject: [PATCH 1/6] chore: start working on irregular plural module --- harper-core/src/linting/irregular_verbs.json | 114 ++++++++ harper-core/src/linting/irregular_verbs.rs | 80 +++++ harper-core/src/linting/mod.rs | 1 + harper-core/src/linting/will_non_lemma.rs | 293 +++++++++++++++++++ 4 files changed, 488 insertions(+) create mode 100644 harper-core/src/linting/irregular_verbs.json create mode 100644 harper-core/src/linting/irregular_verbs.rs create mode 100644 harper-core/src/linting/will_non_lemma.rs diff --git a/harper-core/src/linting/irregular_verbs.json b/harper-core/src/linting/irregular_verbs.json new file mode 100644 index 000000000..49c8b3c32 --- /dev/null +++ b/harper-core/src/linting/irregular_verbs.json @@ -0,0 +1,114 @@ +[ + "// comment lines look like this", + ["arise", "arose", "arisen"], + ["awake", "awoke", "awoken"], + "// be -- was/were -- been", + ["become", "became", "become"], + ["begin", "began", "begun"], + ["bend", "bent", "bent"], + ["bet", "bet", "bet"], + ["bid", "bade", "bidden"], + ["bind", "bound", "bound"], + ["bite", "bit", "bitten"], + ["bleed", "bled", "bled"], + ["blow", "blew", "blown"], + ["break", "broke", "broken"], + ["breed", "bred", "bred"], + ["bring", "brought", "brought"], + ["build", "built", "built"], + ["burst", "burst", "burst"], + ["buy", "bought", "bought"], + ["catch", "caught", "caught"], + ["choose", "chose", "chosen"], + ["come", "came", "come"], + ["cost", "cost", "cost"], + ["cut", "cut", "cut"], + ["dive", "dove", "dove"], + ["do", "did", "done"], + ["drink", "drank", "drunk"], + ["drive", "drove", "driven"], + ["eat", "ate", "eaten"], + ["fall", "fell", "fallen"], + ["feed", "fed", "fed"], + ["feel", "felt", "felt"], + ["fight", "fought", "fought"], + ["find", "found", "found"], + ["fly", "flew", "flown"], + ["forgo", "forwent", "forgone"], + ["freeze", "froze", "frozen"], + ["get", "got", "got"], + ["go", "went", "gone"], + ["have", "had", "had"], + ["hear", "heard", "heard"], + ["hit", "hit", "hit"], + ["hold", "held", "held"], + ["hurt", "hurt", "hurt"], + ["input", "input", "input"], + ["keep", "kept", "kept"], + ["know", "knew", "known"], + ["lay", "laid", "lain"], + ["light", "lit", "lit"], + ["lose", "lost", "lost"], + ["make", "made", "made"], + ["mistake", "mistook", "mistaken"], + ["overtake", "overtook", "overtaken"], + ["overthrow", "overthrew", "overthrown"], + ["overwrite", "overwrote", "overwritten"], + ["partake", "partook", "partaken"], + ["pay", "paid", "paid"], + ["put", "put", "put"], + ["read", "read", "read"], + ["redo", "redid", "redone"], + ["remake", "remade", "remade"], + ["reread", "reread", "reread"], + ["reset", "reset", "reset"], + ["ride", "rode", "ridden"], + ["ring", "rang", "rung"], + ["rise", "rose", "risen"], + ["run", "ran", "run"], + ["see", "saw", "seen"], + ["sell", "sold", "sold"], + ["send", "sent", "sent"], + ["set", "set", "set"], + ["shake", "shook", "shaken"], + ["shed", "shed", "shed"], + ["shine", "shone", "shone"], + ["shoe", "shod", "shod"], + ["shoot", "shot", "shot"], + ["show", "showed", "shown"], + ["shrink", "shrank", "shrunk"], + ["shut", "shut", "shut"], + ["sing", "sang", "sung"], + ["sink", "sank", "sunk"], + ["sit", "sat", "sat"], + ["slay", "slew", "slain"], + ["sleep", "slept", "slept"], + ["slide", "slid", "slid"], + ["slit", "slit", "slit"], + ["speak", "spoke", "spoken"], + ["spin", "spun", "spun"], + ["spit", "spat", "spat"], + ["split", "split", "split"], + ["spread", "spread", "spread"], + ["spring", "sprang", "sprung"], + ["stand", "stood", "stood"], + ["steal", "stole", "stolen"], + ["stick", "stuck", "stuck"], + ["sting", "stung", "stung"], + ["stink", "stank", "stunk"], + ["stride", "strode", "stridden"], + ["strike", "struck", "stricken"], + ["string", "strung", "strung"], + ["swim", "swam", "swum"], + ["swing", "swung", "swung"], + ["take", "took", "taken"], + ["teach", "taught", "taught"], + ["tear", "tore", "torn"], + ["think", "thought", "thought"], + ["throw", "threw", "thrown"], + ["tread", "trod", "trodden"], + ["undo", "undid", "undone"], + ["wear", "wore", "worn"], + ["wind", "wound", "wound"], + ["write", "wrote", "written"] +] \ No newline at end of file diff --git a/harper-core/src/linting/irregular_verbs.rs b/harper-core/src/linting/irregular_verbs.rs new file mode 100644 index 000000000..be6d5e63d --- /dev/null +++ b/harper-core/src/linting/irregular_verbs.rs @@ -0,0 +1,80 @@ +use hashbrown::HashSet; +use lazy_static::lazy_static; +use serde::Deserialize; +use std::sync::Arc; + +type Verb = (String, String, String); + +#[derive(Debug, Deserialize)] +pub struct VerbTable { + verbs: Vec, +} + +/// The uncached function that is used to produce the original copy of the +/// irregular verb table. +fn uncached_inner_new() -> Arc { + VerbTable::from_json_file( + include_str!("irregular_verbs.json") + ) + .map(Arc::new) + .unwrap_or_else(|e| panic!("Failed to load irregular verb table: {}", e)) +} + +lazy_static! { + static ref VERBS: Arc = uncached_inner_new(); +} + +impl VerbTable { + pub fn new() -> Self { + Self { + verbs: VerbTable::default() + } + } + + pub fn from_json_file(json: &str) -> Result { + // Deserialize into Vec to handle mixed types + let values: Vec = + serde_json::from_str(json).expect("Failed to parse irregular verbs JSON"); + + let mut verbs = Vec::new(); + + for value in values { + match value { + serde_json::Value::Array(arr) if arr.len() == 3 => { + // Handle array of 3 strings + if let (Some(lemma), Some(preterite), Some(past_participle)) = + (arr[0].as_str(), arr[1].as_str(), arr[2].as_str()) + { + verbs.push(( + lemma.to_string(), + preterite.to_string(), + past_participle.to_string(), + )); + } + } + // Strings are used for comments to guide contributors editing the file + serde_json::Value::String(_) => {} + _ => {} + } + } + + Ok(Self { verbs }) + } +} + +impl Default for VerbTable { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_verb_table() { + let vt = VerbTable::from_json_file(include_str!("irregular_verbs.json")); + eprintln!("{:#?}", vt); + } +} diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 52347c41b..368b9e614 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -78,6 +78,7 @@ mod inflected_verb_after_to; mod initialism_linter; mod initialisms; mod interested_in; +mod irregular_verbs; mod it_is; mod it_looks_like_that; mod it_would_be; diff --git a/harper-core/src/linting/will_non_lemma.rs b/harper-core/src/linting/will_non_lemma.rs new file mode 100644 index 000000000..e0b69adb6 --- /dev/null +++ b/harper-core/src/linting/will_non_lemma.rs @@ -0,0 +1,293 @@ +use hashbrown::HashMap; + +use crate::expr::{Expr, SequenceExpr}; +use crate::linting::expr_linter::Chunk; +use crate::linting::irregular_verbs; +use crate::linting::{ExprLinter, LintKind, Suggestion}; +use crate::spell::Dictionary; +use crate::{Lint, Token, TokenStringExt}; + +/// Maps irregular simple past verb forms to their lemma forms +const IRREGULAR_VERBS: &[(&str, &str)] = &[ + ("ate", "eat"), + ("awoke", "awake"), + ("broke", "break"), + ("burnt", "burn"), + ("came", "come"), + ("did", "do"), + ("dove", "dive"), + ("drank", "drink"), + ("drove", "drive"), + ("flew", "fly"), + ("forwent", "forgo"), + ("froze", "freeze"), + ("got", "get"), + ("had", "have"), + ("hit", "hit"), + // ("hurt", "hurt"), + ("knew", "know"), + ("laid", "lay"), + ("lit", "light"), + ("lost", "lose"), + ("made", "make"), + ("mistook", "mistake"), + ("overthrew", "overthrow"), + ("overtook", "overtake"), + ("overwrote", "overwrite"), + ("ran", "run"), + // ("read", "read"), + ("redid", "redo"), + // ("reread", "reread"), + ("rode", "ride"), + ("rose", "rise"), + ("saw", "see"), + ("taught", "teach"), + ("thought", "think"), + ("threw", "throw"), + ("took", "take"), + ("tore", "tear"), + ("undid", "undo"), + ("went", "go"), + ("wore", "wear"), + ("wrote", "write"), +]; + +lazy_static::lazy_static! { + static ref IRREGULAR_VERB_MAP: HashMap<&'static str, &'static str> = + IRREGULAR_VERBS.iter().copied().collect(); +} + +pub struct WillNonLemma +where + D: Dictionary, +{ + expr: Box, + dict: D, +} + +impl WillNonLemma +where + D: Dictionary, +{ + pub fn new(dict: D) -> Self { + Self { + expr: Box::new( + SequenceExpr::word_set(&["will", "shall"]) + .t_ws() + .then_kind_where(|kind| { + kind.is_verb() + && !kind.is_verb_lemma() + && (!kind.is_noun() || kind.is_verb_progressive_form()) + }), + ), + dict, + } + } +} + +impl ExprLinter for WillNonLemma { + type Unit = Chunk; + + fn expr(&self) -> &dyn Expr { + self.expr.as_ref() + } + + fn match_to_lint_with_context( + &self, + toks: &[Token], + src: &[char], + ctx: Option<(&[Token], &[Token])>, + ) -> Option { + let matched_chars = toks.span()?.get_content(src); + + // 'modal' is the 3rd last token, verb is the last token + let verb_idx = toks.len() - 1; + let verb_tok = &toks[verb_idx]; + let verb_str = verb_tok.span.get_content_string(src); + + let suggest = + |text: &str| Suggestion::replace_with_match_case(text.chars().collect(), matched_chars); + + let maybe_prev_word_tok: Option<&Token> = match ctx { + Some((prev, _)) if prev.len() >= 2 => { + let last = &prev[prev.len() - 1]; + let potential_word = &prev[prev.len() - 2]; + if last.kind.is_whitespace() && potential_word.kind.is_word() { + Some(potential_word) + } else { + None + } + } + _ => None, + }; + + let mut suggestions = vec![]; + + if verb_tok.kind.is_verb_simple_past_form() + && let Some(&lemma) = IRREGULAR_VERB_MAP.get(verb_str.as_str()) + && self + .dict + .get_word_metadata_str(lemma) + .is_some_and(|m| m.is_verb_lemma()) + { + suggestions.push(suggest(&format!("will {}", lemma))); + suggestions.push(suggest(&verb_str)); + } + if verb_tok.kind.is_verb_third_person_singular_present_form() { + let candidate = &verb_str[..verb_str.len() - 1]; + if self + .dict + .get_word_metadata_str(candidate) + .is_some_and(|m| m.is_verb_lemma()) + { + suggestions.push(suggest(&format!("will {}", candidate))); + suggestions.push(suggest(&verb_str)); + + // Add suggestion for plural nouns + if maybe_prev_word_tok.is_some_and(|tok| tok.kind.is_plural_nominal()) { + suggestions.push(suggest(candidate)); + } + } + } + if verb_tok.kind.is_verb_progressive_form() { + if let Some(stem) = verb_str.strip_suffix("ing") { + // Check regular form (e.g., 'walking' -> 'walk') + if self + .dict + .get_word_metadata_str(stem) + .is_some_and(|m| m.is_verb_lemma()) + { + suggestions.push(Suggestion::replace_with_match_case( + format!("will {}", stem).chars().collect(), + matched_chars, + )); + } + + // Check form that adds 'e' (e.g., 'coming' -> 'come') + let stem_with_e = format!("{}e", stem); + if self + .dict + .get_word_metadata_str(&stem_with_e) + .is_some_and(|m| m.is_verb_lemma()) + { + suggestions.push(Suggestion::replace_with_match_case( + format!("will {}", stem_with_e).chars().collect(), + matched_chars, + )); + } + } + + let v_ing = Suggestion::replace_with_match_case( + verb_tok.span.get_content(src).to_vec(), + toks.span()?.get_content(src), + ); + suggestions.push(v_ing); + let will_be_v_ing = Suggestion::replace_with_match_case( + format!("will be {}", verb_str) + .chars() + .collect::>(), + toks.span()?.get_content(src), + ); + suggestions.push(will_be_v_ing); + } + + Some(Lint { + span: toks.span()?, + lint_kind: LintKind::Grammar, + suggestions, + message: "`Will` and `shall` should be followed by a verb in its base form." + .to_string(), + ..Default::default() + }) + } + + fn description(&self) -> &str { + "Flags wrong verb forms after `will` or `shall`" + } +} + +#[cfg(test)] +mod tests { + use super::WillNonLemma; + use crate::linting::tests::{assert_good_and_bad_suggestions, assert_lint_count}; + use crate::spell::FstDictionary; + + #[test] + fn fix_will_ran() { + // singular + will + irregular preterite + assert_good_and_bad_suggestions( + "The brown fox will ran thru the meadow.", + WillNonLemma::new(FstDictionary::curated()), + &[ + "The brown fox will run thru the meadow.", + "The brown fox ran thru the meadow.", + ], + &[], + ); + } + + #[test] + fn fix_will_exists() { + // plural + will + 3rd person singular present + assert_good_and_bad_suggestions( + "there is a good chance duplicate Rule IDs will exists.", + WillNonLemma::new(FstDictionary::curated()), + &[ + "there is a good chance duplicate Rule IDs will exist.", + "there is a good chance duplicate Rule IDs exists.", + "there is a good chance duplicate Rule IDs exist.", + ], + &[], + ); + } + + #[test] + fn ignore_shall_vessels() { + // "nor" + shall + (3rd person singular present == plural noun) + assert_lint_count( + "No Preference shall be given by any Regulation of Commerce or Revenue to the Ports of one State over those of another; nor shall Vessels bound to, or from, one State, be obliged to enter, clear, or pay Duties in another.", + WillNonLemma::new(FstDictionary::curated()), + 0, + ); + } + + #[test] + fn ignore_will_tools() { + // "free will" + (3rd person singular present == plural noun) + assert_lint_count( + "Give your AI free will tools.", + WillNonLemma::new(FstDictionary::curated()), + 0, + ); + } + + #[test] + fn fix_will_coming_soon() { + // plural + will + progressive + assert_good_and_bad_suggestions( + "More advanced features will coming soon, so stay tuned!", + WillNonLemma::new(FstDictionary::curated()), + &[ + "More advanced features will come soon, so stay tuned!", + "More advanced features coming soon, so stay tuned!", + "More advanced features will be coming soon, so stay tuned!", + ], + &[], + ); + } + + #[test] + fn fix_will_coming_next() { + // singular + will + progressive + assert_good_and_bad_suggestions( + "on CPU and GPU (NPU support will coming next)", + WillNonLemma::new(FstDictionary::curated()), + &[ + "on CPU and GPU (NPU support will come next)", + "on CPU and GPU (NPU support coming next)", + "on CPU and GPU (NPU support will be coming next)", + ], + &[], + ); + } +} From 3c5d46d40beb33dd561d0424a61a89ffc813683a Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 2 Dec 2025 13:32:06 +0800 Subject: [PATCH 2/6] feat: irregular verb module --- harper-core/src/linting/irregular_verbs.json | 17 ++- harper-core/src/linting/irregular_verbs.rs | 74 +++++++--- harper-core/src/linting/mod.rs | 1 + .../linting/simple_past_to_past_participle.rs | 130 ++++-------------- 4 files changed, 98 insertions(+), 124 deletions(-) diff --git a/harper-core/src/linting/irregular_verbs.json b/harper-core/src/linting/irregular_verbs.json index 49c8b3c32..fce374e06 100644 --- a/harper-core/src/linting/irregular_verbs.json +++ b/harper-core/src/linting/irregular_verbs.json @@ -1,8 +1,9 @@ [ - "// comment lines look like this", + "// comments can appear in the line before an entry", + "// or in place of an entry", ["arise", "arose", "arisen"], ["awake", "awoke", "awoken"], - "// be -- was/were -- been", + "// be/am/are/is -- was/were -- been", ["become", "became", "become"], ["begin", "began", "begun"], ["bend", "bent", "bent"], @@ -34,10 +35,14 @@ ["fight", "fought", "fought"], ["find", "found", "found"], ["fly", "flew", "flown"], + ["forget", "forgot", "forgotten"], ["forgo", "forwent", "forgone"], ["freeze", "froze", "frozen"], + "// get -- got -- gotten", ["get", "got", "got"], + ["give", "gave", "given"], ["go", "went", "gone"], + ["grow", "grew", "grown"], ["have", "had", "had"], ["hear", "heard", "heard"], ["hit", "hit", "hit"], @@ -47,10 +52,12 @@ ["keep", "kept", "kept"], ["know", "knew", "known"], ["lay", "laid", "lain"], + ["lead", "led", "led"], ["light", "lit", "lit"], ["lose", "lost", "lost"], ["make", "made", "made"], ["mistake", "mistook", "mistaken"], + ["output", "output", "output"], ["overtake", "overtook", "overtaken"], ["overthrow", "overthrew", "overthrown"], ["overwrite", "overwrote", "overwritten"], @@ -79,12 +86,14 @@ ["shrink", "shrank", "shrunk"], ["shut", "shut", "shut"], ["sing", "sang", "sung"], + "// sink -- sank -- sunken??", ["sink", "sank", "sunk"], ["sit", "sat", "sat"], ["slay", "slew", "slain"], ["sleep", "slept", "slept"], ["slide", "slid", "slid"], ["slit", "slit", "slit"], + "// sneak -- sneaked/snuck -- sneaked/snuck", ["speak", "spoke", "spoken"], ["spin", "spun", "spun"], ["spit", "spat", "spat"], @@ -99,6 +108,8 @@ ["stride", "strode", "stridden"], ["strike", "struck", "stricken"], ["string", "strung", "strung"], + ["sew", "sewed", "sewn"], + ["swear", "swore", "sworn"], ["swim", "swam", "swum"], ["swing", "swung", "swung"], ["take", "took", "taken"], @@ -108,7 +119,9 @@ ["throw", "threw", "thrown"], ["tread", "trod", "trodden"], ["undo", "undid", "undone"], + ["wake", "woke", "woken"], ["wear", "wore", "worn"], + ["weave", "wove", "woven"], ["wind", "wound", "wound"], ["write", "wrote", "written"] ] \ No newline at end of file diff --git a/harper-core/src/linting/irregular_verbs.rs b/harper-core/src/linting/irregular_verbs.rs index be6d5e63d..a1860168a 100644 --- a/harper-core/src/linting/irregular_verbs.rs +++ b/harper-core/src/linting/irregular_verbs.rs @@ -1,4 +1,3 @@ -use hashbrown::HashSet; use lazy_static::lazy_static; use serde::Deserialize; use std::sync::Arc; @@ -6,29 +5,25 @@ use std::sync::Arc; type Verb = (String, String, String); #[derive(Debug, Deserialize)] -pub struct VerbTable { +pub struct IrregularVerbs { verbs: Vec, } /// The uncached function that is used to produce the original copy of the /// irregular verb table. -fn uncached_inner_new() -> Arc { - VerbTable::from_json_file( - include_str!("irregular_verbs.json") - ) - .map(Arc::new) - .unwrap_or_else(|e| panic!("Failed to load irregular verb table: {}", e)) +fn uncached_inner_new() -> Arc { + IrregularVerbs::from_json_file(include_str!("irregular_verbs.json")) + .map(Arc::new) + .unwrap_or_else(|e| panic!("Failed to load irregular verb table: {}", e)) } lazy_static! { - static ref VERBS: Arc = uncached_inner_new(); + static ref VERBS: Arc = uncached_inner_new(); } -impl VerbTable { +impl IrregularVerbs { pub fn new() -> Self { - Self { - verbs: VerbTable::default() - } + Self { verbs: vec![] } } pub fn from_json_file(json: &str) -> Result { @@ -60,9 +55,20 @@ impl VerbTable { Ok(Self { verbs }) } + + pub fn get() -> Arc { + (*VERBS).clone() + } + + pub fn get_past_participle_for_preterite(&self, preterite: &str) -> Option<&str> { + self.verbs + .iter() + .find(|(_, pt, _)| pt.eq_ignore_ascii_case(preterite)) + .map(|(_, _, pp)| pp.as_str()) + } } -impl Default for VerbTable { +impl Default for IrregularVerbs { fn default() -> Self { Self::new() } @@ -73,8 +79,42 @@ mod tests { use super::*; #[test] - fn test_verb_table() { - let vt = VerbTable::from_json_file(include_str!("irregular_verbs.json")); - eprintln!("{:#?}", vt); + fn can_find_irregular_past_participle_for_preterite_lowercase() { + assert_eq!( + IrregularVerbs::get().get_past_participle_for_preterite("arose"), + Some("arisen") + ); + } + + #[test] + fn can_find_irregular_past_participle_for_preterite_uppercase() { + assert_eq!( + IrregularVerbs::get().get_past_participle_for_preterite("WENT"), + Some("gone") + ); + } + + #[test] + fn can_find_irregular_past_participle_same_as_past_tense() { + assert_eq!( + IrregularVerbs::get().get_past_participle_for_preterite("taught"), + Some("taught") + ); + } + + #[test] + fn cant_find_regular_past_participle() { + assert_eq!( + IrregularVerbs::get().get_past_participle_for_preterite("walked"), + None + ); + } + + #[test] + fn cant_find_non_verb() { + assert_eq!( + IrregularVerbs::get().get_past_participle_for_preterite("the"), + None + ); } } diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 368b9e614..e5753fe86 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -202,6 +202,7 @@ mod would_never_have; pub use expr_linter::ExprLinter; pub use initialism_linter::InitialismLinter; +pub use irregular_verbs::IrregularVerbs; pub use lint::Lint; pub use lint_group::{LintGroup, LintGroupConfig}; pub use lint_kind::LintKind; diff --git a/harper-core/src/linting/simple_past_to_past_participle.rs b/harper-core/src/linting/simple_past_to_past_participle.rs index 7217f4c0c..ce4098848 100644 --- a/harper-core/src/linting/simple_past_to_past_participle.rs +++ b/harper-core/src/linting/simple_past_to_past_participle.rs @@ -1,82 +1,11 @@ use crate::linting::expr_linter::Chunk; use crate::{ Token, - char_string::CharStringExt, expr::{All, Expr, FirstMatchOf, SequenceExpr}, - linting::{ExprLinter, Lint, LintKind, Suggestion}, + linting::{ExprLinter, IrregularVerbs, Lint, LintKind, Suggestion}, patterns::{InflectionOfBe, WordSet}, }; -/// Maps common irregular verbs between their simple past and past participle forms. -const IRREGULAR_VERBS: &[(&str, &str)] = &[ - ("arose", "arisen"), - ("ate", "eaten"), - ("awoke", "awoken"), - ("bade", "bidden"), - ("became", "become"), - ("began", "begun"), - ("bit", "bitten"), - ("blew", "blown"), - ("bought", "bought"), - ("brang", "brung"), - ("broke", "broken"), - ("brought", "brought"), - ("came", "come"), - ("chose", "chosen"), - ("did", "done"), - ("drank", "drunk"), - ("drove", "driven"), - ("fell", "fallen"), - ("felt", "felt"), - ("flew", "flown"), - ("forgot", "forgotten"), - ("forwent", "forgone"), - ("gave", "given"), - ("grew", "grown"), - ("had", "had"), - ("heard", "heard"), - ("hit", "hit"), - ("input", "input"), - ("knew", "known"), - ("led", "led"), - ("mistook", "mistaken"), - ("output", "output"), - ("overtook", "overtaken"), - ("paid", "paid"), - ("partook", "partaken"), - // proved, proved/proven - ("put", "put"), - ("ran", "run"), - ("rang", "rung"), - ("read", "read"), - ("reset", "reset"), - ("rode", "ridden"), - ("rose", "risen"), - ("sang", "sung"), - ("sank", "sunken"), - ("saw", "seen"), - ("set", "set"), - ("sewed", "sewn"), - ("slew", "slain"), - ("slid", "slid"), - ("spoke", "spoken"), - ("sprang", "sprung"), - ("stank", "stunk"), - ("stole", "stolen"), - ("stood", "stood"), - ("swam", "swum"), - ("swore", "sworn"), - ("thought", "thought"), - ("trod", "trodden"), - ("took", "taken"), - // was, been - // were, been - ("went", "gone"), - ("woke", "woken"), - ("wove", "woven"), - ("wrote", "written"), -]; - /// Corrects simple past tense verbs to past participle after auxiliary verbs like "have" or "be". pub struct SimplePastToPastParticiple { expr: Box, @@ -141,41 +70,32 @@ impl ExprLinter for SimplePastToPastParticiple { let verb_tok = &toks[2]; - let verb_ch = verb_tok.span.get_content(src); - if !IRREGULAR_VERBS - .iter() - .any(|(t, p)| verb_ch.eq_ignore_ascii_case_str(t) && p != t) - { - return None; - } + let simple_past = verb_tok.span.get_content_string(src); - let (simple_past, past_participle) = IRREGULAR_VERBS - .iter() - .find(|(simple_past, _)| { - verb_tok - .span - .get_content(src) - .eq_ignore_ascii_case_str(simple_past) + if let Some(past_participle) = IrregularVerbs::get() + .get_past_participle_for_preterite(&simple_past) + .filter(|pp| pp != &simple_past) + { + let suggestions = vec![Suggestion::replace_with_match_case( + past_participle.chars().collect(), + verb_tok.span.get_content(src), + )]; + + let message = format!( + "Use the past participle `{}` instead of `{}` when using compound tenses or passive voice.", + past_participle, simple_past + ); + + Some(Lint { + span: verb_tok.span, + lint_kind: LintKind::Grammar, + suggestions, + message, + ..Default::default() }) - .unwrap(); - - let suggestions = vec![Suggestion::replace_with_match_case( - past_participle.chars().collect(), - verb_tok.span.get_content(src), - )]; - - let message = format!( - "Use the past participle `{}` instead of `{}` when using compound tenses or passive voice.", - past_participle, simple_past - ); - - Some(Lint { - span: verb_tok.span, - lint_kind: LintKind::Grammar, - suggestions, - message, - ..Default::default() - }) + } else { + None + } } fn description(&self) -> &str { From 4c5ee0c5c54aa2edaf5c97ad056efd8cd28c8690 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 2 Dec 2025 14:16:49 +0800 Subject: [PATCH 3/6] fix: `will_non_lemma.rs` shouldn't be included here --- harper-core/src/linting/will_non_lemma.rs | 293 ---------------------- 1 file changed, 293 deletions(-) delete mode 100644 harper-core/src/linting/will_non_lemma.rs diff --git a/harper-core/src/linting/will_non_lemma.rs b/harper-core/src/linting/will_non_lemma.rs deleted file mode 100644 index e0b69adb6..000000000 --- a/harper-core/src/linting/will_non_lemma.rs +++ /dev/null @@ -1,293 +0,0 @@ -use hashbrown::HashMap; - -use crate::expr::{Expr, SequenceExpr}; -use crate::linting::expr_linter::Chunk; -use crate::linting::irregular_verbs; -use crate::linting::{ExprLinter, LintKind, Suggestion}; -use crate::spell::Dictionary; -use crate::{Lint, Token, TokenStringExt}; - -/// Maps irregular simple past verb forms to their lemma forms -const IRREGULAR_VERBS: &[(&str, &str)] = &[ - ("ate", "eat"), - ("awoke", "awake"), - ("broke", "break"), - ("burnt", "burn"), - ("came", "come"), - ("did", "do"), - ("dove", "dive"), - ("drank", "drink"), - ("drove", "drive"), - ("flew", "fly"), - ("forwent", "forgo"), - ("froze", "freeze"), - ("got", "get"), - ("had", "have"), - ("hit", "hit"), - // ("hurt", "hurt"), - ("knew", "know"), - ("laid", "lay"), - ("lit", "light"), - ("lost", "lose"), - ("made", "make"), - ("mistook", "mistake"), - ("overthrew", "overthrow"), - ("overtook", "overtake"), - ("overwrote", "overwrite"), - ("ran", "run"), - // ("read", "read"), - ("redid", "redo"), - // ("reread", "reread"), - ("rode", "ride"), - ("rose", "rise"), - ("saw", "see"), - ("taught", "teach"), - ("thought", "think"), - ("threw", "throw"), - ("took", "take"), - ("tore", "tear"), - ("undid", "undo"), - ("went", "go"), - ("wore", "wear"), - ("wrote", "write"), -]; - -lazy_static::lazy_static! { - static ref IRREGULAR_VERB_MAP: HashMap<&'static str, &'static str> = - IRREGULAR_VERBS.iter().copied().collect(); -} - -pub struct WillNonLemma -where - D: Dictionary, -{ - expr: Box, - dict: D, -} - -impl WillNonLemma -where - D: Dictionary, -{ - pub fn new(dict: D) -> Self { - Self { - expr: Box::new( - SequenceExpr::word_set(&["will", "shall"]) - .t_ws() - .then_kind_where(|kind| { - kind.is_verb() - && !kind.is_verb_lemma() - && (!kind.is_noun() || kind.is_verb_progressive_form()) - }), - ), - dict, - } - } -} - -impl ExprLinter for WillNonLemma { - type Unit = Chunk; - - fn expr(&self) -> &dyn Expr { - self.expr.as_ref() - } - - fn match_to_lint_with_context( - &self, - toks: &[Token], - src: &[char], - ctx: Option<(&[Token], &[Token])>, - ) -> Option { - let matched_chars = toks.span()?.get_content(src); - - // 'modal' is the 3rd last token, verb is the last token - let verb_idx = toks.len() - 1; - let verb_tok = &toks[verb_idx]; - let verb_str = verb_tok.span.get_content_string(src); - - let suggest = - |text: &str| Suggestion::replace_with_match_case(text.chars().collect(), matched_chars); - - let maybe_prev_word_tok: Option<&Token> = match ctx { - Some((prev, _)) if prev.len() >= 2 => { - let last = &prev[prev.len() - 1]; - let potential_word = &prev[prev.len() - 2]; - if last.kind.is_whitespace() && potential_word.kind.is_word() { - Some(potential_word) - } else { - None - } - } - _ => None, - }; - - let mut suggestions = vec![]; - - if verb_tok.kind.is_verb_simple_past_form() - && let Some(&lemma) = IRREGULAR_VERB_MAP.get(verb_str.as_str()) - && self - .dict - .get_word_metadata_str(lemma) - .is_some_and(|m| m.is_verb_lemma()) - { - suggestions.push(suggest(&format!("will {}", lemma))); - suggestions.push(suggest(&verb_str)); - } - if verb_tok.kind.is_verb_third_person_singular_present_form() { - let candidate = &verb_str[..verb_str.len() - 1]; - if self - .dict - .get_word_metadata_str(candidate) - .is_some_and(|m| m.is_verb_lemma()) - { - suggestions.push(suggest(&format!("will {}", candidate))); - suggestions.push(suggest(&verb_str)); - - // Add suggestion for plural nouns - if maybe_prev_word_tok.is_some_and(|tok| tok.kind.is_plural_nominal()) { - suggestions.push(suggest(candidate)); - } - } - } - if verb_tok.kind.is_verb_progressive_form() { - if let Some(stem) = verb_str.strip_suffix("ing") { - // Check regular form (e.g., 'walking' -> 'walk') - if self - .dict - .get_word_metadata_str(stem) - .is_some_and(|m| m.is_verb_lemma()) - { - suggestions.push(Suggestion::replace_with_match_case( - format!("will {}", stem).chars().collect(), - matched_chars, - )); - } - - // Check form that adds 'e' (e.g., 'coming' -> 'come') - let stem_with_e = format!("{}e", stem); - if self - .dict - .get_word_metadata_str(&stem_with_e) - .is_some_and(|m| m.is_verb_lemma()) - { - suggestions.push(Suggestion::replace_with_match_case( - format!("will {}", stem_with_e).chars().collect(), - matched_chars, - )); - } - } - - let v_ing = Suggestion::replace_with_match_case( - verb_tok.span.get_content(src).to_vec(), - toks.span()?.get_content(src), - ); - suggestions.push(v_ing); - let will_be_v_ing = Suggestion::replace_with_match_case( - format!("will be {}", verb_str) - .chars() - .collect::>(), - toks.span()?.get_content(src), - ); - suggestions.push(will_be_v_ing); - } - - Some(Lint { - span: toks.span()?, - lint_kind: LintKind::Grammar, - suggestions, - message: "`Will` and `shall` should be followed by a verb in its base form." - .to_string(), - ..Default::default() - }) - } - - fn description(&self) -> &str { - "Flags wrong verb forms after `will` or `shall`" - } -} - -#[cfg(test)] -mod tests { - use super::WillNonLemma; - use crate::linting::tests::{assert_good_and_bad_suggestions, assert_lint_count}; - use crate::spell::FstDictionary; - - #[test] - fn fix_will_ran() { - // singular + will + irregular preterite - assert_good_and_bad_suggestions( - "The brown fox will ran thru the meadow.", - WillNonLemma::new(FstDictionary::curated()), - &[ - "The brown fox will run thru the meadow.", - "The brown fox ran thru the meadow.", - ], - &[], - ); - } - - #[test] - fn fix_will_exists() { - // plural + will + 3rd person singular present - assert_good_and_bad_suggestions( - "there is a good chance duplicate Rule IDs will exists.", - WillNonLemma::new(FstDictionary::curated()), - &[ - "there is a good chance duplicate Rule IDs will exist.", - "there is a good chance duplicate Rule IDs exists.", - "there is a good chance duplicate Rule IDs exist.", - ], - &[], - ); - } - - #[test] - fn ignore_shall_vessels() { - // "nor" + shall + (3rd person singular present == plural noun) - assert_lint_count( - "No Preference shall be given by any Regulation of Commerce or Revenue to the Ports of one State over those of another; nor shall Vessels bound to, or from, one State, be obliged to enter, clear, or pay Duties in another.", - WillNonLemma::new(FstDictionary::curated()), - 0, - ); - } - - #[test] - fn ignore_will_tools() { - // "free will" + (3rd person singular present == plural noun) - assert_lint_count( - "Give your AI free will tools.", - WillNonLemma::new(FstDictionary::curated()), - 0, - ); - } - - #[test] - fn fix_will_coming_soon() { - // plural + will + progressive - assert_good_and_bad_suggestions( - "More advanced features will coming soon, so stay tuned!", - WillNonLemma::new(FstDictionary::curated()), - &[ - "More advanced features will come soon, so stay tuned!", - "More advanced features coming soon, so stay tuned!", - "More advanced features will be coming soon, so stay tuned!", - ], - &[], - ); - } - - #[test] - fn fix_will_coming_next() { - // singular + will + progressive - assert_good_and_bad_suggestions( - "on CPU and GPU (NPU support will coming next)", - WillNonLemma::new(FstDictionary::curated()), - &[ - "on CPU and GPU (NPU support will come next)", - "on CPU and GPU (NPU support coming next)", - "on CPU and GPU (NPU support will be coming next)", - ], - &[], - ); - } -} From 84546fdcfb174227de6535dc3ade9c65647edddb Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 2 Dec 2025 15:33:18 +0800 Subject: [PATCH 4/6] fix: `just format` --- harper-core/src/linting/irregular_verbs.json | 252 +++++++++---------- 1 file changed, 126 insertions(+), 126 deletions(-) diff --git a/harper-core/src/linting/irregular_verbs.json b/harper-core/src/linting/irregular_verbs.json index fce374e06..5d710a5c1 100644 --- a/harper-core/src/linting/irregular_verbs.json +++ b/harper-core/src/linting/irregular_verbs.json @@ -1,127 +1,127 @@ [ - "// comments can appear in the line before an entry", - "// or in place of an entry", - ["arise", "arose", "arisen"], - ["awake", "awoke", "awoken"], - "// be/am/are/is -- was/were -- been", - ["become", "became", "become"], - ["begin", "began", "begun"], - ["bend", "bent", "bent"], - ["bet", "bet", "bet"], - ["bid", "bade", "bidden"], - ["bind", "bound", "bound"], - ["bite", "bit", "bitten"], - ["bleed", "bled", "bled"], - ["blow", "blew", "blown"], - ["break", "broke", "broken"], - ["breed", "bred", "bred"], - ["bring", "brought", "brought"], - ["build", "built", "built"], - ["burst", "burst", "burst"], - ["buy", "bought", "bought"], - ["catch", "caught", "caught"], - ["choose", "chose", "chosen"], - ["come", "came", "come"], - ["cost", "cost", "cost"], - ["cut", "cut", "cut"], - ["dive", "dove", "dove"], - ["do", "did", "done"], - ["drink", "drank", "drunk"], - ["drive", "drove", "driven"], - ["eat", "ate", "eaten"], - ["fall", "fell", "fallen"], - ["feed", "fed", "fed"], - ["feel", "felt", "felt"], - ["fight", "fought", "fought"], - ["find", "found", "found"], - ["fly", "flew", "flown"], - ["forget", "forgot", "forgotten"], - ["forgo", "forwent", "forgone"], - ["freeze", "froze", "frozen"], - "// get -- got -- gotten", - ["get", "got", "got"], - ["give", "gave", "given"], - ["go", "went", "gone"], - ["grow", "grew", "grown"], - ["have", "had", "had"], - ["hear", "heard", "heard"], - ["hit", "hit", "hit"], - ["hold", "held", "held"], - ["hurt", "hurt", "hurt"], - ["input", "input", "input"], - ["keep", "kept", "kept"], - ["know", "knew", "known"], - ["lay", "laid", "lain"], - ["lead", "led", "led"], - ["light", "lit", "lit"], - ["lose", "lost", "lost"], - ["make", "made", "made"], - ["mistake", "mistook", "mistaken"], - ["output", "output", "output"], - ["overtake", "overtook", "overtaken"], - ["overthrow", "overthrew", "overthrown"], - ["overwrite", "overwrote", "overwritten"], - ["partake", "partook", "partaken"], - ["pay", "paid", "paid"], - ["put", "put", "put"], - ["read", "read", "read"], - ["redo", "redid", "redone"], - ["remake", "remade", "remade"], - ["reread", "reread", "reread"], - ["reset", "reset", "reset"], - ["ride", "rode", "ridden"], - ["ring", "rang", "rung"], - ["rise", "rose", "risen"], - ["run", "ran", "run"], - ["see", "saw", "seen"], - ["sell", "sold", "sold"], - ["send", "sent", "sent"], - ["set", "set", "set"], - ["shake", "shook", "shaken"], - ["shed", "shed", "shed"], - ["shine", "shone", "shone"], - ["shoe", "shod", "shod"], - ["shoot", "shot", "shot"], - ["show", "showed", "shown"], - ["shrink", "shrank", "shrunk"], - ["shut", "shut", "shut"], - ["sing", "sang", "sung"], - "// sink -- sank -- sunken??", - ["sink", "sank", "sunk"], - ["sit", "sat", "sat"], - ["slay", "slew", "slain"], - ["sleep", "slept", "slept"], - ["slide", "slid", "slid"], - ["slit", "slit", "slit"], - "// sneak -- sneaked/snuck -- sneaked/snuck", - ["speak", "spoke", "spoken"], - ["spin", "spun", "spun"], - ["spit", "spat", "spat"], - ["split", "split", "split"], - ["spread", "spread", "spread"], - ["spring", "sprang", "sprung"], - ["stand", "stood", "stood"], - ["steal", "stole", "stolen"], - ["stick", "stuck", "stuck"], - ["sting", "stung", "stung"], - ["stink", "stank", "stunk"], - ["stride", "strode", "stridden"], - ["strike", "struck", "stricken"], - ["string", "strung", "strung"], - ["sew", "sewed", "sewn"], - ["swear", "swore", "sworn"], - ["swim", "swam", "swum"], - ["swing", "swung", "swung"], - ["take", "took", "taken"], - ["teach", "taught", "taught"], - ["tear", "tore", "torn"], - ["think", "thought", "thought"], - ["throw", "threw", "thrown"], - ["tread", "trod", "trodden"], - ["undo", "undid", "undone"], - ["wake", "woke", "woken"], - ["wear", "wore", "worn"], - ["weave", "wove", "woven"], - ["wind", "wound", "wound"], - ["write", "wrote", "written"] -] \ No newline at end of file + "// comments can appear in the line before an entry", + "// or in place of an entry", + ["arise", "arose", "arisen"], + ["awake", "awoke", "awoken"], + "// be/am/are/is -- was/were -- been", + ["become", "became", "become"], + ["begin", "began", "begun"], + ["bend", "bent", "bent"], + ["bet", "bet", "bet"], + ["bid", "bade", "bidden"], + ["bind", "bound", "bound"], + ["bite", "bit", "bitten"], + ["bleed", "bled", "bled"], + ["blow", "blew", "blown"], + ["break", "broke", "broken"], + ["breed", "bred", "bred"], + ["bring", "brought", "brought"], + ["build", "built", "built"], + ["burst", "burst", "burst"], + ["buy", "bought", "bought"], + ["catch", "caught", "caught"], + ["choose", "chose", "chosen"], + ["come", "came", "come"], + ["cost", "cost", "cost"], + ["cut", "cut", "cut"], + ["dive", "dove", "dove"], + ["do", "did", "done"], + ["drink", "drank", "drunk"], + ["drive", "drove", "driven"], + ["eat", "ate", "eaten"], + ["fall", "fell", "fallen"], + ["feed", "fed", "fed"], + ["feel", "felt", "felt"], + ["fight", "fought", "fought"], + ["find", "found", "found"], + ["fly", "flew", "flown"], + ["forget", "forgot", "forgotten"], + ["forgo", "forwent", "forgone"], + ["freeze", "froze", "frozen"], + "// get -- got -- gotten", + ["get", "got", "got"], + ["give", "gave", "given"], + ["go", "went", "gone"], + ["grow", "grew", "grown"], + ["have", "had", "had"], + ["hear", "heard", "heard"], + ["hit", "hit", "hit"], + ["hold", "held", "held"], + ["hurt", "hurt", "hurt"], + ["input", "input", "input"], + ["keep", "kept", "kept"], + ["know", "knew", "known"], + ["lay", "laid", "lain"], + ["lead", "led", "led"], + ["light", "lit", "lit"], + ["lose", "lost", "lost"], + ["make", "made", "made"], + ["mistake", "mistook", "mistaken"], + ["output", "output", "output"], + ["overtake", "overtook", "overtaken"], + ["overthrow", "overthrew", "overthrown"], + ["overwrite", "overwrote", "overwritten"], + ["partake", "partook", "partaken"], + ["pay", "paid", "paid"], + ["put", "put", "put"], + ["read", "read", "read"], + ["redo", "redid", "redone"], + ["remake", "remade", "remade"], + ["reread", "reread", "reread"], + ["reset", "reset", "reset"], + ["ride", "rode", "ridden"], + ["ring", "rang", "rung"], + ["rise", "rose", "risen"], + ["run", "ran", "run"], + ["see", "saw", "seen"], + ["sell", "sold", "sold"], + ["send", "sent", "sent"], + ["set", "set", "set"], + ["shake", "shook", "shaken"], + ["shed", "shed", "shed"], + ["shine", "shone", "shone"], + ["shoe", "shod", "shod"], + ["shoot", "shot", "shot"], + ["show", "showed", "shown"], + ["shrink", "shrank", "shrunk"], + ["shut", "shut", "shut"], + ["sing", "sang", "sung"], + "// sink -- sank -- sunken??", + ["sink", "sank", "sunk"], + ["sit", "sat", "sat"], + ["slay", "slew", "slain"], + ["sleep", "slept", "slept"], + ["slide", "slid", "slid"], + ["slit", "slit", "slit"], + "// sneak -- sneaked/snuck -- sneaked/snuck", + ["speak", "spoke", "spoken"], + ["spin", "spun", "spun"], + ["spit", "spat", "spat"], + ["split", "split", "split"], + ["spread", "spread", "spread"], + ["spring", "sprang", "sprung"], + ["stand", "stood", "stood"], + ["steal", "stole", "stolen"], + ["stick", "stuck", "stuck"], + ["sting", "stung", "stung"], + ["stink", "stank", "stunk"], + ["stride", "strode", "stridden"], + ["strike", "struck", "stricken"], + ["string", "strung", "strung"], + ["sew", "sewed", "sewn"], + ["swear", "swore", "sworn"], + ["swim", "swam", "swum"], + ["swing", "swung", "swung"], + ["take", "took", "taken"], + ["teach", "taught", "taught"], + ["tear", "tore", "torn"], + ["think", "thought", "thought"], + ["throw", "threw", "thrown"], + ["tread", "trod", "trodden"], + ["undo", "undid", "undone"], + ["wake", "woke", "woken"], + ["wear", "wore", "worn"], + ["weave", "wove", "woven"], + ["wind", "wound", "wound"], + ["write", "wrote", "written"] +] From 360bfc747ae34827882f4bc601277613ba3a7d9e Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 3 Dec 2025 13:16:57 +0800 Subject: [PATCH 5/6] refactor: in response to PR feedback --- harper-core/irregular_nouns.json | 162 ++++++++++++++++++ .../{src/linting => }/irregular_verbs.json | 0 harper-core/src/irregular_nouns.rs | 115 +++++++++++++ .../src/{linting => }/irregular_verbs.rs | 2 +- harper-core/src/lib.rs | 2 + harper-core/src/linting/mod.rs | 2 - .../linting/simple_past_to_past_participle.rs | 3 +- 7 files changed, 282 insertions(+), 4 deletions(-) create mode 100644 harper-core/irregular_nouns.json rename harper-core/{src/linting => }/irregular_verbs.json (100%) create mode 100644 harper-core/src/irregular_nouns.rs rename harper-core/src/{linting => }/irregular_verbs.rs (97%) diff --git a/harper-core/irregular_nouns.json b/harper-core/irregular_nouns.json new file mode 100644 index 000000000..edc61312f --- /dev/null +++ b/harper-core/irregular_nouns.json @@ -0,0 +1,162 @@ +[ + "// comments can appear in the line before an entry", + "// or in place of an entry", + ["child", "children"], + ["foot", "feet"], + ["goose", "geese"], + ["man", "men"], + ["mouse", "mice"], + ["ox", "oxen"], + ["person", "people"], + ["seraph", "seraphim"], + ["woman", "women"], + ["addendum", "addenda"], + ["aircraft", "aircraft"], + ["aircraftman", "aircraftmen"], + ["aircraftwoman", "aircraftwomen"], + ["airman", "airmen"], + ["alderman", "aldermen"], + ["alga", "algae"], + ["alveolus", "alveoli"], + ["anchorman", "anchormen"], + ["anchorwoman", "anchorwomen"], + ["atrium", "atria"], + ["axis", "axes"], + ["bacillus", "bacilli"], + ["bacterium", "bacteria"], + ["bandsman", "bandsmen"], + ["bargeman", "bargemen"], + ["bellman", "bellmen"], + ["biceps", "biceps"], + ["boatman", "boatmen"], + ["bronchus", "bronchi"], + ["businesswoman", "businesswomen"], + ["cactus", "cacti"], + ["cameraperson", "camerapeople"], + ["candelabrum", "candelabra"], + ["catharsis", "catharses"], + ["chairman", "chairmen"], + ["chairwoman", "chairwomen"], + ["churchwoman", "churchwomen"], + ["clansman", "clansmen"], + ["clanswoman", "clanswomen"], + ["committeeman", "committeemen"], + ["committeewoman", "committeewomen"], + ["continuum", "continua"], + ["corpus", "corpora"], + ["craftsman", "craftsmen"], + ["craftswoman", "craftswomen"], + ["crisis", "crises"], + ["cyclops", "cyclopes"], + ["datum", "data"], + ["diaeresis", "diaereses"], + ["diagnosis", "diagnoses"], + ["dominatrix", "dominatrices"], + ["draughtsman", "draughtsmen"], + ["draughtswoman", "draughtswomen"], + ["effluvium", "effluvia"], + ["emphasis", "emphases"], + ["esophagus", "esophagi"], + ["extremum", "extrema"], + ["fish", "fish"], + ["footman", "footmen"], + ["formula", "formulae"], + ["forum", "fora"], + ["freeman", "freemen"], + ["frontiersman", "frontiersmen"], + ["frontierswoman", "frontierswomen"], + ["garbageman", "garbagemen"], + ["genesis", "geneses"], + ["genie", "genii"], + ["genius", "genii"], + ["genus", "genera"], + ["glissando", "glissandi"], + ["graffito", "graffiti"], + ["grandchild", "grandchildren"], + ["handyman", "handymen"], + ["hitman", "hitmen"], + ["houseman", "housemen"], + ["iceman", "icemen"], + ["ilium", "ilia"], + ["index", "indices"], + ["intermezzo", "intermezzi"], + ["journeyman", "journeymen"], + ["labium", "labia"], + ["lamina", "laminae"], + ["laundrywoman", "laundrywomen"], + ["laywoman", "laywomen"], + ["linesman", "linesmen"], + ["lira", " lire"], + ["longshoreman", "longshoremen"], + ["louse", "lice"], + ["madman", "madmen"], + ["mailman", "mailmen"], + ["memorandum", "memoranda"], + ["metathesis", "metatheses"], + ["minimum", "minima"], + ["mitosis", "mitoses"], + ["motorman", "motormen"], + ["muscleman", "musclemen"], + ["nemesis", "nemeses"], + ["nightwatchman", "nightwatchmen"], + ["oarsman", "oarsmen"], + ["oarswoman", "oarswomen"], + ["oasis", "oases"], + ["ombudsman", "ombudsmen"], + ["optimum", "optima"], + ["palazzo", "palazzi"], + ["papyrus", "papyri"], + ["parenthesis", "parentheses"], + ["patina", "patinae"], + ["patrolman", "patrolmen"], + ["pericardium", "pericardia"], + ["periphrasis", "periphrases"], + ["pharynx", "pharynges"], + ["phenomenon", "phenomena"], + ["plainclothesman", "plainclothesmen"], + ["pneumococcus", "pneumococci"], + ["pressman", "pressmen"], + ["prosthesis", "protheses"], + ["quantum", "quanta"], + ["radius", "radii"], + ["radix", "radices"], + ["repairman", "repairmen"], + ["salesman", "salesmen"], + ["saleswoman", "saleswomen"], + ["sandman", "sandmen"], + ["schema", "schemata"], + ["sheep", "sheep"], + ["shoreman", "shoremen"], + ["signore", "signori"], + ["simulacrum", "simulacra"], + ["solarium", "solaria"], + ["spokesman", "spokesmen"], + ["spokesperson", "spokespeople"], + ["spokeswoman", "spokeswomen"], + ["statesman", "statesmen"], + ["stateswoman", "stateswomen"], + ["steersman", "steersmen"], + ["stratum", "strata"], + ["streptococcus", "streptococci"], + ["succubus", "succubi"], + ["symbiosis", "symbioses"], + ["tarsus", "tarsi"], + ["taxon", "taxa"], + ["testatrix", "testatrices"], + ["testis", "testes"], + ["thesis", "theses"], + ["thrombosis", "thromboses"], + ["tooth", "teeth"], + ["townsman", "townsmen"], + ["townswoman", "townswomen"], + ["tradesman", "tradesmen"], + ["tradeswoman", "tradeswomen"], + ["uterus", "uteri"], + ["vertebra", "vertebrae"], + ["vertex", "vertices"], + ["vivarium", "vivaria"], + ["washerwoman", "washerwomen"], + ["woodlouse", "woodlice"], + ["workingwoman", "workingwomen"], + ["workman", "workmen"] +] diff --git a/harper-core/src/linting/irregular_verbs.json b/harper-core/irregular_verbs.json similarity index 100% rename from harper-core/src/linting/irregular_verbs.json rename to harper-core/irregular_verbs.json diff --git a/harper-core/src/irregular_nouns.rs b/harper-core/src/irregular_nouns.rs new file mode 100644 index 000000000..eb4694808 --- /dev/null +++ b/harper-core/src/irregular_nouns.rs @@ -0,0 +1,115 @@ +use lazy_static::lazy_static; +use serde::Deserialize; +use std::sync::Arc; + +type Noun = (String, String); + +#[derive(Debug, Deserialize)] +pub struct IrregularNouns { + nouns: Vec, +} + +/// The uncached function that is used to produce the original copy of the +/// irregular noun table. +fn uncached_inner_new() -> Arc { + IrregularNouns::from_json_file(include_str!("../irregular_nouns.json")) + .map(Arc::new) + .unwrap_or_else(|e| panic!("Failed to load irregular noun table: {}", e)) +} + +lazy_static! { + static ref NOUNS: Arc = uncached_inner_new(); +} + +impl IrregularNouns { + pub fn new() -> Self { + Self { nouns: vec![] } + } + + pub fn from_json_file(json: &str) -> Result { + // Deserialize into Vec to handle mixed types + let values: Vec = + serde_json::from_str(json).expect("Failed to parse irregular nouns JSON"); + + let mut nouns = Vec::new(); + + for value in values { + match value { + serde_json::Value::Array(arr) if arr.len() == 2 => { + // Handle array of 2 strings + if let (Some(singular), Some(plural)) = (arr[0].as_str(), arr[1].as_str()) { + nouns.push((singular.to_string(), plural.to_string())); + } + } + // Strings are used for comments to guide contributors editing the file + serde_json::Value::String(_) => {} + _ => {} + } + } + + Ok(Self { nouns }) + } + + pub fn get() -> Arc { + (*NOUNS).clone() + } + + pub fn get_plural_for_singular(&self, singular: &str) -> Option<&str> { + self.nouns + .iter() + .find(|(sg, _)| sg.eq_ignore_ascii_case(singular)) + .map(|(_, pl)| pl.as_str()) + } + + pub fn get_singular_for_plural(&self, plural: &str) -> Option<&str> { + self.nouns + .iter() + .find(|(_, pl)| pl.eq_ignore_ascii_case(plural)) + .map(|(sg, _)| sg.as_str()) + } +} + +impl Default for IrregularNouns { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_find_irregular_plural_for_singular_lowercase() { + assert_eq!( + IrregularNouns::get().get_plural_for_singular("man"), + Some("men") + ); + } + + #[test] + fn can_find_irregular_plural_for_singular_uppercase() { + assert_eq!( + IrregularNouns::get().get_plural_for_singular("WOMAN"), + Some("women") + ); + } + + #[test] + fn can_find_singular_for_irregular_plural() { + assert_eq!( + IrregularNouns::get().get_singular_for_plural("children"), + Some("child") + ); + } + + #[test] + fn cant_find_regular_plural() { + assert_eq!(IrregularNouns::get().get_plural_for_singular("car"), None); + } + + #[test] + fn cant_find_non_noun() { + assert_eq!(IrregularNouns::get().get_plural_for_singular("the"), None); + } +} diff --git a/harper-core/src/linting/irregular_verbs.rs b/harper-core/src/irregular_verbs.rs similarity index 97% rename from harper-core/src/linting/irregular_verbs.rs rename to harper-core/src/irregular_verbs.rs index a1860168a..7c1eba567 100644 --- a/harper-core/src/linting/irregular_verbs.rs +++ b/harper-core/src/irregular_verbs.rs @@ -12,7 +12,7 @@ pub struct IrregularVerbs { /// The uncached function that is used to produce the original copy of the /// irregular verb table. fn uncached_inner_new() -> Arc { - IrregularVerbs::from_json_file(include_str!("irregular_verbs.json")) + IrregularVerbs::from_json_file(include_str!("../irregular_verbs.json")) .map(Arc::new) .unwrap_or_else(|e| panic!("Failed to load irregular verb table: {}", e)) } diff --git a/harper-core/src/lib.rs b/harper-core/src/lib.rs index b08bbb78a..11ecaf948 100644 --- a/harper-core/src/lib.rs +++ b/harper-core/src/lib.rs @@ -11,6 +11,8 @@ mod edit_distance; pub mod expr; mod fat_token; mod ignored_lints; +pub mod irregular_nouns; +pub mod irregular_verbs; pub mod language_detection; mod lexing; pub mod linting; diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index e5753fe86..52347c41b 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -78,7 +78,6 @@ mod inflected_verb_after_to; mod initialism_linter; mod initialisms; mod interested_in; -mod irregular_verbs; mod it_is; mod it_looks_like_that; mod it_would_be; @@ -202,7 +201,6 @@ mod would_never_have; pub use expr_linter::ExprLinter; pub use initialism_linter::InitialismLinter; -pub use irregular_verbs::IrregularVerbs; pub use lint::Lint; pub use lint_group::{LintGroup, LintGroupConfig}; pub use lint_kind::LintKind; diff --git a/harper-core/src/linting/simple_past_to_past_participle.rs b/harper-core/src/linting/simple_past_to_past_participle.rs index ce4098848..e9753937b 100644 --- a/harper-core/src/linting/simple_past_to_past_participle.rs +++ b/harper-core/src/linting/simple_past_to_past_participle.rs @@ -2,7 +2,8 @@ use crate::linting::expr_linter::Chunk; use crate::{ Token, expr::{All, Expr, FirstMatchOf, SequenceExpr}, - linting::{ExprLinter, IrregularVerbs, Lint, LintKind, Suggestion}, + irregular_verbs::IrregularVerbs, + linting::{ExprLinter, Lint, LintKind, Suggestion}, patterns::{InflectionOfBe, WordSet}, }; From 4445a2924d6827156158033e1e33e3303c52de79 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Thu, 4 Dec 2025 01:29:27 +0800 Subject: [PATCH 6/6] fix: implement @elijah's requested changes --- harper-core/src/irregular_nouns.rs | 18 ++++++++++++------ harper-core/src/irregular_verbs.rs | 12 ++++++------ harper-core/src/lib.rs | 6 ++++-- .../linting/simple_past_to_past_participle.rs | 2 +- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/harper-core/src/irregular_nouns.rs b/harper-core/src/irregular_nouns.rs index eb4694808..0396616e5 100644 --- a/harper-core/src/irregular_nouns.rs +++ b/harper-core/src/irregular_nouns.rs @@ -50,7 +50,7 @@ impl IrregularNouns { Ok(Self { nouns }) } - pub fn get() -> Arc { + pub fn curated() -> Arc { (*NOUNS).clone() } @@ -82,7 +82,7 @@ mod tests { #[test] fn can_find_irregular_plural_for_singular_lowercase() { assert_eq!( - IrregularNouns::get().get_plural_for_singular("man"), + IrregularNouns::curated().get_plural_for_singular("man"), Some("men") ); } @@ -90,7 +90,7 @@ mod tests { #[test] fn can_find_irregular_plural_for_singular_uppercase() { assert_eq!( - IrregularNouns::get().get_plural_for_singular("WOMAN"), + IrregularNouns::curated().get_plural_for_singular("WOMAN"), Some("women") ); } @@ -98,18 +98,24 @@ mod tests { #[test] fn can_find_singular_for_irregular_plural() { assert_eq!( - IrregularNouns::get().get_singular_for_plural("children"), + IrregularNouns::curated().get_singular_for_plural("children"), Some("child") ); } #[test] fn cant_find_regular_plural() { - assert_eq!(IrregularNouns::get().get_plural_for_singular("car"), None); + assert_eq!( + IrregularNouns::curated().get_plural_for_singular("car"), + None + ); } #[test] fn cant_find_non_noun() { - assert_eq!(IrregularNouns::get().get_plural_for_singular("the"), None); + assert_eq!( + IrregularNouns::curated().get_plural_for_singular("the"), + None + ); } } diff --git a/harper-core/src/irregular_verbs.rs b/harper-core/src/irregular_verbs.rs index 7c1eba567..304ff0ec2 100644 --- a/harper-core/src/irregular_verbs.rs +++ b/harper-core/src/irregular_verbs.rs @@ -56,7 +56,7 @@ impl IrregularVerbs { Ok(Self { verbs }) } - pub fn get() -> Arc { + pub fn curated() -> Arc { (*VERBS).clone() } @@ -81,7 +81,7 @@ mod tests { #[test] fn can_find_irregular_past_participle_for_preterite_lowercase() { assert_eq!( - IrregularVerbs::get().get_past_participle_for_preterite("arose"), + IrregularVerbs::curated().get_past_participle_for_preterite("arose"), Some("arisen") ); } @@ -89,7 +89,7 @@ mod tests { #[test] fn can_find_irregular_past_participle_for_preterite_uppercase() { assert_eq!( - IrregularVerbs::get().get_past_participle_for_preterite("WENT"), + IrregularVerbs::curated().get_past_participle_for_preterite("WENT"), Some("gone") ); } @@ -97,7 +97,7 @@ mod tests { #[test] fn can_find_irregular_past_participle_same_as_past_tense() { assert_eq!( - IrregularVerbs::get().get_past_participle_for_preterite("taught"), + IrregularVerbs::curated().get_past_participle_for_preterite("taught"), Some("taught") ); } @@ -105,7 +105,7 @@ mod tests { #[test] fn cant_find_regular_past_participle() { assert_eq!( - IrregularVerbs::get().get_past_participle_for_preterite("walked"), + IrregularVerbs::curated().get_past_participle_for_preterite("walked"), None ); } @@ -113,7 +113,7 @@ mod tests { #[test] fn cant_find_non_verb() { assert_eq!( - IrregularVerbs::get().get_past_participle_for_preterite("the"), + IrregularVerbs::curated().get_past_participle_for_preterite("the"), None ); } diff --git a/harper-core/src/lib.rs b/harper-core/src/lib.rs index 11ecaf948..33ea32b2a 100644 --- a/harper-core/src/lib.rs +++ b/harper-core/src/lib.rs @@ -11,8 +11,8 @@ mod edit_distance; pub mod expr; mod fat_token; mod ignored_lints; -pub mod irregular_nouns; -pub mod irregular_verbs; +mod irregular_nouns; +mod irregular_verbs; pub mod language_detection; mod lexing; pub mod linting; @@ -44,6 +44,8 @@ pub use dict_word_metadata_orthography::{OrthFlags, Orthography}; pub use document::Document; pub use fat_token::{FatStringToken, FatToken}; pub use ignored_lints::{IgnoredLints, LintContext}; +pub use irregular_nouns::IrregularNouns; +pub use irregular_verbs::IrregularVerbs; use linting::Lint; pub use mask::{Mask, Masker}; pub use number::{Number, OrdinalSuffix}; diff --git a/harper-core/src/linting/simple_past_to_past_participle.rs b/harper-core/src/linting/simple_past_to_past_participle.rs index e9753937b..bf5ba7dd7 100644 --- a/harper-core/src/linting/simple_past_to_past_participle.rs +++ b/harper-core/src/linting/simple_past_to_past_participle.rs @@ -73,7 +73,7 @@ impl ExprLinter for SimplePastToPastParticiple { let simple_past = verb_tok.span.get_content_string(src); - if let Some(past_participle) = IrregularVerbs::get() + if let Some(past_participle) = IrregularVerbs::curated() .get_past_participle_for_preterite(&simple_past) .filter(|pp| pp != &simple_past) {