diff --git a/harper-core/irregular_nouns.json b/harper-core/irregular_nouns.json new file mode 100644 index 000000000..edc61312f --- /dev/null +++ b/harper-core/irregular_nouns.json @@ -0,0 +1,162 @@ +[ + "// comments can appear in the line before an entry", + "// or in place of an entry", + ["child", "children"], + ["foot", "feet"], + ["goose", "geese"], + ["man", "men"], + ["mouse", "mice"], + ["ox", "oxen"], + ["person", "people"], + ["seraph", "seraphim"], + ["woman", "women"], + ["addendum", "addenda"], + ["aircraft", "aircraft"], + ["aircraftman", "aircraftmen"], + ["aircraftwoman", "aircraftwomen"], + ["airman", "airmen"], + ["alderman", "aldermen"], + ["alga", "algae"], + ["alveolus", "alveoli"], + ["anchorman", "anchormen"], + ["anchorwoman", "anchorwomen"], + ["atrium", "atria"], + ["axis", "axes"], + ["bacillus", "bacilli"], + ["bacterium", "bacteria"], + ["bandsman", "bandsmen"], + ["bargeman", "bargemen"], + ["bellman", "bellmen"], + ["biceps", "biceps"], + ["boatman", "boatmen"], + ["bronchus", "bronchi"], + ["businesswoman", "businesswomen"], + ["cactus", "cacti"], + ["cameraperson", "camerapeople"], + ["candelabrum", "candelabra"], + ["catharsis", "catharses"], + ["chairman", "chairmen"], + ["chairwoman", "chairwomen"], + ["churchwoman", "churchwomen"], + ["clansman", "clansmen"], + ["clanswoman", "clanswomen"], + ["committeeman", "committeemen"], + ["committeewoman", "committeewomen"], + ["continuum", "continua"], + ["corpus", "corpora"], + ["craftsman", "craftsmen"], + ["craftswoman", "craftswomen"], + ["crisis", "crises"], + ["cyclops", "cyclopes"], + ["datum", "data"], + ["diaeresis", "diaereses"], + ["diagnosis", "diagnoses"], + ["dominatrix", "dominatrices"], + ["draughtsman", "draughtsmen"], + ["draughtswoman", "draughtswomen"], + ["effluvium", "effluvia"], + ["emphasis", "emphases"], + ["esophagus", "esophagi"], + ["extremum", "extrema"], + ["fish", "fish"], + ["footman", "footmen"], + ["formula", "formulae"], + ["forum", "fora"], + ["freeman", "freemen"], + ["frontiersman", "frontiersmen"], + ["frontierswoman", "frontierswomen"], + ["garbageman", "garbagemen"], + ["genesis", "geneses"], + ["genie", "genii"], + ["genius", "genii"], + ["genus", "genera"], + ["glissando", "glissandi"], + ["graffito", "graffiti"], + ["grandchild", "grandchildren"], + ["handyman", "handymen"], + ["hitman", "hitmen"], + ["houseman", "housemen"], + ["iceman", "icemen"], + ["ilium", "ilia"], + ["index", "indices"], + ["intermezzo", "intermezzi"], + ["journeyman", "journeymen"], + ["labium", "labia"], + ["lamina", "laminae"], + ["laundrywoman", "laundrywomen"], + ["laywoman", "laywomen"], + ["linesman", "linesmen"], + ["lira", " lire"], + ["longshoreman", "longshoremen"], + ["louse", "lice"], + ["madman", "madmen"], + ["mailman", "mailmen"], + ["memorandum", "memoranda"], + ["metathesis", "metatheses"], + ["minimum", "minima"], + ["mitosis", "mitoses"], + ["motorman", "motormen"], + ["muscleman", "musclemen"], + ["nemesis", "nemeses"], + ["nightwatchman", "nightwatchmen"], + ["oarsman", "oarsmen"], + ["oarswoman", "oarswomen"], + ["oasis", "oases"], + ["ombudsman", "ombudsmen"], + ["optimum", "optima"], + ["palazzo", "palazzi"], + ["papyrus", "papyri"], + ["parenthesis", "parentheses"], + ["patina", "patinae"], + ["patrolman", "patrolmen"], + ["pericardium", "pericardia"], + ["periphrasis", "periphrases"], + ["pharynx", "pharynges"], + ["phenomenon", "phenomena"], + ["plainclothesman", "plainclothesmen"], + ["pneumococcus", "pneumococci"], + ["pressman", "pressmen"], + ["prosthesis", "protheses"], + ["quantum", "quanta"], + ["radius", "radii"], + ["radix", "radices"], + ["repairman", "repairmen"], + ["salesman", "salesmen"], + ["saleswoman", "saleswomen"], + ["sandman", "sandmen"], + ["schema", "schemata"], + ["sheep", "sheep"], + ["shoreman", "shoremen"], + ["signore", "signori"], + ["simulacrum", "simulacra"], + ["solarium", "solaria"], + ["spokesman", "spokesmen"], + ["spokesperson", "spokespeople"], + ["spokeswoman", "spokeswomen"], + ["statesman", "statesmen"], + ["stateswoman", "stateswomen"], + ["steersman", "steersmen"], + ["stratum", "strata"], + ["streptococcus", "streptococci"], + ["succubus", "succubi"], + ["symbiosis", "symbioses"], + ["tarsus", "tarsi"], + ["taxon", "taxa"], + ["testatrix", "testatrices"], + ["testis", "testes"], + ["thesis", "theses"], + ["thrombosis", "thromboses"], + ["tooth", "teeth"], + ["townsman", "townsmen"], + ["townswoman", "townswomen"], + ["tradesman", "tradesmen"], + ["tradeswoman", "tradeswomen"], + ["uterus", "uteri"], + ["vertebra", "vertebrae"], + ["vertex", "vertices"], + ["vivarium", "vivaria"], + ["washerwoman", "washerwomen"], + ["woodlouse", "woodlice"], + ["workingwoman", "workingwomen"], + ["workman", "workmen"] +] diff --git a/harper-core/irregular_verbs.json b/harper-core/irregular_verbs.json new file mode 100644 index 000000000..5d710a5c1 --- /dev/null +++ b/harper-core/irregular_verbs.json @@ -0,0 +1,127 @@ +[ + "// comments can appear in the line before an entry", + "// or in place of an entry", + ["arise", "arose", "arisen"], + ["awake", "awoke", "awoken"], + "// be/am/are/is -- was/were -- been", + ["become", "became", "become"], + ["begin", "began", "begun"], + ["bend", "bent", "bent"], + ["bet", "bet", "bet"], + ["bid", "bade", "bidden"], + ["bind", "bound", "bound"], + ["bite", "bit", "bitten"], + ["bleed", "bled", "bled"], + ["blow", "blew", "blown"], + ["break", "broke", "broken"], + ["breed", "bred", "bred"], + ["bring", "brought", "brought"], + ["build", "built", "built"], + ["burst", "burst", "burst"], + ["buy", "bought", "bought"], + ["catch", "caught", "caught"], + ["choose", "chose", "chosen"], + ["come", "came", "come"], + ["cost", "cost", "cost"], + ["cut", "cut", "cut"], + ["dive", "dove", "dove"], + ["do", "did", "done"], + ["drink", "drank", "drunk"], + ["drive", "drove", "driven"], + ["eat", "ate", "eaten"], + ["fall", "fell", "fallen"], + ["feed", "fed", "fed"], + ["feel", "felt", "felt"], + ["fight", "fought", "fought"], + ["find", "found", "found"], + ["fly", "flew", "flown"], + ["forget", "forgot", "forgotten"], + ["forgo", "forwent", "forgone"], + ["freeze", "froze", "frozen"], + "// get -- got -- gotten", + ["get", "got", "got"], + ["give", "gave", "given"], + ["go", "went", "gone"], + ["grow", "grew", "grown"], + ["have", "had", "had"], + ["hear", "heard", "heard"], + ["hit", "hit", "hit"], + ["hold", "held", "held"], + ["hurt", "hurt", "hurt"], + ["input", "input", "input"], + ["keep", "kept", "kept"], + ["know", "knew", "known"], + ["lay", "laid", "lain"], + ["lead", "led", "led"], + ["light", "lit", "lit"], + ["lose", "lost", "lost"], + ["make", "made", "made"], + ["mistake", "mistook", "mistaken"], + ["output", "output", "output"], + ["overtake", "overtook", "overtaken"], + ["overthrow", "overthrew", "overthrown"], + ["overwrite", "overwrote", "overwritten"], + ["partake", "partook", "partaken"], + ["pay", "paid", "paid"], + ["put", "put", "put"], + ["read", "read", "read"], + ["redo", "redid", "redone"], + ["remake", "remade", "remade"], + ["reread", "reread", "reread"], + ["reset", "reset", "reset"], + ["ride", "rode", "ridden"], + ["ring", "rang", "rung"], + ["rise", "rose", "risen"], + ["run", "ran", "run"], + ["see", "saw", "seen"], + ["sell", "sold", "sold"], + ["send", "sent", "sent"], + ["set", "set", "set"], + ["shake", "shook", "shaken"], + ["shed", "shed", "shed"], + ["shine", "shone", "shone"], + ["shoe", "shod", "shod"], + ["shoot", "shot", "shot"], + ["show", "showed", "shown"], + ["shrink", "shrank", "shrunk"], + ["shut", "shut", "shut"], + ["sing", "sang", "sung"], + "// sink -- sank -- sunken??", + ["sink", "sank", "sunk"], + ["sit", "sat", "sat"], + ["slay", "slew", "slain"], + ["sleep", "slept", "slept"], + ["slide", "slid", "slid"], + ["slit", "slit", "slit"], + "// sneak -- sneaked/snuck -- sneaked/snuck", + ["speak", "spoke", "spoken"], + ["spin", "spun", "spun"], + ["spit", "spat", "spat"], + ["split", "split", "split"], + ["spread", "spread", "spread"], + ["spring", "sprang", "sprung"], + ["stand", "stood", "stood"], + ["steal", "stole", "stolen"], + ["stick", "stuck", "stuck"], + ["sting", "stung", "stung"], + ["stink", "stank", "stunk"], + ["stride", "strode", "stridden"], + ["strike", "struck", "stricken"], + ["string", "strung", "strung"], + ["sew", "sewed", "sewn"], + ["swear", "swore", "sworn"], + ["swim", "swam", "swum"], + ["swing", "swung", "swung"], + ["take", "took", "taken"], + ["teach", "taught", "taught"], + ["tear", "tore", "torn"], + ["think", "thought", "thought"], + ["throw", "threw", "thrown"], + ["tread", "trod", "trodden"], + ["undo", "undid", "undone"], + ["wake", "woke", "woken"], + ["wear", "wore", "worn"], + ["weave", "wove", "woven"], + ["wind", "wound", "wound"], + ["write", "wrote", "written"] +] diff --git a/harper-core/src/irregular_nouns.rs b/harper-core/src/irregular_nouns.rs new file mode 100644 index 000000000..0396616e5 --- /dev/null +++ b/harper-core/src/irregular_nouns.rs @@ -0,0 +1,121 @@ +use lazy_static::lazy_static; +use serde::Deserialize; +use std::sync::Arc; + +type Noun = (String, String); + +#[derive(Debug, Deserialize)] +pub struct IrregularNouns { + nouns: Vec, +} + +/// The uncached function that is used to produce the original copy of the +/// irregular noun table. +fn uncached_inner_new() -> Arc { + IrregularNouns::from_json_file(include_str!("../irregular_nouns.json")) + .map(Arc::new) + .unwrap_or_else(|e| panic!("Failed to load irregular noun table: {}", e)) +} + +lazy_static! { + static ref NOUNS: Arc = uncached_inner_new(); +} + +impl IrregularNouns { + pub fn new() -> Self { + Self { nouns: vec![] } + } + + pub fn from_json_file(json: &str) -> Result { + // Deserialize into Vec to handle mixed types + let values: Vec = + serde_json::from_str(json).expect("Failed to parse irregular nouns JSON"); + + let mut nouns = Vec::new(); + + for value in values { + match value { + serde_json::Value::Array(arr) if arr.len() == 2 => { + // Handle array of 2 strings + if let (Some(singular), Some(plural)) = (arr[0].as_str(), arr[1].as_str()) { + nouns.push((singular.to_string(), plural.to_string())); + } + } + // Strings are used for comments to guide contributors editing the file + serde_json::Value::String(_) => {} + _ => {} + } + } + + Ok(Self { nouns }) + } + + pub fn curated() -> Arc { + (*NOUNS).clone() + } + + pub fn get_plural_for_singular(&self, singular: &str) -> Option<&str> { + self.nouns + .iter() + .find(|(sg, _)| sg.eq_ignore_ascii_case(singular)) + .map(|(_, pl)| pl.as_str()) + } + + pub fn get_singular_for_plural(&self, plural: &str) -> Option<&str> { + self.nouns + .iter() + .find(|(_, pl)| pl.eq_ignore_ascii_case(plural)) + .map(|(sg, _)| sg.as_str()) + } +} + +impl Default for IrregularNouns { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_find_irregular_plural_for_singular_lowercase() { + assert_eq!( + IrregularNouns::curated().get_plural_for_singular("man"), + Some("men") + ); + } + + #[test] + fn can_find_irregular_plural_for_singular_uppercase() { + assert_eq!( + IrregularNouns::curated().get_plural_for_singular("WOMAN"), + Some("women") + ); + } + + #[test] + fn can_find_singular_for_irregular_plural() { + assert_eq!( + IrregularNouns::curated().get_singular_for_plural("children"), + Some("child") + ); + } + + #[test] + fn cant_find_regular_plural() { + assert_eq!( + IrregularNouns::curated().get_plural_for_singular("car"), + None + ); + } + + #[test] + fn cant_find_non_noun() { + assert_eq!( + IrregularNouns::curated().get_plural_for_singular("the"), + None + ); + } +} diff --git a/harper-core/src/irregular_verbs.rs b/harper-core/src/irregular_verbs.rs new file mode 100644 index 000000000..304ff0ec2 --- /dev/null +++ b/harper-core/src/irregular_verbs.rs @@ -0,0 +1,120 @@ +use lazy_static::lazy_static; +use serde::Deserialize; +use std::sync::Arc; + +type Verb = (String, String, String); + +#[derive(Debug, Deserialize)] +pub struct IrregularVerbs { + verbs: Vec, +} + +/// The uncached function that is used to produce the original copy of the +/// irregular verb table. +fn uncached_inner_new() -> Arc { + IrregularVerbs::from_json_file(include_str!("../irregular_verbs.json")) + .map(Arc::new) + .unwrap_or_else(|e| panic!("Failed to load irregular verb table: {}", e)) +} + +lazy_static! { + static ref VERBS: Arc = uncached_inner_new(); +} + +impl IrregularVerbs { + pub fn new() -> Self { + Self { verbs: vec![] } + } + + pub fn from_json_file(json: &str) -> Result { + // Deserialize into Vec to handle mixed types + let values: Vec = + serde_json::from_str(json).expect("Failed to parse irregular verbs JSON"); + + let mut verbs = Vec::new(); + + for value in values { + match value { + serde_json::Value::Array(arr) if arr.len() == 3 => { + // Handle array of 3 strings + if let (Some(lemma), Some(preterite), Some(past_participle)) = + (arr[0].as_str(), arr[1].as_str(), arr[2].as_str()) + { + verbs.push(( + lemma.to_string(), + preterite.to_string(), + past_participle.to_string(), + )); + } + } + // Strings are used for comments to guide contributors editing the file + serde_json::Value::String(_) => {} + _ => {} + } + } + + Ok(Self { verbs }) + } + + pub fn curated() -> Arc { + (*VERBS).clone() + } + + pub fn get_past_participle_for_preterite(&self, preterite: &str) -> Option<&str> { + self.verbs + .iter() + .find(|(_, pt, _)| pt.eq_ignore_ascii_case(preterite)) + .map(|(_, _, pp)| pp.as_str()) + } +} + +impl Default for IrregularVerbs { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_find_irregular_past_participle_for_preterite_lowercase() { + assert_eq!( + IrregularVerbs::curated().get_past_participle_for_preterite("arose"), + Some("arisen") + ); + } + + #[test] + fn can_find_irregular_past_participle_for_preterite_uppercase() { + assert_eq!( + IrregularVerbs::curated().get_past_participle_for_preterite("WENT"), + Some("gone") + ); + } + + #[test] + fn can_find_irregular_past_participle_same_as_past_tense() { + assert_eq!( + IrregularVerbs::curated().get_past_participle_for_preterite("taught"), + Some("taught") + ); + } + + #[test] + fn cant_find_regular_past_participle() { + assert_eq!( + IrregularVerbs::curated().get_past_participle_for_preterite("walked"), + None + ); + } + + #[test] + fn cant_find_non_verb() { + assert_eq!( + IrregularVerbs::curated().get_past_participle_for_preterite("the"), + None + ); + } +} diff --git a/harper-core/src/lib.rs b/harper-core/src/lib.rs index b08bbb78a..33ea32b2a 100644 --- a/harper-core/src/lib.rs +++ b/harper-core/src/lib.rs @@ -11,6 +11,8 @@ mod edit_distance; pub mod expr; mod fat_token; mod ignored_lints; +mod irregular_nouns; +mod irregular_verbs; pub mod language_detection; mod lexing; pub mod linting; @@ -42,6 +44,8 @@ pub use dict_word_metadata_orthography::{OrthFlags, Orthography}; pub use document::Document; pub use fat_token::{FatStringToken, FatToken}; pub use ignored_lints::{IgnoredLints, LintContext}; +pub use irregular_nouns::IrregularNouns; +pub use irregular_verbs::IrregularVerbs; use linting::Lint; pub use mask::{Mask, Masker}; pub use number::{Number, OrdinalSuffix}; diff --git a/harper-core/src/linting/simple_past_to_past_participle.rs b/harper-core/src/linting/simple_past_to_past_participle.rs index 7217f4c0c..bf5ba7dd7 100644 --- a/harper-core/src/linting/simple_past_to_past_participle.rs +++ b/harper-core/src/linting/simple_past_to_past_participle.rs @@ -1,82 +1,12 @@ use crate::linting::expr_linter::Chunk; use crate::{ Token, - char_string::CharStringExt, expr::{All, Expr, FirstMatchOf, SequenceExpr}, + irregular_verbs::IrregularVerbs, linting::{ExprLinter, Lint, LintKind, Suggestion}, patterns::{InflectionOfBe, WordSet}, }; -/// Maps common irregular verbs between their simple past and past participle forms. -const IRREGULAR_VERBS: &[(&str, &str)] = &[ - ("arose", "arisen"), - ("ate", "eaten"), - ("awoke", "awoken"), - ("bade", "bidden"), - ("became", "become"), - ("began", "begun"), - ("bit", "bitten"), - ("blew", "blown"), - ("bought", "bought"), - ("brang", "brung"), - ("broke", "broken"), - ("brought", "brought"), - ("came", "come"), - ("chose", "chosen"), - ("did", "done"), - ("drank", "drunk"), - ("drove", "driven"), - ("fell", "fallen"), - ("felt", "felt"), - ("flew", "flown"), - ("forgot", "forgotten"), - ("forwent", "forgone"), - ("gave", "given"), - ("grew", "grown"), - ("had", "had"), - ("heard", "heard"), - ("hit", "hit"), - ("input", "input"), - ("knew", "known"), - ("led", "led"), - ("mistook", "mistaken"), - ("output", "output"), - ("overtook", "overtaken"), - ("paid", "paid"), - ("partook", "partaken"), - // proved, proved/proven - ("put", "put"), - ("ran", "run"), - ("rang", "rung"), - ("read", "read"), - ("reset", "reset"), - ("rode", "ridden"), - ("rose", "risen"), - ("sang", "sung"), - ("sank", "sunken"), - ("saw", "seen"), - ("set", "set"), - ("sewed", "sewn"), - ("slew", "slain"), - ("slid", "slid"), - ("spoke", "spoken"), - ("sprang", "sprung"), - ("stank", "stunk"), - ("stole", "stolen"), - ("stood", "stood"), - ("swam", "swum"), - ("swore", "sworn"), - ("thought", "thought"), - ("trod", "trodden"), - ("took", "taken"), - // was, been - // were, been - ("went", "gone"), - ("woke", "woken"), - ("wove", "woven"), - ("wrote", "written"), -]; - /// Corrects simple past tense verbs to past participle after auxiliary verbs like "have" or "be". pub struct SimplePastToPastParticiple { expr: Box, @@ -141,41 +71,32 @@ impl ExprLinter for SimplePastToPastParticiple { let verb_tok = &toks[2]; - let verb_ch = verb_tok.span.get_content(src); - if !IRREGULAR_VERBS - .iter() - .any(|(t, p)| verb_ch.eq_ignore_ascii_case_str(t) && p != t) - { - return None; - } + let simple_past = verb_tok.span.get_content_string(src); - let (simple_past, past_participle) = IRREGULAR_VERBS - .iter() - .find(|(simple_past, _)| { - verb_tok - .span - .get_content(src) - .eq_ignore_ascii_case_str(simple_past) + if let Some(past_participle) = IrregularVerbs::curated() + .get_past_participle_for_preterite(&simple_past) + .filter(|pp| pp != &simple_past) + { + let suggestions = vec![Suggestion::replace_with_match_case( + past_participle.chars().collect(), + verb_tok.span.get_content(src), + )]; + + let message = format!( + "Use the past participle `{}` instead of `{}` when using compound tenses or passive voice.", + past_participle, simple_past + ); + + Some(Lint { + span: verb_tok.span, + lint_kind: LintKind::Grammar, + suggestions, + message, + ..Default::default() }) - .unwrap(); - - let suggestions = vec![Suggestion::replace_with_match_case( - past_participle.chars().collect(), - verb_tok.span.get_content(src), - )]; - - let message = format!( - "Use the past participle `{}` instead of `{}` when using compound tenses or passive voice.", - past_participle, simple_past - ); - - Some(Lint { - span: verb_tok.span, - lint_kind: LintKind::Grammar, - suggestions, - message, - ..Default::default() - }) + } else { + None + } } fn description(&self) -> &str {