Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
008c534
test(core): add failing test
86xsk Jan 20, 2026
a5f4a1b
test(comments): don't expect 'lin' to be marked as a spelling error
86xsk Jan 28, 2026
5face23
test(core): move tests
86xsk Jan 28, 2026
fe37ba6
test(core): don't expect `SpellCheck` to mark capitalization issues
86xsk Jan 29, 2026
6f02acd
deps(core): add `indexmap`
86xsk Jan 29, 2026
03b1f5b
feat(core)!: more explicit handling of case-sensitivity in dictionaries
86xsk Jan 29, 2026
ce666c5
chore: update snapshots
86xsk Jan 29, 2026
4443a4e
Partially revert "fix(core): PR getting flagged as 'misspelled' (#2476)"
86xsk Jan 29, 2026
e982f23
test(core): merge tests and add test
86xsk Jan 29, 2026
5688aee
Merge branch 'master' into fix-dict-casing2
86xsk Jan 29, 2026
0a1e2d4
test(core): move test
86xsk Jan 30, 2026
cf9a90a
fix(core): fix logic in `OrthographicConsistency`
86xsk Jan 30, 2026
007df6e
test(core): add failing test
86xsk Jan 30, 2026
7518350
fix(core): allow all case-variants in `OrthographicConsistency`
86xsk Jan 30, 2026
3c9d54e
test(core): remove Lego -> LEGO test in `OrthographicConsistency`
86xsk Jan 30, 2026
8b426d9
chore: update snapshots
86xsk Jan 30, 2026
3381fba
test(core): add test
86xsk Jan 30, 2026
e11a2d6
test(core): fix incorrect test expectation
86xsk Jan 30, 2026
b23f652
refactor(core): appease Clippy
86xsk Jan 30, 2026
3f068bb
feat(core): support multiple `derived_from`
86xsk Feb 1, 2026
18ba296
perf(core): reduce Vec cloning
86xsk Feb 2, 2026
aeba563
refactor(core): reuse code from similar function
86xsk Feb 2, 2026
f30cfff
Merge branch 'master' into fix-dict-casing2
86xsk Feb 3, 2026
32ce68c
refactor(core): remove dead code
86xsk Feb 3, 2026
5bd11c1
Merge branch 'master' into fix-dict-casing2
86xsk Feb 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 11 additions & 7 deletions harper-cli/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![doc = include_str!("../README.md")]

use harper_core::spell::{Dictionary, FstDictionary, MutableDictionary, WordId};
use harper_core::spell::{CanonicalWordId, Dictionary, FstDictionary, MutableDictionary};
use hashbrown::HashMap;
use std::collections::BTreeMap;
use std::fs::File;
Expand Down Expand Up @@ -363,7 +363,7 @@ fn main() -> anyhow::Result<()> {
];

for word in words {
let meta = curated_dictionary.get_word_metadata_str(&word);
let meta = curated_dictionary.get_word_metadata_str_exact(&word);
let (flags, emojis) = meta.as_ref().map_or_else(
|| (String::new(), String::new()),
|md| {
Expand Down Expand Up @@ -859,7 +859,7 @@ fn main() -> anyhow::Result<()> {
let mut processed_words = HashMap::new();
let mut longest_word = 0;
for word in curated_dictionary.words_iter() {
if let Some(metadata) = curated_dictionary.get_word_metadata(word) {
if let Some(metadata) = curated_dictionary.get_word_metadata_exact(word) {
let orth = metadata.orth_info;
let bits = orth.bits() & case_bitmask.bits();

Expand Down Expand Up @@ -961,11 +961,15 @@ fn line_to_parts(line: &str) -> (String, String) {
fn print_word_derivations(word: &str, annot: &str, dictionary: &impl Dictionary) {
println!("{word}/{annot}");

let id = WordId::from_word_str(word);
let id = CanonicalWordId::from_word_str(word);

let children = dictionary
.words_iter()
.filter(|e| dictionary.get_word_metadata(e).unwrap().derived_from == Some(id));
let children = dictionary.words_iter().filter(|e| {
dictionary
.get_word_metadata_exact(e)
.unwrap()
.derived_from
.contains(id)
});

println!(" - {word}");

Expand Down
2 changes: 1 addition & 1 deletion harper-comments/tests/language_support.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ create_test!(jsdoc.ts, 4);
create_test!(issue_96.lua, 0);
create_test!(merged_lines.ts, 1);
create_test!(javadoc_clean_simple.java, 0);
create_test!(javadoc_complex.java, 5);
create_test!(javadoc_complex.java, 4);
create_test!(issue_132.rs, 1);
create_test!(laravel_app.php, 2);
create_test!(ignore_shebang_1.sh, 0);
Expand Down
1 change: 1 addition & 0 deletions harper-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ harper-brill = { path = "../harper-brill", version = "1.0.0" }
harper-thesaurus = { path = "../harper-thesaurus", version = "1.4.1", optional = true }
bitflags = { version = "2.10.0", features = ["serde"] }
trie-rs = "0.4.2"
indexmap = "2.12.1"
zip = { version = "2.2.0", default-features = false, features = ["deflate"] }

[dev-dependencies]
Expand Down
1 change: 1 addition & 0 deletions harper-core/dictionary.dict
Original file line number Diff line number Diff line change
Expand Up @@ -8252,6 +8252,7 @@ PowerPoint/ONgV
Powers/NOg
Powhatan/NOg
Poznan/Og
Pr/ # Praseodymium
Prada/g
Prado/Og
Praetorian/Ng
Expand Down
86 changes: 44 additions & 42 deletions harper-core/src/dict_word_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ use strum_macros::{Display, EnumCount, EnumIter, EnumString, VariantArray};
use std::convert::TryFrom;

use crate::dict_word_metadata_orthography::OrthFlags;
use crate::spell::WordId;
use crate::{Document, TokenKind, TokenStringExt};

pub mod derived_from;
use derived_from::DerivedFrom;

/// This represents a "lexeme" or "headword" which is case-folded but affix-expanded.
/// So not only lemmata but also inflected forms are stored here, with "horn" and "horns" each
/// having their own lexeme, but "Ivy" and "ivy" sharing the same lexeme.
Expand Down Expand Up @@ -44,8 +46,8 @@ pub struct DictWordMetadata {
/// Whether the word is considered especially common.
#[serde(default = "default_false")]
pub common: bool,
#[serde(default = "default_none")]
pub derived_from: Option<WordId>,
#[serde(default = "DerivedFrom::default")]
pub derived_from: DerivedFrom,
/// Generated by a chunker. Declares whether the word is a member of a nominal phrase. Using
/// this should be preferred over the similarly named `Pattern`.
///
Expand All @@ -60,11 +62,6 @@ fn default_false() -> bool {
false
}

/// Needed for `serde`
fn default_none<T>() -> Option<T> {
None
}

/// Needed for `serde`
fn default_default<T: Default>() -> T {
T::default()
Expand Down Expand Up @@ -185,35 +182,9 @@ impl DictWordMetadata {

/// Produce a copy of `self` with the known properties of `other` set.
pub fn or(&self, other: &Self) -> Self {
macro_rules! merge {
($a:expr, $b:expr) => {
match ($a, $b) {
(Some(a), Some(b)) => Some(a.or(&b)),
(Some(a), None) => Some(a),
(None, Some(b)) => Some(b),
(None, None) => None,
}
};
}

Self {
noun: merge!(self.noun, other.noun),
pronoun: merge!(self.pronoun, other.pronoun),
verb: merge!(self.verb, other.verb),
adjective: merge!(self.adjective, other.adjective),
adverb: merge!(self.adverb, other.adverb),
conjunction: merge!(self.conjunction, other.conjunction),
determiner: merge!(self.determiner, other.determiner),
affix: merge!(self.affix, other.affix),
preposition: self.preposition || other.preposition,
dialects: self.dialects | other.dialects,
orth_info: self.orth_info | other.orth_info,
swear: self.swear.or(other.swear),
common: self.common || other.common,
derived_from: self.derived_from.or(other.derived_from),
pos_tag: self.pos_tag.or(other.pos_tag),
np_member: self.np_member.or(other.np_member),
}
let mut clone = self.clone();
clone.append(other);
clone
}

/// Given a UPOS tag, discard any metadata that would disagree with the given POS tag.
Expand Down Expand Up @@ -760,7 +731,34 @@ impl DictWordMetadata {

/// Same thing as [`Self::or`], except in-place rather than a clone.
pub fn append(&mut self, other: &Self) -> &mut Self {
*self = self.or(other);
macro_rules! merge {
($a:expr, $b:expr) => {
match ($a, $b) {
(Some(a), Some(b)) => Some(a.or(&b)),
(Some(a), None) => Some(a),
(None, Some(b)) => Some(b),
(None, None) => None,
}
};
}

self.noun = merge!(self.noun, other.noun);
self.pronoun = merge!(self.pronoun, other.pronoun);
self.verb = merge!(self.verb, other.verb);
self.adjective = merge!(self.adjective, other.adjective);
self.adverb = merge!(self.adverb, other.adverb);
self.conjunction = merge!(self.conjunction, other.conjunction);
self.determiner = merge!(self.determiner, other.determiner);
self.affix = merge!(self.affix, other.affix);
self.preposition |= other.preposition;
self.dialects |= other.dialects;
self.orth_info |= other.orth_info;
self.swear = self.swear.or(other.swear);
self.common |= other.common;
self.derived_from.extend(other.derived_from.iter());
self.pos_tag = self.pos_tag.or(other.pos_tag);
self.np_member = self.np_member.or(other.np_member);

self
}
}
Expand Down Expand Up @@ -1195,15 +1193,19 @@ impl Default for DialectFlags {

#[cfg(test)]
pub mod tests {
use std::borrow::Cow;
use std::sync::{Arc, LazyLock};

use crate::DictWordMetadata;
use crate::spell::{Dictionary, FstDictionary};

// Helper function to get metadata from the curated dictionary
pub fn md(word: &str) -> DictWordMetadata {
FstDictionary::curated()
.get_word_metadata_str(word)
pub fn md(word: &str) -> Cow<'_, DictWordMetadata> {
static CURATED_DICT: LazyLock<Arc<FstDictionary>> = LazyLock::new(FstDictionary::curated);

CURATED_DICT
.get_word_metadata_combined_str(word)
.unwrap_or_else(|| panic!("Word '{word}' not found in dictionary"))
.into_owned()
}

mod dialect {
Expand Down
75 changes: 75 additions & 0 deletions harper-core/src/dict_word_metadata/derived_from.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
use std::iter::Extend;
use std::slice::Iter;

use serde::{Deserialize, Serialize};

use crate::spell::CanonicalWordId;

/// A container for storing word IDs that a word is considered to be derived from.
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
pub struct DerivedFrom {
inner: Vec<CanonicalWordId>,
}

impl DerivedFrom {
/// Insert another word ID, if it's not already contained in the list.
///
/// If it is already contained in the list, it's quietly ignored.
pub fn insert(&mut self, id: CanonicalWordId) {
if !self.contains(id) {
self.inner.push(id);
}
}

/// Is the list empty? In other words, Does this word have no known words it's derived from?
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}

/// Is this word derived from the word represented by `id`?
pub fn contains(&self, id: CanonicalWordId) -> bool {
self.inner.contains(&id)
}

/// Create a new `DerivedFrom` containing a single initial word ID.
pub fn from_canonical_word_id(word_id: CanonicalWordId) -> Self {
Self {
inner: vec![word_id],
}
}

/// Get an iterator of the contained [`CanonicalWordId`].
pub fn iter(&self) -> Iter<'_, CanonicalWordId> {
self.inner.iter()
}
}

impl Extend<CanonicalWordId> for DerivedFrom {
fn extend<T: IntoIterator<Item = CanonicalWordId>>(&mut self, iter: T) {
// Extend additional word ID's, as long as they don't already exist.
// This is intended to emulate the behavior of a `HashSet`.
iter.into_iter().for_each(|canonical_word_id| {
self.insert(canonical_word_id);
});
}
}

impl<'a> Extend<&'a CanonicalWordId> for DerivedFrom {
fn extend<T: IntoIterator<Item = &'a CanonicalWordId>>(&mut self, iter: T) {
// Extend additional word ID's, as long as they don't already exist.
// This is intended to emulate the behavior of a `HashSet`.
iter.into_iter().copied().for_each(|canonical_word_id| {
self.insert(canonical_word_id);
});
}
}

impl IntoIterator for DerivedFrom {
type Item = CanonicalWordId;

type IntoIter = std::vec::IntoIter<Self::Item>;

fn into_iter(self) -> Self::IntoIter {
self.inner.into_iter()
}
}
2 changes: 1 addition & 1 deletion harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ impl Document {
if let TokenKind::Word(meta) = &mut token.kind {
let word_source = token.span.get_content(&self.source);
let mut found_meta = dictionary
.get_word_metadata(word_source)
.get_word_metadata_combined(word_source)
.map(|c| c.into_owned());

if let Some(inner) = &mut found_meta {
Expand Down
6 changes: 3 additions & 3 deletions harper-core/src/expr/mergeable_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ impl MergeableWords {
let mut compound = a_chars.clone();
compound.push(' ');
compound.extend_from_slice(&b_chars);
let meta_open = self.dict.get_word_metadata(&compound);
let meta_open = self.dict.get_word_metadata(&compound).first().copied();

// Then check if the closed compound exists in the dictionary
compound.remove(a_chars.len());
let meta_closed = self.dict.get_word_metadata(&compound);
let meta_closed = self.dict.get_word_metadata(&compound).first().copied();

if (self.predicate)(meta_closed.as_deref(), meta_open.as_deref()) {
if (self.predicate)(meta_closed, meta_open) {
return Some(compound);
}

Expand Down
4 changes: 2 additions & 2 deletions harper-core/src/linting/inflected_verb_after_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ impl<T: Dictionary> Linter for InflectedVerbAfterTo<T> {
}

let check_stem = |stem: &[char]| {
if let Some(metadata) = self.dictionary.get_word_metadata(stem)
if let Some(metadata) = self.dictionary.get_word_metadata_combined(stem)
&& metadata.is_verb()
&& !metadata.is_noun()
{
Expand Down Expand Up @@ -79,7 +79,7 @@ impl<T: Dictionary> Linter for InflectedVerbAfterTo<T> {
let ed_specific_heuristics = || {
if let Some(prev) = document.get_next_word_from_offset(pi, -1) {
let prev_chars = document.get_span_content(&prev.span);
if let Some(metadata) = self.dictionary.get_word_metadata(prev_chars) {
if let Some(metadata) = self.dictionary.get_word_metadata_combined(prev_chars) {
// adj: "able to" expects an infinitive verb
// verb: "have/had/has/having to" expect an infinitive verb
if metadata.is_adjective() || metadata.is_verb() {
Expand Down
4 changes: 2 additions & 2 deletions harper-core/src/linting/mass_nouns/mass_plurals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ where

fn is_mass_noun_in_dictionary(&self, chars: &[char]) -> bool {
self.dict
.get_word_metadata(chars)
.get_word_metadata_combined(chars)
.is_some_and(|wmd| wmd.is_mass_noun_only())
}

fn is_mass_noun_in_dictionary_str(&self, s: &str) -> bool {
self.dict
.get_word_metadata_str(s)
.get_word_metadata_combined_str(s)
.is_some_and(|wmd| wmd.is_mass_noun_only())
}
}
Expand Down
2 changes: 1 addition & 1 deletion harper-core/src/linting/more_adjective.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ where
}

fn add_valid_candidate(&self, candidates: &mut Vec<String>, candidate: String) -> bool {
if let Some(metadata) = self.dict.get_word_metadata_str(&candidate)
if let Some(metadata) = self.dict.get_word_metadata_str_exact(&candidate)
&& (metadata.is_comparative_adjective() || metadata.is_superlative_adjective())
{
candidates.push(candidate);
Expand Down
8 changes: 4 additions & 4 deletions harper-core/src/linting/one_of_the_singular.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ impl<D: Dictionary + 'static> ExprLinter for OneOfTheSingular<D> {

if self
.dict
.get_word_metadata(&plural_s)
.get_word_metadata_combined(&plural_s)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_s, singular));
}
if self
.dict
.get_word_metadata(&plural_es)
.get_word_metadata_combined(&plural_es)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_es, singular));
Expand All @@ -117,7 +117,7 @@ impl<D: Dictionary + 'static> ExprLinter for OneOfTheSingular<D> {
plural_ies.extend(['i', 'e', 's']);
if self
.dict
.get_word_metadata(&plural_ies)
.get_word_metadata_combined(&plural_ies)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_ies, singular));
Expand All @@ -130,7 +130,7 @@ impl<D: Dictionary + 'static> ExprLinter for OneOfTheSingular<D> {
plural_ves.extend(['v', 'e', 's']);
if self
.dict
.get_word_metadata(&plural_ves)
.get_word_metadata_combined(&plural_ves)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_ves, singular));
Expand Down
Loading
Loading