From 868ab495c356b5eb7fc26c0f2e1f4f854bb8a171 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Sun, 13 Aug 2023 23:21:44 +0300 Subject: [PATCH 1/4] attempt to port my changes from DAWG-py to DAWG --- setup.py | 2 +- src/dawg.pyx | 57 +++++++++++++++++++++++++++------------------------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index 8698e45..e990952 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name="DAWG2", - version="0.9.0", + version="0.9.1", description="Fast and memory efficient DAWG (DAFSA) for Python", long_description=open('README.rst').read() + '\n\n' + open('CHANGES.rst').read(), author='Mikhail Korobov', diff --git a/src/dawg.pyx b/src/dawg.pyx index 37258d9..5a27fb3 100644 --- a/src/dawg.pyx +++ b/src/dawg.pyx @@ -199,13 +199,13 @@ cdef class DAWG: b_step = (key[word_pos].encode('utf8')) if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] - - if self.dct.Follow(b_replace_char, &next_index): - prefix = current_prefix + key[start_pos:word_pos] + u_replace_char - extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) - res.extend(extra_keys) + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index + next_index = self.dct.Follow(b_replace_char, &next_index) + if next_index: + prefix = current_prefix + key[start_pos:word_pos] + u_replace_char + extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) + res.extend(extra_keys) if not self.dct.Follow(b_step, &index): break @@ -225,7 +225,7 @@ cdef class DAWG: ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode striings to (one or more) single-char unicode strings. This may be useful e.g. for handling single-character umlauts. @@ -273,13 +273,16 @@ cdef class DAWG: def compile_replaces(cls, replaces): for k,v in replaces.items(): - if len(k) != 1 or len(v) != 1: - raise ValueError("Keys and values must be single-char unicode strings.") - + if len(k) != 1: + raise ValueError("Keys must be single-char unicode strings.") + if (isinstance(v, str) and len(v) != 1): + raise ValueError("Values must be single-char unicode strings or non-empty lists of such.") + if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1): + raise ValueError("Values must be single-char unicode strings or non-empty lists of such.") return dict( ( k.encode('utf8'), - (v.encode('utf8'), unicode(v)) + [(v_entry.encode('utf8'), unicode(v_entry)) for v_entry in v] ) for k, v in replaces.items() ) @@ -725,13 +728,13 @@ cdef class BytesDAWG(CompletionDAWG): b_step = (key[word_pos].encode('utf8')) if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] - - if self.dct.Follow(b_replace_char, &next_index): - prefix = current_prefix + key[start_pos:word_pos] + u_replace_char - extra_items = self._similar_items(prefix, key, next_index, replace_chars) - res.extend(extra_items) + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index + next_index = self.dct.Follow(b_replace_char, &next_index) + if next_index: + prefix = current_prefix + key[start_pos:word_pos] + u_replace_char + extra_items = self._similar_items(prefix, key, next_index, replace_chars) + res.extend(extra_items) if not self.dct.Follow(b_step, &index): break @@ -752,7 +755,7 @@ cdef class BytesDAWG(CompletionDAWG): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. """ return self._similar_items("", key, self.dct.root(), replaces) @@ -772,12 +775,12 @@ cdef class BytesDAWG(CompletionDAWG): b_step = (key[word_pos].encode('utf8')) if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] - - if self.dct.Follow(b_replace_char, &next_index): - extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) - res.extend(extra_items) + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index + next_index = self.dct.Follow(b_replace_char, &next_index) + if next_index: + extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) + res.extend(extra_items) if not self.dct.Follow(b_step, &index): break @@ -797,7 +800,7 @@ cdef class BytesDAWG(CompletionDAWG): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. """ return self._similar_item_values(0, key, self.dct.root(), replaces) From 381e9aa7bdafd8e45ec5ac40bf239a6117d93ee2 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Mon, 14 Aug 2023 00:02:52 +0300 Subject: [PATCH 2/4] change tabs --- src/dawg.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dawg.pyx b/src/dawg.pyx index 5a27fb3..5335a78 100644 --- a/src/dawg.pyx +++ b/src/dawg.pyx @@ -225,7 +225,7 @@ cdef class DAWG: ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode striings to (one or more) single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. This may be useful e.g. for handling single-character umlauts. @@ -282,7 +282,7 @@ cdef class DAWG: return dict( ( k.encode('utf8'), - [(v_entry.encode('utf8'), unicode(v_entry)) for v_entry in v] + [(v_entry.encode('utf8'), unicode(v_entry)) for v_entry in v] ) for k, v in replaces.items() ) @@ -733,8 +733,8 @@ cdef class BytesDAWG(CompletionDAWG): next_index = self.dct.Follow(b_replace_char, &next_index) if next_index: prefix = current_prefix + key[start_pos:word_pos] + u_replace_char - extra_items = self._similar_items(prefix, key, next_index, replace_chars) - res.extend(extra_items) + extra_items = self._similar_items(prefix, key, next_index, replace_chars) + res.extend(extra_items) if not self.dct.Follow(b_step, &index): break @@ -779,8 +779,8 @@ cdef class BytesDAWG(CompletionDAWG): next_index = index next_index = self.dct.Follow(b_replace_char, &next_index) if next_index: - extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) - res.extend(extra_items) + extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) + res.extend(extra_items) if not self.dct.Follow(b_step, &index): break From e18837991e1d54438f1c61c97c936c9c610a649e Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Mon, 14 Aug 2023 01:08:00 +0300 Subject: [PATCH 3/4] add tests --- tests/test_prediction.py | 65 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tests/test_prediction.py b/tests/test_prediction.py index b20986e..8b46eae 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -43,6 +43,71 @@ class TestPrediction(object): ] + @pytest.mark.parametrize(("word", "prediction"), SUITE) + def test_dawg_prediction(self, word, prediction): + d = dawg.DAWG(self.DATA) + assert d.similar_keys(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE) + def test_record_dawg_prediction(self, word, prediction): + d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) + assert d.similar_keys(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS) + def test_record_dawg_items(self, word, prediction): + d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) + assert d.similar_items(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES) + def test_record_dawg_items_values(self, word, prediction): + d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) + assert d.similar_item_values(word, self.REPLACES) == prediction + +class TestMultiValuedPrediction(object): + DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён".split(" ") + LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA))) + + REPLACES = dawg.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'}) + + SUITE = [ + ('осел', []), + ('ель', ['ель']), + ('ёль', []), + ('хлеб', ['хлѣб']), + ('елка', ['ёлка']), + ('лесное', ['лѣсное']), + ('лесноё', []), + ('лёсное', []), + ('изобретен', ['изобрѣтён']), + ('беленая', ['бѣлёная']), + ('белёная', ['бѣлёная']), + ('бѣленая', ['бѣлёная']), + ('бѣлёная', ['бѣлёная']), + ('белѣная', []), + ('бѣлѣная', []), + ('все', ['всё', 'всѣ']), + ] + + SUITE_ITEMS = [ + ( + it[0], # key + [ + (w, [(len(w),)]) # item, value pair + for w in it[1] + ] + ) + for it in SUITE + ] + + SUITE_VALUES = [ + ( + it[0], # key + [[(len(w),)] for w in it[1]] + ) + for it in SUITE + ] + + @pytest.mark.parametrize(("word", "prediction"), SUITE) def test_dawg_prediction(self, word, prediction): d = dawg.DAWG(self.DATA) From 8c91c2f495e4738b400abefde3798e83f91fe46b Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Thu, 17 Aug 2023 01:47:02 +0300 Subject: [PATCH 4/4] fix --- src/dawg.pyx | 20 ++++++++++---------- tests/test_prediction.py | 4 +++- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/dawg.pyx b/src/dawg.pyx index 5335a78..26fb847 100644 --- a/src/dawg.pyx +++ b/src/dawg.pyx @@ -201,8 +201,8 @@ cdef class DAWG: if b_step in replace_chars: for (b_replace_char, u_replace_char) in replace_chars[b_step]: next_index = index - next_index = self.dct.Follow(b_replace_char, &next_index) - if next_index: + is_followed = self.dct.Follow(b_replace_char, &next_index) + if is_followed: prefix = current_prefix + key[start_pos:word_pos] + u_replace_char extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) res.extend(extra_keys) @@ -233,9 +233,9 @@ cdef class DAWG: return self._similar_keys("", key, self.dct.root(), replaces) cpdef list prefixes(self, unicode key): - ''' + """ Return a list with keys of this DAWG that are prefixes of the ``key``. - ''' + """ return [p.decode('utf8') for p in self.b_prefixes(key.encode('utf8'))] cpdef list b_prefixes(self, bytes b_key): @@ -254,9 +254,9 @@ cdef class DAWG: return res def iterprefixes(self, unicode key): - ''' + """ Return a generator with keys of this DAWG that are prefixes of the ``key``. - ''' + """ cdef BaseType index = self.dct.root() cdef bytes b_key = key.encode('utf8') cdef int pos = 1 @@ -730,8 +730,8 @@ cdef class BytesDAWG(CompletionDAWG): if b_step in replace_chars: for (b_replace_char, u_replace_char) in replace_chars[b_step]: next_index = index - next_index = self.dct.Follow(b_replace_char, &next_index) - if next_index: + is_followed = self.dct.Follow(b_replace_char, &next_index) + if is_followed: prefix = current_prefix + key[start_pos:word_pos] + u_replace_char extra_items = self._similar_items(prefix, key, next_index, replace_chars) res.extend(extra_items) @@ -777,8 +777,8 @@ cdef class BytesDAWG(CompletionDAWG): if b_step in replace_chars: for (b_replace_char, u_replace_char) in replace_chars[b_step]: next_index = index - next_index = self.dct.Follow(b_replace_char, &next_index) - if next_index: + is_followed = self.dct.Follow(b_replace_char, &next_index) + if is_followed: extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) res.extend(extra_items) diff --git a/tests/test_prediction.py b/tests/test_prediction.py index 8b46eae..cea8551 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -64,7 +64,7 @@ def test_record_dawg_items_values(self, word, prediction): assert d.similar_item_values(word, self.REPLACES) == prediction class TestMultiValuedPrediction(object): - DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён".split(" ") + DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ") LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA))) REPLACES = dawg.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'}) @@ -86,6 +86,8 @@ class TestMultiValuedPrediction(object): ('белѣная', []), ('бѣлѣная', []), ('все', ['всё', 'всѣ']), + ('лев', ['лев', 'лёв', 'лѣв']), + ('венский', ['вѣнскій']), ] SUITE_ITEMS = [