From 868ab495c356b5eb7fc26c0f2e1f4f854bb8a171 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Sun, 13 Aug 2023 23:21:44 +0300
Subject: [PATCH 1/4] attempt to port my changes from DAWG-py to DAWG

---
 setup.py     |  2 +-
 src/dawg.pyx | 57 +++++++++++++++++++++++++++-------------------------
 2 files changed, 31 insertions(+), 28 deletions(-)
diff --git a/setup.py b/setup.py
index 8698e45..e990952 100755
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name="DAWG2",
-    version="0.9.0",
+    version="0.9.1",
     description="Fast and memory efficient DAWG (DAFSA) for Python",
     long_description=open('README.rst').read() + '\n\n' + open('CHANGES.rst').read(),
     author='Mikhail Korobov',
diff --git a/src/dawg.pyx b/src/dawg.pyx
index 37258d9..5a27fb3 100644
--- a/src/dawg.pyx
+++ b/src/dawg.pyx
@@ -199,13 +199,13 @@ cdef class DAWG:
             b_step = <bytes>(key[word_pos].encode('utf8'))
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = <tuple>replace_chars[b_step]
-
-                if self.dct.Follow(b_replace_char, &next_index):
-                    prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
-                    extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
-                    res.extend(extra_keys)
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
+                    next_index = self.dct.Follow(b_replace_char, &next_index)
+                    if next_index:
+                        prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
+                        extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
+                        res.extend(extra_keys)
 
             if not self.dct.Follow(b_step, &index):
                 break
@@ -225,7 +225,7 @@ cdef class DAWG:
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode striings to (one or more) single-char
         unicode strings.
 
         This may be useful e.g. for handling single-character umlauts.
@@ -273,13 +273,16 @@ cdef class DAWG:
     def compile_replaces(cls, replaces):
 
         for k,v in replaces.items():
-            if len(k) != 1 or len(v) != 1:
-                raise ValueError("Keys and values must be single-char unicode strings.")
-
+            if len(k) != 1:
+                raise ValueError("Keys must be single-char unicode strings.")
+            if (isinstance(v, str) and len(v) != 1):
+                raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
+            if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1):
+                raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
         return dict(
             (
                 k.encode('utf8'),
-                (v.encode('utf8'), unicode(v))
+				[(v_entry.encode('utf8'), unicode(v_entry)) for v_entry in v]
             )
             for k, v in replaces.items()
         )
@@ -725,13 +728,13 @@ cdef class BytesDAWG(CompletionDAWG):
             b_step = <bytes>(key[word_pos].encode('utf8'))
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = <tuple>replace_chars[b_step]
-
-                if self.dct.Follow(b_replace_char, &next_index):
-                    prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
-                    extra_items = self._similar_items(prefix, key, next_index, replace_chars)
-                    res.extend(extra_items)
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
+                    next_index = self.dct.Follow(b_replace_char, &next_index)
+                    if next_index:
+                        prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
+						extra_items = self._similar_items(prefix, key, next_index, replace_chars)
+						res.extend(extra_items)
 
             if not self.dct.Follow(b_step, &index):
                 break
@@ -752,7 +755,7 @@ cdef class BytesDAWG(CompletionDAWG):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
         """
         return self._similar_items("", key, self.dct.root(), replaces)
@@ -772,12 +775,12 @@ cdef class BytesDAWG(CompletionDAWG):
             b_step = <bytes>(key[word_pos].encode('utf8'))
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = <tuple>replace_chars[b_step]
-
-                if self.dct.Follow(b_replace_char, &next_index):
-                    extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
-                    res.extend(extra_items)
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
+                    next_index = self.dct.Follow(b_replace_char, &next_index)
+                    if next_index:
+						extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
+						res.extend(extra_items)
 
             if not self.dct.Follow(b_step, &index):
                 break
@@ -797,7 +800,7 @@ cdef class BytesDAWG(CompletionDAWG):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
         """
         return self._similar_item_values(0, key, self.dct.root(), replaces)

From 381e9aa7bdafd8e45ec5ac40bf239a6117d93ee2 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Mon, 14 Aug 2023 00:02:52 +0300
Subject: [PATCH 2/4] change tabs

---
 src/dawg.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/dawg.pyx b/src/dawg.pyx
index 5a27fb3..5335a78 100644
--- a/src/dawg.pyx
+++ b/src/dawg.pyx
@@ -225,7 +225,7 @@ cdef class DAWG:
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode striings to (one or more) single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
 
         This may be useful e.g. for handling single-character umlauts.
@@ -282,7 +282,7 @@ cdef class DAWG:
         return dict(
             (
                 k.encode('utf8'),
-				[(v_entry.encode('utf8'), unicode(v_entry)) for v_entry in v]
+                [(v_entry.encode('utf8'), unicode(v_entry)) for v_entry in v]
             )
             for k, v in replaces.items()
         )
@@ -733,8 +733,8 @@ cdef class BytesDAWG(CompletionDAWG):
                     next_index = self.dct.Follow(b_replace_char, &next_index)
                     if next_index:
                         prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
-						extra_items = self._similar_items(prefix, key, next_index, replace_chars)
-						res.extend(extra_items)
+                        extra_items = self._similar_items(prefix, key, next_index, replace_chars)
+                        res.extend(extra_items)
 
             if not self.dct.Follow(b_step, &index):
                 break
@@ -779,8 +779,8 @@ cdef class BytesDAWG(CompletionDAWG):
                     next_index = index
                     next_index = self.dct.Follow(b_replace_char, &next_index)
                     if next_index:
-						extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
-						res.extend(extra_items)
+                        extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
+                        res.extend(extra_items)
 
             if not self.dct.Follow(b_step, &index):
                 break

From e18837991e1d54438f1c61c97c936c9c610a649e Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Mon, 14 Aug 2023 01:08:00 +0300
Subject: [PATCH 3/4] add tests

---
 tests/test_prediction.py | 65 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index b20986e..8b46eae 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -43,6 +43,71 @@ class TestPrediction(object):
     ]
 
 
+    @pytest.mark.parametrize(("word", "prediction"), SUITE)
+    def test_dawg_prediction(self, word, prediction):
+        d = dawg.DAWG(self.DATA)
+        assert d.similar_keys(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE)
+    def test_record_dawg_prediction(self, word, prediction):
+        d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA)
+        assert d.similar_keys(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS)
+    def test_record_dawg_items(self, word, prediction):
+        d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA)
+        assert d.similar_items(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES)
+    def test_record_dawg_items_values(self, word, prediction):
+        d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA)
+        assert d.similar_item_values(word, self.REPLACES) == prediction
+
+class TestMultiValuedPrediction(object):
+    DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён".split(" ")
+    LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA)))
+
+    REPLACES = dawg.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})
+
+    SUITE = [
+        ('осел', []),
+        ('ель', ['ель']),
+        ('ёль', []),
+        ('хлеб', ['хлѣб']),
+        ('елка', ['ёлка']),
+        ('лесное', ['лѣсное']),
+        ('лесноё', []),
+        ('лёсное', []),
+        ('изобретен', ['изобрѣтён']),
+        ('беленая', ['бѣлёная']),
+        ('белёная', ['бѣлёная']),
+        ('бѣленая', ['бѣлёная']),
+        ('бѣлёная', ['бѣлёная']),
+        ('белѣная', []),
+        ('бѣлѣная', []),
+        ('все', ['всё', 'всѣ']),
+    ]
+
+    SUITE_ITEMS = [
+        (
+            it[0], # key
+            [
+                (w, [(len(w),)]) # item, value pair
+                for w in it[1]
+            ]
+        )
+        for it in SUITE
+    ]
+
+    SUITE_VALUES = [
+        (
+            it[0], # key
+            [[(len(w),)] for w in it[1]]
+        )
+        for it in SUITE
+    ]
+
+
     @pytest.mark.parametrize(("word", "prediction"), SUITE)
     def test_dawg_prediction(self, word, prediction):
         d = dawg.DAWG(self.DATA)

From 8c91c2f495e4738b400abefde3798e83f91fe46b Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Thu, 17 Aug 2023 01:47:02 +0300
Subject: [PATCH 4/4] fix

---
 src/dawg.pyx             | 20 ++++++++++----------
 tests/test_prediction.py |  4 +++-
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/dawg.pyx b/src/dawg.pyx
index 5335a78..26fb847 100644
--- a/src/dawg.pyx
+++ b/src/dawg.pyx
@@ -201,8 +201,8 @@ cdef class DAWG:
             if b_step in replace_chars:
                 for (b_replace_char, u_replace_char) in replace_chars[b_step]:
                     next_index = index
-                    next_index = self.dct.Follow(b_replace_char, &next_index)
-                    if next_index:
+                    is_followed = self.dct.Follow(b_replace_char, &next_index)
+                    if is_followed:
                         prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
                         extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
                         res.extend(extra_keys)
@@ -233,9 +233,9 @@ cdef class DAWG:
         return self._similar_keys("", key, self.dct.root(), replaces)
 
     cpdef list prefixes(self, unicode key):
-        '''
+        """
         Return a list with keys of this DAWG that are prefixes of the ``key``.
-        '''
+        """
         return [p.decode('utf8') for p in self.b_prefixes(<bytes>key.encode('utf8'))]
 
     cpdef list b_prefixes(self, bytes b_key):
@@ -254,9 +254,9 @@ cdef class DAWG:
         return res
 
     def iterprefixes(self, unicode key):
-        '''
+        """
         Return a generator with keys of this DAWG that are prefixes of the ``key``.
-        '''
+        """
         cdef BaseType index = self.dct.root()
         cdef bytes b_key = <bytes>key.encode('utf8')
         cdef int pos = 1
@@ -730,8 +730,8 @@ cdef class BytesDAWG(CompletionDAWG):
             if b_step in replace_chars:
                 for (b_replace_char, u_replace_char) in replace_chars[b_step]:
                     next_index = index
-                    next_index = self.dct.Follow(b_replace_char, &next_index)
-                    if next_index:
+                    is_followed = self.dct.Follow(b_replace_char, &next_index)
+                    if is_followed:
                         prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
                         extra_items = self._similar_items(prefix, key, next_index, replace_chars)
                         res.extend(extra_items)
@@ -777,8 +777,8 @@ cdef class BytesDAWG(CompletionDAWG):
             if b_step in replace_chars:
                 for (b_replace_char, u_replace_char) in replace_chars[b_step]:
                     next_index = index
-                    next_index = self.dct.Follow(b_replace_char, &next_index)
-                    if next_index:
+                    is_followed = self.dct.Follow(b_replace_char, &next_index)
+                    if is_followed:
                         extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
                         res.extend(extra_items)
 
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index 8b46eae..cea8551 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -64,7 +64,7 @@ def test_record_dawg_items_values(self, word, prediction):
         assert d.similar_item_values(word, self.REPLACES) == prediction
 
 class TestMultiValuedPrediction(object):
-    DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён".split(" ")
+    DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ")
     LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA)))
 
     REPLACES = dawg.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})
@@ -86,6 +86,8 @@ class TestMultiValuedPrediction(object):
         ('белѣная', []),
         ('бѣлѣная', []),
         ('все', ['всё', 'всѣ']),
+        ('лев', ['лев', 'лёв', 'лѣв']),
+        ('венский', ['вѣнскій']),
     ]
 
     SUITE_ITEMS = [