From 7a77a9423ae56c2916d980438f7a840f88e0c351 Mon Sep 17 00:00:00 2001 From: maxmerben Date: Tue, 3 Feb 2026 18:39:09 +0300 Subject: [PATCH 1/6] new (preliminary) converter for Avar --- converter.py | 166 ++++++++++++++++++++++++ static/ortho_ava.txt | 278 +++++++++++++++++++++++++++++++++++++++++ static/ortho_table.csv | 110 ++++++++++++++++ 3 files changed, 554 insertions(+) create mode 100644 converter.py create mode 100644 static/ortho_ava.txt create mode 100644 static/ortho_table.csv diff --git a/converter.py b/converter.py new file mode 100644 index 0000000..91403cd --- /dev/null +++ b/converter.py @@ -0,0 +1,166 @@ +import re, os, csv + +current_folder = os.path.dirname(os.path.abspath(__file__)) +ortho_table_path = os.path.join(current_folder, os.path.join("static", "ortho_table.csv")) +ortho_txt_default_name = "ortho" + +all_letters = "A-Za-zÀ-ÖØ-öø-ӿԀ-ԯⷩ" +punct = "\!-/:-@\[-`{-~ -¿\‐-⁞" + +#cyr_check = "[ПпБДдЛлЖжШшЩщФфЦцЧчИиЙйЬьЪъЫыЭэЮюЯя]" +#lat_check = "[SsVvFfIiGgZzQqNR]" + +_cyr = ["cyr", "cyrillic", "кир", "кириллица"] +#_lat = ["lat", "latin", "лат", "латиница"] +other_targets = ["ipa", "cauc"] + +possible_targets = _cyr + other_targets +recommended_targets = [_cyr[0]] + other_targets + + +def raise_error_wrong_argument(token, recommended): + raise ValueError(f"Invalid '{token}' argument given! Should be one of these: {recommended}") + + +class ConverterOutput: + """The output of the Converter.convert() function.""" + + def __init__(self, text, lang, orig, target): + """ + Parameters + ---------- + """ + + self.text = text + self.lang = lang + self.orig = orig + self.target = target + + + def __repr__(self): + return self.text + + + def full(self): + """Prints all the settings of the converted text.""" + return f"ConverterOutput(\n\ttext='{self.text}',\n\tlang={self.lang}, orig={self.orig}, target={self.target}\n)" + + +class Converter: + """The main class to convert from alphabet X to alphabet Y. + + Make a `Converter` object. Optionally, set the default target alphabet as an argument: + > c = Converter(target="cyr") + + Then use the method `convert` to change the alphabet of a text: + > converted_text = c.convert(original_text) + """ + + def __init__(self, lang, orig=None, target=None): + """ + Parameters + ---------- + """ + + with open(ortho_table_path, 'r', encoding='utf-8-sig') as f: + reader = list(csv.reader(f, delimiter=",")) + self._ortho_table = {} + headers = [] + for i in range(len(reader)): + row = reader[i] + if i == 0: + headers = row + else: + self._ortho_table[row[0]] = { + headers[i]: row[i] for i in range(1, len(row))} + + possible_langs = sorted(set([x.split("_")[0] for x in list(list(self._ortho_table.values())[0].keys())])) + if lang in possible_langs: + self.lang = lang + else: + raise_error_wrong_argument("lang", possible_langs) + + if orig in possible_targets: + self.orig = orig + elif orig is None: + self.orig = None + else: + raise_error_wrong_argument("orig", recommended_targets) + + if target in _cyr: + self._lang_target = f"{self.lang}_{_cyr[0]}" + elif target in other_targets: + self._lang_target = f"{self.lang}_{target}" + elif target is None: + self._lang_target = None + else: + raise_error_wrong_argument("target", recommended_targets) + + self.target = target + + with open( + os.path.join(current_folder, os.path.join("static", f"ortho_{lang}.txt")), + "r", encoding="utf-8-sig") as file: + ortho_txt_file = file.readlines() + + self._ortho_to_meta = {} + for line in ortho_txt_file: + if not line.startswith("#"): + bad, good = line.split("\t") + good = good.strip("\r\n") + self._ortho_to_meta[bad] = good + + + def __repr__(self): + return f"Converter(lang={self.lang}, orig={self.orig}, target={self.target})" + + + def convert(self, text, orig=None, target=None): + """ + Converts a text. + + Parameters + ---------- + """ + if text is None: + return None + if orig is None: + if self.orig is None: + raise ValueError("No 'orig' value given!") + else: + orig = self.orig + elif orig not in possible_targets: + raise_error_wrong_argument("orig", recommended_targets) + + lang_target = None + if target is None: + if self._lang_target is None: + raise ValueError("No 'target' value given!") + else: + lang_target = self._lang_target + target = self.target + elif target in possible_targets: + lang_target = f"{self.lang}_{target}" + else: + raise_error_wrong_argument("target", recommended_targets) + + ###################### + + text = re.sub(f"(?<=[{all_letters}])[|1]|[|1](?=[{all_letters}])", "ӏ", text) + + tokens = re.findall(f"[{all_letters}]+|[{punct}]+|[0-9]+|[^{all_letters}{punct}0-9]+", text) + + for i in range(len(tokens)): + if re.fullmatch(f"[{all_letters}]+", tokens[i]): + for bad, good in self._ortho_to_meta.items(): + tokens[i] = re.sub(bad, good, tokens[i]) + text = "".join(tokens) + + for letter in self._ortho_table: + text = re.sub(letter, self._ortho_table[letter][lang_target], text) + + if target in ("ipa", "cauc"): + re.sub("’", "ʼ", text) + + return ConverterOutput(text, lang=self.lang, orig=orig, target=target) + diff --git a/static/ortho_ava.txt b/static/ortho_ava.txt new file mode 100644 index 0000000..d97e5b0 --- /dev/null +++ b/static/ortho_ava.txt @@ -0,0 +1,278 @@ +# абруптивность ʼ → долгота ː → лабиализация +# ʼ modifier letter apostrophe +# ’ right single quotation mark +# +# +# +# палочка +Ӏ ӏ +# +# +# +# латиница +k[ʼ’]ː kʼː +ɬː łː +c[ʼ’]ː cʼː +č[ʼ’]ː čʼː +ʁ ğ +h h +ʕ ḣ +kː kː +q[ʼ’] qʼ +ƛ[ʼ’] ḷʼ +k[ʼ’] kʼ +ƛ ḷ +ɬ ł +p[ʼ’] pʼ +t[ʼ’] tʼ +χː ẍː +q q +x ẋ +ħ ħ +cː cː +c[ʼ’] cʼ +čː čː +č[ʼ’] čʼ +je ё +(? Date: Tue, 3 Feb 2026 19:02:05 +0300 Subject: [PATCH 2/6] small fix concerning palochkas --- converter.py | 10 ++- static/ortho_ava.txt | 148 ++++++++++++++++++++++--------------------- 2 files changed, 83 insertions(+), 75 deletions(-) diff --git a/converter.py b/converter.py index 91403cd..26a0f7e 100644 --- a/converter.py +++ b/converter.py @@ -146,21 +146,25 @@ def convert(self, text, orig=None, target=None): ###################### + # Fix palochkas which are incorrectly capital but within a word + text = re.sub(f"(?<=[{all_letters}])Ӏ", "ӏ", text) + + # Fix palochkas which are written as '1' or '|' text = re.sub(f"(?<=[{all_letters}])[|1]|[|1](?=[{all_letters}])", "ӏ", text) + # Split into tokens tokens = re.findall(f"[{all_letters}]+|[{punct}]+|[0-9]+|[^{all_letters}{punct}0-9]+", text) + # Convert to the meta-orthography for i in range(len(tokens)): if re.fullmatch(f"[{all_letters}]+", tokens[i]): for bad, good in self._ortho_to_meta.items(): tokens[i] = re.sub(bad, good, tokens[i]) text = "".join(tokens) + # Convert to the target orthography for letter in self._ortho_table: text = re.sub(letter, self._ortho_table[letter][lang_target], text) - if target in ("ipa", "cauc"): - re.sub("’", "ʼ", text) - return ConverterOutput(text, lang=self.lang, orig=orig, target=target) diff --git a/static/ortho_ava.txt b/static/ortho_ava.txt index d97e5b0..02b3db2 100644 --- a/static/ortho_ava.txt +++ b/static/ortho_ava.txt @@ -5,41 +5,53 @@ # # # палочка -Ӏ ӏ +#Ӏ ӏ # # # # латиница -k[ʼ’]ː kʼː -ɬː łː +(t[͜͡]?s|ʦ)[ʼ’]ː cʼː +(t[͜͡]?ʃ|ʧ)[ʼ’]ː čʼː c[ʼ’]ː cʼː č[ʼ’]ː čʼː +k[ʼ’]ː kʼː +@k[͜͡]?[ʼ’]xː kʼː +(t[͜͡]?s|ʦ)[ʼ’] cʼ +(t[͜͡]?ʃ|ʧ)[ʼ’] čʼ +t[͜͡]?ɬ[ʼ’] ḷʼ +c[ʼ’] cʼ +č[ʼ’] čʼ +k[ʼ’] kʼ +ƛ[ʼ’] ḷʼ +p[ʼ’] pʼ +q[ʼ’] qʼ +t[ʼ’] tʼ +(t[͜͡]?s|ʦ)ː cː +(t[͜͡]?ʃ|ʧ)ː čː +(t[͜͡]?s|ʦ) c +(t[͜͡]?ʃ|ʧ) č +t[͜͡]?ɬ ḷ +kː kː +@k[͜͡]?xː kː +ɬː łː +χː ẍː ʁ ğ h h ʕ ḣ -kː kː -q[ʼ’] qʼ -ƛ[ʼ’] ḷʼ -k[ʼ’] kʼ ƛ ḷ ɬ ł -p[ʼ’] pʼ -t[ʼ’] tʼ -χː ẍː q q x ẋ ħ ħ cː cː -c[ʼ’] cʼ čː čː -č[ʼ’] čʼ -je ё (? Date: Tue, 3 Feb 2026 19:47:59 +0300 Subject: [PATCH 3/6] small fix regarding the meta orthography --- static/ortho_ava.txt | 187 +++++++++++++++++++++++++++-------------- static/ortho_table.csv | 66 +++++++-------- 2 files changed, 158 insertions(+), 95 deletions(-) diff --git a/static/ortho_ava.txt b/static/ortho_ava.txt index 02b3db2..54872c7 100644 --- a/static/ortho_ava.txt +++ b/static/ortho_ava.txt @@ -1,4 +1,7 @@ # абруптивность ʼ → долгота ː → лабиализация +# +# ʼː → ٯ +# # ʼ modifier letter apostrophe # ’ right single quotation mark # @@ -10,99 +13,159 @@ # # # латиница -(t[͜͡]?s|ʦ)[ʼ’]ː cʼː -(t[͜͡]?ʃ|ʧ)[ʼ’]ː čʼː -c[ʼ’]ː cʼː -č[ʼ’]ː čʼː -k[ʼ’]ː kʼː -@k[͜͡]?[ʼ’]xː kʼː +(t[͜͡]?s|ʦ)[ʼ’]ː cٯ +(T[͜͡]?[Ss]|ʦ)[ʼ’]ː Cٯ +(t[͜͡]?ʃ|ʧ)[ʼ’]ː čٯ +(T[͜͡]?ʃ|ʧ)[ʼ’]ː Čٯ +c[ʼ’]ː cٯ +C[ʼ’]ː Cٯ +č[ʼ’]ː čٯ +Č[ʼ’]ː Čٯ +k[ʼ’]ː ƙٯ +K[ʼ’]ː Ƙٯ +@k[͜͡]?[ʼ’]xː ƙٯ +@K[͜͡]?[ʼ’]xː Ƙٯ (t[͜͡]?s|ʦ)[ʼ’] cʼ +(T[͜͡]?[Ss]|ʦ)[ʼ’] Cʼ (t[͜͡]?ʃ|ʧ)[ʼ’] čʼ +(T[͜͡]?ʃ|ʧ)[ʼ’] Čʼ t[͜͡]?ɬ[ʼ’] ḷʼ +T[͜͡]?ɬ[ʼ’] Ḷʼ c[ʼ’] cʼ +C[ʼ’] Cʼ č[ʼ’] čʼ -k[ʼ’] kʼ +Č[ʼ’] Čʼ +k[ʼ’] ƙʼ +K[ʼ’] Ƙʼ ƛ[ʼ’] ḷʼ +ƛ[ʼ’] Ḷʼ p[ʼ’] pʼ +P[ʼ’] Pʼ q[ʼ’] qʼ -t[ʼ’] tʼ -(t[͜͡]?s|ʦ)ː cː -(t[͜͡]?ʃ|ʧ)ː čː -(t[͜͡]?s|ʦ) c -(t[͜͡]?ʃ|ʧ) č -t[͜͡]?ɬ ḷ -kː kː -@k[͜͡]?xː kː +Q[ʼ’] Qʼ +t[ʼ’] تʼ +T[ʼ’] طʼ +(t[͜͡]?s|ʦ)ʰ?ːʰ? cː +(T[͜͡]?[Ss]|ʦ)ʰ?ːʰ? Cː +(t[͜͡]?ʃ|ʧ)ʰ?ːʰ? čː +(T[͜͡]?ʃ|ʧ)ʰ?ːʰ? Čː +(t[͜͡]?s|ʦ)ʰ? c +(t[͜͡]?[Ss]|ʦ)ʰ? c +(t[͜͡]?ʃ|ʧ)ʰ? č +(T[͜͡]?ʃ|ʧ)ʰ? Č +t[͜͡]?ɬʰ? ḷ +T[͜͡]?ɬʰ? Ḷ +kʰ?ːʰ? ƙː +Kʰ?ːʰ? Ƙː +@k[͜͡]?xː ƙː +@K[͜͡]?[Xx]ː Ƙː ɬː łː χː ẍː +Χː Ẍː ʁ ğ h h +H H ʕ ḣ ƛ ḷ ɬ ł -q q +qʰ? q +Qʰ? Q x ẋ +X Ẋ ħ ħ -cː cː -čː čː +Ħ Ħ +cʰ?ːʰ? cː +Cʰ?ːʰ? Cː +čʰ?ːʰ? čː +Čʰ?ːʰ? Čː (? Date: Tue, 3 Feb 2026 21:05:25 +0300 Subject: [PATCH 4/6] Standard Dargwa language added + small fixes in Avar --- converter.py | 5 + static/ortho_ava.txt | 106 +++++++++--------- static/ortho_dar.txt | 238 +++++++++++++++++++++++++++++++++++++++++ static/ortho_table.csv | 228 ++++++++++++++++++++------------------- 4 files changed, 415 insertions(+), 162 deletions(-) create mode 100644 static/ortho_dar.txt diff --git a/converter.py b/converter.py index 26a0f7e..60f5998 100644 --- a/converter.py +++ b/converter.py @@ -146,6 +146,11 @@ def convert(self, text, orig=None, target=None): ###################### + # TO-DO: Fix palochkas which are incorrectly written as I if Cyrillic + # + # + # + # Fix palochkas which are incorrectly capital but within a word text = re.sub(f"(?<=[{all_letters}])Ӏ", "ӏ", text) diff --git a/static/ortho_ava.txt b/static/ortho_ava.txt index 54872c7..a1b36c5 100644 --- a/static/ortho_ava.txt +++ b/static/ortho_ava.txt @@ -1,6 +1,6 @@ -# абруптивность ʼ → долгота ː → лабиализация +# абруптивность ʼ → долгота ː → лабиализация ʷ # -# ʼː → ٯ +# ʼː → ٮ # # ʼ modifier letter apostrophe # ’ right single quotation mark @@ -13,18 +13,18 @@ # # # латиница -(t[͜͡]?s|ʦ)[ʼ’]ː cٯ -(T[͜͡]?[Ss]|ʦ)[ʼ’]ː Cٯ -(t[͜͡]?ʃ|ʧ)[ʼ’]ː čٯ -(T[͜͡]?ʃ|ʧ)[ʼ’]ː Čٯ -c[ʼ’]ː cٯ -C[ʼ’]ː Cٯ -č[ʼ’]ː čٯ -Č[ʼ’]ː Čٯ -k[ʼ’]ː ƙٯ -K[ʼ’]ː Ƙٯ -@k[͜͡]?[ʼ’]xː ƙٯ -@K[͜͡]?[ʼ’]xː Ƙٯ +(t[͜͡]?s|ʦ)[ʼ’]ː cٮ +(T[͜͡]?[Ss]|ʦ)[ʼ’]ː Cٮ +(t[͜͡]?ʃ|ʧ)[ʼ’]ː čٮ +(T[͜͡]?ʃ|ʧ)[ʼ’]ː Čٮ +c[ʼ’]ː cٮ +C[ʼ’]ː Cٮ +č[ʼ’]ː čٮ +Č[ʼ’]ː Čٮ +k[ʼ’]ː ƙٮ +K[ʼ’]ː Ƙٮ +@k[͜͡]?[ʼ’]xː ƙٮ +@K[͜͡]?[ʼ’]xː Ƙٮ (t[͜͡]?s|ʦ)[ʼ’] cʼ (T[͜͡]?[Ss]|ʦ)[ʼ’] Cʼ (t[͜͡]?ʃ|ʧ)[ʼ’] čʼ @@ -41,8 +41,8 @@ K[ʼ’] Ƙʼ ƛ[ʼ’] Ḷʼ p[ʼ’] pʼ P[ʼ’] Pʼ -q[ʼ’] qʼ -Q[ʼ’] Qʼ +q[ʼ’] ƣ +Q[ʼ’] Ƣ t[ʼ’] تʼ T[ʼ’] طʼ (t[͜͡]?s|ʦ)ʰ?ːʰ? cː @@ -68,8 +68,8 @@ H H ʕ ḣ ƛ ḷ ɬ ł -qʰ? q -Qʰ? Q +qʰ?ː?ʰ? qː +Qʰ?ː?ʰ? Qː x ẋ X Ẋ ħ ħ @@ -80,8 +80,8 @@ Cʰ?ːʰ? Cː Čʰ?ːʰ? Čː (? Date: Wed, 4 Feb 2026 02:19:02 +0300 Subject: [PATCH 5/6] Lezgian added + small fixes for other languages --- static/ortho_ava.txt | 28 ++--- static/ortho_dar.txt | 2 - static/ortho_lez.txt | 264 +++++++++++++++++++++++++++++++++++++++++ static/ortho_table.csv | 240 +++++++++++++++++++------------------ 4 files changed, 400 insertions(+), 134 deletions(-) create mode 100644 static/ortho_lez.txt diff --git a/static/ortho_ava.txt b/static/ortho_ava.txt index a1b36c5..d916942 100644 --- a/static/ortho_ava.txt +++ b/static/ortho_ava.txt @@ -62,14 +62,16 @@ Kʰ?ːʰ? Ƙː ɬː łː χː ẍː Χː Ẍː +sː sː +Sː Sː ʁ ğ h h H H ʕ ḣ ƛ ḷ ɬ ł -qʰ?ː?ʰ? qː -Qʰ?ː?ʰ? Qː +qʰ?ː?ʰ? ق +Qʰ?ː?ʰ? ف x ẋ X Ẋ ħ ħ @@ -142,8 +144,6 @@ J[Aa] Я ʒ ž j j J J -y j -Y J # # # @@ -174,13 +174,9 @@ Y J г[Ъъ] ğ Г[Ъъ][Вв] Ğʷ Г[Ъъ] Ğ -г[Ьь]в hʷ г[Ьь] h -Г[Ьь][Вв] Hʷ Г[Ьь] H -г[Ӏӏ]в ḣʷ г[Ӏӏ] ḣ -Г[Ӏӏ][Вв] Ḣʷ Г[Ӏӏ] Ḣ к[Кк]в ƙːʷ к[Кк] ƙː @@ -210,6 +206,10 @@ Y J п[Ӏӏ] pʼ П[Ӏӏ][Вв] Pʼʷ П[Ӏӏ] Pʼ +ссв sːʷ +С[Сс][Вв] Sːʷ +сс sː +С[Сс] Sː т[Ӏӏ]в تʼʷ т[Ӏӏ] تʼ Т[Ӏӏ][Вв] طʼʷ @@ -218,10 +218,10 @@ Y J х[Хх] ẍː Х[Хх][Вв] Ẍːʷ Х[Хх] Ẍː -х[Ъъ]в qːʷ -х[Ъъ] qː -Х[Ъъ][Вв] Qːʷ -Х[Ъъ] Qː +х[Ъъ]в قʷ +х[Ъъ] ق +Х[Ъъ][Вв] فʷ +Х[Ъъ] ف х[Ьь]в ẋʷ х[Ьь] ẋ Х[Ьь][Вв] Ẋʷ @@ -261,8 +261,6 @@ Y J А A б b Б B -в v -В V гв gʷ г g Г[Вв] Gʷ @@ -343,5 +341,7 @@ Y J [ЪъЬь]?Ю Ю [ъь]?я я [ЪъЬь]?Я Я +в v +В V ӏ ɂ Ӏ Ɂ \ No newline at end of file diff --git a/static/ortho_dar.txt b/static/ortho_dar.txt index 0784f5e..ee9485c 100644 --- a/static/ortho_dar.txt +++ b/static/ortho_dar.txt @@ -102,8 +102,6 @@ J[Aa] Я ʒ ž j j J J -y j -Y J # # # diff --git a/static/ortho_lez.txt b/static/ortho_lez.txt new file mode 100644 index 0000000..91b8f73 --- /dev/null +++ b/static/ortho_lez.txt @@ -0,0 +1,264 @@ +# абруптивность ʼ → долгота ː → лабиализация ʷ +# +# ʼː → ٮ +# +# ʼ modifier letter apostrophe +# ’ right single quotation mark +# +# +# +# палочка +#Ӏ ӏ +# +# +# +# латиница +(t[͜͡]?ʃ|ʧ)[ʼ’]ː čٮ +(T[͜͡]?ʃ|ʧ)[ʼ’]ː Čٮ +č[ʼ’]ː čٮ +Č[ʼ’]ː Čٮ +k[ʼ’]ː ƙٮ +K[ʼ’]ː Ƙٮ +@k[͜͡]?[ʼ’]xː ƙٮ +@K[͜͡]?[ʼ’]xː Ƙٮ +(t[͜͡]?s|ʦ)[ʼ’] cʼ +(T[͜͡]?[Ss]|ʦ)[ʼ’] Cʼ +(t[͜͡]?ʃ|ʧ)[ʼ’] čʼ +(T[͜͡]?ʃ|ʧ)[ʼ’] Čʼ +c[ʼ’] cʼ +C[ʼ’] Cʼ +č[ʼ’] čʼ +Č[ʼ’] Čʼ +k[ʼ’] ƙʼ +K[ʼ’] Ƙʼ +p[ʼ’] pʼ +P[ʼ’] Pʼ +q[ʼ’] ƣ +Q[ʼ’] Ƣ +t[ʼ’] تʼ +T[ʼ’] طʼ +(t[͜͡]?s|ʦ)ʰ? c +(t[͜͡]?[Ss]|ʦ)ʰ? c +(t[͜͡]?ʃ|ʧ)ʰ? č +(T[͜͡]?ʃ|ʧ)ʰ? Č +ʁ ğ +h h +H H +qʰ q +Qʰ Q +q ق +Q ف +x ẋ +X Ẋ +(? Date: Wed, 4 Feb 2026 16:25:40 +0300 Subject: [PATCH 6/6] Update README.md --- README.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 026e473..a6bc983 100644 --- a/README.md +++ b/README.md @@ -1 +1,19 @@ -This is for transliterators + # Transliterators for Caucasian languages + + ### Languages available + - East Caucasian + - Avar [ava] + - Standard Dargwa [dar] + - Lezgian [lez] + + ### To-do + - East Caucasian + - Botlikh + - Godoberi + - Chamalal + - Chechen + - Hinuq + - Ingush + - Khwarshi + - Lak + - Tsakhur