diff --git a/README.md b/README.md index 026e473..a6bc983 100644 --- a/README.md +++ b/README.md @@ -1 +1,19 @@ -This is for transliterators + # Transliterators for Caucasian languages + + ### Languages available + - East Caucasian + - Avar [ava] + - Standard Dargwa [dar] + - Lezgian [lez] + + ### To-do + - East Caucasian + - Botlikh + - Godoberi + - Chamalal + - Chechen + - Hinuq + - Ingush + - Khwarshi + - Lak + - Tsakhur diff --git a/converter.py b/converter.py new file mode 100644 index 0000000..60f5998 --- /dev/null +++ b/converter.py @@ -0,0 +1,175 @@ +import re, os, csv + +current_folder = os.path.dirname(os.path.abspath(__file__)) +ortho_table_path = os.path.join(current_folder, os.path.join("static", "ortho_table.csv")) +ortho_txt_default_name = "ortho" + +all_letters = "A-Za-zÀ-ÖØ-öø-ӿԀ-ԯⷩ" +punct = "\!-/:-@\[-`{-~ -¿\‐-⁞" + +#cyr_check = "[ПпБДдЛлЖжШшЩщФфЦцЧчИиЙйЬьЪъЫыЭэЮюЯя]" +#lat_check = "[SsVvFfIiGgZzQqNR]" + +_cyr = ["cyr", "cyrillic", "кир", "кириллица"] +#_lat = ["lat", "latin", "лат", "латиница"] +other_targets = ["ipa", "cauc"] + +possible_targets = _cyr + other_targets +recommended_targets = [_cyr[0]] + other_targets + + +def raise_error_wrong_argument(token, recommended): + raise ValueError(f"Invalid '{token}' argument given! Should be one of these: {recommended}") + + +class ConverterOutput: + """The output of the Converter.convert() function.""" + + def __init__(self, text, lang, orig, target): + """ + Parameters + ---------- + """ + + self.text = text + self.lang = lang + self.orig = orig + self.target = target + + + def __repr__(self): + return self.text + + + def full(self): + """Prints all the settings of the converted text.""" + return f"ConverterOutput(\n\ttext='{self.text}',\n\tlang={self.lang}, orig={self.orig}, target={self.target}\n)" + + +class Converter: + """The main class to convert from alphabet X to alphabet Y. + + Make a `Converter` object. Optionally, set the default target alphabet as an argument: + > c = Converter(target="cyr") + + Then use the method `convert` to change the alphabet of a text: + > converted_text = c.convert(original_text) + """ + + def __init__(self, lang, orig=None, target=None): + """ + Parameters + ---------- + """ + + with open(ortho_table_path, 'r', encoding='utf-8-sig') as f: + reader = list(csv.reader(f, delimiter=",")) + self._ortho_table = {} + headers = [] + for i in range(len(reader)): + row = reader[i] + if i == 0: + headers = row + else: + self._ortho_table[row[0]] = { + headers[i]: row[i] for i in range(1, len(row))} + + possible_langs = sorted(set([x.split("_")[0] for x in list(list(self._ortho_table.values())[0].keys())])) + if lang in possible_langs: + self.lang = lang + else: + raise_error_wrong_argument("lang", possible_langs) + + if orig in possible_targets: + self.orig = orig + elif orig is None: + self.orig = None + else: + raise_error_wrong_argument("orig", recommended_targets) + + if target in _cyr: + self._lang_target = f"{self.lang}_{_cyr[0]}" + elif target in other_targets: + self._lang_target = f"{self.lang}_{target}" + elif target is None: + self._lang_target = None + else: + raise_error_wrong_argument("target", recommended_targets) + + self.target = target + + with open( + os.path.join(current_folder, os.path.join("static", f"ortho_{lang}.txt")), + "r", encoding="utf-8-sig") as file: + ortho_txt_file = file.readlines() + + self._ortho_to_meta = {} + for line in ortho_txt_file: + if not line.startswith("#"): + bad, good = line.split("\t") + good = good.strip("\r\n") + self._ortho_to_meta[bad] = good + + + def __repr__(self): + return f"Converter(lang={self.lang}, orig={self.orig}, target={self.target})" + + + def convert(self, text, orig=None, target=None): + """ + Converts a text. + + Parameters + ---------- + """ + if text is None: + return None + if orig is None: + if self.orig is None: + raise ValueError("No 'orig' value given!") + else: + orig = self.orig + elif orig not in possible_targets: + raise_error_wrong_argument("orig", recommended_targets) + + lang_target = None + if target is None: + if self._lang_target is None: + raise ValueError("No 'target' value given!") + else: + lang_target = self._lang_target + target = self.target + elif target in possible_targets: + lang_target = f"{self.lang}_{target}" + else: + raise_error_wrong_argument("target", recommended_targets) + + ###################### + + # TO-DO: Fix palochkas which are incorrectly written as I if Cyrillic + # + # + # + + # Fix palochkas which are incorrectly capital but within a word + text = re.sub(f"(?<=[{all_letters}])Ӏ", "ӏ", text) + + # Fix palochkas which are written as '1' or '|' + text = re.sub(f"(?<=[{all_letters}])[|1]|[|1](?=[{all_letters}])", "ӏ", text) + + # Split into tokens + tokens = re.findall(f"[{all_letters}]+|[{punct}]+|[0-9]+|[^{all_letters}{punct}0-9]+", text) + + # Convert to the meta-orthography + for i in range(len(tokens)): + if re.fullmatch(f"[{all_letters}]+", tokens[i]): + for bad, good in self._ortho_to_meta.items(): + tokens[i] = re.sub(bad, good, tokens[i]) + text = "".join(tokens) + + # Convert to the target orthography + for letter in self._ortho_table: + text = re.sub(letter, self._ortho_table[letter][lang_target], text) + + return ConverterOutput(text, lang=self.lang, orig=orig, target=target) + diff --git a/static/ortho_ava.txt b/static/ortho_ava.txt new file mode 100644 index 0000000..d916942 --- /dev/null +++ b/static/ortho_ava.txt @@ -0,0 +1,347 @@ +# абруптивность ʼ → долгота ː → лабиализация ʷ +# +# ʼː → ٮ +# +# ʼ modifier letter apostrophe +# ’ right single quotation mark +# +# +# +# палочка +#Ӏ ӏ +# +# +# +# латиница +(t[͜͡]?s|ʦ)[ʼ’]ː cٮ +(T[͜͡]?[Ss]|ʦ)[ʼ’]ː Cٮ +(t[͜͡]?ʃ|ʧ)[ʼ’]ː čٮ +(T[͜͡]?ʃ|ʧ)[ʼ’]ː Čٮ +c[ʼ’]ː cٮ +C[ʼ’]ː Cٮ +č[ʼ’]ː čٮ +Č[ʼ’]ː Čٮ +k[ʼ’]ː ƙٮ +K[ʼ’]ː Ƙٮ +@k[͜͡]?[ʼ’]xː ƙٮ +@K[͜͡]?[ʼ’]xː Ƙٮ +(t[͜͡]?s|ʦ)[ʼ’] cʼ +(T[͜͡]?[Ss]|ʦ)[ʼ’] Cʼ +(t[͜͡]?ʃ|ʧ)[ʼ’] čʼ +(T[͜͡]?ʃ|ʧ)[ʼ’] Čʼ +t[͜͡]?ɬ[ʼ’] ḷʼ +T[͜͡]?ɬ[ʼ’] Ḷʼ +c[ʼ’] cʼ +C[ʼ’] Cʼ +č[ʼ’] čʼ +Č[ʼ’] Čʼ +k[ʼ’] ƙʼ +K[ʼ’] Ƙʼ +ƛ[ʼ’] ḷʼ +ƛ[ʼ’] Ḷʼ +p[ʼ’] pʼ +P[ʼ’] Pʼ +q[ʼ’] ƣ +Q[ʼ’] Ƣ +t[ʼ’] تʼ +T[ʼ’] طʼ +(t[͜͡]?s|ʦ)ʰ?ːʰ? cː +(T[͜͡]?[Ss]|ʦ)ʰ?ːʰ? Cː +(t[͜͡]?ʃ|ʧ)ʰ?ːʰ? čː +(T[͜͡]?ʃ|ʧ)ʰ?ːʰ? Čː +(t[͜͡]?s|ʦ)ʰ? c +(t[͜͡]?[Ss]|ʦ)ʰ? c +(t[͜͡]?ʃ|ʧ)ʰ? č +(T[͜͡]?ʃ|ʧ)ʰ? Č +t[͜͡]?ɬʰ? ḷ +T[͜͡]?ɬʰ? Ḷ +kʰ?ːʰ? ƙː +Kʰ?ːʰ? Ƙː +@k[͜͡]?xː ƙː +@K[͜͡]?[Xx]ː Ƙː +ɬː łː +χː ẍː +Χː Ẍː +sː sː +Sː Sː +ʁ ğ +h h +H H +ʕ ḣ +ƛ ḷ +ɬ ł +qʰ?ː?ʰ? ق +Qʰ?ː?ʰ? ف +x ẋ +X Ẋ +ħ ħ +Ħ Ħ +cʰ?ːʰ? cː +Cʰ?ːʰ? Cː +čʰ?ːʰ? čː +Čʰ?ːʰ? Čː +(?