diff --git a/docs/FrequencyListFormats.md b/docs/FrequencyListFormats.md new file mode 100644 index 00000000..034957fa --- /dev/null +++ b/docs/FrequencyListFormats.md @@ -0,0 +1,62 @@ +# Example frequency list formats + + +#### Study plan frequency list created with Readibility Analyzer +``` +#study_plan_frequency 1.0 +мистер мистер мистер мистер UNKNOWN UNKNOWN [morph_freq 99992, master_freq 99992] +похоже похоже похоже похоже UNKNOWN UNKNOWN [morph_freq 71157, master_freq 71157] +доктор доктор доктор доктор UNKNOWN UNKNOWN [morph_freq 67918, master_freq 67918] +. +. +. +``` + +#### Frequency report format +This is generated by Readibility Analyzer for morph_freq_report.txt or instance_freq_report.txt +Note the new header which is automatically created with the new Morphman version. Add the header manually if needed + +``` +#frequency_report 1.0 +1401 я я я UNKNOWN UNKNOWN 1 1 3.40098073 3.40098073 matches 1 +1139 не не не UNKNOWN UNKNOWN 2 2 2.76496577 6.16594650 matches 1 +992 в в в UNKNOWN UNKNOWN 3 3 2.40811769 8.57406418 matches 1 +798 что что что UNKNOWN UNKNOWN 4 4 1.93717532 10.51123950 matches 1 +798 и и и UNKNOWN UNKNOWN 4 5 1.93717532 12.44841482 matches 1 +660 это это это UNKNOWN UNKNOWN 5 6 1.60217507 14.05058989 matches 1 +. +. +``` + +#### Custom frequency list type, which has also frequency count +``` +#HEADERTYPE_count_word +5081568 я +4334804 не +3552532 что +2953981 в +2917112 и +2723798 ты +. +. +``` +This is handy if you wan't to use frequency lists from external sources. + +#### List consisting of single word by line without frequency +This is the fall-back format +``` +の +だ +は +た +に +を +が +する +ない +. +. +. +``` + +If you need to add a custom format, it can be done quite easily by editing `loadFrequencyList` in language.py \ No newline at end of file diff --git a/docs/MultiLanguage.md b/docs/MultiLanguage.md new file mode 100644 index 00000000..dc05645e --- /dev/null +++ b/docs/MultiLanguage.md @@ -0,0 +1,26 @@ +# Multi-language support + +If you want to study multiple languages but keep using the same Anki profile, it's now possible with the new multi-language support. This means that each target language has its own frequency list and databases, neatly separated into distinct files. + + +## Setup + +*Morphman Preferences -> Note Filter* window has now a new column for selecting the target language for each filter. By selecting for example '*Japanese*' the Morphman will use the following files for that specific filter: + + - frequency_Japanese.txt + - all_Japanese.db + - seen_Japanese.db + - known_Japanese.db + - mature_Japanese.db + - external_Japanese.db + - priority_Japanese.db + +**Default** setting means that Morphman will keep using the existing frequencylist.txt, all.db, known.db .. files for backwards compatibility when processing that filter. + +The language list is currently hard-coded, but if you need to add a new one you can do that easily by editing *preferences.py*, or select **Other** . In the latter case Morphman would use files such as *known_Other.txt* + +If you have existing **frequency.txt** and **external.db** files, you can rename them to reflect the target language (e.g. *frequency_Japanese.txt* and *external_Japanese.db*). You can then delete the rest of the database files and do a Recalc. + +## Changes in Readability Analyzer + +When using *Readability Analyzer* you must now explicitly select both Known and Mature database files (because it will not try to infer the mature morph database file name from known data base file). If generating frequency lists the output file name is currently fixed *frequency.txt* so you will need to manually rename it for the specific language. \ No newline at end of file diff --git a/morph/UI/__init__.py b/morph/UI/__init__.py index 4756ee06..18ecc309 100644 --- a/morph/UI/__init__.py +++ b/morph/UI/__init__.py @@ -1,3 +1,4 @@ # pylint: disable=W0611 from .morphemizerComboBox import MorphemizerComboBox +from .languageComboBox import LanguageComboBox diff --git a/morph/UI/languageComboBox.py b/morph/UI/languageComboBox.py new file mode 100644 index 00000000..c483cbcf --- /dev/null +++ b/morph/UI/languageComboBox.py @@ -0,0 +1,31 @@ + +from PyQt6.QtWidgets import QComboBox + + +class LanguageComboBox(QComboBox): + + def setLanguages(self, languages): + if type(languages) == list: + self.languages = languages + else: + self.languages = ['Default'] + + for language in self.languages: + self.addItem(language) + + self.setCurrentIndex(0) + + def getCurrent(self): + try: + return self.languages[self.currentIndex()] + except IndexError: + return None + + def setCurrentByName(self, name): + active = False + for i, language in enumerate(self.languages): + if language == name: + active = i + if active: + self.setCurrentIndex(active) + diff --git a/morph/config.py b/morph/config.py index 2373e542..4ff0ca33 100644 --- a/morph/config.py +++ b/morph/config.py @@ -5,13 +5,13 @@ # 4th (lowest) priority default = { 'path_dbs': os.path.join(mw.pm.profileFolder(), 'dbs'), - 'path_priority': os.path.join(mw.pm.profileFolder(), 'dbs', 'priority.db'), - 'path_ext': os.path.join(mw.pm.profileFolder(), 'dbs', 'external.db'), - 'path_frequency': os.path.join(mw.pm.profileFolder(), 'dbs', 'frequency.txt'), - 'path_all': os.path.join(mw.pm.profileFolder(), 'dbs', 'all.db'), - 'path_mature': os.path.join(mw.pm.profileFolder(), 'dbs', 'mature.db'), - 'path_known': os.path.join(mw.pm.profileFolder(), 'dbs', 'known.db'), - 'path_seen': os.path.join(mw.pm.profileFolder(), 'dbs', 'seen.db'), + 'path_priority': os.path.join(mw.pm.profileFolder(), 'dbs', 'priority%s.db'), + 'path_ext': os.path.join(mw.pm.profileFolder(), 'dbs', 'external%s.db'), + 'path_frequency': os.path.join(mw.pm.profileFolder(), 'dbs','frequency%s.txt'), + 'path_all': os.path.join(mw.pm.profileFolder(), 'dbs', 'all%s.db'), + 'path_mature': os.path.join(mw.pm.profileFolder(), 'dbs', 'mature%s.db'), + 'path_known': os.path.join(mw.pm.profileFolder(), 'dbs', 'known%s.db'), + 'path_seen': os.path.join(mw.pm.profileFolder(), 'dbs', 'seen%s.db'), 'path_log': os.path.join(mw.pm.profileFolder(), 'morphman.log'), 'path_stats': os.path.join(mw.pm.profileFolder(), 'morphman.stats'), diff --git a/morph/graphs.py b/morph/graphs.py index 6a16a4fc..a1ce46ad 100644 --- a/morph/graphs.py +++ b/morph/graphs.py @@ -8,6 +8,7 @@ from .morphemes import AnkiDeck from .preferences import get_preference as cfg from .util import mw +from .language import getAllDb colYoung = "#7c7" colCard = "#282" @@ -221,7 +222,8 @@ def get_stats(self, db_table, bucket_size_days, day_cutoff_seconds, num_buckets= if not all_reviews_for_bucket: return stats_by_name - all_db = util.allDb() + # TODO! Process all.db for each language + all_db = getAllDb("Default") nid_to_morphs = defaultdict(set) for m, ls in all_db.db.items(): diff --git a/morph/language.py b/morph/language.py new file mode 100644 index 00000000..645e5ad7 --- /dev/null +++ b/morph/language.py @@ -0,0 +1,131 @@ +from aqt.utils import tooltip +import os +import io +import csv +import itertools +from .preferences import get_preference as cfg +from .morphemes import Morpheme + +# Each language has its own MorphDb +_allDb = {} + +class FrequencyList: + def __init__(self): + self.map = dict() + self.len = 0 + self.has_morphemes = False + self.has_frequency_count = False + self.master_total_instances = 0 + +def getLanguageList(): + languages = set() + rowData = cfg('Filter') + try: + for row in rowData: + languages.add(row['Language']) + except: + # language per filter not yet configured + pass + if len(languages) == 0: + languages.add('Default') + return list(languages) + + +def getPathByLanguage(path, language): + if (language == 'Default'): + return path % ('') + else: + return path % ('_' + language ) + + +def getAllDb(language): + global _allDb + + # Force reload if all.db got deleted + all_db_path = getPathByLanguage(cfg('path_all'),language) + reload = not os.path.isfile(all_db_path) + + if reload or (language not in _allDb): + from .morphemes import MorphDb + _allDb[language] = MorphDb(all_db_path, ignoreErrors=True) + return _allDb[language] + + +def getTotalKnownSet(): + from .morphemes import MorphDb + + # Load known.db and get total morphemes known + totalVariations = 0 + totalKnown = 0 + languages = getLanguageList() + for language in languages: + known_db = MorphDb(getPathByLanguage(cfg('path_known'),language), ignoreErrors=True) + totalVariations += len(known_db.db) + totalKnown += len(known_db.groups) + + d = {'totalVariations': totalVariations, 'totalKnown': totalKnown} + return d + + +""" +See docs/FrequencyListFormats.md for specific info about the file format +""" +def loadFrequencyList(frequencyListPath, force_morphemes=False): + + print("Loading Frequency List for file %s.." % frequencyListPath) + fl = FrequencyList() + + try: + with io.open(frequencyListPath, encoding='utf-8-sig') as csvfile: + csvreader = csv.reader(csvfile, delimiter="\t") + rows = [row for row in csvreader] + print("First line: [%s]" % rows[0][0]) + + if rows[0][0] == "#study_plan_frequency": + print("Detected Study plan frequency format") + fl.has_morphemes = True + fl.map = dict( + zip([Morpheme(row[0], row[1], row[2], row[3], row[4], row[5]) for row in rows[1:]], + itertools.count(0))) + + elif rows[0][0] == "#frequency_report": + print("Detected Frequency report format") + fl.has_morphemes = True + fl.has_frequency_count = True + for row in rows[1:]: + fl.map[ Morpheme(row[1], row[2], row[2], row[3], row[4], row[5]) ] = int(row[0]) + + elif rows[0][0] == "#HEADERTYPE_count_word": + print("Detected frequency + word format") + fl.has_frequency_count = True + if force_morphemes: + fl.has_morphemes = True + for row in rows[1:]: + fl.map[ Morpheme(row[1], row[1], row[1], row[1], "UNKNOWN","UNKNOWN") ] = int(row[0]) + else: + for row in rows[1:]: + fl.map[ row[1] ] = int(row[0]) + else: + print("Assuming one-word-per-line format") + if force_morphemes: + fl.has_morphemes = True + fl.map = dict(zip([Morpheme(row[1], row[1], row[1], row[1], "UNKNOWN","UNKNOWN") for row in rows], itertools.count(0))) + else: + fl.map = dict(zip([row[0] for row in rows], itertools.count(0))) + + fl.len = len(fl.map) + if fl.has_frequency_count: + fl.master_total_instances = sum(fl.map.values()) + + except (FileNotFoundError, IndexError) as e: + err = "Warning! Couldn't not read frequency list %s" % (frequencyListPath) + print(err) + tooltip(err) + pass + + return fl + +def loadFrequencyListByLanguage(language): + frequencyListPath = getPathByLanguage(cfg('path_frequency'), language) + return loadFrequencyList(frequencyListPath) + diff --git a/morph/main.py b/morph/main.py index 9148d715..8d92e1cc 100644 --- a/morph/main.py +++ b/morph/main.py @@ -20,6 +20,7 @@ from .util import printf, mw, errorMsg, getFilterByMidAndTags, getReadEnabledModels, getModifyEnabledModels from .preferences import get_preference as cfg, get_preferences from .util_external import memoize +from .language import * # hack: typing is compile time anyway, so, nothing bad happens if it fails, the try is to support anki < 2.1.16 try: @@ -118,7 +119,7 @@ def notesToUpdate(last_updated, included_mids): return mw.col.db.execute(query) -def mkAllDb(all_db=None): +def mkAllDb(all_db, language): from . import config importlib.reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags @@ -127,7 +128,6 @@ def mkAllDb(all_db=None): # for providing an error message if there is no note that is used for processing N_enabled_notes = 0 - if not all_db: all_db = MorphDb() @@ -164,6 +164,10 @@ def mkAllDb(all_db=None): if mid_cfg is None: continue + if 'Language' in mid_cfg: + if (mid_cfg['Language'] != language): + continue + N_enabled_notes += 1 mName = mid_cfg['Morphemizer'] @@ -230,7 +234,7 @@ def filterDbByMat(db, mat): return newDb -def updateNotes(allDb): +def updateNotes(allDb, language): t_0, now, db = time.time(), int_time(), mw.col.db TAG = mw.col.tags # type: TagManager @@ -252,30 +256,10 @@ def updateNotes(allDb): knownDb = filterDbByMat(allDb, cfg('threshold_known')) matureDb = filterDbByMat(allDb, cfg('threshold_mature')) mw.progress.update(label='Loading priority.db') - priorityDb = MorphDb(cfg('path_priority'), ignoreErrors=True) - - mw.progress.update(label='Loading frequency.txt') - frequencyListPath = cfg('path_frequency') - frequency_map = {} - frequency_has_morphemes = False - - try: - with io.open(frequencyListPath, encoding='utf-8-sig') as csvfile: - csvreader = csv.reader(csvfile, delimiter="\t") - rows = [row for row in csvreader] - - if rows[0][0] == "#study_plan_frequency": - frequency_has_morphemes = True - frequency_map = dict( - zip([Morpheme(row[0], row[1], row[2], row[3], row[4], row[5]) for row in rows[1:]], - itertools.count(0))) - else: - frequency_map = dict(zip([row[0] for row in rows], itertools.count(0))) - - except (FileNotFoundError, IndexError) as e: - pass + priorityDb = MorphDb(getPathByLanguage(cfg('path_priority'),language), ignoreErrors=True) - frequencyListLength = len(frequency_map) + mw.progress.update(label='Loading frequency list') + frequency_list = loadFrequencyListByLanguage(language) # prefetch cfg for fields field_focus_morph = cfg('Field_FocusMorph') @@ -352,6 +336,10 @@ def updateNotes(allDb): if notecfg is None or not notecfg['Modify']: continue + if 'Language' in notecfg: + if (notecfg['Language'] != language): + continue + # Get all morphemes for note morphemes = set() for fieldName in notecfg['Fields']: @@ -408,16 +396,16 @@ def updateNotes(allDb): isPriority = True usefulness += priorityDbWeight - if frequency_has_morphemes: - focusMorphIndex = frequency_map.get(focusMorph, -1) + if frequency_list.has_morphemes: + focusMorphIndex = frequency_list.map.get(focusMorph, -1) else: - focusMorphIndex = frequency_map.get(focusMorph.base, -1) + focusMorphIndex = frequency_list.map.get(focusMorph.base, -1) if focusMorphIndex >= 0: isFrequency = True # The bigger this number, the lower mmi becomes - usefulness += int(round( frequencyBonus * (1 - focusMorphIndex / frequencyListLength) )) + usefulness += int(round( frequencyBonus * (1 - focusMorphIndex / frequency_list.len) )) # average frequency of unknowns (ie. how common the word is within your collection) F_k_avg = F_k // N_k if N_k > 0 else F_k @@ -559,10 +547,10 @@ def updateNotes(allDb): if cfg('saveDbs'): mw.progress.update(label='Saving all/seen/known/mature dbs') - allDb.save(cfg('path_all')) - seenDb.save(cfg('path_seen')) - knownDb.save(cfg('path_known')) - matureDb.save(cfg('path_mature')) + allDb.save(getPathByLanguage(cfg('path_all'),language)) + seenDb.save(getPathByLanguage(cfg('path_seen'),language)) + knownDb.save(getPathByLanguage(cfg('path_known'),language)) + matureDb.save(getPathByLanguage(cfg('path_mature'),language)) printf('Updated %d notes + saved dbs in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() @@ -575,35 +563,36 @@ def main(): pr = cProfile.Profile() pr.enable() - # load existing all.db - mw.progress.start(label='Loading existing all.db', immediate=True) - t_0 = time.time() - cur = util.allDb() if cfg('loadAllDb') else None - printf('Loaded all.db in %f sec' % (time.time() - t_0)) - mw.progress.finish() - - # update all.db - allDb = mkAllDb(cur) - # there was an (non-critical-/non-"exception"-)error but error message was already displayed - if not allDb: + # load existing all.db for each language (all.db for default language, all_Japanese.db for Japanese etc..) + languages = getLanguageList() + for language in languages: + fname = getPathByLanguage('all%s.db', language) + mw.progress.start(label='Loading existing %s' % fname, immediate=True) + t_0 = time.time() + cur = getAllDb(language) if cfg('loadAllDb') else None + printf('Loaded %s in %f sec' % (fname, time.time() - t_0)) mw.progress.finish() - return - # merge in external.db - mw.progress.start(label='Merging ext.db', immediate=True) - ext = MorphDb(cfg('path_ext'), ignoreErrors=True) - allDb.merge(ext) - mw.progress.finish() + # update all.db + allDb = mkAllDb(cur, language) + # there was an (non-critical-/non-"exception"-)error but error message was already displayed + if not allDb: + mw.progress.finish() + else: + + # merge in external.db (or external_Japanese.db, external_French.db etc) + mw.progress.start(label='Merging ext.db', immediate=True) + ext = MorphDb(getPathByLanguage(cfg('path_ext'),language), ignoreErrors=True) + allDb.merge(ext) + mw.progress.finish() - # update notes - knownDb = updateNotes(allDb) + # update notes + knownDb = updateNotes(allDb, language) - # update stats and refresh display - stats.updateStats(knownDb) - mw.toolbar.draw() + # update stats and refresh display + stats.updateStats(knownDb) + mw.toolbar.draw() - # set global allDb - util._allDb = allDb # finish------------------- if doProfile: diff --git a/morph/morphemizer.py b/morph/morphemizer.py index fcf2a0c3..95b59895 100644 --- a/morph/morphemizer.py +++ b/morph/morphemizer.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import re +import unicodedata from functools import lru_cache from .morphemes import Morpheme @@ -52,7 +53,7 @@ def getAllMorphemizers(): # type: () -> [Morphemizer] global morphemizers if morphemizers is None: - morphemizers = [SpaceMorphemizer(), MecabMorphemizer(), JiebaMorphemizer(), CjkCharMorphemizer()] + morphemizers = [SpaceMorphemizer(), MecabMorphemizer(), JiebaMorphemizer(), CjkCharMorphemizer(), DeaccentMorphemizer()] for m in morphemizers: morphemizers_by_name[m.getName()] = m @@ -111,6 +112,50 @@ def getDescription(self): return 'Language w/ Spaces' + +#################################################################################################### +# Morphemizer that removes accents. This can be useful especially for learning russian language. +# Some of the learning material might use words with accent marks (малако́) for emphasis when usually they +# are omitted in literature and subtitles (молоко). +# When using the default SpaceMorphemizer these two words would be regarded as different words so you might +# end up wasting a lot of time. +# With DeaccentMorphimizer all accents are removed, avoiding this annoyance. +# +# WARNING! There are some words which DO have identical writing but different emphasis (for example +# замо́к = lock and за́мок = castle) but this is a bit rare situtation. If all of your cards have accent +# markings, it's better to use SpaceMorphemizer so you will be forced to learn both meanings for these words:) +#################################################################################################### + +ACCENT_MAPPING = { + 'а́': 'а', + 'е́': 'е', + 'и́': 'и', + 'о́': 'о', + 'у́': 'у', + 'ы́': 'ы', + 'э́': 'э', + 'ю́': 'ю', + 'я́': 'я', +} +ACCENT_MAPPING = {unicodedata.normalize('NFKC', i): j for i, j in ACCENT_MAPPING.items()} + +def deaccentify(s): + source = unicodedata.normalize('NFKC', s) + for old, new in ACCENT_MAPPING.items(): + source = source.replace(old, new) + return source + +class DeaccentMorphemizer(Morphemizer): + + def _getMorphemesFromExpr(self, expression): + word_list = [deaccentify(word.lower()) + for word in re.findall(r"\b[^\s\d]+\b", expression, re.UNICODE)] + return [Morpheme(word, word, word, word, 'UNKNOWN', 'UNKNOWN') for word in word_list] + + def getDescription(self): + return 'Deaccented words w/ spaces' + + #################################################################################################### # CJK Character Morphemizer #################################################################################################### diff --git a/morph/newMorphHelper.py b/morph/newMorphHelper.py index 51b75750..ca4f80fc 100644 --- a/morph/newMorphHelper.py +++ b/morph/newMorphHelper.py @@ -23,8 +23,9 @@ from aqt.utils import tooltip from . import main -from .util import mw, allDb +from .util import mw from .preferences import get_preference as cfg +from .language import getAllDb, getPathByLanguage, loadFrequencyListByLanguage assert isinstance(mw, aqt.main.AnkiQt) @@ -266,14 +267,6 @@ def nonSpanSub(sub, repl, string): return ''.join(re.sub(sub, repl, s, flags=re.IGNORECASE) if not s.startswith(')', string)) - frequency_list_path = cfg('path_frequency') - try: - with codecs.open(frequency_list_path, encoding='utf-8') as f: - frequency_list = [line.strip().split('\t')[0] for line in f.readlines()] - except: - frequency_list = [] - - priority_db = main.MorphDb(cfg('path_priority'), ignoreErrors=True).db note = ctx.note() tags = note.stringTags() @@ -284,12 +277,18 @@ def nonSpanSub(sub, repl, string): if morphemizer is None: return txt + language = filter['Language'] + allDb = getAllDb(language) + + frequency_list = loadFrequencyListByLanguage(language).keys() + priority_db = main.MorphDb(getPathByLanguage(cfg('path_priority'),language), ignoreErrors=True).db + ms = getMorphemes(morphemizer, txt, tags) proper_nouns_known = cfg('Option_ProperNounsAlreadyKnown') for m in sorted(ms, key=lambda x: len(x.inflected), reverse=True): # largest subs first - locs = allDb().getMatchingLocs(m) + locs = allDb.getMatchingLocs(m) mat = max(loc.maturity for loc in locs) if locs else 0 if proper_nouns_known and m.isProperNoun(): diff --git a/morph/preferences.py b/morph/preferences.py index d89e19d7..1cb8c76e 100644 --- a/morph/preferences.py +++ b/morph/preferences.py @@ -110,13 +110,16 @@ def jcfg_default(): 'Tag_TooLong': 'mm_tooLong', # set if sentence is above optimal length range 'Tag_Frequency': 'mm_frequency', # set if sentence is above optimal length range + # Language list + 'Languages' : ['Default','Japanese','Chinese','Korean','German','French','Russian','Other'], + # filter for cards that should be analyzed, higher entries have higher priority 'Filter': [ # note type (None means all note types), list of tags, list of morph fields for this note type -> morphemizer, analyze only or modify? {'Type': 'SubtitleMemorize', 'TypeId': None, 'Tags': ['japanese'], 'Fields': ['Expression'], - 'Morphemizer': 'MecabMorphemizer', 'Read': True, 'Modify': True}, + 'Language': 'Default', 'Morphemizer': 'MecabMorphemizer', 'Read': True, 'Modify': True}, {'Type': 'SubtitleMemorize', 'TypeId': None, 'Tags': [], 'Fields': ['Expression'], - 'Morphemizer': 'SpaceMorphemizer', 'Read': True, 'Modify': True}, + 'Language': 'Default', 'Morphemizer': 'SpaceMorphemizer', 'Read': True, 'Modify': True}, ], # This field lets you dictate string-to-morpheme conversions. This is useful for cases @@ -160,6 +163,8 @@ def jcfg_default(): # Readability Analyzer options 'Option_AnalysisInputPath': '', 'Option_MasterFrequencyListPath': '', + 'Option_KnownMorphListPath': '', + 'Option_MatureMorphListPath': '', 'Option_DefaultMinimumMasterFrequency': 0, 'Option_DefaultStudyTarget': 98.0, 'Option_OptimalMasterTarget': 0.0, diff --git a/morph/preferencesDialog.py b/morph/preferencesDialog.py index 394a007f..9b7daef1 100644 --- a/morph/preferencesDialog.py +++ b/morph/preferencesDialog.py @@ -9,6 +9,7 @@ from .preferences import get_preference, update_preferences from .morphemizer import getAllMorphemizers from .UI import MorphemizerComboBox +from .UI import LanguageComboBox # only for jedi-auto-completion import aqt.main @@ -44,7 +45,7 @@ def createNoteFilterTab(self): self.frame1.setLayout(vbox) vbox.setContentsMargins(0, 20, 0, 0) - self.tableModel = QStandardItemModel(0, 6) + self.tableModel = QStandardItemModel(0, 7) self.tableView = QTableView() self.tableView.setModel(self.tableModel) self.tableView.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch) @@ -53,9 +54,10 @@ def createNoteFilterTab(self): self.tableModel.setHeaderData(0, Qt.Orientation.Horizontal, "Note type") self.tableModel.setHeaderData(1, Qt.Orientation.Horizontal, "Tags") self.tableModel.setHeaderData(2, Qt.Orientation.Horizontal, "Fields") - self.tableModel.setHeaderData(3, Qt.Orientation.Horizontal, "Morphemizer") - self.tableModel.setHeaderData(4, Qt.Orientation.Horizontal, "Read?") - self.tableModel.setHeaderData(5, Qt.Orientation.Horizontal, "Modify?") + self.tableModel.setHeaderData(3, Qt.Orientation.Horizontal, "Language") + self.tableModel.setHeaderData(4, Qt.Orientation.Horizontal, "Morphemizer") + self.tableModel.setHeaderData(5, Qt.Orientation.Horizontal, "Read?") + self.tableModel.setHeaderData(6, Qt.Orientation.Horizontal, "Modify?") rowData = get_preference('Filter') self.tableModel.setRowCount(len(rowData)) @@ -280,6 +282,13 @@ def setTableRow(self, rowIndex, data): morphemizerComboBox.setMorphemizers(getAllMorphemizers()) morphemizerComboBox.setCurrentByName(data['Morphemizer']) + languageComboBox = LanguageComboBox() + languageComboBox.setLanguages(get_preference('Languages')) + try: + languageComboBox.setCurrentByName(data['Language']) + except: + pass + readItem = QStandardItem() readItem.setCheckable(True) readItem.setCheckState(Qt.CheckState.Checked if data.get('Read', True) else Qt.CheckState.Unchecked) @@ -292,6 +301,7 @@ def setTableRow(self, rowIndex, data): rowGui['tagsEntry'] = QLineEdit(', '.join(data['Tags'])) rowGui['fieldsEntry'] = QLineEdit(', '.join(data['Fields'])) rowGui['morphemizerComboBox'] = morphemizerComboBox + rowGui['languageComboBox'] = languageComboBox rowGui['readCheckBox'] = readItem rowGui['modifyCheckBox'] = modifyItem @@ -301,9 +311,10 @@ def setColumn(col, widget): setColumn(0, rowGui['modelComboBox']) setColumn(1, rowGui['tagsEntry']) setColumn(2, rowGui['fieldsEntry']) - setColumn(3, rowGui['morphemizerComboBox']) - self.tableModel.setItem(rowIndex, 4, readItem) - self.tableModel.setItem(rowIndex, 5, modifyItem) + setColumn(3, rowGui['languageComboBox']) + setColumn(4, rowGui['morphemizerComboBox']) + self.tableModel.setItem(rowIndex, 5, readItem) + self.tableModel.setItem(rowIndex, 6, modifyItem) if len(self.rowGui) == rowIndex: self.rowGui.append(rowGui) @@ -326,6 +337,7 @@ def rowGuiToFilter(row_gui): filter['Fields'] = [ x for x in row_gui['fieldsEntry'].text().split(', ') if x] + filter['Language'] = row_gui['languageComboBox'].getCurrent() filter['Morphemizer'] = row_gui['morphemizerComboBox'].getCurrent().getName() filter['Read'] = row_gui['readCheckBox'].checkState() != Qt.CheckState.Unchecked filter['Modify'] = row_gui['modifyCheckBox'].checkState() != Qt.CheckState.Unchecked diff --git a/morph/readability.py b/morph/readability.py index 1c8f587c..37ce325e 100644 --- a/morph/readability.py +++ b/morph/readability.py @@ -35,6 +35,8 @@ from . import readability_ui from . import readability_settings_ui +from .language import loadFrequencyList, FrequencyList + importlib.reload(customTableWidget) importlib.reload(readability_ui) importlib.reload(readability_settings_ui) @@ -395,8 +397,10 @@ def __init__(self, parent=None): self.ui.masterFreqEdit.setText(cfg('Option_MasterFrequencyListPath')) self.ui.masterFreqButton.clicked.connect( lambda le: getPath(self.ui.masterFreqEdit, "Select Master Frequency List")) - self.ui.knownMorphsEdit.setText(cfg('path_known')) + self.ui.knownMorphsEdit.setText(cfg('Option_KnownMorphListPath')) self.ui.knownMorphsButton.clicked.connect(lambda le: getPath(self.ui.knownMorphsEdit, "Select Known Morphs DB")) + self.ui.matureMorphsEdit.setText(cfg('Option_MatureMorphListPath')) + self.ui.matureMorphsButton.clicked.connect(lambda le: getPath(self.ui.matureMorphsEdit, "Select Mature Morphs DB")) self.ui.outputFrequencyEdit.setText(cfg('path_dbs')) self.ui.outputFrequencyButton.clicked.connect( lambda le: getPath(self.ui.outputFrequencyEdit, "Select Output Directory", True)) @@ -597,6 +601,7 @@ def saveWordReport(self, known_db, morphs, path): group_idx = 0 morph_total = 0.0 master_morphs_count = sum(n for n in master_morphs.values()) + f.write("#frequency_report\t1.0\n") for m in sorted(master_morphs.items(), key=operator.itemgetter(1), reverse=True): if m[1] != last_count: @@ -718,7 +723,7 @@ def onAnalyze(self): self.readability_target = float(self.ui.targetSpinBox.value()) master_freq_path = self.ui.masterFreqEdit.text() known_words_path = self.ui.knownMorphsEdit.text() - mature_words_path = os.path.normpath(os.path.dirname(known_words_path) + '/mature.db') + mature_words_path = self.ui.matureMorphsEdit.text() output_path = self.ui.outputFrequencyEdit.text() save_word_report = self.ui.wordReportCheckBox.isChecked() save_study_plan = self.ui.studyPlanCheckBox.isChecked() @@ -730,6 +735,8 @@ def onAnalyze(self): pref = {} pref['Option_AnalysisInputPath'] = input_path pref['Option_MasterFrequencyListPath'] = master_freq_path + pref['Option_KnownMorphListPath'] = known_words_path + pref['Option_MatureMorphListPath'] = mature_words_path pref['Option_DefaultMinimumMasterFrequency'] = self.minimum_master_frequency pref['Option_DefaultStudyTarget'] = self.readability_target pref['Option_SaveWordReport'] = save_word_report @@ -802,19 +809,18 @@ def onAnalyze(self): all_morph_sample = {} if os.path.isfile(master_freq_path): - with io.open(master_freq_path, encoding='utf-8-sig') as csvfile: - csvreader = csv.reader(csvfile, delimiter="\t") - for row in csvreader: - try: - instances = int(row[0]) - m = Morpheme(row[1], row[2], row[2], row[3], row[4], row[5]) - - self.master_db.addMorph(m, instances) - self.master_total_instances += instances - except: - pass - self.writeOutput("Master morphs loaded: K %d V %d\n" % ( - self.master_db.getTotalNormMorphs(), self.master_db.getTotalVariationMorphs())) + + frequency_list = loadFrequencyList(master_freq_path, force_morphemes=True) + if frequency_list.has_frequency_count: + + for m, instances in frequency_list.map.items(): + self.master_db.addMorph(m, instances) + self.master_total_instances += instances + self.writeOutput("Master morphs loaded: K %d V %d\n" % ( + self.master_db.getTotalNormMorphs(), self.master_db.getTotalVariationMorphs())) + else: + self.writeOutput("Master frequency file '%s' has no frequency data or wrong format!\n" % master_freq_path) + self.minimum_master_frequency = 0 else: self.writeOutput("Master frequency file '%s' not found.\n" % master_freq_path) self.minimum_master_frequency = 0 diff --git a/morph/readability.ui b/morph/readability.ui index 47d74eba..a357bd18 100644 --- a/morph/readability.ui +++ b/morph/readability.ui @@ -325,6 +325,34 @@ The expected format is that of a instance_freq_report.txt file. + + + + Mature Morphs DB + + + Qt::AlignBottom|Qt::AlignLeading|Qt::AlignLeft + + + + + + + + + ... + + + + + + + Path to use as your 'Mature' morphs database. + + + + + diff --git a/morph/readability_ui.py b/morph/readability_ui.py index 327ab314..28a71e3f 100644 --- a/morph/readability_ui.py +++ b/morph/readability_ui.py @@ -137,6 +137,19 @@ def setupUi(self, ReadabilityDialog): self.knownMorphsEdit.setObjectName("knownMorphsEdit") self.horizontalLayout_KnownMorphs.addWidget(self.knownMorphsEdit) self.verticalLayout_3.addLayout(self.horizontalLayout_KnownMorphs) + self.matureMorphsLabel = QtWidgets.QLabel(self.generalSettingsGroupBox) + self.matureMorphsLabel.setAlignment(QtCore.Qt.AlignmentFlag.AlignBottom|QtCore.Qt.AlignmentFlag.AlignLeading|QtCore.Qt.AlignmentFlag.AlignLeft) + self.matureMorphsLabel.setObjectName("matureMorphsLabel") + self.verticalLayout_3.addWidget(self.matureMorphsLabel) + self.horizontalLayout_MatureMorphs = QtWidgets.QHBoxLayout() + self.horizontalLayout_MatureMorphs.setObjectName("horizontalLayout_MatureMorphs") + self.matureMorphsButton = QtWidgets.QPushButton(self.generalSettingsGroupBox) + self.matureMorphsButton.setObjectName("matureMorphsButton") + self.horizontalLayout_MatureMorphs.addWidget(self.matureMorphsButton) + self.matureMorphsEdit = QtWidgets.QLineEdit(self.generalSettingsGroupBox) + self.matureMorphsEdit.setObjectName("matureMorphsEdit") + self.horizontalLayout_MatureMorphs.addWidget(self.matureMorphsEdit) + self.verticalLayout_3.addLayout(self.horizontalLayout_MatureMorphs) self.outputFreqLabel = QtWidgets.QLabel(self.generalSettingsGroupBox) self.outputFreqLabel.setObjectName("outputFreqLabel") self.verticalLayout_3.addWidget(self.outputFreqLabel) @@ -261,9 +274,15 @@ def retranslateUi(self, ReadabilityDialog): self.masterFreqButton.setText(_translate("ReadabilityDialog", "...")) self.masterFreqEdit.setToolTip(_translate("ReadabilityDialog", "Specity a Master Frequency List\n" "The expected format is that of a instance_freq_report.txt file.")) + self.knownMorphsLabel.setText(_translate("ReadabilityDialog", "Known Morphs DB")) self.knownMorphsButton.setText(_translate("ReadabilityDialog", "...")) self.knownMorphsEdit.setToolTip(_translate("ReadabilityDialog", "Path to use as your \'Known\' morphs database.")) + + self.matureMorphsLabel.setText(_translate("ReadabilityDialog", "Mature Morphs DB")) + self.matureMorphsButton.setText(_translate("ReadabilityDialog", "...")) + self.matureMorphsEdit.setToolTip(_translate("ReadabilityDialog", "Path to use as your \'Mature\' morphs database.")) + self.outputFreqLabel.setText(_translate("ReadabilityDialog", "Output Directory")) self.outputFrequencyButton.setText(_translate("ReadabilityDialog", "...")) self.outputFrequencyEdit.setToolTip(_translate("ReadabilityDialog", "Path where all outputs are written.")) diff --git a/morph/stats.py b/morph/stats.py index 9b1adc57..c422f408 100644 --- a/morph/stats.py +++ b/morph/stats.py @@ -11,6 +11,7 @@ from .preferences import get_preference as cfg from .errors.profileNotYetLoadedException import ProfileNotYetLoadedException +from .language import getTotalKnownSet def getStatsPath(): return cfg('path_stats') @@ -36,14 +37,7 @@ def saveStats(d): def updateStats(known_db=None): mw.progress.start(label='Updating stats', immediate=True) - from .morphemes import MorphDb - - # Load known.db and get total morphemes known - if known_db is None: - known_db = MorphDb(cfg('path_known'), ignoreErrors=True) - - d = {'totalVariations': len(known_db.db), 'totalKnown': len(known_db.groups)} - + d = getTotalKnownSet() saveStats(d) mw.progress.finish() return d diff --git a/morph/text_utils.py b/morph/text_utils.py index 24d33847..6fd18019 100644 --- a/morph/text_utils.py +++ b/morph/text_utils.py @@ -4,7 +4,8 @@ from .morphemes import getMorphemes from .morphemizer import getMorphemizerByName from .preferences import get_preference as cfg -from .util import getFilterByMidAndTags, allDb +from .util import getFilterByMidAndTags +from .language import getAllDb def nonSpanSub(sub, repl, string): return ''.join(re.sub(sub, repl, s, flags=re.IGNORECASE) if not s.startswith('= cfg('threshold_known')): diff --git a/morph/util.py b/morph/util.py index a7768452..5bfb3f00 100644 --- a/morph/util.py +++ b/morph/util.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import codecs import datetime -from os import path from anki.hooks import addHook from anki.notes import Note @@ -21,24 +20,6 @@ except ImportError: pass -############################################################################### -# Global data -############################################################################### -_allDb = None - - -def allDb(): - global _allDb - - # Force reload if all.db got deleted - all_db_path = get_preference('path_all') - reload = not path.isfile(all_db_path) - - if reload or (_allDb is None): - from .morphemes import MorphDb - _allDb = MorphDb(all_db_path, ignoreErrors=True) - return _allDb - ############################################################################### # Preferences