diff --git a/daba/dabased.py b/daba/dabased.py old mode 100644 new mode 100755 index 4e0f009..b06038b --- a/daba/dabased.py +++ b/daba/dabased.py @@ -172,15 +172,17 @@ def parse_gloss(self, gloss_string): daba.grammar.tokenize(gloss_string) ) gt = daba.formats.WordToken([gloss], stage='dabased') + #print("*** parse_gloss / gt :",gt) return gt def parse_token(self, token_expression): try: toktype, tokvalue = token_expression[1:].split(':') - tokvalue = re.compile("^" + tokvalue + "\Z", re.UNICODE) + tokvalue = re.compile(r"^" + tokvalue + r"\Z", re.UNICODE) except (ValueError): toktype = token_expression[1:] tokvalue = '' + #print("--- parse_token returns:",daba.formats.PlainToken((toktype, tokvalue))) return daba.formats.PlainToken((toktype, tokvalue)) def parse_expr(self, expr): @@ -201,9 +203,18 @@ def parse_expr(self, expr): return result def parse_command(self, command): - command = u.normalize( - 'NFKD', command - ).strip('\n') + # command = u.normalize('NFKD', command).strip('\n') + # normalize should concern lemmas, not gloss + def normalizeLex(m): + lx=m.groups()[0] + lx=u.normalize('NFKD', lx) + psgloss=m.groups()[1] + psgloss=u.normalize('NFC',psgloss) # is this faster than tomonolith() ? + return lx+psgloss + + command = command.strip('\n') + command = re.sub(r'([^\:\< ]+)(\:[^\:\< ]*\:[^\:\< ]*)',normalizeLex,command) + m = re.match(r'\s*(.+?)\s*>>\s*(.+?)\s*$', command, re.U) try: source, target = m.groups() @@ -212,6 +223,7 @@ def parse_command(self, command): return sourcelist = self.parse_expr(source) targetlist = self.parse_expr(target) + #print("=== parse_command / targetlist:", targetlist) # OK return ReplaceRule(sourcelist, targetlist) @@ -253,50 +265,516 @@ def replace(self, token, target): outgloss = token.union(target) if outgloss.gloss is not None: target = outgloss + #print("replace / target:", target) return target - def recursive_replace(self, gloss, pattern, target): + def recursive_replace(self, gloss, pattern, target, filler): + #if gloss is None : return gloss # why do I need this protection??? + filler+=" " + #print(filler+"gloss, pattern, target :\n"+filler,gloss,"\n"+filler, pattern,"\n"+filler, target) if gloss.matches(pattern, psstrict=True): out = gloss.union(target, psoverride=True) else: out = gloss - if gloss.morphemes: + # print(filler+"+++ recursive_replace / before gloss.morphemes : out :",out) + + lpattern=len(str(pattern)) + # question: is it relevant to compare size of pattern and size of glosses in all cases, are there exceptions ? + if out.morphemes: + # print(filler+"+++ recursive_replace / gloss.morphemes :",out.morphemes) + outstr0=str(out) + if lpattern<=len(outstr0): + outstr=outstr0[:outstr0.find("[")+1] #: "kɛ́ra:v: [" + isNone=False + for subgloss in out.morphemes: + if subgloss is None: + print("recursive_replace WHY? outstr0=",outstr0) + isNone=True + else: + if lpattern<=len(str(subgloss)): + subgloss=self.recursive_replace(subgloss,pattern,target,filler) + outstr+=str(subgloss)+" " + outstr=outstr[:-1]+"]" + # print(filler+"outstr:",outstr) + if isNone or outstr[outstr.find("["):].startswith("[ ")\ + or outstr[outstr.find("["):].endswith("[ "): # suppression cases + print(filler+"recursive_replace strange outstr=",outstr) + outstr,ignore=outstr.split(" [",1) + if outstr != outstr0 : + toks = daba.grammar.str_tokenize(outstr) + try: + out = daba.grammar.stringgloss_parser().parse(toks) + # print(filler+"daba.grammar out=",out) + except: + print("stringloss_parser failed on:",outstr) + # no change to out ? + + # else: print(filler+"no gloss.morphemes") + + """ + print(filler+"self.recursive_replace(morph, pattern, target) for morph in gloss.morphemes :\n"+filler,\ + tuple(self.recursive_replace(morph, pattern, target, filler) for morph in gloss.morphemes)) out = out._replace( morphemes=tuple( - self.recursive_replace(morph, pattern, target) - for morph in gloss.morphemes) + self.recursive_replace(morph, pattern, target,filler) + for morph in out.morphemes) ) + """ + + + # print(filler+"+++ recursive_replace / returns out :",out) return out + + def make_replace_func(self, rule): + # print("\n!!! make_replace_func rule:",self.getstr(rule.inlist)," >> ",self.getstr(rule.outlist)) + global textscript if not rule.symmetric: def replace_func(tokens, rule): + # in rule.outlist, words should be compatible with meta-data text-script + # example outlist: + # [w ('bɛ́', 'dabased', [Gloss(form='bɛ́', ps=('cop',), gloss='BE', morphemes=())]) None, + # w ('kà', 'dabased', [Gloss(form='kà', ps=('pm',), gloss='INF', morphemes=())]) None] + # print("modify outlist per text-script:", textscript) + for w in rule.outlist: + # print ("w.token:",w.token) + if textscript=='Ancien orthographe malien': # ref corbama/meta.xml + w.token=re.sub(r'[̀́̌̂]','',w.token) # remove tones + w.token=w.token.replace('ɛɛ','èe') + w.token=w.token.replace('ɔɔ','òo') + w.token=w.token.replace('ɛ','è') + w.token=w.token.replace('ɔ','ò') + w.token=w.token.replace('ɲ','ny') + w.token=w.token.replace('ƐƐ','ÈE') + w.token=w.token.replace('ƆƆ','ÒO') + w.token=w.token.replace('Ɛ','È') + w.token=w.token.replace('Ɔ','Ò') + + w.token=w.token.replace('Ɲ','Ny') + elif textscript=="N’Ko": # ref cormani/meta.xml + def s(a,b,text): + return(re.sub(a,b,text,0,re.U|re.MULTILINE)) + + def gbarali(m): + #text=s(r'([^ߊߍߋߌߐߏߎ ])(?P[ߊߍߋߌߐߏߎ])([́̀̂̌]*)([^ߊߍߋߌߐߏߎ́̀̂̌])(?P=WOV1)([^\u07F2])','\g<1>\g<4>\g<2>\g<3>\g<5>',text) + cons1=m.groups()[0] + cons2=m.groups()[3] + voy=m.groups()[1] # unique since voy1=voy2 + ton1=m.groups()[2] + nonasal=m.groups()[4] + ton2="" + if nonasal in '\u0300\u0301\u0302\u030C\u07EF\u07F0\u07EE\u07F1': ton2=nonasal # latin tones + nko tones for already identified long wovels + + if cons1==cons2 : return cons1+voy+ton1+cons2+voy+nonasal + else: + if ton2==ton1 : return cons1+cons2+voy+nonasal + else : + if ton1=='\u0301': # high tone + if ton2=="": return cons1+cons2+voy+nonasal # pas de ton2=high tone too + else: return cons1+voy+ton1+cons2+voy+nonasal + # is the reverse situation ton1=="" and ton2="\u301" the same ??? + else: return cons1+voy+ton1+cons2+voy+nonasal + + def lat2nko(text): + text=text.strip().lower() + + texts="" + # + # text should end with a punctuation + if text[-1] not in " ,.;:!?\n": text=text+"\n" + scan=re.findall(r'([^ ,\.;:!\?\n]+)([ ,\.;:!\?\n]+)',text) + #print(scan) + nitem=len(scan) + iitem=0 + for item in scan: + iitem+=1 + w=item[0] + punct=item[1] + + w=re.sub(r'([aeiouɛɔ][\u0300\u0301\u0302\u030C]*)n([^aeiouɛɔ])',r'\g<1>~\g<2>',w) + w=re.sub(r'([aeiouɛɔ][\u0300\u0301\u0302\u030C]*)n$',r'\g<1>~',w) + + first=re.findall(r'(^([aeiouɛɔ])([\u0300\u0301\u0302\u030C]*)(~?))',w) + lfirst=0 + if first: + lfirst=len(first[0][0]) + # print(w,'first',first[0][0]) + syllabs=re.findall(r'(([^aeiouɛɔ])([aeiouɛɔ])([\u0300\u0301\u0302\u030C]*)([aeiouɛɔ]?)(~?))',w[lfirst:]) + # caution long vowels : formula above assumes up to two vowels but they could be different!!! + # print(w,"\nsyllabs",syllabs) + + # do something useful with syllabs data + nsyl=len(syllabs) + if nsyl>1: + if first: basetone=first[0][2] + else: basetone=syllabs[0][3] + # print("basetone",basetone.encode("unicode_escape")) + # is basetone a low tone ? + if basetone=="\u0300": + # check tone of next word if any + basetone2="" + if iitem!=nitem: # next word not to be checked on last word + w2=scan[iitem][0] + # print("next word",w2) + if w2[0] in "aeiouɛɔ": basetone2=w2[1] + else: basetone2=w2[2] + # print("basetone2",basetone2.encode("unicode_escape")) + if basetone2!="\u0301": + # update final tone to high + wnew='' + if first: wnew=first[0][0] + isyl=0 + for syl in syllabs: + isyl+=1 + if isyl==nsyl: + wnew=wnew+syl[1]+syl[2]+"\u0301"+syl[4]+syl[5] + else: + wnew=wnew+syl[0] + w=wnew + + w=w.replace("~","n") + texts=texts+w+punct + text=texts + # print("new text\n",text,"\n") + + ###### NY closed list follows: ###### + text=s(r'\bbɛnnyɔɔnya\b','ߓߍ߲ߢߐ߲߱ߧߊ',text) + text=s(r'\bdɛnnyɔgɔnnya\b','ߘߍ߲߬ߢߐ߬ߜߐ߲߬ߧߊ',text) + text=s(r'\bsiginyɔgɔnnya\b','ߛߜߌ߬ߢߐ߬ߜߐ߲߬ߧߊ',text) + text=s(r'\byilanyilan\b','ߦߌߟߊ߲ߦߌߟߊ߲',text) + + text=s(r'\bdɛnyɔonnu\b','ߘߍ߲߬ߢߐ߲߰ߣߎ',text) + # опечатка, должно быть dɛ̀nɲɔɔnnu + text=s(r'\bdɛnyɔɔ\b','ߘߍ߬ߢߐ߲߰',text) + # неправильное написание, надо dɛ̀ɲɔɔn + text=s(r'\bnadanya\b','ߣߊߘߊ߲ߢߊ',text) + # nadanya = nadannya + text=s(r'(ɲinynkali|ɲunynkali)','ɲininkali',text) + # some typo + ##################################### + text=s(r'\bny','ߢ',text) + + + ###### nny -> nɲ -> n+NYA-woloso + text=s(r'\bnny','ߒߧ',text) + text=s(r'nny','\u07F2\u07E7',text) # ߲ Nazalization Mark + ߧ Nya Woloso + ###### ny -> ɲ + text=s(r'ny','ߢ',text) + ##################################### + + #text=s(r'ng','ߢ߭',text) # incompatible avec le gb ! julakolongbɛ + text=s(r'\bng','ߢ߭',text) # est seul possible + text=s(r'ŋ','ߢ߭',text) + + text=s(r'\bn\'','ߣߴ',text) + + #? essai de traitement des nasales voyelle + U+07F2=߲ + #!!! LES TONS NE SONT PAS TRAITES DANS lat2nko.pl !!! + # NB: IMPORTANT! la nasalisation doit précéder le ton + text=s(r'([aeiouɛɔ])([́̀̂]*)n\b',r'\g<1>\u07F2\g<2>',text) + text=s(r'([aeiouɛɔ])([́̀̂]*)n([bcdefghjklmnprstwyz])',r'\g<1>\u07F2\g<2>\g<3>',text) + + # Replace initial n followed by a wovel with ߒ , others initial n with ߣ + text=s(r'\bn([^aeiouɛɔn])',r'ߒ\g<1>',text) + text=s(r'\bn','ߣ',text) + + # où sont traitées les nasales ? <---------------------------- + + # Replace n at the end of a word with ߒ # ??? + text=s(r'([aeiouɛɔ])n\b',r'\g<1>ߒ',text) + text=s(r'([aeiouɛɔ])n([^aeiouɛɔ])',r'\g<1>ߒ\g<2>',text) + text=s(r'\bn\b','ߒ',text) + text=s(r'n','ߣ',text) + + text=s(r'gb','ߜ',text) + text=s(r'sh','ߛ߭',text) + text=s(r'ʃ','ߛ߭',text) + text=s(r'th','ߛ߳',text) + text=s(r'θ','ߛ߳',text) + text=s(r'kp','ߜ߳',text) + text=s(r'rr','ߚ',text) + text=s(r'g','ߜ߭',text) + text=s(r'v','ߝ߭',text) + text=s(r'z','ߖ߭',text) + text=s(r'ħ','ߤ߭',text) + text=s(r'kh','ߞ߭',text) + text=s(r'x','ߞ߭',text) + text=s(r'q','ߞ߫',text) + text=s(r'gh','ߜ߫',text) + text=s(r'ɣ','ߜ߫',text) + text=s(r'zh','ߗ߭',text) + text=s(r'dj','ߗ߭',text) + text=s(r'ʒ','ߗ߭',text) + text=s(r'ð','ߗ߭',text) + text=s(r'ʕa','ߊ߳',text) + text=s(r'ʕ','ߊ߳',text) + text=s(r'bh','ߓ߭',text) + text=s(r'ɓ','ߓ߭',text) + text=s(r'dh','ߘ߳',text) + text=s(r'ɗ','ߘ߳',text) + + text=s(r'b','ߓ',text) + text=s(r'c','ߗ',text) + text=s(r'd','ߘ',text) + text=s(r'f','ߝ',text) + text=s(r'h','ߤ',text) + text=s(r'j','ߖ',text) + text=s(r'k','ߞ',text) + text=s(r'l','ߟ',text) + text=s(r'm','ߡ',text) + text=s(r'ɲ','ߢ',text) + text=s(r'p','ߔ',text) + text=s(r's','ߛ',text) + text=s(r't','ߕ',text) + text=s(r'r','ߙ',text) + text=s(r'w','ߥ',text) + + text=s(r'y','ߦ',text) + + text=s(r'aa','ߊ߯',text) + text=s(r'ɛɛ','ߍ߯',text) + text=s(r'ee','ߋ߯',text) + text=s(r'ii','ߌ߯',text) + text=s(r'ɔɔ','ߐ߰',text) # erreur ? + text=s(r'oo','ߏ߯',text) + text=s(r'uu','ߎ߯',text) + + # add transforms with inside latin tone + # high + # with tonal article + text=s(r'áa`','ߊ߮',text) + text=s(r'ɛ́ɛ`','ߍ߮',text) + text=s(r'ée`','ߋ߮',text) + text=s(r'íi`','ߌ߮',text) + text=s(r'ɔ́ɔ`','ߐ߮',text) + text=s(r'óo`','ߏ߮',text) + text=s(r'úu`','ߎ߮',text) + # without + text=s(r'áa','ߊ߯',text) + text=s(r'ɛ́ɛ','ߍ߯',text) + text=s(r'ée','ߋ߯',text) + text=s(r'íi','ߌ߯',text) + text=s(r'ɔ́ɔ','ߐ߯',text) + text=s(r'óo','ߏ߯',text) + text=s(r'úu','ߎ߯',text) + # low + # with tonal article + text=s(r'àa`','ߊ߱',text) + text=s(r'ɛ̀ɛ`','ߍ߱',text) + text=s(r'èe`','ߋ߱',text) + text=s(r'ìi`','ߌ߱',text) + text=s(r'ɔ̀ɔ`','ߐ߱',text) + text=s(r'òo`','ߏ߱',text) + text=s(r'ùu`','ߎ߱',text) + # without + text=s(r'àa','ߊ߰',text) + text=s(r'ɛ̀ɛ','ߍ߰',text) + text=s(r'èe','ߋ߰',text) + text=s(r'ìi','ߌ߰',text) + text=s(r'ɔ̀ɔ','ߐ߰',text) + text=s(r'òo','ߏ߰',text) + text=s(r'ùu','ߎ߰',text) + + # single wowels + text=s(r'a','ߊ',text) + text=s(r'á','ߊ',text) # usage ??? + text=s(r'ɛ','ߍ',text) + text=s(r'e','ߋ',text) + text=s(r'i','ߌ',text) + text=s(r'ɔ','ߐ',text) + text=s(r'o','ߏ',text) + text=s(r'u','ߎ',text) + + # text=s(r'^(.)',' \1',text) ; text=s(r'(.)$','\1 ',text) + # text=s(r'([^\d])(\d)(\d)(\d)(\d)(\d)(\d)(\d)([^\d])','\1\8\7\6\5\4\3\2\9',text) + # text=s(r'([^\d])(\d)(\d)(\d)(\d)(\d)(\d)([^\d])','\1\7\6\5\4\3\2\8',text) + # text=s(r'([^\d])(\d)(\d)(\d)(\d)(\d)([^\d])','\1\6\5\4\3\2\7',text) + # text=s(r'([^\d])(\d)(\d)(\d)(\d)([^\d])','\1\5\4\3\2\6',text) + # text=s(r'([^\d])(\d)(\d)(\d)([^\d])','\1\4\3\2\5',text) + # text=s(r'(\D)(\d)(\d)(\D)','\1\2\3\4',text) + # + # s/^ //g ; s/ $//g + + text=s(r'0','߀',text) + text=s(r'1','߁',text) + text=s(r'2','߂',text) + text=s(r'3','߃',text) + text=s(r'4','߄',text) + text=s(r'5','߅',text) + text=s(r'6','߆',text) + text=s(r'7','߇',text) + text=s(r'8','߈',text) + text=s(r'9','߉',text) + + text=s(r'<ߤ>','',text) + text=s(r'<\/ߤ>','',text) + # s/<\/ߤ>/<\/h>/g + text=s(r'<ߛ>','',text) + text=s(r'<\/ߛ>','',text) + # s/<\/ߛ>/<\/s>/g + text=s(r'<ߕ>','',text) + text=s(r'<\/ߕ>','',text) + # s/<\/ߕ>/<\/t>/g + text=s(r'<ߕߓ>','',text) + text=s(r'<\/ߕߓ>','',text) + # s/<\/ߕߓ>/<\/tb>/g + text=s(r'<ߓߙ>','
',text) + text=s(r'<\/ߓߙ>','
',text) + text=s(r'<ߓߙ\/>','
',text) + # s/<ߓߙ\/>//g + + # GBARALI - ex bála + # NB: check wovels with latin tones included, + # skipped if nazalisation character ex bálan + # (1st wovel :not included with tones) + # (2d wovel: explicit) + + # checkit=re.findall(r'([^ߊߍߋߌߐߏߎ])(?P[ߊߍߋߌߐߏߎ])([́̀̂̌]*)([^ߊߍߋߌߐߏߎ́̀̂̌])(?P=WOV1)([^\u07F2])',text) + # if checkit: print(checkit) + # else : print("not found") + + # à revoir : le ton initial n'est pas à reporter, du coup la règle nɔnɔ ne peut pas être traitée comme ça. + #text=s(r'([^ߊߍߋߌߐߏߎ ])(?P[ߊߍߋߌߐߏߎ])([́̀̂̌]*)([^ߊߍߋߌߐߏߎ́̀̂̌])(?P=WOV1)([^\u07F2])','\g<1>\g<4>\g<2>\g<3>\g<5>',text) + # solution : renvoi à une fonction adhoc - tons latins : [\u0300\u0301\u0302\u030C] + tones for long wovels: 07EF et 07F0 + text=re.sub(r'([^ߊߍߋߌߐߏߎ ])(?P[ߊߍߋߌߐߏߎ])([\u0300\u0301\u0302\u030C\u07EF\u07F0\u07EE\u07F1]*)([^ߊߍߋߌߐߏߎ́̀̂̌])(?P=WOV1)([^\u07F2])',\ + gbarali,text,0,re.U|re.MULTILINE) + + # tones character references checked / https://unicodeplus.com + # reference latin + # high ́ Combining Acute Accent (U+0301) + # low ̀ Combining Grave Accent (U+0300) + # descending ̂ Combining Circumflex Accent (U+0302) + # raising ̌ Combining Caron (U+030C) + # tonal article Grave Accent (U+0060) ` + # + # reference N'ko + # long wovels + # H+flottant B Nko Combining Long Descending Tone (U+07EE) ߮ + # H Nko Combining Long High Tone (U+07EF) ߯ + # B Nko Combining Long Low Tone (U+07F0) ߰ + # Ascendant+flottant B Nko Combining Long Rising Tone (U+07F1) ߱ + # + # short wovels + # H Nko Combining Short High Tone (U+07EB) ߫ + # H+flottant B (none, default) + # B Nko Combining Short Low Tone (U+07EC) ߬ + # Ascendant+flottant B Nko Combining Short Rising Tone (U+07ED) ߭ + + # remove high tones + #? text=s('́','',text) + + # high tones + # long wovels + # no final tone=high tone??? + text=s(r'(?P[ߊߍߋߌߐߏߎ])(?P=WOV1)\b',r'\g<1>\u07EE',text) + # high tone expressed + text=s(r'(?P[ߊߍߋߌߐߏߎ])\u0301(?P=WOV1)',r'\g<1>\u07EF',text) + + # short wovels + # no final tone=high tone??? + text=s(r'([ߊߍߋߌߐߏߎߒ][\u07F2]*)([ ,;\.\:\?\!])',r'\g<1>\u07EB\g<2>',text) # en fin de mot sans autre indication !!! fin=espace ici ??? \b fails + text=s(r'([ߊߍߋߌߐߏߎߒ][\u07F2]*)$',r'\g<1>\u07EB',text) # en fin de mot sans autre indication !!! fin=espace ici ??? \b fails + + text=s(r'([ߊߍߋߌߐߏߎߒ][\u07F2]*)\u0060',r'\g<1>',text) + # high tone expressed + text=s(r'([ߊߍߋߌߐߏߎߒ][\u07F2]*)\u0301([ ,;\.\:\?\!])',r'\g<1>\u07EB\g<2>',text) # ex bɔ́ en fin de mot ON SUPPOSE QUE C'EST UN VERBE!!!!!! fin=espace ici ??? \b fails + text=s(r'([ߊߍߋߌߐߏߎߒ][\u07F2]*)\u0301\u0060',r'\g<1>',text) # ex bɔ́` art tonal indique un NOM: supprimer + text=s(r'([ߊߍߋߌߐߏߎߒ][\u07F2]*)\u0301',r'\g<1>',text) # ailleurs: supprimer + + # low tones + # long wovels + text=s(r'(?P[ߊߍߋߌߐߏߎ])\u0300(?P=WOV1)\u0060',r'\g<1>\u07F1',text) # mɔ̀ɔ` + text=s(r'(?P[ߊߍߋߌߐߏߎ])\u0300(?P=WOV1)',r'\g<1>\u07F0',text) # mɔ̀ɔnin + # short wovels + text=s(r'([ߊߍߋߌߐߏߎ][\u07F2]*)\u0300\u0060',r'\g<1>\u07ED',text) # l'art tonal indique un NOM + text=s(r'([ߊߍߋߌߐߏߎ][\u07F2]*)\u0300',r'\g<1>\u07EC',text) + + # descending tones + # long wovels + text=s(r'(?P[ߊߍߋߌߐߏߎ])\u0302(?P=WOV1)',r'\g<1>\u07F1',text) # aâ ? test case to find! + # short wovels + text=s(r'([ߊߍߋߌߐߏߎߒ][\u07F2]*)\u0302',r'\g<1>',text) # supprimer? (comme ce qui est fait pour ân 1PL) + + # remove leftover art. tonal [some cases not handled properly] + text=s(r'[\u0060\u0300\u0301\u0302]','',text) + + # 1SG et 2SG sans ton # 07/01/2023 : + # NB: done in sentence scan (latin script preparation) but no effect ??? + # \b not working here but ok in Sublime text ??? + # text=s(r'\b([ߒ|ߌ])\u07EB\b','\g<1>',text) # \b([ߒ|ߌ])߫\b + text=s(r'^([ߒ|ߌ])\u07EB ',r'\g<1> ',text) + text=s(r' ([ߒ|ߌ])\u07EB ',r' \g<1> ',text) # relies on proper punctuation ;-) + + + # déplacer les ponctuations APRES LES TONS as it kills \b and latin punctuation tests fail + + text=s(r',','،',text) + text=s(r'\?','؟',text) + text=s(r'!','߹',text) + text=s(r';','؛',text) + text=s(r'\'','ߴ',text) + + text=s(r'([߹؟،߸!\.:\(\)\-\u2329\u232A«»])\s*$',r'\g<1>\u200F',text) # 〈 Left-Pointing Angle Bracket + 〉 Right-Pointing Angle Bracket + # RTL mark after punctuation (U+200F Right-To-Left Mark) + text=s(r'([߹؟،߸!\.:\(\)\-\u2329\u232A«»])\s*<',r'\g<1>\u200F<',text) + # RTL mark after punctuation + + return(text[:-1]) # in dabased, remove trailing \n + + w.token=lat2nko(w.token) # simplified version ignores -la -da in verbs - still too big a hammer! + # print ("-> w.token:",w.token) return rule.outlist domatch = True else: # FIXME special case for 1:1 replacement: allows deep matching + # now supposedly fixed : need restructuring all this code instead of the following dirty fix! if rule.winsize == 1 and rule.inlist[0].type == 'w': - if rule.inlist[0].gloss.morphemes and rule.outlist[0].gloss.morphemes: + if rule.inlist[0].gloss.morphemes and rule.outlist[0].gloss.morphemes and False: # ============dirty fix + # print("¹¹¹ rule si morphemes G et D :",self.getstr(rule.inlist)," >> ",self.getstr(rule.outlist)) def replace_func(tokens, rule): token = tokens[0] target = rule.outlist[0] gt = self.replace(token, target) outgloss = gt.gloss._replace(morphemes=target.gloss.morphemes) gt = daba.formats.WordToken([outgloss], token=tokens[0].token, stage='dabased') + # print("### make_replace_func / gt:",gt) return [gt] domatch = True + #tried domatch = False else: + # print("²²² rule si pas morphemes G et D :",self.getstr(rule.inlist)," >> ",self.getstr(rule.outlist)) def replace_func(tokens, rule): + # print("###2 make_replace_func / tokens, rule:",tokens, rule) token = tokens[0].gloss + # print("###2 make_replace_func / token:",token) pattern = rule.inlist[0].gloss + # print("###2 make_replace_func / pattern :",pattern) target = rule.outlist[0].gloss - outgloss = self.recursive_replace(token, pattern, target) + # print("###2 make_replace_func / target:",target) + outgloss = self.recursive_replace(token, pattern, target,"") + # print("###2 make_replace_func / outgloss:",outgloss) # NOT OK gt = daba.formats.WordToken([outgloss], token=tokens[0].token, stage='dabased') + # print("replace_func gt:",gt) + + """ + # what is the justification for this test, it blocks changes + # like prn -> mrph in gloss like in + # jɛ̀ɲɔgɔn:n:associé [jɛ̀:v:assembler ɲɔ́gɔn:prn:RECP] >> jɛ̀ɲɔgɔn:n:associé [jɛ̀:v:assembler ɲɔgɔn:mrph:RECP] + # it will return jɛ̀ɲɔgɔn:n:associé [jɛ̀:v:assembler ] + # ... which fails if pattern.ps == target.ps: + print("###2 make_replace_func / same ps / tokens[0].union(gt):",tokens[0].union(gt)) + print("###2 make_replace_func / same ps / au lieu de gt :",gt) return [tokens[0].union(gt)] + else: + print("###2 make_replace_func / different ps / gt:",gt) return [gt] + """ + return [gt] domatch = False + # tried (stops early!) domatch = True else: def replace_func(tokens, rule): return [self.replace(token, target) @@ -306,9 +784,13 @@ def replace_func(tokens, rule): return (domatch, replace_func) def apply_rule(self, rule, stream): + global nchanges + # print("\n??? apply_rule / rule :",self.getstr(rule.inlist)," >> ",self.getstr(rule.outlist),"\n") # OK domatch, replace_func = self.make_replace_func(rule) # sys.stderr.write(u'Domatch {}\n'.format(str(domatch))) success = -rule.winsize + #print("rule, rule.winsize, stream:",rule, rule.winsize, stream) + #print("self.feed_tokens(rule.winsize, stream) : ",self.feed_tokens(rule.winsize, stream)) for pos, tokens in self.feed_tokens(rule.winsize, stream): if pos < success + rule.winsize: continue @@ -320,19 +802,27 @@ def apply_rule(self, rule, stream): # sys.stderr.write( # u'match: {}\n'.format(self.getstr(tokens)) # ) + # print("??? apply_rule / tokens passed to replace_tokens :",self.getstr(tokens)) # NOT OK + # print("??? apply_rule / rule passed to replace_rule :",self.getstr(rule.inlist)," >> ",self.getstr(rule.outlist)) replacement = replace_func(tokens, rule) + # print("??? apply_rule / replacement :",self.getstr(replacement)) # NOT OK # sys.stderr.write( # u'replacement: {}\n'.format(self.getstr(replacement)) # ) + # print("check all =") + # for g, r in zip_longest(tokens, replacement, fillvalue=daba.formats.PlainToken()): + # print("g, r, = : ",g,r, g == r) + # print("-?-",all(g == r for g, r in zip_longest(tokens, replacement, fillvalue=daba.formats.PlainToken()))) if not all(g == r for g, r in zip_longest( tokens, replacement, fillvalue=daba.formats.PlainToken())): self.dirty = True + nchanges+=1 if self.verbose: sys.stderr.write( - u'{0} -> {1}\n'.format( + u'\033[1m{0}\033[0m -> \033[92m{1}\033[0m\n'.format( self.getstr(tokens), self.getstr(replacement)) ) @@ -343,13 +833,16 @@ def apply_rule(self, rule, stream): continue yield tokens[0] else: - for token in tokens[1:]: - yield token + if 'tokens' in locals(): # why did I need to protect from an error here? + for token in tokens[1:]: + yield token def apply_script(self, script, stream): tokens = stream for rule in script: + # if self.verbose: print("~~~ apply_script / rule in script :",rule, "list(tokens):",list(tokens)) # OK tokens = self.apply_rule(rule, list(tokens)) + #if self.verbose: print("~~~ apply_script / returns tokens :",list(tokens)) return tokens @@ -369,12 +862,19 @@ def main(): sed = StreamEditor(verbose=args.verbose) script = ScriptParser(args.script) in_handler = daba.formats.HtmlReader(args.infile, compatibility_mode=False) + #print("metadata:",in_handler.metadata) + #print("textscript",in_handler.metadata['text:script']) + global textscript + textscript=in_handler.metadata['text:script'] + # print("script, in_handler:",script, in_handler) + global nchanges + nchanges=0 processed_tokens = list(sed.apply_script(script, in_handler)) if sed.dirty: out_handler = daba.formats.HtmlWriter((in_handler.metadata, in_handler.make_compatible_glosses(processed_tokens)), args.outfile) out_handler.write() if args.verbose: - sys.stderr.write(u'Written {0}\n'.format(args.outfile)) + sys.stderr.write(u'Written {0} : \033[3m{1} changes\033[0m\n'.format(args.outfile,nchanges)) if __name__ == '__main__': diff --git a/daba/formats.py b/daba/formats.py old mode 100644 new mode 100755 index 07fd70b..6b2b98d --- a/daba/formats.py +++ b/daba/formats.py @@ -37,7 +37,7 @@ from abc import abstractmethod import daba.grammar -from daba.ntgloss import Gloss +from daba.ntgloss import Gloss, __str__ from daba.orthography import detone #FIXME: duplicate, move to common util @@ -205,7 +205,7 @@ class SentenceListReader(BaseReader): def __init__(self, filename, encoding="utf-8"): self.isdummy = True self.metadata = {} - sent_re = '(?P)(?P(.|\n(?!)' + sent_re = r'(?P)(?P(.|\n(?!)' out = [] with open(filename, encoding=encoding) as f: txt = f.read() @@ -479,6 +479,7 @@ def __init__(self, metadata_para, filename, encoding="utf-8"): html = e.Element('html') head = e.SubElement(html, 'head') e.SubElement(head, 'meta', {'http-equiv': "Content-Type", 'content': "text/html; charset={0}".format(self.encoding)}) + # what about sorting metadata by key ? sortedmetadata=dict(sorted(metadata.items())) for (name, content) in self.metadata.items(): md = e.SubElement(head, 'meta', {'name': name, 'content': content}) body = e.SubElement(html, 'body') @@ -524,6 +525,7 @@ def _make_header(self): root = e.Element('html') head = e.SubElement(root, 'head') meta = e.SubElement(head, 'meta', {'http-equiv': 'Content-Type', 'content': 'text/html; charset={0}'.format(self.encoding)}) + # what about sorting metadata by key ? sortedmetadata=dict(sorted(metadata.items())) for (name, content) in self.metadata.items(): md = e.SubElement(head, 'meta', {'name': name, 'content': content}) style = e.SubElement(head, 'style', {'type': 'text/css'}) @@ -660,17 +662,38 @@ def __init__(self, udict, filename, lang='', name='', ver='', add=False, encodin def write(self): def makeGlossSfm(gloss,morpheme=False): - if not morpheme: - sfm = r""" -\lx {0} -\ps {1} -\ge {2} - """.format(gloss.form, '/'.join(gloss.ps), gloss.gloss) - for m in gloss.morphemes: - sfm = sfm + makeGlossSfm(m, morpheme=True) - else: - sfm = r'\mm ' + ':'.join([gloss.form or '', '/'.join(gloss.ps or ()), gloss.gloss or '']) + os.linesep + # example str(gloss): álalandiya:n:piété3 [álalandi:n:personne.pieuse [Ála:n:Dieu landi:adj:qui.aime] ya:mrph:ABSTR] + sfm="\n\\lx "+gloss.form + sfm+="\n\\ps "+'/'.join(gloss.ps) + sfm+="\n\\gf "+gloss.gloss + if gloss.morphemes: + lxroot,mrphx=str(gloss).split(" ",1) + sfm+=mmlist(str(mrphx[1:-1])) + sfm+="\n" return sfm + + def mmlist(mrphx): # can handle multiple level mm ( source: wordparser1 ) + mrphx=mrphx.replace("[","[ ") + mrphx=mrphx.replace("]"," ]") + mrphelem=mrphx.split(" ") + mmprefix="\n\\mm" + level=0 + mms="" + for elem in mrphelem: + if elem=="[": + level+=1 + mmprefix=mmprefix+"m" + elif elem=="]": + level-=1 + mmprefix=mmprefix[:-1] + else: + if ":" in elem: + mmlx,mmps,mmgloss=elem.split(":",2) + mms+=mmprefix+" "+mmlx+":"+mmps+":"+mmgloss # or tomonolith(mmgloss) ? + else: + mms+=mmprefix+" "+elem # ??? what happened ??? + return mms+"\n" + with codecs.open(self.filename, 'w', encoding=self.encoding) as dictfile: dictfile.write(u'\\lang {0}\n'.format(self.lang)) @@ -761,7 +784,8 @@ def __getitem__(self, gloss): if gs: lookup.append((form, (ps, gs))) if ms: - stems = [m for m in ms if 'mrph' not in m.ps] + stems = [m for m in ms if 'mrph' not in m.ps] # tried replace by JJM - tried keepmrph first + # stems = [m for m in ms ] # JJ removed: if 'mrph' not in m.ps if len(stems) == 1: g = stems[0] lookup.append((g.form, (g.ps, g.gloss))) @@ -799,11 +823,33 @@ def __delitem__(self, gloss): class DictReader(object): def __init__(self, filename, encoding='utf-8', store=True, variants=False, polisemy=False, keepmrph=False, - normalize=True, ignorelist=('i',), inverse=False, - lemmafields=('lx', 'le'), - variantfields=('ve', 'va', 'vc', 'a'), - glossfields=('gf', 'ge', 'dff'), canonical=False): - + normalize=True, ignorelist=('i'), inverse=False, + lemmafields=('lx', 'le', 'va'), # JJM changed 'va',) to 'va') + variantfields=('vc', 'a'), # JJM removed 've', 28/12/2024 'vt', 06/04/2025 (but consequences if has mm: aded to lx/va twice! see edit 26-06-2025) + conditionalavoidfields=('ve'), # JJM added 06/01/2025 + glossfields=('gf', 'ge', 'gr', 'dff'), canonical=False): + # JJM moved va vt from variantfields to lemmafields as per our discussion 17 jan 2022 + # JJM added 'gr' to glossfields (malidaba) 22/6/2024 + # JJM question : are gvf gve gvr relevant here (gvf should be for polysemy) + # JJM 28/12/2024 remove 've' completely, not wanted as possible choice in gdisamb: kó = gó + # IMPORTANT! : 've' moved to ignorelist / existings tests and paragraphs about 've' to be ignored + # 06/01/2025: rolled back! + # JJM 06/01/2025 've' conditional avoid fields ("variants to avoid"): + # important to keep for gparser to guess. Example twa = tɔgɔ (bam) + # but some introduce too much unwanted ambiguity : + # Example (bam) : kó ve of gó (bad) as there are already other legitimate "ko" (lemmafields) + # => need to create a first pass to load all values of lemmafields in a list (avoidlist) + # then in 2d (normal) pass, check if ve already in avoidlist, then avoid, else add + # In summary, ve is ignored if it is already a legitimate lemma. + # This handling is not ideal : + # One could with that "a man ko" would display ve "ko" (="go") if preceded by "man" or "ka" + # but this sentence analysis is beyond the scope of gparser + # 29/09/2025 CAVEAT : tone included: já is not in the lemmas, so this \ve of díya is NOT Ignored. + # if tone is exluded - this behaviour is acceptable for a build + # however it is not for a normal work in gdisamb + # a better handling should be in mparser: + # list forms that are different from the original, but only at the end of possible lemma vars + # => rolled back self._dict = DabaDict() self._variants = VariantsDict(canonical=canonical) self._polisemy = defaultdict(ddlist) @@ -814,6 +860,7 @@ def __init__(self, filename, encoding='utf-8', store=True, self.inverse = inverse self.lemmafields = lemmafields self.variantfields = variantfields + self.conditionalavoidfields = conditionalavoidfields self.glossfields = glossfields ignore = False lemmalist = [] @@ -835,7 +882,8 @@ def parsemm(v): def normalize(value): try: - return normalizeText(value.translate({ord(u'.'): None, ord(u'-'):None}).lower()) + #return normalizeText(value.translate({ord(u'.'): None, ord(u'-'):None}).lower()) # JJM removes lower + return normalizeText(value.translate({ord(u'.'): None, ord(u'-'):None})) except AttributeError: return value @@ -891,57 +939,211 @@ def process_record(key, lemmalist, ps, glossdict): self._variants.add(list(zip(*lemmalist))[1]) with codecs.open(filename, 'r', encoding=encoding) as dictfile: + """ removed 29/09/2025 + # print("open 1st pass:",filename) + # first pass: created avoid list later used to filter through conditionalavoidfields + avoidlist=[] + for line in dictfile: + if line.startswith('\\'): + #line = unicodedata.normalize('NFKD', line) = normalizeText + tag, space, value = line[1:].partition(' ') + value = value.strip() + if tag in self.lemmafields: + value = normalizeText(value) + if value not in avoidlist: + avoidlist.append(value) + """ + # normal pass: + with codecs.open(filename, 'r', encoding=encoding) as dictfile: + # print("open 2d pass:",filename) + mmlevel=0 + morphemetext="" + key="" for line in dictfile: self.line = self.line + 1 - # end of the artice/dictionary + # end of the article/dictionary if not line or line.isspace(): - if not ignore: - process_record(key, lemmalist, ps, glossdict) + if key and not ignore: + lemmalist2=[] # build lemmalist2 with key and Gloss + for x in lemmalist: + lkey=x[0] + lxkey=lkey + if ":" in lkey: lkey,lxkey=lkey.split(":",1) # special case generated by 've' + morphemetext=x[1] + if morphemetext: + #balance closing brackets + nrb=morphemetext.count("]") + nlb=morphemetext.count("[") + if nlb > nrb: + morphemetext+="]"*(nlb-nrb) + # elif nlb < nrb : # should not happen + + morphemetext=morphemetext.replace(" ]","]").strip() + morphemetext=" ["+morphemetext+"]" + morphemetext=lxkey+":"+pstext+":"+glosstext+morphemetext + + # morpheme as a Gloss + #print("morphemetext:",morphemetext) + try: + toks = daba.grammar.str_tokenize(morphemetext) + g = daba.grammar.stringgloss_parser().parse(toks) + except: + #print("erreur de mm sur : ",morphemetext) + continue + + lemmalist1=[] + lemmalist1.append(lkey) + lemmalist1.append(g) + lemmalist2.append(lemmalist1) + + process_record(key, lemmalist2, ps, glossdict) + + mmlevel=0 + morphemetext="" + ignore = False lemmalist = [] ps = () + glosstext="" glossdict = {} key = None lemma = None + elif line.startswith('\\'): - line = unicodedata.normalize('NFKD', line) + # line = unicodedata.normalize('NFKD', line) tag, space, value = line[1:].partition(' ') value = value.strip() + if tag in glossfields: + value = unicodedata.normalize('NFC',value) + else: + value = normalizeText(value) + if tag in ['lang', 'ver', 'name']: self._dict.__setattr__(tag, value) + elif tag in self.ignorelist: ignore = True + elif tag in self.lemmafields: if self.normalize: key = normalize(value) else: key = value - lemmalist.append(make_item(value)) - if not lemma: - lemma = value - elif tag in self.variantfields: - if lemma: - lemmalist.append(make_item(lemma, key=value)) - else: + + if " " in key : + key=key.replace(" "," ") # replace by hard space (cf enciclop "famous names") + # print("\033[1mILLEGAL\033[0m space in ",key," replaced by hard space") + + lemmalist1=[] + lemmalist1.append(key) + lemmalist1.append('') # Gloss in text form (morphemes) + lemmalist.append(lemmalist1) + morphemetext="" + mmlevel=0 + ignoremm=False + + elif tag in self.variantfields and tag not in self.conditionalavoidfields: # ve should inherit gloss from lx + if " " in value : + value=value.replace(" "," ") # replace by hard space (cf enciclop "famous names") + # print("\033[1mILLEGAL\033[0m space in ",key," '",tag,"' variant for ",value," replaced by hard space") + lemmalist1=[] + lemmalist1.append(value) + lemmalist1.append('') # Gloss in text form (morphemes) + lemmalist.append(lemmalist1) + # print("va? lemmalist:",lemmalist) + morphemetext="" + mmlevel=0 + ignoremm=False + #else: # shouldn't happen: variant should not come before lemma - lemmalist.append(make_item(value)) - elif tag in ['mm']: - lemmalist[-1][1] = lemmalist[-1][1]._replace(morphemes=lemmalist[-1][1].morphemes+(parsemm(value),)) + #lemmalist.append(make_item(value)) + + elif tag in self.conditionalavoidfields: + # removed 29/09/2025: + # if value not in avoidlist: + # print("add ve:",value) + if " " in value : + value=value.replace(" "," ") # replace by hard space (cf enciclop "famous names") + # print("\033[1mILLEGAL\033[0m space in ",key," 've' variant for ",value," replaced by hard space") + try: + lemmalist1=[] + #lemmalist1.append(value) # try: value+":"+key and split before process to keep key ? (check for ":" in lkey string) + lemmalist1.append(value+":"+key) # tried lemmalist1.append("?"+value+":"+key)= never picked in gparser + lemmalist1.append(lemmalist[0][1]) # inherits lx gloss + lemmalist.append(lemmalist1) + # print("ve? lemmalist:",lemmalist) + # [['gó', ''], ['kó:gó', '']] + except: + print("error / value, key",value,key) + ignoremm=True + + elif tag=="vt": # JJM 26-06-2025 side effect of removing vt from variantfields + morphemetext="" + mmlevel=0 + ignoremm=True + + elif tag.startswith("mm") and not ignoremm : # caveat : supposed to be something like "mmmm" BUT could be as well "mmaa" + morphemetext="" + thislevel=len(tag)-2 + if mmlevelthislevel: + morphemetext+="]"*(mmlevel-thislevel)+" " + mmlevel=thislevel + morphemetext+=value+" " + lemmalist[-1][1] = lemmalist[-1][1]+morphemetext + elif tag in ['ps'] and not ps: if value: ps = tuple(value.split('/')) + pstext=value else: ps = () + elif tag in self.glossfields: + + if " " in value : + value=value.replace(" ",".") + # print("\033[1mILLEGAL\033[0m space in gloss for ",key," replaced by dot:",value) + glossdict[tag] = value - elif tag in ['gv']: + if tag=="gf": glosstext=value # otherwise the last glossfields will be published, eg russian! + + elif tag in ['gv','gvf']: # JJM added gvf 22/6/24 if polisemy: self._polisemy[key][select_gloss(glossdict)].append(value) dk = detone(key) if not dk == key: self._polisemy[dk][select_gloss(glossdict)].append(value) - else: - process_record(key, lemmalist, ps, glossdict) + + else: # when the for loop is finished, do the following (last line) + if key and not ignore: + lemmalist2=[] # build lemmalist2 with key and Gloss + for x in lemmalist: + lkey=x[0] + lxkey=lkey + if ":" in lkey: lkey,lxkey=lkey.split(":",1) # special case generated by conditionalavoidfields + morphemetext=x[1] + if morphemetext: + if mmlevel!=0: + morphemetext+="]"*mmlevel + morphemetext=morphemetext.replace(" ]","]").strip() + morphemetext=" ["+morphemetext+"]" + morphemetext=lxkey+":"+pstext+":"+glosstext+morphemetext + + # morpheme as a Gloss + try: + toks = daba.grammar.str_tokenize(morphemetext) + g = daba.grammar.stringgloss_parser().parse(toks) + except: + # print("ERR erreur de mm sur : ",morphemetext) + continue + lemmalist1=[] + lemmalist1.append(key) + lemmalist1.append(g) + lemmalist2.append(lemmalist1) + + process_record(key, lemmalist2, ps, glossdict) if not self._dict.attributed(): print(r"Dictionary does not contain obligatory \lang, \name or \ver fields.\ diff --git a/daba/gdisamb.py b/daba/gdisamb.py old mode 100644 new mode 100755 index 5839fcc..dfd5dd3 --- a/daba/gdisamb.py +++ b/daba/gdisamb.py @@ -54,6 +54,9 @@ LocaldictLookupEvent, EVT_LOCALDICT_LOOKUP = wx.lib.newevent.NewCommandEvent() LocaldictSaveEvent, EVT_LOCALDICT_SAVE = wx.lib.newevent.NewCommandEvent() +global bamananGV, maninkaGV +bamananGV=True # JJM - default value for Gloss Validations specific to Bamanan +maninkaGV=False # JJM 24/03/2024 Settings should be saved # UTILITY functions and no-interface classes @@ -70,8 +73,11 @@ def get_basename(fname): like .pars.html and .dis.html""" basename = os.path.splitext(os.path.basename(fname))[0] pars = basename.rfind('.pars') + repl = basename.rfind('.repl') # added JJM if pars > 0: return basename[:pars] + elif repl > 0: + return basename[:repl] dis = basename.rfind('.dis') if dis > 0 and len(basename)-dis <= 7: return basename[:dis] @@ -96,6 +102,9 @@ def SetValue(self, string): def makeGlossString(gloss, morphemes=False): """string representation of the Gloss object (for labelling buttons and the like)""" + # print("makeGlossString morphemes passed:",morphemes) + # print("makeGlossString gloss.morphemes passed:",gloss.morphemes) + # print("makeGlossString gloss._str_ passed:",str(gloss)) if not ''.join(gloss.ps) and not gloss.gloss and not gloss.morphemes: return gloss.form elif morphemes and gloss.morphemes: @@ -114,9 +123,9 @@ class SentAnnot(object): ---------- pnum (int) : paragraph number (0-based) snum (int) : sentence number (0-based) - senntoken (PlainToken) : sentence token + senttoken (PlainToken) : sentence token senttext (str) : sentence text - glosslist ([WordToken]) : list of anntotations for each token in a sentence + glosslist ([WordToken]) : list of annotations for each token in a sentence selectlist ([[WordToken]]) : list of annotations selected by user (for each token) attrs (dict) : sentence-level attributes (proxy to senttoken.attrs) """ @@ -218,9 +227,18 @@ def write(self, filename): glosstoken.setGlosslist(selectlist) outgloss.append(glosstoken) out[-1].append((sent.senttoken, outgloss)) + # added JJM from format HtmlReader (removed numpar, unkown here) + # these were not updated on save (only on load) + for k, v in [ + ('_auto:words', self.numwords), + ('_auto:sentences', self.numsent) + ]: + self.metadata[k] = str(v) + fwriter = daba.formats.HtmlWriter((self.metadata, out), filename) fwriter.write() + class EditLogger(object): """log token edit operations""" @@ -263,7 +281,7 @@ def __init__(self, processor): @property def nmatches(self): - """property holding the number of mathces""" + """property holding the number of matches""" return len(self.matches) def _searcher(self, searchstr, searchtype, startsent): @@ -272,7 +290,7 @@ def _searcher(self, searchstr, searchtype, startsent): self.matches = [] self.searchstr = searchstr self.history.append(self.searchstr) - if self.ignorecase: + if searchtype not in ('gf','in','re',) and self.ignorecase: searchstr = searchstr.lower() glosses = self.processor.glosses if startsent: @@ -287,6 +305,63 @@ def _searcher(self, searchstr, searchtype, startsent): # FIXME: should not happen if all words are proper GlossTokens except (AttributeError): print(word) + elif searchtype == "ps": + for wnum, word in enumerate(sent.glosslist): + try: + s0=":"+self.searchstr+":" + if s0 in str(word.gloss) : + match = (sent.snum, wnum) + self.matches.append(match) + # FIXME: should not happen if all words are proper GlossTokens + except (AttributeError): + print(word) + elif searchtype == "gf": + for wnum, word in enumerate(sent.glosslist): + try: + s1=":"+self.searchstr+" " + s2=":"+self.searchstr+"]" + sgloss=str(word.gloss)+" " + if s1 in sgloss or s2 in sgloss : + match = (sent.snum, wnum) + self.matches.append(match) + # FIXME: should not happen if all words are proper GlossTokens + except (AttributeError): + print("except in searcher searchtype 'gf' :",word) + elif searchtype == "lx": + for wnum, word in enumerate(sent.glosslist): + try: + s1=self.searchstr+":" + s2="["+s1 + s3=" "+s1 + sgloss=str(word.gloss) + if sgloss.startswith(s1) or s2 in sgloss or s3 in sgloss: + match = (sent.snum, wnum) + self.matches.append(match) + # FIXME: should not happen if all words are proper GlossTokens + except (AttributeError): + print("except in searcher searchtype 'lx' :",word) + elif searchtype == "in": + for wnum, word in enumerate(sent.glosslist): + try: + sgloss=str(word.gloss) + if searchstr in sgloss: + match = (sent.snum, wnum) + self.matches.append(match) + # FIXME: should not happen if all words are proper GlossTokens + except (AttributeError): + print("except in searcher searchtype 'in' :",searchstr,word) + elif searchtype == "re": + # print("_searcher 're' searchstr:",searchstr) + for wnum, word in enumerate(sent.glosslist): + try: + sgloss=str(word.gloss) + # print("_searcher 're' sgloss:",sgloss, re.search(searchstr,sgloss)) + if re.search(searchstr,sgloss): + match = (sent.snum, wnum) + self.matches.append(match) + # FIXME: should not happen if all words are proper GlossTokens + except (AttributeError): + print("except in searcher searchtype 're' :",searchstr,word) elif searchtype == 'sentence part': for matchobj in re.finditer(self.searchstr, sent.senttext): self.matches.append((sent.snum, matchobj)) @@ -302,6 +377,21 @@ def find(self, searchstr, startsent=0): """ if ' ' in searchstr: searchtype = 'sentence part' + elif searchstr.startswith(':::'): + searchtype = "gf" + searchstr=normalizeText(searchstr[3:]) + elif searchstr.startswith('::'): + searchtype = "ps" + searchstr=searchstr[2:] + elif searchstr.startswith(":"): + searchtype='lx' + searchstr=searchstr[1:] + elif searchstr.startswith('**'): + searchtype = "re" + searchstr=searchstr[2:] + elif searchstr.startswith("*"): + searchtype='in' + searchstr=searchstr[1:] else: searchtype = 'word part' matches = self._searcher(searchstr, searchtype, startsent) @@ -349,9 +439,31 @@ def onMouseEvent(self, event): self.GetTopLevelParent().sentpanel.OnSaveResults(event) self.GetTopLevelParent().ShowSent(self.num) self.GetTopLevelParent().Layout() - + self.GetTopLevelParent().notebook.ChangeSelection(0) # added JJM: switches to sentpanel event.Skip() +class MetaText(wx.StaticText): + """Meta data overview widget""" + def __init__(self, parent, id, num=None, *args, **kwargs): + wx.StaticText.__init__(self, parent, *args, **kwargs) + self.num = num + self.parent = parent + + font = wx.Font(12, wx.FONTFAMILY_MODERN, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL) + self.SetFont(font) + + + """def onMouseEvent(self, event): + + if event.Moving(): + self.SetCursor(wx.StockCursor(wx.CURSOR_HAND)) + elif event.LeftDown(): + self.GetTopLevelParent().metapanel.OnSaveResults(event) + self.GetTopLevelParent().ShowSent(self.num) + self.GetTopLevelParent().Layout() + + event.Skip() + """ class GlossButton(wx.Panel): """Single button widget for selecting a gloss variant @@ -365,13 +477,13 @@ class GlossButton(wx.Panel): children (list) : a list of the nested morphemes of a gloss gloss (Gloss) : widget's Gloss """ - def __init__(self, parent, gloss, statecolours, disabled=False, + def __init__(self, parent, gloss, statecolours, disabled=False, addbylocaldict=False, *args, **kwargs): """GlossButton constructor :param gloss: Gloss to be displayed on the button :type gloss: Gloss - :param statecolors: colors for vairous state of the selector + :param statecolors: colors for various state of the selector :type statecolors: dict""" wx.Panel.__init__(self, parent, *args, **kwargs) self.selected = False @@ -379,18 +491,24 @@ def __init__(self, parent, gloss, statecolours, disabled=False, self.gloss = gloss self.disabled = disabled self.statecolours = statecolours + self.addbylocaldict = addbylocaldict box = wx.BoxSizer(wx.VERTICAL) # prepare main gloss button + textforbutton=makeGlossString(gloss) + if self.addbylocaldict: textforbutton="*"+textforbutton if self.disabled: - self.main = wx.Button(self, -1, makeGlossString(gloss)) + self.main = wx.Button(self, -1, textforbutton) self.main.Disable() else: - self.main = wx.ToggleButton(self, -1, makeGlossString(gloss)) + self.main = wx.ToggleButton(self, -1, textforbutton) self.main.Bind(wx.EVT_TOGGLEBUTTON, self.OnToggled) fore, back = self.statecolours['deselected'] self.main.SetForegroundColour(fore) - self.main.SetBackgroundColour(back) + if self.addbylocaldict: + self.main.SetBackgroundColour((255, 230, 200, 255)) # some sort of beige? + else: + self.main.SetBackgroundColour(back) self.Refresh() box.Add(self.main, 0, wx.EXPAND) # prepare morphemes buttons recursively @@ -418,11 +536,17 @@ def DoToggle(self): if self.selected: fore, back = self.statecolours['selected'] self.main.SetForegroundColour(fore) - self.main.SetBackgroundColour(back) + if self.addbylocaldict: + self.main.SetBackgroundColour((255, 230, 200, 255)) # some sort of beige? + else: + self.main.SetBackgroundColour(back) else: fore, back = self.statecolours['deselected'] self.main.SetForegroundColour(fore) - self.main.SetBackgroundColour(back) + if self.addbylocaldict: + self.main.SetBackgroundColour((255, 230, 200, 255)) # some sort of beige? + else: + self.main.SetBackgroundColour(back) self.Refresh() self.ToggleChildren() @@ -453,6 +577,7 @@ def __init__(self, parent, id, title, gloss, *args, **kwargs): wx.Dialog.__init__(self, parent, id, title, style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER, *args, **kwargs) + self.as_gloss = gloss self.morphemes = [] self.save = True @@ -463,8 +588,12 @@ def __init__(self, parent, id, title, gloss, *args, **kwargs): config.Read('colors/deselected/back', 'White'))} vbox_top = wx.BoxSizer(wx.VERTICAL) + global bamananGV,maninkaGV + texte_à_afficher="Composez votre glose | x:ps:gloss [y...] | Gloss string" + if bamananGV: texte_à_afficher="(bam) "+texte_à_afficher + elif maninkaGV: texte_à_afficher="(emk) "+texte_à_afficher vbox_top.Add(wx.StaticText(self, wx.ID_ANY, - "Gloss string (edit inplace):")) + texte_à_afficher,size=(600,20))) # JJM dirty trick to enlarge dialog glossstring = str(self.as_gloss) self.glosstext = wx.ComboBox(self, wx.ID_ANY, glossstring, choices=[glossstring]) @@ -491,12 +620,43 @@ def FitGlosstextWidth(self): self.GetSizer().SetItemMinSize(self.glosstext, (gwidth + 15, gheight + 10)) self.Layout() self.Fit() + + def tomonolith(m) : + mapping = { 'à':'à', 'â':'â', 'é':'é', 'ê':'ê', 'è':'è', 'ë':'ë', 'î':'î', 'ï':'ï', 'ô':'ô', 'û':'û', 'ù':'ù', 'ç':'ç', 'À':'À', 'Ç':'Ç', 'Ê':'Ê', 'Ô':'Ô'} + lxps=m.groups()[0] + gloss=m.groups()[1] + for k, v in mapping.items(): + if k in gloss: + gloss = gloss.replace(k, v) + return lxps+gloss def UpdateInterface(self, gloss): + """ + def tomonolith(m) : + mapping = { 'à':'à', 'â':'â', 'é':'é', 'ê':'ê', 'è':'è', 'ë':'ë', 'î':'î', 'ï':'ï', 'ô':'ô', 'û':'û', 'ù':'ù', 'ç':'ç', 'À':'À', 'Ç':'Ç', 'Ê':'Ê', 'Ô':'Ô'} + lxps=m.groups()[0] + gloss=m.groups()[1] + for k, v in mapping.items(): + if k in gloss: + gloss = gloss.replace(k, v) + return lxps+gloss + """ """update dialog (gbutton, glosstext) given a gloss""" self.freeze = True - glossstring = str(gloss) + glossstring = str(gloss) # ou bien self.glosstext ? + # glossstring = re.sub(r'([^\:\[ ]+\:[^\:\[ ]*\:)([^\: ]+)',tomonolith,glossstring) # JJM temp fix cursor = self.glosstext.GetInsertionPoint() + #print("'"+glossstring+"', len(glossstring):",len(glossstring), ", cursor:", cursor) + diacritics="\u0301\u0300\u0302\u030c\u00B8\u0308\u005e\u02c6" # high, low, decreasing, increasing tone diacritics + cedilla and diaeresis (trema as in ë) + #print("fin de glossstring=",glossstring[-1]) + if glossstring[-1] in diacritics: + cursor=len(glossstring) # JJM in case monolith characters are split : ê -> e ̂ + elif cursor",savedglosses) + # font = wx.Font(10, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_BOLD, True) + # N'ko: essayer aussi en enlevant l'article tonal final (?), ou en l'ajoutant + elif textscript=="N’Ko" : + if formlookup[-1]=="`" : + formlookup=formlookup[:-1] + if formlookup in self.GetTopLevelParent().localdict: + savedglosses = self.GetTopLevelParent().localdict[formlookup] + else: + formlookup=formlookup+"`" + if formlookup in self.GetTopLevelParent().localdict: + savedglosses = self.GetTopLevelParent().localdict[formlookup] + + if len(savedglosses)>0: + for gloss in savedglosses: + if gloss not in alreadyAdded: + self.gbutton = GlossButton(self, gloss, self.statecolours,addbylocaldict=True) + self.children.append(self.gbutton) + self.sizer.Add(self.gbutton, 0, *self.sizerflags) + alreadyAdded.append(gloss) + # end JJM + if len(self.glosslist) > 1: + # above JJM code was here in 1st implementation - but red button could not benefit from localdict + for gloss in glosslist: - gbutton = GlossButton(self, gloss, self.statecolours) - self.children.append(gbutton) - self.sizer.Add(gbutton, 0, *self.sizerflags) + # JJM add : screen already added buttons + if gloss not in alreadyAdded: + gbutton = GlossButton(self, gloss, self.statecolours) + self.children.append(gbutton) + self.sizer.Add(gbutton, 0, *self.sizerflags) self.SetSizer(self.sizer) self.Layout() @@ -868,10 +1213,10 @@ def GetWordToken(self): def OnContextMenu(self, evt): """context menu shown on right-click on selector's area""" if not hasattr(self, "joinfwID"): - self.joinfwID = wx.NewId() - self.joinbwID = wx.NewId() - self.splitID = wx.NewId() - self.changeID = wx.NewId() + self.joinfwID = wx.NewIdRef(count=1) # was NewId() JJM + self.joinbwID = wx.NewIdRef(count=1) # was NewId() + self.splitID = wx.NewIdRef(count=1) # was NewId() + self.changeID = wx.NewIdRef(count=1) # was NewId() self.Bind(wx.EVT_MENU, self.OnJoinForward, id=self.joinfwID) self.Bind(wx.EVT_MENU, self.OnJoinBackward, id=self.joinbwID) @@ -1130,19 +1475,28 @@ def calcCharSpans(self, tokenbuttons): self.charspans = [] startchar = 0 charlength = 0 + metadict=self.GetTopLevelParent().processor.metadata # added JJM 03/10/24 + + textscript="" + if "text:script" in metadict: + textscript=metadict["text:script"] + for btn in tokenbuttons: token = btn.token.token charlength = len(token) tokenindex = self.text[startchar:].find(token) - if tokenindex == -1: - # FIXME: handle missing tokens properly - tokenindex = startchar - charlength = 0 - notfound = wx.MessageDialog(self, u'Token not found in the source sentence: ' + token, 'Token not found', wx.OK) - notfound.ShowModal() - notfound.Destroy() - else: + if textscript=="N’Ko": # FIX THIS (charlength irrelevant)/do not call calcCharSpans for N'ko ? tokenindex += startchar + else: + if tokenindex == -1: + # FIXME: handle missing tokens properly + tokenindex = startchar + charlength = 0 + notfound = wx.MessageDialog(self, u'Token not found in the source sentence: ' + token, 'Token not found', wx.OK) + notfound.ShowModal() + notfound.Destroy() + else: + tokenindex += startchar charspan = (tokenindex, charlength) startchar = tokenindex+charlength self.charspans.append(charspan) @@ -1172,6 +1526,16 @@ def SetSentence(self, senttoken, tokenbuttons): """typeset and color sentence text, attach button ids""" self.token = senttoken self.text = senttoken.value + # two fixes (should be fixed elsewhere) - are there more "sentences" ? + self.GetTopLevelParent().sentpanel.senttext = self.text # fixed JJM : after token split/join OnCopyToClipboard was returning old sentence + # now fixed below : if subsequent sentence split, old sentence before after token split/join still there! created havok + self.GetTopLevelParent().processor.glosses[self.GetTopLevelParent().sentpanel.snum].senttext = self.text + #print("SetSentence - processor.glosses[snum]:") + #sent=self.GetTopLevelParent().processor.glosses[self.GetTopLevelParent().sentpanel.snum] + #print("SetSentence - glosses... senttext=",sent.senttext) + #for g in sent.glosslist: + # print(str(g)) + self.calcCharSpans(tokenbuttons) self.SetText(self.text) self.SetReadOnly(True) @@ -1240,6 +1604,8 @@ def OnTokenJoin(self, evt): snum = evt.snum startfirst, lenfirst = self.charspans[evt.first] startsecond, lensecond = self.charspans[evt.second] + # print("OnTokenJoin evt.first,second:",startfirst, startsecond) + #AttributeError: print("OnTokenJoin evt.index:",evt.index) first = self.text[startfirst:startfirst+lenfirst] second = self.text[startsecond:startsecond+lensecond] self.UpdateText(startfirst, startsecond+lensecond, u''.join((first, second)), snum) @@ -1258,6 +1624,8 @@ def UpdateText(self, start, end, newtext, snum): """replace characters between start and end with newtext, update colors and button bindings""" self.text = ''.join([self.text[:start], newtext, self.text[end:]]) self.token.value = self.text + #print("UpdateText self.token.value",self.token.value) + #print("UpdateText self.token",self.token) sentevent = SentenceEditEvent(self.GetId(), snum=snum, sent=self.token) wx.PostEvent(self.GetEventHandler(), sentevent) @@ -1268,9 +1636,9 @@ def Highlight(self, start, end): def OnContextMenu(self, evt): """pop-up sentence context menu on right-click""" if not hasattr(self, "joinfwID"): - self.splitID = wx.NewId() - self.joinfwID = wx.NewId() - self.joinbwID = wx.NewId() + self.splitID = wx.NewIdRef(count=1) # was NewId() JJM + self.joinfwID = wx.NewIdRef(count=1) # was NewId() + self.joinbwID = wx.NewIdRef(count=1) # was NewId() self.Bind(wx.EVT_MENU, self.OnJoinForward, id=self.joinfwID) self.Bind(wx.EVT_MENU, self.OnJoinBackward, id=self.joinbwID) @@ -1311,12 +1679,15 @@ def OnSplitSentence(self, evt): bytepos = self.GetCurrentPos() charpos = self.calcCharPos(bytepos) last = len(self.text) + #print("OnSplitSentence - text:",self.text) if charpos < last: first = self.intervals.overlap(0, charpos) tnum = len(first) if self.intervals[charpos]: charpos = charpos-1 if self.intervals[charpos]: + #print("OnSplitSentence -impossible- intervals[charpos]=",self.intervals[charpos]) + # On ne peut splitter qu'entre deux "boutons": pas de boutons pour les ponctuations et les Tags... dommage! self.SplitImpossibleError(evt) return # make sure that both parts contain tokens @@ -1325,6 +1696,7 @@ def OnSplitSentence(self, evt): ssplitevent = SentenceSplitEvent(self.GetId(), snum=snum, tnum=tnum, charpos=charpos) wx.PostEvent(self.GetEventHandler(), ssplitevent) else: + #print("OnSplitSentence -impossible- tnum, len(charspan)",tnum,len(self.charspans)) self.SplitImpossibleError(evt) def SplitImpossibleError(self, e): @@ -1373,13 +1745,15 @@ def SetSentence(self, senttoken, snum): self.snum = snum if senttoken.attrs: alist = senttoken.attrs.items() - alist.sort() + #alist.sort() # no longer works in Python 3 + alist=sorted(alist) # mod JJM 6/11/24 for keytext, value in alist: key = wx.StaticText(self, wx.ID_ANY, keytext) field = wx.TextCtrl(self, wx.ID_ANY, value) field.Bind(wx.EVT_TEXT, self.OnEditValue) delbutton = wx.Button(self, wx.ID_ANY, style=wx.BU_EXACTFIT | wx.BU_NOTEXT) - delbutton.SetBitmapLabel(wx.ArtProvider.GetBitmap(wx.ART_DELETE | wx.ART_MENU)) + #delbutton.SetBitmapLabel(wx.ArtProvider.GetBitmap(wx.ART_DELETE | wx.ART_MENU)) + delbutton.SetBitmapLabel(wx.ArtProvider.GetBitmap(wx.ART_DELETE)) # JJM 6/11/24 delbutton.Bind(wx.EVT_BUTTON, self.OnDeleteAttribute) self.fields[keytext] = field self.attrs[keytext] = value @@ -1451,20 +1825,185 @@ def __init__(self, parent, *args, **kwargs): wx.ScrolledWindow.__init__(self, parent, *args, **kwargs) self.SetScrollRate(20, 20) self.parent = parent + self.Sizer = wx.BoxSizer(wx.VERTICAL) + self.isFileShown=False + self.st = SentText(self, -1, num=-1, style=wx.ST_NO_AUTORESIZE) def ShowFile(self, sentlist): """show source text for a file""" - Sizer = wx.BoxSizer(wx.VERTICAL) + + if self.isFileShown: + # print("ShowFile : Clear & Remove") + self.Sizer.Clear(delete_windows=True) # that's the key element missing : delete_windows=True + self.Sizer.Remove(self.Sizer) # proably does not do a thing... + + # added JJM : load html file & html sentences and gloss + #fileIN = open(os.path.join(self.GetTopLevelParent().dirname, self.GetTopLevelParent().filename), "r") # OK on Unix, not windows + fileIN=codecs.open(os.path.join(self.GetTopLevelParent().dirname, self.GetTopLevelParent().filename), 'r', encoding="UTF-8") + htmlfile=fileIN.read() + fileIN.close() + htmlfile=re.sub(r"\r\n","\n",htmlfile,0,re.U|re.MULTILINE) # takes care of windows line endings? + + head,body=htmlfile.split("") + # old format ? + # check if body has strange sentence ending sequence + body=re.sub(r'\n\n','\n\n\n',body,0,re.U|re.MULTILINE) + + if '\n\n\n\n") + + #Sizer = wx.BoxSizer(wx.VERTICAL) for n, senttoken in enumerate(sentlist): - st = SentText(self, -1, num=n, style=wx.ST_NO_AUTORESIZE) - st.SetLabel(senttoken.value) - st.Wrap(self.GetClientSize().GetWidth()-20) - st.Bind(wx.EVT_LEFT_DOWN, st.onMouseEvent) - Sizer.Add(st, 1, wx.EXPAND) + self.st = SentText(self, -1, num=n, style=wx.ST_NO_AUTORESIZE) + stv=senttoken.value # JJM : stv and stv handling - add marker for ambiguity in sentence + stv=re.sub(r'\n','␤',stv) + stv=str(n+1)+". "+stv + padleft="\t \t" - self.SetSizer(Sizer) + # broken html structures may break here + try: + if "lemma var" in sentences[n]: padleft="\t*\t" + except IndexError: + pass + + stv=padleft+stv + self.st.SetLabel(stv) + #st.SetLabel(senttoken.value) + self.st.Wrap(self.GetClientSize().GetWidth()-20) + self.st.Bind(wx.EVT_LEFT_DOWN, self.st.onMouseEvent) + #Sizer.Add(st, 1, wx.EXPAND) + self.Sizer.Add(self.st, 1, wx.EXPAND) + + #self.SetSizer(Sizer) + self.SetSizer(self.Sizer) self.Layout() + self.isFileShown=True +class MetaPanel(wx.ScrolledWindow): + def __init__(self, parent, *args, **kwargs): + wx.ScrolledWindow.__init__(self, parent, *args, **kwargs) + self.SetScrollRate(20, 20) + self.parent = parent + self.isMetaShown=False + self.Sizer = wx.BoxSizer(wx.VERTICAL) + + def ShowMetas(self, metadata): + + if self.isMetaShown: + self.Sizer.Clear(delete_windows=True) # that's the key element missing : delete_windows=True + self.Sizer.Remove(self.Sizer) # probably does not do a thing... + + metas={} + items=[] + authors=False + for x,y in metadata.items(): + item,subitem=x.split(":") + if item=="author": + if subitem=="name": subitem="_name" # force sorting as 1st + elif item in ["source","text"] : + if subitem=="title": subitem="_title" + + if item in items: + if item=="author" and authors: + vy=y.split("|") + authindex=0 + for v in vy: + authindex+=1 + metas[item][authindex][subitem]=v + else: + metas[item][subitem]=y + else: + items.append(item) + if item=="author": + if "|" in y: + vy=y.split("|") + authors=True + authindex=0 + for v in vy: + authindex+=1 + if authindex==1: + metas[item]={authindex: {subitem:v}} + else : metas[item][authindex]={subitem:v} + else: + metas[item]={subitem:y} + #print("\n",metas,"\n") + else: + metas[item]={subitem:y} + + # print sorted metas + metatxt="\n" + for x,y in sorted(metas.items()): + metatxt+=x+"\n" + for w,z in sorted(y.items()): + if x=="author" and authors: + metatxt+="\t"+str(w)+"\n" + for wn,zn in sorted(z.items()): + metatxt+="\t\t"+wn.strip("_")+" :\t "+zn+"\n" + else: + metatxt+="\t"+w.strip("_")+" :\t "+z+"\n" + metatxt=metatxt.replace("\n","\n ") + + self.st= wx.TextCtrl(self,style=wx.TE_MULTILINE|wx.TE_DONTWRAP) + self.st.SetValue(metatxt) + #font = wx.Font(12, wx.FONTFAMILY_MODERN, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL) + #self.SetFont(font) + self.metafont = self.GetFont() + self.metafont.SetPointSize(self.metafont.GetPointSize() + 2) + self.st.SetFont(self.metafont) + self.Sizer.Add(self.st, 1, wx.EXPAND) + + self.SetSizer(self.Sizer) + self.Layout() + self.isMetaShown=True + +class DictPanel(wx.ScrolledWindow): + def __init__(self, parent, *args, **kwargs): + wx.ScrolledWindow.__init__(self, parent, *args, **kwargs) + self.SetScrollRate(20, 20) + self.parent = parent + self.isDictShown=False + self.Sizer = wx.BoxSizer(wx.VERTICAL) + + def ShowDict(self,localdictfile): + global ldtext + if self.isDictShown: + # print("ShowDict : Clear & Remove") + self.Sizer.Clear(delete_windows=True) # that's the key element missing : delete_windows=True + self.Sizer.Remove(self.Sizer) # probably does not do a thing... + + if os.path.exists(localdictfile): + #ldfile=open(localdictfile,'r') # OK only on UNix + ldfile=codecs.open(localdictfile, 'r', encoding="UTF-8") + ldtext=ldfile.read() + ldtext=re.sub(r"\r\n","\n",ldtext,0,re.U|re.MULTILINE) # takes care of windows line endings? + ldfile.close() + else : ldtext="vous n'avez pas de localdict pour l'instant" + self.ldfile=localdictfile + + centeredLabel = wx.StaticText(self, -1, localdictfile) + self.Sizer.Add(centeredLabel, flag=wx.ALIGN_CENTER_HORIZONTAL) + self.st= wx.TextCtrl(self,style=wx.TE_MULTILINE) + self.st.SetValue(ldtext) + self.Sizer.Add(self.st, 1, wx.EXPAND) + submitButton = wx.Button(self, wx.ID_SAVE,'Save') + submitButton.Bind(wx.EVT_BUTTON, self.OnSave) + self.Sizer.Add(submitButton,0,wx.ALIGN_CENTER) + self.SetSizer(self.Sizer) + self.Layout() + self.isDictShown=True + + def OnSave(self,e): + ldfile=codecs.open(self.ldfile, 'w', encoding="UTF-8") + ldtext=self.st.GetValue() + ldfile.write(ldtext) + ldfile.close() + self.GetTopLevelParent().SetLocaldict(self.ldfile) + class SentPanel(wx.Panel): """Manual disambiguation panel @@ -1488,6 +2027,7 @@ class SentPanel(wx.Panel): searchbutton (wx.SearchCtrl) : search query input field findprevbutton : find previous findnextbutton : find next + nextambigbutton : jump to next sentence with ambiguous words navsizer : navigation sizer sentsource (SentenceText) : sentence text widget sentattrs (SentAttributes) : sentence-level attributes panel @@ -1516,8 +2056,9 @@ def __init__(self, parent, vertical=True, *args, **kwargs): savebutton = wx.Button(self, wx.ID_ANY, 'Save results') savebutton.Bind(wx.EVT_BUTTON, self.OnSaveResults) self.searchbutton = wx.SearchCtrl(self, size=(200, -1), style=wx.TE_PROCESS_ENTER) - self.findprevbutton = wx.Button(self, wx.ID_ANY, '') + self.findprevbutton = wx.Button(self, wx.ID_ANY, '←ꙭ') + self.findnextbutton = wx.Button(self, wx.ID_ANY, 'ꙭ→') + self.nextambigbutton = wx.Button(self, wx.ID_ANY, '???►►') self.navsizer = wx.BoxSizer(wx.HORIZONTAL) sentenceno = wx.StaticText(self, wx.ID_ANY, "Sentence No") sentenceno.SetFont(self.sentfont) @@ -1530,6 +2071,7 @@ def __init__(self, parent, vertical=True, *args, **kwargs): self.navsizer.Add(self.sentof, 0) self.navsizer.Add(prevbutton, 0) self.navsizer.Add(nextbutton, 0) + self.navsizer.Add(self.nextambigbutton, 0) self.navsizer.Add(savebutton, 0) self.navsizer.Add(self.searchbutton, 0, wx.EXPAND) self.navsizer.Add(self.findprevbutton, 0) @@ -1537,11 +2079,17 @@ def __init__(self, parent, vertical=True, *args, **kwargs): copybutton = wx.Button(self, wx.ID_COPY) copybutton.Bind(wx.EVT_BUTTON, self.OnCopyToClipboard) self.navsizer.Add(copybutton) + copybutton2 = wx.Button(self, wx.ID_ANY, "Copy2") + copybutton2.Bind(wx.EVT_BUTTON, self.OnCopyToClipboard2) + self.navsizer.Add(copybutton2) + copybuttonrepl = wx.Button(self, wx.ID_ANY, "Copyrepl") # added 14/nov/2024 + copybuttonrepl.Bind(wx.EVT_BUTTON, self.OnCopyToClipboardRepl) + self.navsizer.Add(copybuttonrepl) self.sentsource = SentenceText(self) self.sentattrs = SentAttributes(self) self.Sizer.Add(self.navsizer) self.Sizer.Add(self.sentsource, 0, wx.EXPAND) - self.Sizer.Add(self.sentattrs, 0, wx.EXPAND) + self.Sizer.Add(self.sentattrs, 0, wx.EXPAND) # reset by JJM 6/nov/24 self.SetSizer(self.Sizer) self.Layout() @@ -1559,6 +2107,7 @@ def CreateGlossButtons(self): tokenbuttons = [] self.annotlist = wx.lib.scrolledpanel.ScrolledPanel(self, wx.ID_ANY) self.annotlist.SetScrollRate(20, 20) + self.annotlist.SetBackgroundColour((236, 211, 211, 255)) # was (60, 25, 25, 25) if self.vertical: annotsizer = wx.BoxSizer(wx.HORIZONTAL) else: @@ -1569,17 +2118,39 @@ def CreateGlossButtons(self): annotsizer.Add(abox) self.annotlist.SetSizer(annotsizer) self.annotlist.Layout() + #NOT WORKING #self.annotlist.sb=wx.ScrollBar(self, wx.ID_ANY) + #NOT WORKING self.annotlist.sb=wx.ScrollBar() + """#NOT WORKING + self.sb=wx.ScrollBar(self.annotlist, wx.ID_ANY) + self.sb.Layout() + thumbpos=self.sb.GetThumbPosition() + print("thumbpos",thumbpos) + self.sb.SetThumbPosition(40) + thumbpos=self.sb.GetThumbPosition() + print("new thumbpos",thumbpos) + """ return tokenbuttons def ShowSent(self, sentannot): """set sentence data attributes and show widgets""" self.senttoken, self.selectlist, self.tokenlist, self.sentindex = sentannot.as_tuple() self.senttext = sentannot.senttext.strip() + #print ("ShowSent *** sentannot :",sentannot) + """ + print ("*** self.senttoken :",self.senttoken) + print ("*** self.selectlist :",self.selectlist) + #print ("*** self.tokenlist :",self.tokenlist) + print ("*** self.sentindex :", self.sentindex) + print ("self.senttext :",self.senttext) + print ("self.isshown :",self.isshown) + """ + if self.isshown: self.sentsource.ClearSentence() self.sentattrs.ClearSentence() self.Sizer.Remove(self.annotlist.GetSizer()) self.annotlist.Destroy() + self.snum = sentannot.snum self.sentnumbutton.SetValue(self.snum+1) tokenbuttons = self.CreateGlossButtons() @@ -1622,6 +2193,189 @@ def OnCopyToClipboard(self, evt): wx.TheClipboard.SetData(clipdata) wx.TheClipboard.Close() + def OnCopyToClipboard2(self, evt): + """copy sentence text to a clipboard""" + global copy2spacer + if self.senttext: + clipdata2 = wx.TextDataObject() + #clipdata2.SetText(self.senttext) - build it from ligne2 + l2orig="" + l2lx="" + l2ps="" + l2gloss="" + #print("OnCopyToClipboard2: CHECK toknum, (token, selectlist):\n") + glosses = self.GetTopLevelParent().processor.glosses + sent= glosses[self.snum] + # is there a way to generate table only for selected text ??? + # try to get selection + # frm, to = sent.GetSelection() # AttributeError: 'SentAnnot' object has no attribute 'GetSelection' + # frm, to = self.sentsource.GetSelection() + # myselection=self.sentsource.GetStringSelection() + myselectedtext=self.sentsource.GetSelectedText() + myselectedtext=myselectedtext.strip() + myselectedtext=myselectedtext.replace('\u07f8', ",") # N'Ko COMMA - as done in line 2 + myselectedtext=myselectedtext.replace('\u060c', ",") # Arabic comma + myselectedtext=myselectedtext.replace(' , ', ", ") + myselectedtext=myselectedtext.replace('\u061f', "?") # Arabic question mark + myselectedtext=myselectedtext.replace('؛', r":") # U+061B ؛ ARABIC SEMICOLON + myselectedtext=myselectedtext.replace('\u066a', "%") # Arabic Percent sign ٪ + myselectedtext=myselectedtext.replace('\u07f9', "!") # N'Ko EXLAMATION MARK + # selstart=self.sentsource.GetSelectionStart() # same values as frm,to + # selend=self.sentsource.GetSelectionEnd() + # print("GetSelection: frm,to=",frm,to) + # print("selstart, selend=",selstart, selend) + # frm=int(frm/2) # these values seem erratic and unusable!!! + # to=int(to/2) + # rem ; senttext =? sentsource.text - no: sentsource.text should be used, but still, results are impredictable + # print('senttext="'+self.senttext+'"') + # print('sentsource.text="'+self.sentsource.text+'"') + # print("GetSelection:self.senttext[",frm,":",to,"]=",self.senttext[frm:to]) + # print("GetSelection:self.sentsource.text[",frm,":",to,"]=",self.sentsource.text[frm:to]) + # print("GetStringSelection:myselection",myselection) # same as above + #print("GetSelectedText:myselectedtext",myselectedtext) # results OK but position in sentence is unknown- can only select 1st occurrence + # tried and failed with encoder/decoder + # print(self.sentsource.charspans) # how did they get this right ? ah, they don't use mouse selections! + # end try + # print("sent.glosslist=",sent.glosslist) + mysentlist=[] + for w in sent.glosslist: + mysentlist.append(w.token) + """ + print("senttoken=",self.senttoken) + print("senttoken.value=",self.senttoken.value) + #mytoks=enumerate(zip(self.senttoken)) + #print("mytoks=",mytoks) + print("senttext=",self.senttext) + mysent=self.senttext # need a better way!!!?? ????????????? + # this approach fails in case of split / join + #mysent=self.senttoken.value + #mysent=mysent.strip() + mysent=mysent.replace("ߵ","ߵ ") + mysent=mysent.replace("."," .") + mysent=mysent.replace('\u07f8', " \u07f8") # N'Ko COMMA + mysent=mysent.replace('\u060c', " \u060c") # Arabic comma + mysent=mysent.replace('\u061f', " \u061f") # Arabic question mark + mysent=mysent.replace('؛', r" ؛") # U+061B ؛ ARABIC SEMICOLON + mysent=mysent.replace('\u07fa', " \u07fa") # N'KO LAJANYALAN + mysent=mysent.replace('\u066a', " \u066a") # Arabic Percent sign ٪ + mysent=mysent.replace('\u07f9', " \u07f9") # N'Ko EXLAMATION MARK + mysent=mysent.replace('\n', " ") # Newline side-effects + while " " in mysent: + mysent=mysent.replace(" "," ") + mysent=mysent.strip() + mysentlist=mysent.split(" ") + """ + mysentlistselected="" + for (toknum, (token, selectlist)) in enumerate(zip(self.tokenlist, self.selectlist)): + if selectlist: + mytokenlist=selectlist + else: + mytokenlist=token.glosslist + mytoken=mytokenlist[0] + # print("mytoken:", mytoken) + if myselectedtext!="": + if mysentlistselected=="": mysentlistselected=mysentlistselected+mysentlist[toknum] + else: + if mytoken.gloss=="c" or mysentlistselected[-1] in ["'","’","ߴ","ߵ"]: # added N'ko apostrophes (high & low tone) + mysentlistselected=mysentlistselected+mysentlist[toknum] + else: mysentlistselected=mysentlistselected+" "+mysentlist[toknum] + # print("mysentlistselected:",mysentlistselected) + # CAUTION this only handles some punctuations !!! + if not myselectedtext.startswith(mysentlistselected) : + mysentlistselected="" # wrong sequence, start over again - Note this will only get the first occurrence of a selected sequence! + l2orig="" + l2lx="" + l2ps="" + l2gloss="" + continue + mylx=mytoken.form + myps="/".join(mytoken.ps) + mygloss=mytoken.gloss + if mygloss==None: mygloss="" + if mygloss=="" : + for mymorph in mytoken.morphemes: + if mymorph.gloss!=None: mygloss+=mymorph.gloss+"_" + mygloss=mygloss[:-1] + l2orig=l2orig+mysentlist[toknum]+copy2spacer + l2lx+=mylx+copy2spacer + l2ps+=myps+copy2spacer + #print("mygloss:",mygloss) + l2gloss+=mygloss+copy2spacer + if myselectedtext!="": + if mysentlistselected==myselectedtext: break + if myselectedtext!="" and mysentlistselected=="": clipdata2.SetText("Sorry, couln't find "+myselectedtext) + else: clipdata2.SetText(l2orig[:-1]+"\n"+l2lx[:-1]+"\n"+l2ps[:-1]+"\n"+l2gloss[:-1]) + if not wx.TheClipboard.IsOpened(): + wx.TheClipboard.Open() + wx.TheClipboard.SetData(clipdata2) + wx.TheClipboard.Close() + + def OnCopyToClipboardRepl(self, evt): + """copy sentence text to a clipboard""" + metadict=self.GetTopLevelParent().processor.metadata + if "text:script" in metadict: + textscript=metadict["text:script"] + if textscript=="": textscript="Nouvel orthographe malien" # default + else: textscript="Nouvel orthographe malien" # default + if self.senttext: + clipdata2 = wx.TextDataObject() + l2orig="" + l2token="" + glosses = self.GetTopLevelParent().processor.glosses + sent= glosses[self.snum] + myselectedtext=self.sentsource.GetSelectedText() + myselectedtext=myselectedtext.strip() + if textscript=="N’Ko": + myselectedtext=myselectedtext.replace('\u07f8', ",") # N'Ko COMMA - as done in line 2 + myselectedtext=myselectedtext.replace('\u060c', ",") # Arabic comma + myselectedtext=myselectedtext.replace(' , ', ", ") + myselectedtext=myselectedtext.replace('\u061f', "?") # Arabic question mark + myselectedtext=myselectedtext.replace('؛', r":") # U+061B ؛ ARABIC SEMICOLON + myselectedtext=myselectedtext.replace('\u066a', "%") # Arabic Percent sign ٪ + myselectedtext=myselectedtext.replace('\u07f9', "!") # N'Ko EXLAMATION MARK + #print("GetSelectedText:myselectedtext",myselectedtext) # results OK but position in sentence is unknown- can only select 1st occurrence + mysentlist=[] + for w in sent.glosslist: + mysentlist.append(w.token) + mysentlistselected="" + for (toknum, (token, selectlist)) in enumerate(zip(self.tokenlist, self.selectlist)): + if selectlist: + mytokenlist=selectlist + else: + mytokenlist=token.glosslist + mytoken=mytokenlist[0] + # print("mytoken:", mytoken) + if myselectedtext!="": + if mysentlistselected=="": mysentlistselected=mysentlistselected+mysentlist[toknum] + else: + if mytoken.gloss=="c" or mysentlistselected[-1] in ["'","’","ߴ","ߵ"]: # added N'ko apostrophes (high & low tone) + mysentlistselected=mysentlistselected+mysentlist[toknum] + else: mysentlistselected=mysentlistselected+" "+mysentlist[toknum] + # print("mysentlistselected:",mysentlistselected) + # CAUTION this only handles some punctuations !!! + if not myselectedtext.startswith(mysentlistselected) : + mysentlistselected="" # wrong sequence, start over again - Note this will only get the first occurrence of a selected sequence! + l2orig="" + l2token="" + continue + if textscript=="N’Ko": + l2orig+=mysentlist[toknum]+"_" + else: + l2orig+=mytoken.form+"_" + l2token+=str(mytoken)+"_" + if myselectedtext!="": + if mysentlistselected==myselectedtext: break + if myselectedtext!="" and mysentlistselected=="": + clipdata2.SetText("Sorry, couln't find "+myselectedtext) + else: + l2orig=l2orig.replace(",","COMMA") + l2token=l2token.replace(",::c","COMMA") + clipdata2.SetText(l2orig[:-1]+"==="+l2token[:-1]) + if not wx.TheClipboard.IsOpened(): + wx.TheClipboard.Open() + wx.TheClipboard.SetData(clipdata2) + wx.TheClipboard.Close() + def OnSelectorUpdate(self, evt): """pass selector update event to the sentence text widget""" self.sentsource.OnSelectorUpdate(evt) @@ -1687,6 +2441,16 @@ def __init__(self, parent, *args, **kwargs): self.config = wx.Config.Get(False) self.config.SetRecordDefaults() + x=self.config.ReadInt("MainFrame/pos/x",30) # JJM : recover previous layout + y=self.config.ReadInt("MainFrame/pos/y",30) + w=self.config.ReadInt("MainFrame/size/w",1024) + h=self.config.ReadInt("MainFrame/size/h",512) + self.SetPosition(wx.Point(x,y)) + self.SetSize(wx.Rect(x,y,w,h)) + + self.statusbar = self.CreateStatusBar(1) + self.statusbar.SetStatusText('Bienvenue dans la désambiguïsation !') + def savedDefault(name, fore, back): forecolor = self.config.Read("colors/{}/fore".format(name.lower()), fore) backcolor = self.config.Read("colors/{}/back".format(name.lower()), back) @@ -1708,8 +2472,9 @@ def savedDefault(name, fore, back): recent = wx.Menu() menuOpen = filemenu.Append(wx.ID_OPEN, "O&pen", " Open text file") self.Bind(wx.EVT_MENU, self.OnMenuOpen, menuOpen) - filemenu.Append(wx.ID_ANY, "Open &recent", recent) - self.filehistory = wx.FileHistory(maxFiles=9, idBase=wx.ID_FILE1) + #filemenu.Append(wx.ID_ANY, "Open &recent", recent) + filemenu.AppendSubMenu(recent,"Open &recent") + self.filehistory = wx.FileHistory(maxFiles=20, idBase=wx.ID_FILE1) self.filehistory.Load(self.config) self.filehistory.UseMenu(recent) self.filehistory.AddFilesToMenu() @@ -1749,9 +2514,28 @@ def savedDefault(name, fore, back): self.Bind(wx.EVT_MENU, self.OnSelectColors, menuColors) menuLocaldict = settingsmenu.Append(wx.ID_ANY, "Set &Localdict", "Set Localdict") self.Bind(wx.EVT_MENU, self.OnSetLocaldict, menuLocaldict) + + self.menuLangBamananGV = wx.MenuItem(settingsmenu, 1, '&Bamanan gloss validation', kind = wx.ITEM_CHECK) # ADDED JJM + settingsmenu.Append(self.menuLangBamananGV) + bamananGV=True # Default + self.menuLangBamananGV.Check(check = bamananGV) # IMPROVE : if True, maninkaGV should be False (or not)!!! & vice-versa - see attempt in DoOpen + self.Bind(wx.EVT_MENU, self.OnSetLangBamananGV, self.menuLangBamananGV) + + self.menuLangManinkaGV = wx.MenuItem(settingsmenu, 2, '&Maninka/N\'ko gloss validation', kind = wx.ITEM_CHECK) # ADDED JJM 24/03/2024 + settingsmenu.Append(self.menuLangManinkaGV) + maninkaGV=False # Default + self.menuLangManinkaGV.Check(check = maninkaGV) # IMPROVE: should be checked if text.script=N'Ko (see metadict) + self.Bind(wx.EVT_MENU, self.OnSetLangManinkaGV, self.menuLangManinkaGV) + + menuCopy2SpacerToggle= wx.MenuItem(settingsmenu, 3, '&Copy2 Spacer is tab/space toggle', kind = wx.ITEM_CHECK) # ADDED JJM 04/06/2024 + global copy2spacer + copy2spacer="\t" + settingsmenu.Append(menuCopy2SpacerToggle) + self.Bind(wx.EVT_MENU, self.OnSetCopy2Spacer, menuCopy2SpacerToggle) + menuBar.Append(settingsmenu, "&Settings") self.SetMenuBar(menuBar) - + debugmenu = wx.Menu() menuInspector = debugmenu.Append(wx.ID_ANY, "Widget I&nspector", "Widget Inspector") self.Bind(wx.EVT_MENU, self.OnWidgetInspector, menuInspector) @@ -1778,9 +2562,25 @@ def savedDefault(name, fore, back): def SetLocaldict(self, dictfile): """load localdict from a file or create empty one if the file does not exist""" if os.path.exists(dictfile): - self.localdict = daba.formats.DictReader(dictfile).get() - else: + self.oldlocaldict = daba.formats.DictReader(dictfile,keepmrph=True).get() + # create new local dict - keepmrph important for disamb localdict1 self.localdict = daba.formats.DabaDict() + # populate new localdict with notones items + for x, y in sorted(self.oldlocaldict.items()): + xnotone=re.sub(r'[\u0301\u0300\u0302\u030c]','',x) # high, low, decreasing, increasing tone diacritics + xnotone=xnotone.lower() + if xnotone in self.localdict: + for ygloss in y: + if ygloss not in self.localdict[xnotone]: + self.localdict[xnotone].insert(0, ygloss) + else: + self.localdict[xnotone]=y[0] + for ygloss in y: + if ygloss not in self.localdict[xnotone]: + self.localdict[xnotone].insert(0, ygloss) + del self.oldlocaldict + else: + self.localdict = daba.formats.DabaDict() # will be populated OnSave def InitValues(self): """set main attributes""" @@ -1797,8 +2597,12 @@ def InitUI(self): self.notebook = wx.Notebook(self) self.filepanel = FilePanel(self.notebook) self.sentpanel = SentPanel(self.notebook, vertical=self.config.ReadBool("display/vertical")) + self.metapanel = MetaPanel(self.notebook) + self.dictpanel = DictPanel(self.notebook) self.notebook.AddPage(self.sentpanel, "Disambiguate") self.notebook.AddPage(self.filepanel, "Source") + self.notebook.AddPage(self.metapanel, "Metas") + self.notebook.AddPage(self.dictpanel, "Localdict") self.Sizer.Add(self.notebook, 1, wx.EXPAND) self.Layout() self.Bind(wx.EVT_TEXT_ENTER, self.OnButtonSearch, self.sentpanel.searchbutton) @@ -1807,6 +2611,7 @@ def InitUI(self): self.Bind(wx.EVT_BUTTON, self.OnFindNext, self.sentpanel.findnextbutton) self.Bind(wx.EVT_SPINCTRL, self.OnGotoSentence, self.sentpanel.sentnumbutton) self.Bind(wx.EVT_TEXT_ENTER, self.OnGotoSentence, self.sentpanel.sentnumbutton) + self.Bind(wx.EVT_BUTTON, self.OnNextAmbig, self.sentpanel.nextambigbutton) def CleanUI(self): """clear interface""" @@ -1825,6 +2630,7 @@ def UpdateUI(self): if snum is not None: self.filepanel.ShowFile(s.senttoken for s in self.processor.glosses) self.ShowSent(snum) + self.metapanel.ShowMetas(self.processor.metadata) self.Layout() self.Thaw() @@ -1862,14 +2668,38 @@ def OnSetLocaldict(self, e): if not self.fileopened: self.NoFileError(e) else: - dlg = wx.FileDialog(self, "Choose localdict file", self.dirname, "localdict.txt", "*.*", wx.FD_OPEN) + dlg = wx.FileDialog(self, "Choose localdict file", self.dirname, "localdict1.txt", "*.*", wx.FD_OPEN) if dlg.ShowModal() == wx.ID_OK: dictfile = dlg.GetPath() self.SetLocaldict(dictfile) - if not dictfile == '/'.join([self.dirname, "localdict.txt"]): + if not dictfile == '/'.join([self.dirname, "localdict1.txt"]): self.config.Write('/'.join(['localdict', self.infile]), dictfile) dlg.Destroy() + def OnSetLangBamananGV(self, e): # this does not check / uncheck in Settings ??? + """ let user toggle Bamanan Gloss validation on/off """ + global bamananGV, maninkaGV # why not just use: self.bamananGV ? + bamananGV= not bamananGV + self.menuLangBamananGV.Check(check = bamananGV) + self.menuLangManinkaGV.Check(check = maninkaGV) + #print("OnSetLangBamananGV - bamananGV: ",bamananGV," maninkaGV: ",maninkaGV) + + def OnSetLangManinkaGV(self, e): # JJM 24/04/2024 # this does not check / uncheck in Settings ??? + """ let user toggle Bamanan Gloss validation on/off """ + global bamananGV, maninkaGV # why not just use: self.maninkaGV ? + maninkaGV=not maninkaGV + self.menuLangBamananGV.Check(check = bamananGV) + self.menuLangManinkaGV.Check(check = maninkaGV) + # print("OnSetLangManinkaGV - bamananGV: ",bamananGV," maninkaGV: ",maninkaGV) + + def OnSetCopy2Spacer(self,e) : # JJM 04/06/2024 + # use space or tab when spacing ligne 2 elements (lx, ps, gloss) to copy + global copy2spacer + if copy2spacer==" ": + copy2spacer="\t" + else: + copy2spacer=" " + def OnWidgetInspector(self, e): """show widget inspector""" import wx.lib.inspection @@ -1891,6 +2721,8 @@ def ShowSent(self, snum): if self.undolist[snum]: self.menuUndoTokens.Enable(True) self.SaveFilePos(snum) + #print("ShowSent snum=",snum, "len(glosses)=",len(self.processor.glosses)) + if snum>=len(self.processor.glosses): snum=0 self.sentpanel.ShowSent(self.processor.glosses[snum]) def OnTokenSplit(self, evt): @@ -1898,15 +2730,41 @@ def OnTokenSplit(self, evt): snum, toknum = evt.index sent = self.processor.glosses[snum] savedtoken = sent.glosslist[toknum] + # print("savedtoken.token",savedtoken.token) + # print("vars(savedtoken)",vars(savedtoken)) + savedforms=[] # added JJM 14/10/2025 to build split tokens from saved morphemes if possible + # print("OnTokenSplit - selected= ") + # for g in savedtoken.glosslist: + # print(" g.gloss",g.gloss) # also tried gloss(g) and gloss(savedtoken.glosslist) + # which gloss is selected? I have not found how to... + # if word is ambiguous it would be better to start with the selected gloss instead of gloss 0 (first gloss in list) + for g in savedtoken.glosslist[0].morphemes: + #print("g.form",g.form) + savedforms.append(re.sub(r'[\u0301\u0300\u0302\u030c]','',g.form)) + savedformstuple=tuple(savedforms) edit = TokenEdit('split', toknum, toknum+len(evt.result), [savedtoken]) self.undolist[snum].append(edit) del sent.selectlist[toknum] del sent.glosslist[toknum] - shift = 0 - for token in evt.result: - sent.selectlist.insert(toknum+shift, []) - sent.glosslist.insert(toknum+shift, daba.formats.WordToken([Gloss(token, (), '', ())], token, '-1')) - shift = shift+1 + # print("savedformstuple",savedformstuple) + # print("evt.result",evt.result) + if len(savedtoken.glosslist)==1 and savedformstuple==evt.result: + # print("égalité des tuples") + shift=0 + for g in savedtoken.glosslist[0].morphemes: + # print("shift,g.form",shift,g.form) + sent.selectlist.insert(toknum+shift, []) + token=evt.result[shift] + sent.glosslist.insert(toknum+shift, daba.formats.WordToken([Gloss(g.form, g.ps, g.gloss, g.morphemes)], token, '-1')) + # equivalent to: sent.glosslist.insert(toknum+shift, daba.formats.WordToken([g], token, '-1')) + shift = shift+1 + else: + shift = 0 + for token in evt.result: + # print("shift,token",shift,token) + sent.selectlist.insert(toknum+shift, []) + sent.glosslist.insert(toknum+shift, daba.formats.WordToken([Gloss(token, (), '', ())], token, '-1')) + shift = shift+1 self.processor.dirty = True wx.CallAfter(self.ShowSent, snum) @@ -1917,19 +2775,38 @@ def OnTokenJoin(self, evt): second = evt.second sent = self.processor.glosses[snum] savedtokens = [2][first:second+1] + # print("OnTokenJoin - savedtokens",savedtokens) edit = TokenEdit('join', first, second, savedtokens) self.undolist[snum].append(edit) firsttoken = sent.glosslist[first] nexttoken = sent.glosslist[second] + # print("OnTokenJoin - firsttoken nexttoken",firsttoken, nexttoken) + # print("vars(firsttoken):",vars(firsttoken)) + # print("firsttoken.glosslist[0]",firsttoken.glosslist[0]) + # print("firsttoken.glosslist[0].ps",firsttoken.glosslist[0].ps) # FIXME: will break on non-word tokens - newform = firsttoken.token + nexttoken.token - newtoken = daba.formats.WordToken([Gloss(newform, (), '', ())], newform, '-1') + #newform = firsttoken.token + nexttoken.token + if len(firsttoken.glosslist)==1 and len(nexttoken.glosslist)==1 : + newform = firsttoken.glosslist[0].form + nexttoken.token + newtokenform = firsttoken.token + nexttoken.token + #original code: newtoken = daba.formats.WordToken([Gloss(newform, (), '', ())], newform, '-1') + # new code: JJM 13/10/2025 + pstuple=(firsttoken.glosslist[0].ps+nexttoken.glosslist[0].ps) + uniqueps = tuple(j for i, j in enumerate(pstuple) if pstuple.index(j) == i) + #not needed? joinedgloss='.'.join((firsttoken.glosslist[0].gloss,nexttoken.glosslist[0].gloss)) + newtoken = daba.formats.WordToken([Gloss(newform, uniqueps, '', (firsttoken.glosslist[0],nexttoken.glosslist[0]))], newtokenform, '-1') + else: + newform = firsttoken.token + nexttoken.token + newtoken = daba.formats.WordToken([Gloss(newform, (), '', ())], newform, '-1') sent.selectlist[first] = [] del sent.selectlist[second] sent.glosslist[first] = newtoken del sent.glosslist[second] self.processor.dirty = True + #NOT WORKING thumbpos=self.sentpanel.annotlist.sb.GetThumbPosition() wx.CallAfter(self.ShowSent, snum) + #NOT WORKING self.sentpanel.annotlist.sb.SetThumbPosition(thumbpos) + #NOT WORKING print("positionné à thumbpos", thumbpos) def OnTokenEdit(self, evt): """edit tokens in the processor glosses data, update UI""" @@ -1964,6 +2841,7 @@ def OnSentenceJoin(self, evt): def OnSentenceSplit(self, evt): """split sentences in the processor glosses data, update UI""" sent = self.processor.glosses[evt.snum] + # print("OnSentenceSplit: sent=",sent) firstsent, nextsent = sent.split(evt.tnum, evt.charpos) self.processor.glosses[evt.snum] = firstsent self.processor.glosses.insert(evt.snum+1, nextsent) @@ -1991,8 +2869,27 @@ def OnGlossEdited(self, evt): def OnLocaldictLookup(self, evt): """lookup a gloss in localdict, show available matches""" + global formlookup try: - savedglosses = self.localdict[evt.gloss.form] + + formlookup=evt.gloss.form + + + # JJM : localdict now indexed without tones & lowercase + formlookup=formlookup.lower() # all indexed in localdict with no tones and lowercase + + if self.GetTopLevelParent().processor.metadata["text:script"]=="Ancien orthographe malien": + # form is normally parsed in New Orthography - this only handles exception (where parse failed, notably Proper names) + formlookup=formlookup.replace("èe","ɛɛ") + formlookup=formlookup.replace("òo","ɔɔ") + formlookup=formlookup.replace("è","ɛ") + formlookup=formlookup.replace("ò","ɔ") + + formlookup=re.sub(r'[\u0301\u0300\u0302\u030c]','',formlookup) # 4 bambara tones + # print("OnLocaldictLookup - using formlookup: ",formlookup) + + savedglosses = self.localdict[formlookup] + dlg = evt.dlg wx.CallAfter(dlg.ShowLocaldictVariants, savedglosses) except (KeyError): @@ -2000,16 +2897,79 @@ def OnLocaldictLookup(self, evt): def OnLocaldictSave(self, evt): """save a word into localdict""" + global formlookup + # print("OnLocaldictSave - formlookup=",formlookup) gloss = evt.gloss # we do not save words with empty glosses into localdict - if not gloss.gloss: - return - if gloss.form in self.localdict: + # original code for this whole section + """if gloss.form in self.localdict: if gloss not in self.localdict[gloss.form]: self.localdict[gloss.form].insert(0, gloss) + print("OnLocalditSave - Added:",gloss, "to entry:",gloss.form) else: self.localdict[gloss.form] = gloss - + print("OnLocalditSave - Created:",gloss, "to entry:",gloss.form) + """ + #if not gloss.gloss: JJM changed this, in case there are morphemes + if not gloss.gloss and not gloss.morphemes: + return + + if gloss.gloss==None: + # rebuild : do not allow None as value for \ge + gloss=Gloss(gloss.form, gloss.ps, '', gloss.morphemes) + + # new localdict JJM : is indexed with notones form + x=gloss.form + #print("gloss,...", gloss, gloss.form, gloss.ps, gloss.gloss, gloss.morphemes) + #print("Gloss",Gloss(gloss.form, gloss.ps, gloss.gloss, gloss.morphemes)) + xnotone=re.sub(r'[\u0301\u0300\u0302\u030c]','',x) # high, low, decreasing, increasing tone diacritics + xnotone=xnotone.lower() # allow proper names to be available even on lower case text + # there is a side effect with Capitalized entries also created in lowercase on first "Create" ??? + if xnotone in self.localdict and len(gloss.ps)>0 : + + if gloss not in self.localdict[xnotone] : + + self.localdict[xnotone].insert(0, gloss) + print("OnLocaldictSave - Added to localdict[",xnotone,"] =",gloss) + + #else: + # print("OnLocaldictSave - Skipped localdict[",xnotone,"] =", gloss, ": already there, do nothing") + else: + self.localdict[xnotone] = gloss + print("OnLocaldictSave - Created localdict[",xnotone,"] =",gloss) + + if xnotone!=formlookup and len(gloss.ps)>0 : + # this will add an entry if token!=form so that future erroneous token may yield correct form lookup. + print(xnotone,"<>",formlookup," : essayer de l'indexer?") + if formlookup not in self.localdict: + print(formlookup,"not in self.localdict") + self.localdict[formlookup] = gloss + print("¹OnLocaldictSave - this session only! - also created localdict[",formlookup,"] =",gloss) + +# 04/10/2025 - toujours pas la bonne solution? + elif gloss not in self.localdict[formlookup]: + print(gloss,"not in localdict[",formlookup,"]") + self.localdict[formlookup] = gloss + print("²OnLocaldictSave - this session only! - added localdict["+formlookup+"] =",gloss) + + print("récap de localdict["+formlookup+"] =") + for g in self.localdict[formlookup]: + print("-",str(g)) +# jan 2025: +# if formlookup not in self.localdict: +# self.localdict[formlookup] = gloss +# print("¹OnLocaldictSave - also created localdict[",formlookup,"] =",gloss) +# elif xnotone not in self.localdict: +# self.localdict[xnotone] = gloss +# print("²OnLocaldictSave - also created localdict[",xnotone,"] =",gloss) +# else: +# print(formlookup,"ou",xnotone,"déjà dans localdict") +# try: +# self.localdict[xnotone] = gloss +# print('on indexe quand même') +# except: +# print('ajout impossible') + def OnUndoTokens(self, e): """undo token split/join operations""" snum = self.sentpanel.snum @@ -2071,6 +3031,35 @@ def OnFindNext(self, e): match = self.searcher.findNext() self.ShowSearchResult(match) + def OnNextAmbig(self, e): + snum = self.sentpanel.sentnumbutton.GetValue() # start at current sentence + #print("OnNextAmbig - starting at sentence # ",snum) + ns=0 + ambig=False + for s in self.processor.glosses: + ns+=1 + if ns>snum: # only check sentences after the current one + #print(" OnNextAmbig - checking sentence # ",ns) + for g in s.glosslist: + if g.type=="w": + if len(g.value[2])!=1: # >1 : ambigu 0 : inconnu + #print(" ",g) + #print(" ",g.value[2]) + ambig=True + break + else: # only one gloss but problems + if len(g.gloss.ps)==0 and g.gloss.gloss=='': # word in red (no gparser candidate) + #print(" ",g) + ambig=True + break + elif g.gloss.gloss=='INCOGN': + #print(" ",g) + ambig=True + break + if ambig: break + if ambig: self.ShowSent(ns-1) + + def OnGotoSentence(self, e): """show sentence with a given number""" self.sentpanel.OnSaveResults(e) @@ -2090,7 +3079,19 @@ def GetFilePos(self, filename): def OnClose(self, e): """save and cleanup UI on file close""" + if self.fileopened: + # JJM save window position and size + x,y=self.Position + w,h=self.Size + #print ("x,y - w,h :",x,y,w,h) + + self.config.WriteInt("MainFrame/pos/x",x) + self.config.WriteInt("MainFrame/pos/y",y) + self.config.WriteInt("MainFrame/size/w",w) + self.config.WriteInt("MainFrame/size/h",h) + self.config.Flush() # permanently writes + if self.processor.dirty: self.OnSave(e) if self.logger: @@ -2121,7 +3122,8 @@ def OnMenuOpen(self, e): if self.fileopened: self.FileOpenedError(e) else: - dlg = wx.FileDialog(self, "Choose a file", self.dirname, "", "*.*", wx.FD_OPEN) + #dlg = wx.FileDialog(self, "Choose a file", self.dirname, "", "*.*", wx.FD_OPEN) + dlg = wx.FileDialog(self, "Choose a file", self.dirname, "", "disamb files|*.pars.html;*.repl.html;*.dis.html", wx.FD_OPEN|wx.FD_FILE_MUST_EXIST) if dlg.ShowModal() == wx.ID_OK: self.DoOpen(dlg.GetPath()) dlg.Destroy() @@ -2145,23 +3147,81 @@ def DoOpen(self, filename): self.filename = os.path.basename(self.infile) logfile = os.path.extsep.join([get_basename(self.infile), 'log']) self.logger = EditLogger(os.path.join(self.dirname, logfile)) - self.dictfile = self.config.Read("/".join(["localdict", self.infile]), os.path.join(self.dirname, "localdict.txt")) + self.dictfile = self.config.Read("/".join(["localdict", self.infile]), os.path.join(self.dirname, "localdict1.txt")) self.SetLocaldict(self.dictfile) self.processor.read_file(self.infile) + self.nambigs_before,self.totalwords_before=self.nambigs() + pcdisamb=0 + if self.totalwords_before>0 : pcdisamb=int(100*self.nambigs_before/self.totalwords_before) + self.statusbar.SetStatusText(str(self.nambigs_before)+' mots ambigus restants/'+str(self.totalwords_before)+" mots au total soit "+str(pcdisamb)+"%") + self.InitUI() self.SetTitle(self.filename) self.filepanel.ShowFile(s.senttoken for s in self.processor.glosses) + self.metapanel.ShowMetas(self.processor.metadata) + self.dictpanel.ShowDict(self.dictfile) + snum = self.GetFilePos(self.infile) self.ShowSent(snum) self.fileopened = True self.Layout() + #this does not do anything useful + global bamananGV,maninkaGV + metadict=self.processor.metadata + if "text:script" in metadict: + textscript=metadict["text:script"] + if textscript in ["Ancien orthographe malien","Nouvel orthographe malien"]: + bamananGV=True + maninkaGV=False + elif textscript in["N’Ko","Nouveau orthographe guinéen", "Ancien orthographe guinéen"]: + bamananGV=False + maninkaGV=True + self.menuLangBamananGV.Check(check = bamananGV) + self.menuLangManinkaGV.Check(check = maninkaGV) + + def nambigs(self): # JJM compute ambiguous words left + na=0 + nw=0 + for s in self.processor.glosses: + for g in s.glosslist: + if g.type=="w": + nw += 1 + if len(g.value[2])>1: na += 1 + return na,nw + def SaveFiles(self): """save annotated data, localdict and config values""" - if self.localdict: - daba.formats.DictWriter(self.localdict, self.dictfile, lang='default', name='localdict', ver='0').write() + + prevsent=self.processor.metadata["_auto:sentences"] # JJM + prevwords=self.processor.metadata["_auto:words"] + # JJM : if there are splits/joins these meta will be updated next in processor.write (FileParser) self.processor.write(self.outfile) self.config.Flush() + + # save localdict AFTER annotated data: i case of problem with localdict, disamb is saved with higher priority + if self.localdict: + """print("SaveFiles, before DictWriter") + for x, y in sorted(self.localdict.items()): + print("localdict[",x,"]=",y) + """ + daba.formats.DictWriter(self.localdict, self.dictfile, lang='default', name='localdict', ver='0').write() + self.dictpanel.ShowDict(self.dictfile) # needs to be updated only if localdict is updated + + + nambigs_after,totalwords_after=self.nambigs() + + if self.processor.metadata["_auto:sentences"]!=prevsent or self.processor.metadata["_auto:words"]!=prevwords: + + self.filepanel.ShowFile(s.senttoken for s in self.processor.glosses) # also needs update when sentence fully disambed + self.metapanel.ShowMetas(self.processor.metadata) # may also need to be updated on words split/joins + + if nambigs_after != self.nambigs_before or totalwords_after != self.totalwords_before : + self.filepanel.ShowFile(s.senttoken for s in self.processor.glosses) # update as sentence is disambed + self.nambigs_before=nambigs_after + self.totalwords_before=totalwords_after + pcdisamb=int(100*nambigs_after/totalwords_after) + self.statusbar.SetStatusText(str(self.nambigs_before)+' mots ambigus restants/'+str(totalwords_after)+" mots au total soit "+str(pcdisamb)+"%") def OnSave(self, e): """save files""" @@ -2178,7 +3238,9 @@ def OnSaveAs(self, e): if not self.fileopened: self.NoFileError(e) else: - xfilename = ''.join(['.'.join([get_basename(self.infile), 'dis']), os.path.extsep, 'html']) + basefilename=get_basename(self.infile) + if basefilename.endswith(".old"): basefilename=basefilename[:-4] + xfilename = ''.join(['.'.join([basefilename, 'dis']), os.path.extsep, 'html']) dlg = wx.FileDialog(self, "Choose a file", os.path.dirname(self.infile), xfilename, "*.html", wx.FD_SAVE) if dlg.ShowModal() == wx.ID_OK: @@ -2187,13 +3249,23 @@ def OnSaveAs(self, e): self.outfile = ''.join([self.outfile, os.path.extsep, 'html']) self.SaveFiles() self.filehistory.AddFileToHistory(self.outfile) + # added JJM + self.filehistory.Save(self.config) + self.dirname = os.path.dirname(self.outfile) + self.config.Write("state/curdir", self.dirname) + self.config.Flush() + self.filename = os.path.basename(self.outfile) + self.SetTitle(self.filename) + # end added JJM dlg.Destroy() def main(): app = wx.App() + frame = MainFrame(None, title="Daba disambiguation interface (GUI)") frame.Show() + app.MainLoop() diff --git a/daba/gparser.py b/daba/gparser.py old mode 100644 new mode 100755 index 678edf5..649dc27 --- a/daba/gparser.py +++ b/daba/gparser.py @@ -21,7 +21,7 @@ from contextlib import contextmanager import daba.mparser -import daba.formats +# import daba.formats # already loaded in mparser! from daba.plugins import OrthographyConverter def get_outdir(fname): @@ -38,6 +38,14 @@ def get_outdir(fname): def get_outfile(fname): basename = os.path.basename(fname) + # <<<< +JJM + parsfile='.'.join([os.path.splitext(basename)[0], 'pars.html']) + if os.path.exists(parsfile): + print("get_outfile / EXISTE DÉJÀ / ALREADY EXISTS: ",parsfile) + os.remove(parsfile) + print(" fichier précédent SUPPRIMÉ / previous file DELETED/removed") + # NO SELF HERE self.statusbar.SetStatusText(" fichier pars précédent supprimé:"+parsfile) + # >>>> +JJM return '.'.join([os.path.splitext(basename)[0], 'pars']) @@ -56,6 +64,7 @@ class DictionaryItem(wx.Panel): def __init__(self, parent, dic, b_id, *args, **kwargs): wx.Panel.__init__(self, parent, *args, **kwargs) hbox = wx.BoxSizer(wx.HORIZONTAL) + print("dic :",dic) hbox.Add(wx.StaticText(self, -1, dic.description),0) rbutton = wx.Button(self, b_id, "Remove") self.Bind(wx.EVT_BUTTON, parent.OnRemove, rbutton) @@ -63,15 +72,19 @@ def __init__(self, parent, dic, b_id, *args, **kwargs): self.SetSizer(hbox) self.Layout() - -class DictionaryLister(wx.Panel): +import wx.lib.scrolledpanel as scrolled +class DictionaryLister(scrolled.ScrolledPanel): def __init__(self, parent, dictloader, *args, **kwargs): - wx.Panel.__init__(self, parent, *args, **kwargs) + super(DictionaryLister, self).__init__(parent, + style = wx.TAB_TRAVERSAL|wx.SUNKEN_BORDER) + self.SetupScrolling() + #wx.Panel.__init__(self, parent, *args, **kwargs) self.buttons = {} self.children = {} self.parent = parent self.dictloader = dictloader dictbox = wx.StaticBox(self, -1, "Available Dictionaries") + dictbox.SetBackgroundColour((236, 211, 211, 255)) # was (60, 25, 25, 25) self.dsizer = wx.StaticBoxSizer(dictbox, wx.VERTICAL) b_id = 0 for dic in self.dictloader.dictionary.dictlist: @@ -121,6 +134,7 @@ def __init__(self, parent, grammarloader, *args, **kwargs): self.grammarloader = grammarloader grambox = wx.StaticBox(self, -1, "Available Grammar") + grambox.SetBackgroundColour((236, 211, 211, 255)) # was (80, 40, 40, 30) self.gsizer = wx.StaticBoxSizer(grambox, wx.VERTICAL) self.gramlist = wx.StaticText(self, -1, '\n'.join(self.grammarloader.gramlist)) self.gsizer.Add(self.gramlist, 0, wx.TOP|wx.LEFT, 10) @@ -142,15 +156,21 @@ def OnLoad(self, evt): self.gramlist.Show(True) self.Layout() -class ConverterLister(wx.Panel): +class ConverterLister(scrolled.ScrolledPanel): def __init__(self, parent, *args, **kwargs): - wx.Panel.__init__(self, parent, *args, **kwargs) + super(ConverterLister, self).__init__(parent, + style = wx.TAB_TRAVERSAL|wx.SUNKEN_BORDER) + self.SetupScrolling() + #wx.Panel.__init__(self, parent, *args, **kwargs) #FIXME: make default plugins configurable from config file self.selection = ('apostrophe',) daba.mparser.load_plugins() converterbox = wx.StaticBox(self, -1, "Available Orthographic Converters") + converterbox.SetBackgroundColour((236, 211, 211, 255)) # was (70, 30, 30, 30) self.csizer = wx.StaticBoxSizer(converterbox, wx.VERTICAL) + print("OrthographyConverter.converters",OrthographyConverter.converters) self.converterlist = wx.CheckListBox(self, wx.ID_ANY, choices=OrthographyConverter.converters) + self.converterlist.SetBackgroundColour((236, 211, 211, 255)) # was (60, 25, 25, 25) self.converterlist.SetCheckedStrings(self.selection) self.Bind(wx.EVT_CHECKLISTBOX, self.OnSelection, self.converterlist) self.csizer.Add(self.converterlist, 0, wx.TOP|wx.LEFT, 10) @@ -168,6 +188,7 @@ def __init__(self, parent, *args, **kwargs): self.tkz = daba.mparser.Tokenizer() self.tokenizers = self.tkz.methods tokenizerbox = wx.StaticBox(self, wx.ID_ANY, "Available Tokenizers") + tokenizerbox.SetBackgroundColour((236, 211, 211, 255)) # was (60, 25, 25, 30) self.tsizer = wx.StaticBoxSizer(tokenizerbox, wx.VERTICAL) self.tokenizerlist = wx.RadioBox(self, wx.ID_ANY, choices=self.tokenizers) self.tokenizerlist.SetSelection(self.tokenizerlist.FindString(self.selection)) @@ -207,6 +228,11 @@ class MainFrame(wx.Frame): def __init__(self, parent, *args, **kwargs): wx.Frame.__init__(self, parent, *args, **kwargs) self.InitValues() + + wx.Config.Set(wx.Config("gparser", style=wx.CONFIG_USE_LOCAL_FILE)) + self.config = wx.Config.Get(False) + self.config.SetRecordDefaults() + # setup Resources self.dirname = os.curdir self.dl = daba.mparser.DictLoader() @@ -214,6 +240,9 @@ def __init__(self, parent, *args, **kwargs): self.resourcepanel = ResourcePanel(self, self.dl, self.gr) self.filepanel = FilePanel(self) + self.statusbar = self.CreateStatusBar(1) # JJM + self.statusbar.SetStatusText('Bienvenue dans le parseur (gparser) !') + filemenu= wx.Menu() menuOpen = filemenu.Append(wx.ID_OPEN,"O&pen"," Open text file") self.Bind(wx.EVT_MENU, self.OnOpen, menuOpen) @@ -236,6 +265,14 @@ def __init__(self, parent, *args, **kwargs): self.SetAutoLayout(True) self.Fit() + x=self.config.ReadInt("MainFrame/pos/x",30) # JJM : recover previous layout + y=self.config.ReadInt("MainFrame/pos/y",30) + w=self.config.ReadInt("MainFrame/size/w",512) + h=self.config.ReadInt("MainFrame/size/h",756) + print("config ReadInt x,y,w,h",x,y,w,h) + self.SetPosition(wx.Point(x,y)) + self.SetSize(wx.Rect(x,y,w,h)) + def InitValues(self): self.infile = None self.outfile = None @@ -245,12 +282,17 @@ def InitValues(self): def OnParse(self,e): @contextmanager def wait_for_parser(): + print("OnParse / self.resourcepanel.toklist.tkz.methods:",self.resourcepanel.toklist.tkz.methods) # JJM + #for n, tok in enumerate(self.resourcepanel.toklist.tkz): # JJM + # print("OnParse / self.resourcepanel.toklist / n, tok :",n,tok) + print("OnParse / self.resourcepanel.convlist.selection:",self.resourcepanel.convlist.selection) # JJM self.processor = daba.mparser.Processor(self.dl, self.gr, tokenizer=self.resourcepanel.toklist.tkz, converters=self.resourcepanel.convlist.selection) yield self.processor.parse(self.io.para) dlg = wx.MessageDialog(self, 'Please wait: parsing in progress', 'Please wait', wx.OK) + self.statusbar.SetStatusText("parsing en cours... (mparser) ") dlg.ShowModal() if not self.parsed: @@ -258,9 +300,13 @@ def wait_for_parser(): self.parsed = True dlg.Destroy() self.FinishedParsing(e) + myparsfile=self.outfile.replace(self.dirname,"") + self.statusbar.SetStatusText("parsing terminé / finished :"+myparsfile) else: #FIXME: proper error message or better avoid this case! print("File already parsed!") + myparsfile=self.outfile.replace(self.dirname,"") + self.statusbar.SetStatusText("votre fichier est déjà parsé / already parsed :"+myparsfile+" -> PLEASE file/close") def NoFileError(self,e): dlg = wx.MessageDialog(self, 'Error: no file opened!', 'No file opened', wx.OK) @@ -273,25 +319,93 @@ def FinishedParsing(self,e): dlg.Destroy() def OnExit(self,e): + # JJM save window position and size + x,y=self.Position + w,h=self.Size + self.config.WriteInt("MainFrame/pos/x",x) + self.config.WriteInt("MainFrame/pos/y",y) + self.config.WriteInt("MainFrame/size/w",w) + self.config.WriteInt("MainFrame/size/h",h) + self.config.Flush() # permanently writes + print("config WriteInt x,y,w,h",x,y,w,h) + self.Close(True) def OnOpen(self, e): """ Open a file""" - dlg = wx.FileDialog(self, "Choose a file", self.dirname, "", "*.*", wx.FD_OPEN) + dlg = wx.FileDialog(self, "Choose a file", self.dirname, "", "gparser files (txt/html)|*.html;*.txt", wx.FD_OPEN|wx.FD_FILE_MUST_EXIST) + # caution: it would be nice to filter out localdict.txt and pars.html, dis.html files if dlg.ShowModal() == wx.ID_OK: self.infile = dlg.GetPath() - self.dirname = os.path.dirname(self.infile) - try: - self.io.read(self.infile) - self.parsed = False - self.filepanel.control.SetValue('\n\n'.join(self.io.para)) - except ValueError as e: - fileerror = wx.MessageDialog(self, "Unknown file type", "Unknown file type", wx.OK) + if self.infile.endswith(".pars.html") or self.infile.endswith(".dis.html") or self.infile.endswith(".repl.html"): + fileerror = wx.MessageDialog(self, "incompatible file type", self.infile, wx.OK) fileerror.ShowModal() fileerror.Destroy() + else: + self.dirname = os.path.dirname(self.infile) + + try: + self.io.read(self.infile) + self.SetTitle("gparser: "+self.infile) + self.parsed = False + self.filepanel.control.SetValue('\n\n'.join(self.io.para)) + #print("self.io.metadata['text:script']:",self.io.metadata['text:script']) # JJM + try: + myscript=self.io.metadata['text:script'] + except KeyError: + myscript='***NO SCRIPT***' + dlg = wx.MessageDialog(self, 'Please choose from the Available orthographic converters (or none if "Nouvel orthographe malien", only "apostrophe")', 'The file has no meta information for text:script', wx.OK) + dlg.ShowModal() + print("OnOpen / metas / script:",myscript) + myscriptcode='' + mytokcode='' + if myscript=='Nouvel orthographe malien': + mytokcode="bamana" + elif myscript=='Ancien orthographe malien': + myscriptcode='bamlatinold' + mytokcode="bamana" + elif myscript=='N’Ko' or myscript=="N’Ko": + myscriptcode='nko' + mytokcode="nko" + if myscriptcode=='': + myselection=('apostrophe',) + else: + myselection=('apostrophe',myscriptcode) + if mytokcode=='': + mytokmethods=['default'] + else: + mytokmethods=[mytokcode] + print("OnOpen / myselection:",myselection) + print("OnOpen / mytokmethods:",mytokmethods) + self.resourcepanel.convlist.selection=myselection + self.resourcepanel.toklist.tkz.methods=mytokmethods + # update interface accordingly + try: + self.resourcepanel.toklist.tokenizerlist.SetSelection(self.resourcepanel.toklist.tokenizerlist.FindString(mytokcode)) + except: + print("mytokcode:",mytokcode," radio box index:",self.resourcepanel.toklist.tokenizerlist.FindString(mytokcode)) + self.resourcepanel.convlist.converterlist.SetCheckedStrings(myselection) + + + myfile=self.infile.replace(self.dirname,"") + self.statusbar.SetStatusText("fichier ouvert:"+myfile+" text:script: "+myscript) + except ValueError as e: + fileerror = wx.MessageDialog(self, "Unknown file type", "Unknown file type", wx.OK) + fileerror.ShowModal() + fileerror.Destroy() dlg.Destroy() def OnClose(self,e): + # JJM save window position and size + x,y=self.Position + w,h=self.Size + self.config.WriteInt("MainFrame/pos/x",x) + self.config.WriteInt("MainFrame/pos/y",y) + self.config.WriteInt("MainFrame/size/w",w) + self.config.WriteInt("MainFrame/size/h",h) + self.config.Flush() # permanently writes + print("config WriteInt x,y,w,h",x,y,w,h) + self.filepanel.control.Clear() self.InitValues() diff --git a/daba/grammar.py b/daba/grammar.py old mode 100644 new mode 100755 index ca6e122..8ec0dbf --- a/daba/grammar.py +++ b/daba/grammar.py @@ -6,11 +6,14 @@ from funcparserlib.parser import * from funcparserlib.lexer import make_tokenizer, Token, LexerError +# JJM added 'ap' 04/10/2024' PSLIST = [ 'mrph', + 'mrph.nko', 'n.prop', 'n', 'adj', + 'ap', 'num', 'v', 'ptcp', @@ -30,7 +33,7 @@ 'prep', 'n.top', 'conv.n', - 'PUNCT', + 'PUNCT' ] @@ -86,7 +89,8 @@ def unwrap_re(tupl): maketuple = lambda t: tuple(t.split('/')) if t else () denonetuple = lambda t: t or () filternone = lambda s: [i for i in s if i] -despace = lambda s: [i for i in s if s is not ' '] +#despace = lambda s: [i for i in s if s is not ' '] +despace = lambda s: [i for i in s if s != ' '] def flatten_list(l): for el in l: diff --git a/daba/meta.py b/daba/meta.py old mode 100644 new mode 100755 index 15fe5ea..5be7550 --- a/daba/meta.py +++ b/daba/meta.py @@ -113,12 +113,12 @@ class GUIBuilder(object): def __init__(self): self.widgets = { 'text': (wx.TextCtrl, None, None), - 'long_text': (wx.TextCtrl, None, {'style': wx.TE_MULTILINE}), + 'long_text': (wx.TextCtrl, None, {'style': wx.TE_MULTILINE, 'size':(300,100)}), 'int': (wx.lib.intctrl.IntCtrl, None, None), 'closed_list': (wx.Choice, 'choices', None), 'open_list': (wx.ComboBox, 'choices', None), 'checklist': (wx.CheckListBox, 'choices', None), - 'date': (wx.adv.DatePickerCtrl, None, None), + 'date': (wx.adv.DatePickerCtrl, None, {'style': wx.adv.DP_DROPDOWN|wx.adv.DP_SHOWCENTURY}), 'datetext': (wx.lib.masked.Ctrl, None, {'autoformat': 'EUDATEDDMMYYYY.'}), } operate = namedtuple('Operate', 'get set') @@ -139,7 +139,7 @@ def parse_date(str): lambda w,t: wx.ComboBox.SetValue(w, str(t))), 'checklist': operate(lambda t: ';'.join(wx.CheckListBox.GetCheckedStrings(t)), lambda w,t: wx.CheckListBox.SetCheckedStrings(w, t.split(';'))), - 'date': operate(lambda t: wx.adv.DatePickerCtrl.GetValue(t).FormatDate(), + 'date': operate(lambda t: wx.adv.DatePickerCtrl.GetValue(t).Format("%d.%m.%Y"), lambda w,t: wx.adv.DatePickerCtrl.SetValue(w, parse_date(t))), 'datetext': operate(wx.lib.masked.TextCtrl.GetValue, lambda w,t: wx.lib.masked.BaseMaskedTextCtrl.SetValue(w, str(t))), @@ -250,8 +250,13 @@ def _strip_secname(self, mkey): return mkey[len(prefix):] return mkey - def _decode_row(self, row): - utf = dict((self._strip_secname(k),v) for k,v in row.items()) + def _decode_row(self, row): + try: + utf = dict((self._strip_secname(k),v) for k,v in row.items()) + except: + print("-row : ",row) # added JJM 24/12/2024 + sys.exit("***decode_row error***") + if self.idcolumn not in utf.keys(): key = self._add_uuid(utf) return utf @@ -311,12 +316,16 @@ def append(self, mdict): dbentry = self._normalize_row(mdict) self._map[key] = dbentry self._strmap[self._make_keystring(mdict)] = key - self.write() + # print("mdict? : ",mdict) # JJM 27/12/2024 + # mdict? : {'name': 'Fournier, Georges', 'spelling': '', 'sex': 'm', 'birth_year': '', 'dialect': 'France', 'native_lang': 'Français', 'addon': 'nyòninsan 1976 (avec Andrée Audibert)', 'uuid': 'cc1eced6-e66e-4e9a-9b6d-8b209c56b699'} + if mdict['name']!='': # JJM 27/12/2024 - do not update authors unnecessarily + self.write() return dbentry def update(self, key, mdict): self._map[key] = self._normalize_row(mdict) - self.write() + if mdict['name']!='': # JJM 27/12/2024 - do not update authors unnecessarily + self.write() def getEntryByUUID(self, uuid): return self._map[uuid] @@ -403,6 +412,7 @@ def addPanel(self, evt=None): else: panel = DataPanel(self, config=self.config, section=self.section) self.sizer.Add(panel, 1, wx.EXPAND, 0) + panel.SetBackgroundColour((236, 211, 211, 255)) # was (96, 63, 63, 30) - (90, 50, 50, 30) (236, 211, 211, 255) (130, 80, 80, 30) self.panels.append(panel) self.Layout() @@ -452,6 +462,7 @@ def onItemSelected(self, values): dbentry = self.db.getEntryByKey(self.selector.GetValue()) self.setCurrentPanelData(dbentry.items()) except KeyError: + # print("KeyError :", self.selector.GetValue()) # JJM 04/01/2024 pass def saveDBEntries(self): @@ -465,6 +476,88 @@ def saveDBEntries(self): panel.setPanelData(dbentry.items()) self.selector.AutoComplete(choices=self.db.getList()) +class ResumePanel(wx.ScrolledWindow): + def __init__(self, parent, *args, **kwargs): + wx.ScrolledWindow.__init__(self, parent, *args, **kwargs) + self.SetScrollRate(20, 20) + self.parent = parent + self.isMetaShown=False + self.Sizer = wx.BoxSizer(wx.VERTICAL) + + def ShowMetas(self, metadata): + + if self.isMetaShown: + self.Sizer.Clear(delete_windows=True) # that's the key element missing : delete_windows=True + self.Sizer.Remove(self.Sizer) # probably does not do a thing... + + metas={} + items=[] + authors=False + metatxt="" + for item,y in metadata._data.items(): + #print("item,y",item,y) + #metatxt+=item+" "+str(y)+"\n" + for sitem,svalue in y.items(): + #print("sitem,svalue:",sitem,str(svalue)) + if item=="author": + if sitem=="name": sitem="_name" # force sorting as 1st + elif item in ["source","text"] : + if sitem=="title": sitem="_title" + + thisvalue=', '.join(svalue) + + if item in items: + if item=="author" and authors: + authindex=0 + for v in svalue: + authindex+=1 + metas[item][authindex][sitem]=v + else: + metas[item][sitem]=thisvalue + else: + items.append(item) + if item=="author": + if len(svalue)>1: + authors=True + authindex=0 + for v in svalue: + authindex+=1 + if authindex==1: + metas[item]={authindex: {sitem:v}} + else : metas[item][authindex]={sitem:v} + else: + metas[item]={sitem:thisvalue} + else: + metas[item]={sitem:thisvalue} + + #print("metas:",metas) + # print sorted metas + metatxt="Métadonnées\n" + for x,y in sorted(metas.items()): + metatxt+=x+"\n" + for w,z in sorted(y.items()): + if x=="author" and authors: + metatxt+="\t"+str(w)+"\n" + for wn,zn in sorted(z.items()): + metatxt+="\t\t"+wn.strip("_")+" :\t "+zn+"\n" + else: + metatxt+="\t"+w.strip("_")+" :\t "+str(z)+"\n" + metatxt=metatxt.replace("\n","\n ") + metatxt=metatxt.replace(";"," ; ") + + self.st= wx.TextCtrl(self,style=wx.TE_MULTILINE|wx.TE_DONTWRAP) + self.st.SetValue(metatxt) + #font = wx.Font(12, wx.FONTFAMILY_MODERN, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL) + #self.SetFont(font) + self.metafont = self.GetFont() + self.metafont.SetPointSize(self.metafont.GetPointSize() + 2) + self.st.SetFont(self.metafont) + self.Sizer.Add(self.st, 1, wx.EXPAND) + + self.SetSizer(self.Sizer) + self.Layout() + self.isMetaShown=True + class MetaNotebook(wx.Notebook): """Notebook widget holding MetaPanels""" @@ -481,6 +574,8 @@ def __init__(self, parent, *args, **kwargs): self.parent = parent self.control = wx.TextCtrl(self, style=wx.TE_MULTILINE | wx.TE_READONLY) + # self.control = wx.TextCtrl(self, style=wx.TE_MULTILINE) # needs more work to enable text edits + Sizer = wx.BoxSizer(wx.VERTICAL) Sizer.Add(self.control, 1, wx.EXPAND) self.SetSizer(Sizer) @@ -499,6 +594,25 @@ def __init__(self, parent, config=None, encoding='utf-8', *args, **kwargs): self.metapanels = {} self.encoding = encoding + self.myconfig = wx.Config.Get() # JJM 26/12/2024 + global confpath + confpath = self.myconfig.Read("meta/confpath") + print("_init_confpath=",confpath) + if confpath!="": + self.config = MetaConfig(confpath) + # self.SetTitle("["+confpath+"]") # JJM 27/12/2024 + #wx.Config.Set(wx.Config("gparser", style=wx.CONFIG_USE_LOCAL_FILE)) + #self.config = wx.Config.Get(False) + #self.config.SetRecordDefaults() + + x=self.myconfig.ReadInt("MainFrame/pos/x",30) # JJM : recover previous layout + y=self.myconfig.ReadInt("MainFrame/pos/y",30) + w=self.myconfig.ReadInt("MainFrame/size/w",512) + h=self.myconfig.ReadInt("MainFrame/size/h",256) + #print("x,y,w,h",x,y,w,h) + self.SetPosition(wx.Point(x,y)) + self.SetSize(wx.Rect(x,y,w,h)) + filemenu = wx.Menu() menuOpen = filemenu.Append(wx.ID_OPEN, "O&pen", " Open text file") self.Bind(wx.EVT_MENU, self.OnOpen, menuOpen) @@ -522,6 +636,8 @@ def __init__(self, parent, config=None, encoding='utf-8', *args, **kwargs): configbutton.Bind(wx.EVT_FILEPICKER_CHANGED, self.OnConfigSelected) configbutton.SetTextCtrlProportion(2) configbutton.SetTextCtrlGrowable(True) + if confpath!="": + configbutton.SetPath(confpath) retainbutton = wx.ToggleButton(self, -1, 'Retain values for the next file') retainbutton.Bind(wx.EVT_TOGGLEBUTTON, self.OnRetainToggled) self.Sizer = wx.BoxSizer(wx.VERTICAL) @@ -536,6 +652,7 @@ def make_splitter(self): splitter.SetMinimumPaneSize(100) self.filepanel = FilePanel(splitter) self.notebook = MetaNotebook(splitter) + self.notebook.SetBackgroundColour((236, 211, 211, 255)) # was (96, 63, 63, 30) - (236, 211, 211, 255) (90, 50, 50, 30) splitter.SplitVertically(self.filepanel, self.notebook) return splitter @@ -548,8 +665,13 @@ def init_values(self): def draw_metapanels(self): for secname in self.config.sections(): metapanel = MetaPanel(self.notebook, config=self.config, section=secname) + # metapanel.Bind(wx.EVT_KILL_FOCUS, self.update_metadata) # does not work, only Save fixes metadata for Résumé self.metapanels[secname] = metapanel self.notebook.AddPage(metapanel, self.config.getSectionTitle(secname)) + # add résumé + self.resumepanel = ResumePanel(self.notebook) + # self.resumepanel.Bind(wx.EVT_SET_FOCUS, self.resumepanel.ShowMetas(self.metadata)) + self.notebook.AddPage(self.resumepanel, "[Résumé]") self.Layout() def clear_metapanels(self): @@ -569,19 +691,31 @@ def parse_file(self, ifile): def update_interface(self): for secname, secdata in self.metadata.sections(): self.metapanels[secname].setSectionData(secdata) + # update résumé + self.resumepanel.ShowMetas(self.metadata) def update_metadata(self): # collect all metadata given for secname, mp in self.metapanels.items(): self.metadata.setSection(secname, mp.getSectionData()) + # update résumé + self.resumepanel.ShowMetas(self.metadata) def write_xmldata(self): self.update_metadata() + metadatadict=self.metadata.toPlain() # text-script REQUIRED VALUE JJM 29/12/2024 + if metadatadict['text:script']=='': + # wx.MessageBox('text:script is not set, please check in the Texte panel', 'value required', wx.OK | wx.ICON_INFORMATION) + wx.MessageBox('Le Type d\'écriture (text:script) doit être précisé, vérifier dans l\'onglet Texte ', 'value required', wx.OK | wx.ICON_INFORMATION) + return tempout = tempfile.NamedTemporaryFile(delete=False) - self.io.write(tempout.name, metadata=self.metadata.toPlain()) + # self.io.write(tempout.name, metadata=self.metadata.toPlain()) + self.io.write(tempout.name, metadata=metadatadict) + tempout.close() outfile = os.path.join(self.dirname, self.filename) shutil.copyfile(tempout.name, outfile) + #print("tempout.name: ",tempout.name) os.unlink(tempout.name) def OnRetainToggled(self, e): @@ -591,7 +725,14 @@ def OnConfigSelected(self, e): if self.filename: self.FileOpenedError(e) else: + global confpath confpath = e.GetPath() + + # self.SetTitle("["+confpath+"]") # JJM 26/12/2024 + myconfig = wx.Config.Get() + myconfig.Write("meta/confpath",confpath) # JJM 26/12/2024 + myconfig.Flush() + self.config = MetaConfig(confpath) if len(self.metapanels) > 0: self.clear_metapanels() @@ -615,7 +756,7 @@ def OnOpen(self,e): if not self.config: self.NoFileError(e) return False - dlg = wx.FileDialog(self, "Choose a file", self.dirname, "", "*.*", wx.FD_OPEN) + dlg = wx.FileDialog(self, "Choose a file", self.dirname, "", "meta files (txt/html)|*.html;*.txt", wx.FD_OPEN) if dlg.ShowModal() == wx.ID_OK: self.infile = dlg.GetPath() self.filename = os.path.basename(self.infile) @@ -627,6 +768,9 @@ def OnOpen(self,e): self.draw_metapanels() self.update_interface() self.filepanel.control.SetValue(self.txt) + # global confpath + # self.SetTitle("["+confpath+"] - "+self.filename) # added JJM 26/12/2024 + self.SetTitle("Meta : "+self.filename) # added JJM 30/12/2024 dlg.Destroy() def OnSave(self,e): @@ -658,6 +802,14 @@ def OnSaveAs(self,e): def OnClose(self,e): self.OnSave(e) self.init_values() + # JJM save window position and size + x,y=self.Position + w,h=self.Size + self.myconfig.WriteInt("MainFrame/pos/x",x) + self.myconfig.WriteInt("MainFrame/pos/y",y) + self.myconfig.WriteInt("MainFrame/size/w",w) + self.myconfig.WriteInt("MainFrame/size/h",h) + self.myconfig.Flush() # permanently writes if self.cleanup: self.clear_metapanels() self.draw_metapanels() diff --git a/daba/mparser.py b/daba/mparser.py index 0602edc..1c0d5a0 100644 --- a/daba/mparser.py +++ b/daba/mparser.py @@ -28,6 +28,8 @@ from daba.plugins.tokenizer import TokenizerData from daba.orthography import tones_match, detone +import re # JJM used in removing tones + class Tokenizer(object): def __init__(self): self._data = TokenizerData() @@ -278,6 +280,9 @@ def convert_orthography(self, word): for plugin in self.converters: converted = [] for w in wlist: + # print("w:'"+w+"'") # if next line starts with " we have here '\n"' ??? + # if '\n' in w : w=w.replace('\n','') + # this should NEVER happen given the definition of 'Par' in tokenizer.py!!! for result in plugin.convert(w): converted.append(result) wlist = converted @@ -297,6 +302,11 @@ def filter_parsed(self, results, forms): return stage, filtered def parse(self, txt): + + def iscapitalized(s): + if len(s) ==1 : return s.isupper() + else: return (s[0].isupper() and s[1:].islower()) + self.parsed = [] for para in txt: par = [] @@ -305,32 +315,335 @@ def parse(self, txt): st = (sttoken, []) par.append(st) annot = st[1] + wordindex=0 + # print("sent:",sent) for token in sent: if token.type in ['Comment', 'Tag']: annot.append(daba.formats.PlainToken((token.type, token.value))) elif token.type in ['Punct', 'SentPunct', 'Nonword']: + # annot.append(daba.formats.PlainToken(('c', token.value))) if self.converters: + # print("token value:",token.value) ctoken = self.convert_orthography(token.value)[0] else: ctoken = token.value annot.append(daba.formats.PlainToken(('c', ctoken))) + # print('PUNCT token.value=',token.value) + if token.value in [":","«",'"','“',"-","("] : + wordindex=0 # need to reset, next word in Capital like at start of sentence - for instance after : or « + # print("(in) wordindex=",wordindex,"token.value=",token.value) + elif token.type in ['Cardinal']: gloss = Gloss(token.value, ('num',), 'CARDINAL', ()) annot.append(daba.formats.WordToken([gloss], token.value, 'tokenizer')) elif token.type in ['Word']: + wordindex+=1 if self.converters: wlist = self.convert_orthography(token.value) + # print("token.value, wlist", token.value, wlist) converts = [] for w in filter(None, wlist): converts.append( - self.parser.lemmatize(w.lower()) + #self.parser.lemmatize(w.lower()) + self.parser.lemmatize(w) ) try: stage, glosslist = self.filter_parsed(converts, list(filter(None, wlist))) except ValueError: print("WARNING: invalid orthographic conversion result, skippig token:", token.type, token.value, converts) else: - stage, glosslist = self.parser.lemmatize(token.value.lower()) + #stage, glosslist = self.parser.lemmatize(token.value.lower()) + stage, glosslist = self.parser.lemmatize(token.value) + # but that's not enough: + # if stage, glosslist looks like: (-1, [Gloss(form='hawa', ps=(), gloss='', morphemes=())]) + # we need to try again : if 1st letter is in lower case, try uppercasing it + # else if it is in uppercase, try lowercasing it. + + # for now restricted to results with glosslist limited to one + # it would be nice to also tackle longer glosslist with emply morphemes gloss, + # where only derivation morphemes are identifiable, e.g. : + # propername Keyita, distracted by possible suffix -ta + # [Gloss(form='keyita', ps=(), gloss='', morphemes=()), Gloss(form='keyita', ps=('ptcp',), gloss='', morphemes=(Gloss(form='keyi', ps=('v',), gloss='', morphemes=()), Gloss(form='ta', ps=('mrph',), gloss='PTCP.POT', morphemes=())))] + # propername Mayiga:, distracted by possible prefix ma- + # [Gloss(form='mayiga', ps=(), gloss='', morphemes=()), Gloss(form='mayiga', ps=('v',), gloss='', morphemes=(Gloss(form='mà', ps=('mrph',), gloss='SUPER', morphemes=()), Gloss(form='yiga', ps=('v',), gloss='', morphemes=())))] + + if len(glosslist)==1 or glosslist[0].gloss=="": + gloss=glosslist[0] + if ( len(gloss.ps)==0 and gloss.gloss=="") \ + or (len(gloss.ps)==1 and gloss.ps[0]=="") \ + or (len(gloss.ps)==1 and gloss.ps[0]=="n.prop") : + #Last test + #1) n.prop:NOM is only the default for capitalized token (see bamana.gram.txt) + # but I removed this rtestriction: and gloss.gloss=="NOM" because: + #2) other proper names are inherently misleading and should be presented with alternative lowercase solution + # like Sira (also sira: road, path) or Misi (also misi: cow) and many more + # not doing so produces ridiculous solutions - JJM 07/02/2025 + + # 01/04/2025 traiter des cas comme + # cikɛ-bolofarakuntigi:n: [cikɛ-bolofarakun:n: tìgi:n:maître] + # en éliminant le ou les - + + trythis="" + + if self.converters: + wlist = self.convert_orthography(token.value) + trythis=wlist[0] + + if "-" in trythis: + wlist.append(trythis.replace("-","")) + + if trythis.islower(): + converts = [] + for w in filter(None, wlist): + w=w.capitalize() + converts.append( + self.parser.lemmatize(w) + ) + elif iscapitalized(trythis) : + converts = [] + for w in filter(None, wlist): + w=w.lower() + converts.append( + self.parser.lemmatize(w) + ) + else: # word is all caps or a mixture + converts = [] + for w in filter(None, wlist): + w=w.lower() + converts.append( + self.parser.lemmatize(w) + ) + for w in filter(None, wlist): + w=w.lower().capitalize() + converts.append( + self.parser.lemmatize(w) + ) + try: + # print("wlist=",wlist) + stage2, glosslist2 = self.filter_parsed(converts, list(filter(None, wlist))) + except ValueError: + print("WARNING4: invalid orthographic conversion result, skipping token:", token.type, token.value, converts) + + else: + # print("not self.converters token.value=",token.value) + trythis2="" + if token.value.islower(): + trythis=token.value.capitalize() + elif iscapitalized(token.value): + trythis=token.value.lower() + else: + trythis=token.value.lower() + trythis2=token.value.lower().capitalize() + try: + stage2,glosslist2=self.parser.lemmatize(trythis) + except ValueError: + print("WARNING5: invalid orthographic conversion result, skipping token:", token.type, token.value, converts) + + if trythis2 != "": + try: + stage3,glosslist3=self.parser.lemmatize(trythis2) + glosslist2=glosslist2+glosslist3 + except ValueError: + print("WARNING6 ",trythis2," lemmatize error ") + + if trythis != "": + #gl2="" + #for g in glosslist2: + # gl2 +=str(g)+", " + #print("glosslist2","["+gl2[:-2]+"]") + + if len(glosslist2)>0 and glosslist2!=glosslist: + for g in glosslist2: + if len(g.ps)>0 and g.ps[0] != "": + nggok=0 + if g.gloss=="" : + if len(g.morphemes)>0 : + for gg in g.morphemes: + if len(gg.ps)==1 and gg.ps[0]!="mrph" and gg.gloss!="": nggok+=1 + else: + nggok=1 + + if nggok>0: + if g in glosslist: continue + else: glosslist.append(g) + + # do some cleanup on resulting list + glosslist2=glosslist + glosslist=[] + for g in glosslist2: + if len(g.ps)>0 and g.ps[0] != "": + nggok=0 + if g.gloss=="" : + if len(g.morphemes)>0 : + for gg in g.morphemes: + if len(gg.ps)==1 and gg.ps[0]!="mrph" and gg.gloss!="": nggok+=1 + else: + nggok=1 + + if nggok>0: + if g in glosslist: continue + else: glosslist.append(g) + if len(glosslist) == 0 : + glosslist=glosslist2 # no interesting results found! + + + # MORE CLEANUP: + # if len glosslist>1 éliminate X:n.prop:NOM artificial entries + if len(glosslist)>1: + glosslist2=glosslist + glosslist=[] + for g in glosslist2: + if len(g.ps)>0: + if not (g.ps[0]=='n.prop' and g.gloss=='NOM'): + glosslist.append(g) + if len(glosslist) == 0 : + glosslist=glosslist2 # no interesting results found! + + #gl1="" + #for g in glosslist: + # gl1 +=str(g)+", " + #print("solution found?:","["+gl1[:-2]+"]") + + # needs testing here if it is the first Word in sentence (added wordindex) + # do "trythis" (try lower case - but also try uppercase if it's old bambara all in lowercase) + # and append to already found glosslist + # example Masalabolo may have identified Masala as TOP + bolo "branch" + # but it can also be masalabolo "text" + # print("(word) wordindex=",wordindex,"token=",token.value) + if wordindex==1: + # caution: this is not beginning of sentence but is restarted at punctuations, ie: also after a comma + + # assess initial situation, how many catitalized and lowercase results + gl1 ="" + ncapitalized=0 + nlowercase=0 + # print("glosslist",glosslist) + for g in glosslist: + if g.form[0].islower(): nlowercase+=1 + else: ncapitalized+=1 + gl1 +=str(g)+", " + #print("standard list for :",token.value,"glosslist:","["+gl1 [:-2]+"]") + #if nlowercase==0:print("all capitalized") + #elif ncapitalized==0:print("all lowercase") + #else: print("mixed list lower and capitalized") + + trythis="" + + testthis=token.value[0] + if self.converters: + wlist = self.convert_orthography(token.value) + testthis = wlist[0] + if "-" in testthis: + wlist.append(testthis.replace("-","")) + + if testthis.islower(): # was gloss.form + if ncapitalized == 0: # not capitalized result in standard approach + trythis=token.value.capitalize() # no worry if converters (not used there), trythis just needs to be <> " + if self.converters: + # already done : wlist = self.convert_orthography(token.value) + converts = [] + for w in filter(None, wlist): + w=w.capitalize() + converts.append( + self.parser.lemmatize(w) + ) + try: + stage2, glosslist2 = self.filter_parsed(converts, list(filter(None, wlist))) + except ValueError: + print("WARNING2: invalid orthographic conversion result, skippig token:", token.type, token.value, converts) + else: + stage2,glosslist2=self.parser.lemmatize(trythis) + + #gl2="" + #for g in glosslist2: gl2+=str(g)+", " + #print("islower trythis:",trythis,"glosslist2:","["+gl2[:-2]+"]") + + else: # Word was capitalized + if nlowercase == 0 : # no lowercase result in standard approach + trythis=token.value.lower() # no worry if converters (not used there), trythis just needs to be <> " + if self.converters: + # already done: wlist = self.convert_orthography(token.value) + converts = [] + for w in filter(None, wlist): + w=w.lower() + converts.append( + self.parser.lemmatize(w) + ) + try: + stage2, glosslist2 = self.filter_parsed(converts, list(filter(None, wlist))) + except ValueError: + print("WARNING3: invalid orthographic conversion result, skippig token:", token.type, token.value, converts) + else: + stage2,glosslist2=self.parser.lemmatize(trythis) + + #gl2="" + #for g in glosslist2: gl2+=str(g)+", " + #print("not islower trythis:",trythis,"glosslist2:","["+gl2[:-2]+"]") + + if trythis!="": + + if glosslist2!=glosslist: + for g in glosslist2: + if len(g.ps)>0 : + nggok=0 + if g.gloss=="" : + if len(g.morphemes)>0 : + for gg in g.morphemes: + if len(gg.ps)==1 and gg.ps[0]!="mrph" and gg.gloss!="": nggok+=1 + else: + nggok=1 + + if nggok>0: + if g in glosslist: continue + else: glosslist.append(g) + + # nettoyage des doublons + # faut-il plus de nettoyage (gloses inutiles) ??? + + glosslist2=glosslist + glosslist=[] + for g in glosslist2: + if g not in glosslist: + glosslist.append(g) + + if len(glosslist)==0: + glosslist=glosslist2 + + # MORE CLEANUP: + # if len glosslist>1 éliminate X:n.prop:NOM artificial entries + if len(glosslist)>1: + glosslist2=glosslist + glosslist=[] + for g in glosslist2: + if len(g.ps)>0: + if not (g.ps[0]=='n.prop' and g.gloss=='NOM'): + glosslist.append(g) + if len(glosslist) == 0 : + glosslist=glosslist2 # no interesting results found! + + + + #gl1 ="" + #for g in glosslist: gl1 +=str(g)+", " + #print("Resulting glosslist:","["+gl1 [:-2]+"]") + + #else: print("no change") + + # 29/09/2025: move forms that are different from original at the end of the list + glistsame=[] + glistdiff=[] + if len(glosslist)>1 : + refform=re.sub(r'[\u0300\u0301\u0302\030c]','',token.value.lower()) + # refform=re.sub(r'[\u0300\u0301\u0302\030c]','',wlist[0]) + for g in glosslist: + thisform=re.sub(r'[\u0300\u0301\u0302\030c]','',g.form) + #print("refform, thisform",refform,thisform) + if thisform==refform: + glistsame.append(g) + else: + glistdiff.append(g) + glosslist=glistsame+glistdiff + if self.normalize_orthography and self.converters: if len(wlist) == 1: @@ -346,6 +659,10 @@ def parse(self, txt): else: annot.append(daba.formats.WordToken(glosslist, token.value, str(stage))) + + + + self.parsed.append(par) return self.parsed diff --git a/daba/newmorph.py b/daba/newmorph.py old mode 100644 new mode 100755 index 890dd80..0581f20 --- a/daba/newmorph.py +++ b/daba/newmorph.py @@ -215,7 +215,8 @@ def parse(self, pattern, gloss): result = pattern.apply(gloss) if result: if () in [m.ps for m in result.morphemes]: - ms = filter(lambda i: i.ps is not (), result.morphemes) + # ms = filter(lambda i: i.ps is not (), result.morphemes) + ms = filter(lambda i: i.ps != (), result.morphemes) result = result._replace(morphemes=ms) return [result] else: @@ -232,9 +233,16 @@ def decompose(self, pattern, gloss): if parts < 2: return self.parse(pattern, gloss) else: - if gloss.morphemes: - #FIXME: use only first non-glossed morpheme as possible stem - stemgloss, stempos = [(m,pos) for pos,m in enumerate(gloss.morphemes) if not m.gloss][0] + if gloss.morphemes : + xxx=[(m,pos) for pos,m in enumerate(gloss.morphemes) if not m.gloss] + if len(xxx)>0: + #FIXME: use only first non-glossed morpheme as possible stem + #print("newmorph 237 m,pos :",[(m,pos) for pos,m in enumerate(gloss.morphemes) if not m.gloss]) + # stemgloss, stempos = [(m,pos) for pos,m in enumerate(gloss.morphemes) if not m.gloss][0] + stemgloss, stempos =xxx[0] + else: + stemgloss = gloss + stempos = -1 else: stemgloss = gloss stempos = -1 diff --git a/daba/plugins/bamlatinold.py b/daba/plugins/bamlatinold.py index f5a5f2c..135f860 100644 --- a/daba/plugins/bamlatinold.py +++ b/daba/plugins/bamlatinold.py @@ -12,25 +12,30 @@ def __init__(self, *args, **kwargs): self.title = 'bamlatinold' self.desc = 'Convertor from old latin Bambara orthography (ambiguous)' + # some transforms could be valid for bamlatinNEW: u'ng':[u'ng',u'ŋ'], u'ny':[u'ny',u'ɲ'], u'ua':['wa'] + def convert(self, token): """ Main conversion method """ - conversion_table = {u'è':[u'ɛ'], u'ò':[u'ɔ'], u'èe':[u'ɛɛ'], u'òo':[u'ɔɔ'], u'ng':[u'ng',u'ŋ'], u'ny':[u'ny',u'ɲ']} - + conversion_table = {u'è':[u'ɛ'], u'ò':[u'ɔ'], u'èe':[u'ɛɛ'], u'òo':[u'ɔɔ'], u'ng':[u'ng',u'ŋ'], u'ny':[u'ny',u'ɲ'], u'ua':['wa']} + def graphemes_old(word): # split word into maximal length graphemes (old orthography) specs = [ ('NG', (r'ng', re.I | re.U)), ('NY', (r'ny', re.I | re.U)), + ('UA', (r'ua', re.I | re.U)), ('EE', (r'è[eè]', re.I | re.U)), ('OO', (r'ò[oò]', re.I | re.U)), ('NL', (r'[\n]+', re.U)), ('QUOT', (r'["]', re.U)), ('ANY', (r'.', re.U)), ] + tok = funcparserlib.lexer.make_tokenizer(specs) r = [x.value for x in tok(unicodedata.normalize('NFKC', word)) if x.type != 'NL'] + return r def multiply_list(amblist): diff --git a/daba/plugins/dukure.py b/daba/plugins/dukure.py new file mode 100644 index 0000000..def4194 --- /dev/null +++ b/daba/plugins/dukure.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- coding: utf8 -*- + +from daba.plugins import OrthographyConverter +import funcparserlib.lexer +import re +import unicodedata + + +class DukuretoNew(OrthographyConverter): + def __init__(self, *args, **kwargs): + self.title = 'dukure' + self.desc = 'Convertor from Mamadu Dukurɛ (Fakan) orthography' + + def convert(self, token): + """ + Main conversion method + """ + conversion_table = {r"^['‘]":[u''], u'-':[u'-', u''], u'ʃ':[u's', u'sh'], + u'\u0300':[''], u'à':[u'a'], u'è':[u'e'], u'ì':[u'i'], u'ò':[u'o'], u'ù':[u'u'], u'ὲ':[u'ɛ'], + u'\u041B':[u'Ɲ'], u'\u043B':[u'ɲ'], u'ɑ':['a'], u'ε':['ɛ'], + u'ng':[u'ng',u'ŋ'], u'ny':[u'ny',u'ɲ'], + u'bl':[u'bil',u'bul'], u'tl':[u'til', u'tul']} + # apostrophe initiale=article défini: supprimer + # critique : le parser l'a déjà détachée comme ponctuation + # ça n'est pas toujours vrai : 'kɔnɔ. en fin de phrase ??? + # le tiret intermédiaire est fréquemment utilisé mais met en difficulté gparser : convertir en deux solutions + # lettre ʃ : convertir: 2 solutions sh ou s + # ton bas : supprimer + # MAIS ils sont le plus souvent traités par des monolithes àè...ὲ ! d'où la 2ème ligne + # critique : le parser a déjà divisé les monolithes + # Accidents fréquents + # - Л cyrillique : convertir en Ɲ - minuscule л ajoutée - incertitude ne marche pas sur le dernier mot d'une phrase? + # - ɑ cyrillique : convertir en a + # - ε grec : convertir en ɛ + # ng, ny gardés "au cas où" + # les majuscules aux verbes, marques prédicatives, copules sont traitées dans un dictionnaire avec \ve + # contractions: (Bla pour bìla): traitées aussi en dictionnaire ? + + + def graphemes_old(word): + # split word into maximal length graphemes (old orthography) + specs = [ + ('NG', (r'ng', re.I | re.U)), + ('NY', (r'ny', re.I | re.U)), + ('BL', (r'bl', re.I | re.U)), + ('TL', (r'tl', re.I | re.U)), + ('ANY', (r'.', re.U)), + ] + + tok = funcparserlib.lexer.make_tokenizer(specs) + r = [x.value for x in tok(unicodedata.normalize('NFKC', word))] + + return r + + def multiply_list(amblist): + # given list of lists, returns list of all possible concatenations + # taking a single element from each list + def multiply_list_aux(l, amblist): + if len(amblist)>0: + m = [ l[k]+[amblist[0][i]] for k in range(len(l)) for i in range(len(amblist[0]))] + return multiply_list_aux(m, amblist[1:]) + else: + return l + return multiply_list_aux([[]], amblist) + + def convertg(grapheme): + # convert a single grapheme into a list of corresponding graphemes in new orthography + try: + # !!HACK: converts graphemes to lowercase!! + return conversion_table[grapheme.lower()] + except KeyError: + return [grapheme] + + def convertw(word): + # given single word in old orthography returns + # list of all possible translations to new orthography + graphemes = [convertg(g) for g in graphemes_old(word)] + return [''.join(w) for w in multiply_list(graphemes)] + + x=convertw(token) + # print("DukuretoNew: ",x) + return x diff --git a/daba/plugins/nko.py b/daba/plugins/nko.py index b34b0ad..4a877c8 100644 --- a/daba/plugins/nko.py +++ b/daba/plugins/nko.py @@ -17,6 +17,7 @@ def convert(self, token): """ w = token + # if token.type != 'Word': # w = w.replace('\u060c', ',') # w = w.replace('\u200f', '') @@ -28,14 +29,15 @@ def convert(self, token): if debug: print("NKO", w, ) - ### FOREIGN sounds with diacritics: + ### FOREIGN sounds with diacritics: + # 07ed=short rising tone (dot above) 07f3=double dot above 07eb= short high tone (bar above) w = w.replace('\u07d6\u07ed', r"z") w = w.replace('\u07db\u07ed', r"S") ### SH w = w.replace('\u07dc\u07ed', r"g") w = w.replace('\u07dd\u07ed', r"v") w = w.replace('\u07d8\u07ed', r"D") ### D. w = w.replace('\u07e4\u07ed', r"Q") ### H. - w = w.replace('\u07d7\u07ed', r"J") ### C. + w = w.replace('\u07d7\u07ed', r"J") ### C. CHA+short rising tone w = w.replace('\u07de\u07ed', r"x") ### K. w = w.replace('\u07d5\u07ed', r"T") ### T. @@ -46,21 +48,22 @@ def convert(self, token): w = w.replace('\u07db\u07eb', r"C") ### S= w = w.replace('\u07de\u07eb', r"q") ### K= - w = w.replace('\u07f3', "\u0308") - w = w.replace('\u07f6', r"o") - w = w.replace('\u07cb\u0623', r"{") - w = w.replace('\u07cb\u0625', r"}") + w = w.replace('\u07f3', "\u0308") # Double dot above -> ̈ Combining Diaeresis (U+0308) + + w = w.replace('\u07f6', r"o") # ߶ N'Ko OO dennen + w = w.replace('\u07cb\u0623', r"{") # ߋ N'ko EE + أ Arabic Letter Alef With Hamza Above (U+0623) + w = w.replace('\u07cb\u0625', r"}") # ߋ N'ko EE + إ Arabic Letter Alef With Hamza Below (U+0625) ### VOWELS: w = w.replace('\u07ca', r"a") w = w.replace('\u07cb', r"e") w = w.replace('\u07cc', r"i") - w = w.replace('\u07cd', r"H") + w = w.replace('\u07cd', r"H") # ߍ w = w.replace('\u07ce', r"u") w = w.replace('\u07cf', r"o") w = w.replace('\u07d0', r"O") ### SYLLABIC N - w = w.replace('\u07d2', r"N") + w = w.replace('\u07d2', r"N") # ߒ ### CONSONANTS: w = w.replace('\u07d3', r"b") @@ -76,30 +79,45 @@ def convert(self, token): w = w.replace('\u07dd', r"f") w = w.replace('\u07de', r"k") w = w.replace('\u07df', r"l") - w = w.replace('\u07e0', r"n") # Na woloso + w = w.replace('\u07e0', r"n") # ߠ Na woloso w = w.replace('\u07e1', r"m") - w = w.replace('\u07e2', r"Y") # Nya + w = w.replace('\u07e2', r"Y") # ߢ Nya w = w.replace('\u07e3', r"n") w = w.replace('\u07e4', r"h") w = w.replace('\u07e5', r"w") w = w.replace('\u07e6', r"y") - w = w.replace('\u07e7', r"y") # Nya woloso + w = w.replace('\u07e7', r"y") # ߧ Nya woloso ### APOSTROPHES: - w = w.replace('\u07f4', r"’") - w = w.replace('\u07f5', r"‘") - + #w = w.replace('\u07f4', r"’") + #w = w.replace('\u07f5', r"‘") + w = w.replace('\u07f4', r"'") # MODIFIED JJM 09/10/2024 + w = w.replace('\u07f5', r"'") # MODIFIED JJM 09/10/2024 + ### DOUBLE PUNCTUATIONS # MODIFIED JJM 18/10/2024 + w = w.replace('ߵߵ', r'"') + #w = w.replace('<<', r'«') # this is handled in mparser sent_splitter + #w = w.replace('>>', r'»') ### PUNCTUATION: w = w.replace('\u060c', r",") # Arabic comma w = w.replace('\u061f', r"?") # Arabic question mark - w = w.replace('؛', r";") - w = w.replace('\u07fa', r"-") - w = w.replace('\u066a', r"%") - w = w.replace('\u200f', '') # right-to-left mark - w = w.replace('\u07f9', r"!") - w = w.replace('\u07f8', "\u00b7") # strange ·_ + w = w.replace('؛', r";") # U+061B ؛ ARABIC SEMICOLON + w = w.replace('\u07fa', r"-") # N'Ko LAJANYALAN + w = w.replace('\u066a', r"%") # Arabic Percent sign ٪ + w = w.replace('\u200f', '') # right-to-left mark + w = w.replace('\u07f9', r"!") # N'Ko EXLAMATION MARK + w = w.replace(' \u07d1 ', r" ߸ ") # erroneous use of N'ko dagbasinna as N'Ko comma (always between spaces) - JJM 19/nov/2024 + w = w.replace('\u07f8', ",") # N'Ko COMMA ߸ + w = w.replace('\u202e', " ") # RLO (Right-Left Override) + # not translated : ref https://www.unicode.org/charts/PDF/U07C0.pdf + # \07F7 NKO SYMBOL GBAKURUNEN ߷ + # \07FE NKO DOROME SIGN ߾߾ + # \07FF NKO TAMAN SIGN ߿ + # \07FD NKO DANTAYALAN ߽ + # bad idea? w = w.replace('(', "¤(¤") # rotate parenthesis JJM 11/12/2024 + # bad idea? w = w.replace(')', "(") + # bad idea? w = w.replace("¤(¤",")") ### MARKING HIGH TONE: w = re.sub('(a|e|H|i|o|O|u|N)(b|p|t|j|c|d|r|R|s|G|f|k|l|n|m|Y|h|w|y|z|g|S|v|F|D|Q|J|A|T|Z|C|x|q|-)', "\\1\u0301\\2", w) @@ -175,7 +193,17 @@ def convert(self, token): ### REMOVE flowting low tone from "i" and "n" pronouns: w = re.sub('\b([iN])\u0301`', "\\1\u0301", w) - ### NUMERALS: + ### if N'Ko Text already contains LATIN NUMBERS (happened in Solomana Kante Kurukanfuwa) JJM nov 2024 + # REVERSE NUMBERS : 31 in N'Ko is 13 in latin ! + def reverse(m): + digits=m.groups()[0] + digitsr = digits[len(digits)::-1] + return digitsr + w = re.sub(r'([0-9][0-9]+)',reverse,w) + # otherwise, the reversal comes with the replacement of RTL characters with LTR characters + # for numbers just as well as for words + + ### NUMERALS: w = w.replace('\u07c0', r"0") w = w.replace('\u07c1', r"1") w = w.replace('\u07c2', r"2") @@ -186,7 +214,12 @@ def convert(self, token): w = w.replace('\u07c7', r"7") w = w.replace('\u07c8', r"8") w = w.replace('\u07c9', r"9") - w = re.sub('(\d)\u07f2', r"\1nan", w) + w = re.sub('r(\d)\u07f2', r"\1nan", w) + w = re.sub('r(\d) \u07f2', r"\1nan", w) # I notice it is considered separate JJM 28/10/2024 + w = re.sub('r\u07f2(\d)', r"\1nan", w) # is it not the reverse? JJM 28/10/2024B + w = re.sub('r\u07f2 (\d)', r"\1nan", w) # is it not the reverse? JJM 28/10/2024B + #NB : none of these 4 approaches work ? + ### NASALIZATION MARK: w = w.replace('\u07f2', r"n") @@ -199,14 +232,14 @@ def convert(self, token): w = w.replace('R', r"rr") w = w.replace('G', r"gb") w = w.replace('S', r"sh") # sh - w = w.replace('D', "d\u0323") ### D. - w = w.replace('Q', "\u0127") ### H. - w = w.replace('J', "\u0292") ### C. = zh - w = w.replace('A', "\u0295") ### A" - w = w.replace('F', "\u03b8") ### S" - w = w.replace('T', "t\u0323") ### J" - w = w.replace('Z', "z\u0323") ### J" - w = w.replace('C', "s\u0323") ### S= + w = w.replace('D', "d\u0323") ### D. u0323=Combining dot below + w = w.replace('Q', "\u0127") ### H. Latin Small Letter H With Stroke (U+0127) + w = w.replace('J', "\u0292") ### C. = zh - why not j ??? + w = w.replace('A', "\u0295") ### A" Latin Letter Pharyngeal Voiced Fricative (U+0295) + w = w.replace('F', "\u03b8") ### S" Greek Small Letter Theta (U+03B8) + w = w.replace('T', "t\u0323") ### J" u0323=Combining dot below + w = w.replace('Z', "z\u0323") ### J" u0323=Combining dot below + w = w.replace('C', "s\u0323") ### S= u0323=Combining dot below w = re.sub('[‘]', r"`", w) w = re.sub('[’]', r"'", w) w = w.replace('_', '') @@ -217,7 +250,7 @@ def convert(self, token): if debug: print("LAT", w,) w = self.normalize_tones(w) - + if debug: print("TNL", w) return [w] diff --git a/daba/plugins/thoyer.py b/daba/plugins/thoyer.py new file mode 100644 index 0000000..7f9d374 --- /dev/null +++ b/daba/plugins/thoyer.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# -*- coding: utf8 -*- + +from daba.plugins import OrthographyConverter +import funcparserlib.lexer +import re +import unicodedata + + +class ThoyertoNew(OrthographyConverter): + def __init__(self, *args, **kwargs): + self.title = 'thoyer' + self.desc = 'Convertor from Annik Thoyer transcriptions 1978' + + def convert(self, token): + """ + Main conversion method + """ + conversion_table = {u'èè':[u'ɛɛ'], u'òò':[u'ɔɔ'], u'èe':[u'ɛɛ'], u'òo':[u'ɔɔ'], u'è':[u'ɛ'], u'ò':[u'ɔ'], + u'ng':[u'ng',u'ŋ'], u'ny':[u'ny',u'ɲ'], + u'sy':[u'sh', u's'], u'y':[u'y', u'j'], u'gh':[u'g'], u'gb':[u'g'], u'gw':[u'g',u'j',u'gw']} + + def graphemes_old(word): + # split word into maximal length graphemes (old orthography) + specs = [ + ('NG', (r'ng', re.I | re.U)), + ('NY', (r'ny', re.I | re.U)), + ('EE', (r'è[eè]', re.I | re.U)), + ('OO', (r'ò[oò]', re.I | re.U)), + ('GH', (r'gh', re.I | re.U)), + ('GB', (r'gb', re.I | re.U)), + ('GW', (r'gw', re.I | re.U)), + ('SY', (r'sy', re.I | re.U)), + ('ANY', (r'.', re.U)), + ] + + tok = funcparserlib.lexer.make_tokenizer(specs) + r = [x.value for x in tok(unicodedata.normalize('NFKC', word))] + + return r + + def multiply_list(amblist): + # given list of lists, returns list of all possible concatenations + # taking a single element from each list + def multiply_list_aux(l, amblist): + if len(amblist)>0: + m = [ l[k]+[amblist[0][i]] for k in range(len(l)) for i in range(len(amblist[0]))] + return multiply_list_aux(m, amblist[1:]) + else: + return l + return multiply_list_aux([[]], amblist) + + def convertg(grapheme): + # convert a single grapheme into a list of corresponding graphemes in new orthography + try: + # !!HACK: converts graphemes to lowercase!! + return conversion_table[grapheme.lower()] + except KeyError: + return [grapheme] + + def convertw(word): + # given single word in old orthography returns + # list of all possible translations to new orthography + graphemes = [convertg(g) for g in graphemes_old(word)] + return [''.join(w) for w in multiply_list(graphemes)] + + return convertw(token) diff --git a/daba/plugins/ytcisse.py b/daba/plugins/ytcisse.py new file mode 100644 index 0000000..e1880fb --- /dev/null +++ b/daba/plugins/ytcisse.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf8 -*- + +from daba.plugins import OrthographyConverter +import funcparserlib.lexer +import re +import unicodedata + + +class YTCissetoNew(OrthographyConverter): + def __init__(self, *args, **kwargs): + self.title = 'ytcisse' + self.desc = 'Convertor from Youssouf Tata Cissé transcriptions of Wa Kamissoko 1976' + + def convert(self, token): + """ + Main conversion method + """ + conversion_table = {u'èè':[u'ɛɛ'], u'òò':[u'ɔɔ'], u'èe':[u'ɛɛ'], u'òo':[u'ɔɔ'], u'è':[u'ɛ'], u'ò':[u'ɔ'], + u'ng':[u'ng',u'ŋ'], u'nw':[u'ng',u'nw'], u'ny':[u'ny',u'ɲ'], u'dy':[u'j',u'dy'], u'ty':[u'c',u'ty'], u't':[u't',u'c'], u'k':[u'k',u'g'], + u'sh':[u's',u'sh'], u'sy':[u'sh', u's'], u'y':[u'y', u'j'], u'gh':[u'g'], u'gb':[u'g'], u'gw':[u'g',u'j',u'gw'], + u'aa':[u'aa',u'a'], u'ee':[u'ee',u'e'], u'ii':[u'ii',u'i'], u'oo':[u'oo',u'o'], u'uu':[u'uu',u'u'], u'ɛɛ':[u'ɛɛ',u'ɛ'], u'ɔɔ':[u'ɔɔ',u'ɔ']} + + def graphemes_old(word): + # split word into maximal length graphemes (old orthography) + specs = [ + ('NG', (r'ng', re.I | re.U)), + ('NY', (r'ny', re.I | re.U)), + ('NW', (r'nw', re.I | re.U)), + ('DY', (r'dy', re.I | re.U)), + ('TY', (r'ty', re.I | re.U)), + ('EE', (r'è[eè]', re.I | re.U)), + ('OO', (r'ò[oò]', re.I | re.U)), + ('GH', (r'gh', re.I | re.U)), + ('GB', (r'gb', re.I | re.U)), + ('GW', (r'gw', re.I | re.U)), + ('SH', (r'sh', re.I | re.U)), + ('SY', (r'sy', re.I | re.U)), + ('A2', (r'aa', re.I | re.U)), + ('E2', (r'ee', re.I | re.U)), + ('I2', (r'ii', re.I | re.U)), + ('O2', (r'oo', re.I | re.U)), + ('U2', (r'uu', re.I | re.U)), + ('Ɛ2', (r'ɛɛ', re.I | re.U)), + ('Ɔ2', (r'ɔɔ', re.I | re.U)), + ('ANY', (r'.', re.U)), + ] + + tok = funcparserlib.lexer.make_tokenizer(specs) + r = [x.value for x in tok(unicodedata.normalize('NFKC', word))] + + return r + + def multiply_list(amblist): + # given list of lists, returns list of all possible concatenations + # taking a single element from each list + def multiply_list_aux(l, amblist): + if len(amblist)>0: + m = [ l[k]+[amblist[0][i]] for k in range(len(l)) for i in range(len(amblist[0]))] + return multiply_list_aux(m, amblist[1:]) + else: + return l + return multiply_list_aux([[]], amblist) + + def convertg(grapheme): + # convert a single grapheme into a list of corresponding graphemes in new orthography + try: + # !!HACK: converts graphemes to lowercase!! + return conversion_table[grapheme.lower()] + except KeyError: + return [grapheme] + + def convertw(word): + # given single word in old orthography returns + # list of all possible translations to new orthography + graphemes = [convertg(g) for g in graphemes_old(word)] + return [''.join(w) for w in multiply_list(graphemes)] + + return convertw(token) diff --git a/daba/wordparser.py b/daba/wordparser.py index dfa96df..77cf98f 100644 --- a/daba/wordparser.py +++ b/daba/wordparser.py @@ -2,16 +2,187 @@ import sys from daba.mparser import DictLoader, GrammarLoader, Processor from pprint import pprint +# colours in terminal : https://stackoverflow.com/a/2616912 +import re +import signal +#from Tkinter import Tk + +def handler(signum, frame): + print("use Shift+Ctrl+C to copy - just use (blank entry)/return to exit") + #Tk().clipboard_get() # copy anyway ? + +signal.signal(signal.SIGINT, handler) + +splitted=re.compile(r"́|̀|̌|̂|̧|̈") +mapping = { 'à':'à','á':'á', 'â':'â', 'é':'é', 'ê':'ê', 'è':'è', 'ë':'ë', 'ì':'ì', 'í':'í', 'î':'î', 'ï':'ï', 'ò':'ò', 'ó':'ó', 'ô':'ô', 'û':'û', 'ù':'ù', 'ú':'ú', 'ç':'ç'} +# tbc : more missing ? ... + +def tomonolith(mystring) : + if splitted.search(mystring) : + for k, v in mapping.items(): + mystring = mystring.replace(k, v) + return mystring + +def mmlist(mrphx): # can handle multiple level mm + mrphx=mrphx.replace("[","[ ") + mrphx=mrphx.replace("]"," ]") + mrphelem=mrphx.split(" ") + mmprefix="\\mm" + level=0 + mms="" + for elem in mrphelem: + if elem=="[": + level+=1 + mmprefix=mmprefix+"m" + elif elem=="]": + level-=1 + mmprefix=mmprefix[:-1] + else: + if ":" in elem: + mmlx,mmps,mmgloss=elem.split(":",2) + mms+=mmprefix+" "+mmlx+":"+mmps+":"+tomonolith(mmgloss)+"\n" + else: + mms+=mmprefix+" "+elem # ??? what happened ??? + return mms + +#print('test mmlist: \n', mmlist('[mɔ̀ɔba:n:adulte [mɔ̀ɔ:n:homme ba:mrph:AUGM] kɔ̀dɔ:adj:vieux]')) +# to complement single level in glosslist from parser, +# load mmc glosses that exist in last build of the language dictionaries (export.sh): +# bamadaba-mmc.txt, malidaba-mmc.txt or jula-mmc.txt +# must be renamed as "mmc.txt" and placed in the current directory (where a gparser "run" subdir exists) +glossdict={} +has_mmc=False +try: + mmcfile=open("mmc.txt",'r',encoding="utf-8") + print("helper file mmc.txt found") + has_mmc=True +except: + print("no mmc helper file") +if has_mmc: + mmcall=mmcfile.read() + mmcfile.close() + mmclist=mmcall.split("\n") + for mmc in mmclist: + lx,ps,glmm=mmc.split(":",2) + if " " in glmm: + gl,mm=glmm.split(" ",1) + mmindex=lx+":"+ps+":"+gl + glossdict[mmindex]=mm + print("mmc available",len(glossdict)) + +def recursemm(y): + mrph="[" + for z in y.morphemes: + mpsfull="" + for z1 in z.ps: + mpsfull+=z1+"/" + if mpsfull!="" : mpsfull=mpsfull[:-1] + thismrph=z.form+":"+mpsfull+":"+tomonolith(z.gloss) + if thismrph!="-:mrph:-" : + if has_mmc: + if thismrph in glossdict: + thismrph=thismrph+" "+glossdict[thismrph] + mrph+=thismrph+" " + if z.morphemes: mrph+=recursemm(z) + mrph+="] " + return mrph + + +def glossprint(glosslist): + for x in glosslist: + #print("x:",x) + mrph="" + # checking for multilevel gloss: no found here 14/04/2024 + # fixed in formats1 30/09/2024 + # could simplify with mrph=str(glosslist) but I want the gloses in monolith + # print("glossprint: x.morphemes=", x.morphemes) + for y in x.morphemes : + mpsfull="" + for y1 in y.ps: + mpsfull+=y1+"/" + if mpsfull!="" : mpsfull=mpsfull[:-1] + thismrph=y.form+":"+mpsfull+":"+tomonolith(y.gloss) + if thismrph!="-:mrph:-" : + if has_mmc: + if thismrph in glossdict: + thismrph=thismrph+" "+glossdict[thismrph] + mrph+=thismrph+" " + if y.morphemes: mrph+=recursemm(y) + """ + if y.morphemes: + mrph+="[" + for z in y.morphemes: + mpsfull="" + for z1 in z.ps: + mpsfull+=z1+"/" + if mpsfull!="" : mpsfull=mpsfull[:-1] + thismrph=z.form+":"+mpsfull+":"+tomonolith(z.gloss) + if thismrph!="-:mrph:-" : + if has_mmc: + if thismrph in glossdict: + thismrph=thismrph+" "+glossdict[thismrph] + mrph+=thismrph+" " + mrph+="] " + """ + if mrph!="": + mrph=mrph.replace(' ]',']') + mrph=mrph[:-1] + mrphprint="" + if mrph!="": mrphprint="["+mrph+"]" + psfull="" + for y in x.ps : psfull+=y+"/" + if psfull!="": psfull=psfull[:-1] + print(x.form+":"+psfull+":"+tomonolith(x.gloss), mrphprint) + if mrph!="": + print(mmlist(mrph)) + # can only be single level : levels not returned in x.morphemes??? ;-( def main(): dl = DictLoader() gr = GrammarLoader() pp = Processor(dl, gr) while True: - word = input('Enter word:') - result = pp.parser.lemmatize(word, debug=True) - print('Final result::') - pprint(result) + word = input('\033[42;30;1mEnter word:\033[0m ') + if word=="" : sys.exit() + stage,glosslist = pp.parser.lemmatize(word, debug=True) + print('\033[1mFinal result:\033[0m') + print(stage,glosslist) + # gloss=glosslist[0] + # print("\ngloss.ps",gloss.ps, len(gloss.ps)) + # print("\ngloss.gloss",gloss.gloss) + glossprint(glosslist) + + gloss=glosslist[0] + if len(glosslist)==1: + #if (len(gloss.ps)==0 and gloss.gloss=="") or (gloss.ps[0]=="n.prop" and (gloss.gloss=="NOM" or gloss.gloss=="TOP")): + glossps="" + if len(gloss.ps)>0 : glossps=gloss.ps[0] + if (len(gloss.ps)==0 and gloss.gloss=="") or (glossps=="n.prop"): + if word[0].isupper(): + #word=word[0].lower()+word[1:] + word=word.lower() + stage2,glosslist2 = pp.parser.lemmatize(word) + print('\033[1mAlternative result lowercase:\033[0m') + #print(stage,glosslist) + glossprint(glosslist2) + else: + #word=word[0].upper()+word[1:] + word=word.capitalize() + stage2,glosslist2 = pp.parser.lemmatize(word) + print('\033[1mAlternative result uppercase:\033[0m') + #print(stage,glosslist) + glossprint(glosslist2) + + print ("\033[1mpotential global result\033[0m") # (from mparser - modified - check differences) + if len(glosslist2)>1 : + #or not ((len(glosslist2.ps)==0 and glosslist2.gloss=="") + #or (len(glosslist2.ps)==1 and glosslist2.ps[0]=="n.prop" and glosslist2.gloss=="NOM")): + if (len(gloss.ps)==0 and gloss.gloss=="") or (len(gloss.ps)==1 and gloss.ps[0]=="n.prop" and gloss.gloss=="NOM") : + glosslist=glosslist2 + else: + glosslist=glosslist+glosslist2 + glossprint(glosslist) + if __name__ == '__main__': main() diff --git a/docs/DABA-gdisamb-documentation.odt b/docs/DABA-gdisamb-documentation.odt new file mode 100644 index 0000000..937a048 Binary files /dev/null and b/docs/DABA-gdisamb-documentation.odt differ diff --git a/docs/DABA-gparser-documentation.odt b/docs/DABA-gparser-documentation.odt new file mode 100644 index 0000000..369b94d Binary files /dev/null and b/docs/DABA-gparser-documentation.odt differ diff --git a/docs/DABA-meta-documentation.odt b/docs/DABA-meta-documentation.odt new file mode 100644 index 0000000..d828fe1 Binary files /dev/null and b/docs/DABA-meta-documentation.odt differ diff --git a/docs/samples/bamana.gram.txt b/docs/samples/bamana.gram.txt index 7de06d1..66d1372 100644 --- a/docs/samples/bamana.gram.txt +++ b/docs/samples/bamana.gram.txt @@ -1,5 +1,5 @@ # Bamana (Bambara) morphotactic patterns - +# version 20.05.2019 - 21.05 07:35 - 22.11.2019 # macro definitions macro @smth-nasal@ .*([nmɲŋ]..?n?|n|[aoeuiɔɛ][́̀̌]?n<) macro @nasal-syl@ .*([nmɲŋ]..?n?|n) @@ -12,116 +12,391 @@ macro @syl@ [^aoeuiɔɛ]*(?P[aoeuiɔɛ])(?P=V)?(n(?=([^aoeuiɔɛ]|$)))? macro @syl1@ [^aoeuiɔɛ]*(?P[aoeuiɔɛ])(?P=V1)?(n(?=([^aoeuiɔɛ]|$)))? macro @syl2@ [^aoeuiɔɛ]*(?P[aoeuiɔɛ])(?P=V2)?(n(?=([^aoeuiɔɛ]|$)))? macro @bam-cons@ [bcdfghjklmɲŋprstyz] +macro @proper@ [A-ZƐƆƝŊ][a-zɛɔɲŋ́̀̌̂]+ +macro @worddigits@ [a-zA-ZɛɔɲŋƐƆƝŊ\-]{4,} # processing instructions plan for token: -stage 0 add parallel parse inflection -stage 0 add parallel parse common_derivation -stage 0 add parallel parse participles +stage 0 add parallel parse num1price +stage 0 add parallel parse inflection_PFVINTR_PROG_PL +stage 0 add parallel parse participles_PTCPRES +stage 0 add parallel parse n_v_derivation_ABSTR_GENT_LOC_AGPRM_LI stage 0 apply lookup return if parsed -stage 1 add sequential parse n_v_derivation +stage 1 add parallel parse common_derivation_DIM_AUGM +stage 1 add parallel parse prefixes +stage 1 add parallel parse participles stage 1 add parallel parse n_derivation stage 1 add parallel parse v_vq_derivation stage 1 add parallel parse vq_derivation +stage 1 add parallel parse specialcompo1 stage 1 add parallel parse num_derivation stage 1 apply lookup return if parsed -stage 2 add parallel parse reduplication +stage 2 add parallel parse rare_plural_PL2 +stage 2 add parallel parse inflection_MNT12_OPT2_DIR +stage 2 add parallel decompose pos_composition stage 2 apply lookup return if parsed -stage 3 add parallel decompose pos_composition +stage 3 add parallel parse reduplication +stage 3 add parallel parse specialcompo2 +stage 3 apply lookup +return if parsed +stage 4 add parallel decompose pos_composition3 +stage 4 apply lookup +return if parsed +stage 5 add parallel decompose pos_composition4 +return if parsed +stage 6 add parallel parse alphadigits +stage 6 apply lookup return if parsed -stage 4 apply firstmatch parse nonbamana +stage 7 apply firstmatch parse propername +stage 7 apply firstmatch parse nonbamana return if parsed #for sentence +section num1price +# -la/-na PRICE +pattern :num: [ {@nasal-v@|na}:: ] | :num: [ :num: :mrph:PRIX] +pattern :num: [ {@nonnasal-v@|la}:: ] | :num: [ :num: :mrph:PRIX] -section inflection -# verbal inflection -# -la/-na PROG -pattern :v: [ {@nasal-v@|na}:: ] | :v: [:v: :mrph:PROG] -pattern :v: [ {@nonnasal-v@|la}:: ] | :v: [:v: :mrph:PROG] -# moved up from v_vq_derivation because of na/la ambiguity -pattern :n: [ {@smth-nasal@|na}:: ] | :n: [ :v: :mrph:AG.PRM] -pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [ :v: :mrph:AG.PRM] + +section inflection_PFVINTR_PROG_PL +# verbal inflection (by order of frequency likelyhood) # -ra/-la/-na PFV.INTR pattern :v: [ {@nasal-syl@|n[a']}:: ] | :v: [:v: :mrph:PFV.INTR] pattern :v: [ {@glide-syl@|l[a']}:: ] | :v: [:v: :mrph:PFV.INTR] pattern :v: [ {@nonnasalglide-syl@|r[a']}:: ] | :v: [:v: :mrph:PFV.INTR] + +# -la/-na PROG +pattern :v: [ {@smth-nasal@|na}:: ] | :v: [:v: :mrph:PROG] +pattern :v: [ {@nonnasal-v@|la}:: ] | :v: [:v: :mrph:PROG] + # nominal inflection # -w PL pattern :n/adj/dtm/prn/ptcp/n.prop/num: [ {|w}:: ] | :n/adj/dtm/prn/ptcp/n.prop/num: [:n/adj/dtm/prn/ptcp/n.prop/num: :mrph:PL] -# participles -section participles -pattern :v/ptcp: [ {|bali}:: ] | :ptcp: [ :v: :mrph:PTCP.PRIV] -pattern :v/ptcp: [ {|ta}:: ] | :ptcp: [ :v: :mrph:PTCP.POT] -pattern :v/ptcp: [ {|tɔ}:: ] | :ptcp: [ :v: :mrph:CONV.PROG] +section inflection_MNT12_OPT2_DIR +# produce nouns but moved here because of na/la ambiguity + +# -la/-na MNT1 +pattern :n: [ {@smth-nasal@|na}:: ] | :n: [ :v: :mrph:MNT1] +pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [ :v: :mrph:MNT1] +# up from n_derivation (otherwise it's ignored for N) +pattern :n: [ {@smth-nasal@|na}:: ] | :n: [ :n: :mrph:MNT1] +pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [ :n: :mrph:MNT1] + +# -ra/-la/-na OPT2 +pattern :v: [ {@nasal-syl@|n[a']}:: ] | :v: [:v: :mrph:OPT2] +pattern :v: [ {@glide-syl@|l[a']}:: ] | :v: [:v: :mrph:OPT2] +pattern :v: [ {@nonnasalglide-syl@|r[a']}:: ] | :v: [:v: :mrph:OPT2] + +# -ma DIR +pattern :v: [ {|ma}:: ] | :v: [:v: :mrph:DIR] + +# v to nouns +pattern :n: [ {@smth-nasal@|nata}:: ] | :n: [ :v: :mrph:MNT2] +pattern :n: [ {@nonnasal-v@|lata}:: ] | :n: [ :v: :mrph:MNT2] +# n to nouns +pattern :n: [ {@smth-nasal@|nata}:: ] | :n: [ :n: :mrph:MNT2] +pattern :n: [ {@nonnasal-v@|lata}:: ] | :n: [ :n: :mrph:MNT2] + +# prefixes +section prefixes +pattern :v: [ {la|}:: ] | :v: [lá:mrph:CAUS :v:] +pattern :v: [ {na|}:: ] | :v: [ná:mrph:CAUS :v:] +pattern :v/ptcp: [ {la||len}:: ] | :ptcp: [ lá:mrph:CAUS :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {na||len}:: ] | :ptcp: [ ná:mrph:CAUS :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {la||nen}:: ] | :ptcp: [ lá:mrph:CAUS :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {na||nen}:: ] | :ptcp: [ ná:mrph:CAUS :v: :mrph:PTCP.RES] +# tonal +pattern :v: [ {lá|}:: ] | :v: [:mrph:CAUS :v:] +pattern :v: [ {ná|}:: ] | :v: [:mrph:CAUS :v:] +pattern :v/ptcp: [ {lá||len}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {ná||len}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {lá||nen}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {ná||nen}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES] + +pattern :v/ptcp: [ {la||ta}:: ] | :ptcp: [ lá:mrph:CAUS :v: :mrph:PTCP.POT ] +pattern :v/ptcp: [ {la||tɔ}:: ] | :ptcp: [ lá:mrph:CAUS :v: :mrph:CONV.PROG ] +pattern :v/ptcp: [ {la||bali}:: ] | :ptcp: [ lá:mrph:CAUS :v: :mrph:PTCP.PRIV ] +pattern :v/ptcp: [ {lá||ta}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.POT ] +pattern :v/ptcp: [ {lá||tɔ}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:CONV.PROG ] +pattern :v/ptcp: [ {lá||bali}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.PRIV ] + +pattern :v/ptcp: [ {na||ta}:: ] | :ptcp: [ ná:mrph:CAUS :v: :mrph:PTCP.POT ] +pattern :v/ptcp: [ {na||tɔ}:: ] | :ptcp: [ ná:mrph:CAUS :v: :mrph:CONV.PROG ] +pattern :v/ptcp: [ {na||bali}:: ] | :ptcp: [ ná:mrph:CAUS :v: :mrph:PTCP.PRIV ] +pattern :v/ptcp: [ {ná||ta}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.POT ] +pattern :v/ptcp: [ {ná||tɔ}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:CONV.PROG ] +pattern :v/ptcp: [ {ná||bali}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.PRIV ] + +pattern :v/n: [ {la||li}:: ] | :n: [ lá:mrph:CAUS :v: :mrph:NMLZ ] +pattern :v/n: [ {la||ni}:: ] | :n: [ lá:mrph:CAUS :v: :mrph:NMLZ ] +pattern :v/n: [ {lá||li}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ ] +pattern :v/n: [ {lá||ni}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ ] +pattern :v/n: [ {na||li}:: ] | :n: [ ná:mrph:CAUS :v: :mrph:NMLZ ] +pattern :v/n: [ {na||ni}:: ] | :n: [ ná:mrph:CAUS :v: :mrph:NMLZ ] +pattern :v/n: [ {ná||li}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ ] +pattern :v/n: [ {ná||ni}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ ] + +pattern :v: [ {ma|}:: ] | :v: [mà:mrph:SUPER :v:] +pattern :v: [ {mà|}:: ] | :v: [:mrph:SUPER :v:] +pattern :v/ptcp: [ {ma||len}:: ] | :ptcp: [ mà:mrph:SUPER :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {mà||len}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {ma||nen}:: ] | :ptcp: [ mà:mrph:SUPER :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {mà||nen}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.RES] + +pattern :v/ptcp: [ {ma||ta}:: ] | :ptcp: [ mà:mrph:SUPER :v: :mrph:PTCP.POT ] +pattern :v/ptcp: [ {ma||tɔ}:: ] | :ptcp: [ mà:mrph:SUPER :v: :mrph:CONV.PROG ] +pattern :v/ptcp: [ {ma||bali}:: ] | :ptcp: [ mà:mrph:SUPER :v: :mrph:PTCP.PRIV ] +pattern :v/ptcp: [ {mà||ta}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.POT ] +pattern :v/ptcp: [ {mà||tɔ}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:CONV.PROG ] +pattern :v/ptcp: [ {mà||bali}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.PRIV ] + +pattern :v/n: [ {ma||li}:: ] | :n: [ mà:mrph:SUPER :v: :mrph:NMLZ ] +pattern :v/n: [ {ma||ni}:: ] | :n: [ mà:mrph:SUPER :v: :mrph:NMLZ ] +pattern :v/n: [ {mà||li}:: ] | :n: [ :mrph:SUPER :v: :mrph:NMLZ ] +pattern :v/n: [ {mà||ni}:: ] | :n: [ :mrph:SUPER :v: :mrph:NMLZ ] + +pattern :v: [ {rɔ|}:: ] | :v: [rɔ́:mrph:IN :v:] +pattern :v: [ {sɔ|}:: ] | :v: [sɔ̀:mrph:EN :v:] +pattern :v: [ {rɔ́|}:: ] | :v: [:mrph:IN :v:] +pattern :v: [ {sɔ̀|}:: ] | :v: [:mrph:EN :v:] + +pattern :n: [{tɔgɔ|la|}:: ] | :n: [tɔ́gɔ:n:nom lá:pp:à :n:] + +section participles_PTCPRES pattern :v/ptcp: [ {@smth-nasal@|nen}:: ] | :ptcp: [ :v: :mrph:PTCP.RES] pattern :v/ptcp: [ {@nonnasal-v@|len}:: ] | :ptcp: [ :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {@smth-nasal@|'n}:: ] | :ptcp: [ :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {@nonnasal-v@|'n}:: ] | :ptcp: [ :v: :mrph:PTCP.RES] +# other participles +section participles +pattern :v/ptcp: [ {|bali}:: ] | :ptcp: [ :v: :mrph:PTCP.NEG] +pattern :v/ptcp: [ {|ta}:: ] | :ptcp: [ :v: :mrph:PTCP.POT] +pattern :v/ptcp: [ {|baga|tɔ}:: ] | :ptcp: [ :v: :mrph:AG.OCC :mrph:CONV] +pattern :v/ptcp: [ {|baa|tɔ}:: ] | :ptcp: [ :v: :mrph:AG.OCC :mrph:CONV] +pattern :v/ptcp: [ {|baga|tɔ}:: ] | :ptcp: [ :v: :mrph:AG.OCC :mrph:ST] +pattern :v/ptcp: [ {|baa|tɔ}:: ] | :ptcp: [ :v: :mrph:AG.OCC :mrph:ST] +pattern :v/ptcp: [ {|tɔ}:: ] | :ptcp: [ :v: :mrph:CONV] +pattern :v/ptcp: [ {|tɔ|la}:: ] | :ptcp: [ :v: :mrph:CONV :mrph:PROG] + +# moved up from n_derivation - otherwise never apply +pattern :adj/n: [ {|baga|tɔ}:: ] | :n: [ :n: :mrph:AG.OCC :mrph:ST] +pattern :adj/n: [ {|baa|tɔ}:: ] | :n: [ :n: :mrph:AG.OCC :mrph:ST] +pattern :adj/n: [ {|tɔ}:: ] | :n: [ :n: :mrph:ST] +pattern :adj/n: [ {|baga|tɔ}:: ] | :adj: [ :n: :mrph:AG.OCC :mrph:ST] +pattern :adj/n: [ {|baa|tɔ}:: ] | :adj: [ :n: :mrph:AG.OCC :mrph:ST] +pattern :adj/n: [ {|tɔ}:: ] | :adj: [ :n: :mrph:ST] +# re above : adj/n does not produce adj ? + +# frequently AUGMented participles pattern :v/ptcp: [ {@smth-nasal@|nen|ba}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:AUGM] pattern :v/ptcp: [ {@nonnasal-v@|len|ba}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:AUGM] +pattern :v/ptcp: [ {@smth-nasal@|nen|nin}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:DIM] +pattern :v/ptcp: [ {@nonnasal-v@|len|nin}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:DIM] +# to handle -baliya ex: basigi.bali.ya +pattern :v/ptcp/n/adj: [ {|bali|ya}:: ] | :n: [:v/ptcp: :mrph:PTCP.NEG :mrph:ABSTR] +pattern :v/ptcp/n/adj: [ {|ta|ya}:: ] | :n: [:v/ptcp: :mrph:PTCP.POT :mrph:ABSTR] +pattern :v/ptcp/n/adj: [ {|tɔ|ya}:: ] | :n: [:v/ptcp: :mrph:CONV :mrph:ABSTR] +pattern :v/ptcp/n/adj: [ {@smth-nasal@|nen|ya}:: ] | :n: [:v/ptcp: :mrph:PTCP.RES :mrph:ABSTR] +pattern :v/ptcp/n/adj: [ {@nonnasal-v@|len|ya}:: ] | :n: [:v/ptcp: :mrph:PTCP.RES :mrph:ABSTR] # derivative forms we need to consider even if we have them in dictionary -section common_derivation +section common_derivation_DIM_AUGM pattern :ptcp/n/adj: [ {|nin}:: ] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:DIM] -pattern :n/adj/ptcp/v: [ {|ya}:: ] | :n: [:n/adj/ptcp/v: :mrph:ABSTR] pattern :ptcp/n/adj: [ {|ba}:: ] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:AUGM] -# to handle -baliya ex: basigi.bali.ya -pattern :v/ptcp/n/adj: [ {|bali|ya}:: ] | :n: [:v/ptcp: :mrph:PTCP.PRIV :mrph:ABSTR] -# common nominal/verbal derivation (locatives) -section n_v_derivation -pattern :n/n.prop: [ {|ka}:: ] | :n/n.prop: [:n/n.prop: :mrph:GENT] -pattern :n/n.prop: [ {@nasal-v@|na}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] -pattern :n/n.prop: [ {@nonnasal-v@|la}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] +# common nominal/verbal derivation (locatives) - rarer + +section n_v_derivation_ABSTR_GENT_LOC_AGPRM_LI +pattern :n/adj/ptcp/v: [ {|ya}:: ] | :n: [:n/adj/ptcp/v: :mrph:ABSTR] +pattern :n/adj/ptcp/v: [ {|ya}:: ] | :v: [:n/adj/ptcp/v: :mrph:ABSTR] +#pattern :n/n.prop: [ {|ka}:: ] | :n/n.prop: [:n/n.prop: :mrph:GENT] +#restricted to n.prop 14/3/17 +pattern :n.prop: [ {|ka}:: ] | :n: [:n.prop: :mrph:GENT] +pattern :n.prop: [ {|la|ka}:: ] | :n: [:n.prop: :mrph:LOC :mrph:GENT] + + +# -la/-na LOC +pattern :n.prop: [ {@nonnasal-v@|la}:: ] | :n: [:n.prop: :mrph:LOC] +pattern :n.prop: [ {@smth-nasal@|na}:: ] | :n: [:n.prop: :mrph:LOC] +# CAUTION::: LOC la/na will supersedes AG.PRM if placed after AG.PRM la/na which converts v to n !!! +# pattern :n: [ {@smth-nasal@|na}:: ] | :n: [:n: :mrph:LOC] +# pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [:n: :mrph:LOC] +# can the gloss TOP be forced ? | :n/n.prop:TOP [:n/n.prop: :mrph:LOC] +# Can the n.prop stay with its initial capital ? +# Can n be restricted to a list of body parts, ex: sin -> sin.na "sein LOC" - different from sin.na "diriger PFV.INTR" +# try: +#fails: pattern :n: [ {sin|na}:: ] | :n: [sín:n:sein :mrph:LOC] +#fails: pattern :n: [ {nun|na}:: ] | :n: [nún:n:nez :mrph:LOC] +#fails: pattern :n: [ {(nun|sin)|na}:: ] | :n: [:n: :mrph:LOC] +#the above also fails anyway: pattern :n: [ {@smth-nasal@|na}:: ] | :n: [:n: :mrph:LOC] + +# -la/-na AG.PRM +pattern :n: [ {@smth-nasal@|na}:: ] | :n: [ :v: :mrph:AG.PRM] +pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [ :v: :mrph:AG.PRM] +pattern :n: [ {|kɛ|la}:: ] | :n: [ :n: kɛ́:v:faire :mrph:AG.PRM] + +# moved up from v_vq_derivation - otherwise never apply for suli, sali +pattern :n: [ {@smth-nasal@|ni}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nonnasal-v@|li}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {|kɛ|li}:: ] | :n: [ :n: kɛ́:v:faire :mrph:NMLZ] # nominal derivation section n_derivation -pattern :n: [ {@smth-nasal@|nama}:: ] | :n: [ :n: :mrph:STAT] -pattern :n: [ {@nonnasal-v@|lama}:: ] | :n: [ :n: :mrph:STAT] + +pattern :n: [ {@smth-nasal@|nama}:: ] | :adj: [ :n: :mrph:STAT] +pattern :n: [ {@nonnasal-v@|lama}:: ] | :adj: [ :n: :mrph:STAT] +pattern :n: [ {@smth-nasal@|naman}:: ] | :adj: [ :n: nama:mrph:STAT] +pattern :n: [ {@nonnasal-v@|laman}:: ] | :adj: [ :n: lama:mrph:STAT] pattern :n: [ {|ma}:: ] | :n: [ :n: :mrph:COM] -pattern :adj/n: [ {|ntan}:: ] | :adj/n: [ :n: :mrph:PRIV] -pattern :adj/n: [ {|bagatɔ}:: ] | :adj/n: [ :n: :mrph:ST] -pattern :adj/n: [ {|baatɔ}:: ] | :adj/n: [ :n: :mrph:ST] -pattern :n: [ {ɲɔgɔn|}:: ] | :n: [ :prn:RECP :n: ] +pattern :n: [ {|ma}:: ] | :adj: [ :n: :mrph:COM] + +pattern :n: [ {|ma}:: ] | :n: [ :n: :mrph:RECP.PRN] + +pattern :adj/n: [ {|ntan}:: ] | :n: [ :n: :mrph:PRIV] +pattern :adj/n: [ {|ntan}:: ] | :adj: [ :n: :mrph:PRIV] + +pattern :n: [ {ɲɔgɔn|}:: ] | :n: [ ɲɔ́gɔn:prn:RECP :n: ] pattern :n: [ {|ɲwaa?n}:: ] | :n: [ :n: :prn:RECP] # verbal/vq derivation section v_vq_derivation -pattern :n: [ {@smth-nasal@|nan}:: ] | :n: [ :v: :mrph:INSTR] pattern :n: [ {@nonnasal-v@|lan}:: ] | :n: [ :v: :mrph:INSTR] -pattern :n: [ {@smth-nasal@|ni}:: ] | :n: [ :v: :mrph:NMLZ] -pattern :n: [ {@nonnasal-v@|li}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@smth-nasal@|nan}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {|kɛ|lan}:: ] | :n: [ :n: kɛ́:v:faire :mrph:INSTR] + +pattern :num: [ {|nan}:: ] | :adj: [ :num: :mrph:ORD] + pattern :n: [ {|baga}:: ] | :n: [ :v: :mrph:AG.OCC] pattern :n: [ {|baa}:: ] | :n: [ :v: :mrph:AG.OCC] + +pattern :n: [ {|kɛ|baga}:: ] | :n: [ :n: kɛ́:v:faire :mrph:AG.OCC] +pattern :n: [ {|kɛ|baa}:: ] | :n: [ :n: kɛ́:v:faire :mrph:AG.OCC] + +pattern :n: [ {@nonnasal-v@|li|baga}:: ] | :n: [ :v: :mrph:NMLZ :mrph:AG.OCC] +pattern :n: [ {@nonnasal-v@|li|baa}:: ] | :n: [ :v: :mrph:NMLZ :mrph:AG.OCC] +pattern :n: [ {@smth-nasal@|ni|baga}:: ] | :n: [ :v: :mrph:NMLZ :mrph:AG.OCC] +pattern :n: [ {@smth-nasal@|ni|baa}:: ] | :n: [ :v: :mrph:NMLZ :mrph:AG.OCC] + pattern :n: [ {|baga|nci}:: ] | :n: [ :v: :mrph:AG.OCC :mrph:AG.EX] pattern :n: [ {|baa|nci}:: ] | :n: [ :v: :mrph:AG.OCC :mrph:AG.EX] +pattern :n: [ {|ɲɔgɔn}:: ] | :n: [ :v: ɲɔ́gɔn:prn:RECP] +pattern :n: [ {|ɲwaa?n}:: ] | :n: [ :v: :prn:RECP] +pattern :n: [ {ɲɔgɔn|}:: ] | :v: [ ɲɔ́gɔn:prn:RECP :v: ] + -# attempt to handle -likɛ, -likɛla, others like -liwari... +section rare_plural_PL2 +#frequent in griotic songs / not dtm/prn : creates ambiguity for ò.lu !!! found only dɔlu - but not dalilu +pattern :n: [ {@nonnasal-syl@|lu}:: ] | :n: [:n: lú:mrph:PL2] +pattern :n: [ {@nasal-syl@|nu}:: ] | :n: [:n: nú:mrph:PL2] + +section specialcompo1 +# attempt to handle -likɛ, -likɛla, others like -likɛcogo... pattern :v: [ {@nonnasal-v@|li|kɛ}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire] -pattern :n: [ {@nonnasal-v@|li|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] pattern :v: [ {@smth-nasal@|ni|kɛ}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire] -pattern :n: [ {@smth-nasal@|ni|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] -pattern :n: [ {@nonnasal-v@|li|wari}:: ] | :n: [ :v: :mrph:NMLZ :n:] -pattern :n: [ {@smth-nasal@|ni|wari}:: ] | :n: [ :v: :mrph:NMLZ :n:] +pattern :n: [ {@nonnasal-v@|li|yɔrɔ}:: ] | :n: [ :v: :mrph:NMLZ :n:] +pattern :n: [ {@smth-nasal@|ni|yɔrɔ}:: ] | :n: [ :v: :mrph:NMLZ :n:] +pattern :n: [ {@nonnasal-v@|li|cogo}:: ] | :n: [ :v: :mrph:NMLZ :n:] +pattern :n: [ {@smth-nasal@|ni|cogo}:: ] | :n: [ :v: :mrph:NMLZ :n:] pattern :n: [ {@nonnasal-v@|li|fɛn}:: ] | :n: [ :v: :mrph:NMLZ fɛ́n:n:chose] pattern :n: [ {@smth-nasal@|ni|fɛn}:: ] | :n: [ :v: :mrph:NMLZ fɛ́n:n:chose] pattern :n: [ {@nonnasal-v@|li|ko}:: ] | :n: [ :v: :mrph:NMLZ kó:n:affaire] pattern :n: [ {@smth-nasal@|ni|ko}:: ] | :n: [ :v: :mrph:NMLZ kó:n:affaire] +pattern :n: [ {@nonnasal-v@|li|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] +pattern :n: [ {@smth-nasal@|ni|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] +pattern :n: [ {@nonnasal-v@|li|kɛ|baga}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.OCC] +pattern :n: [ {@smth-nasal@|ni|kɛ|baga}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.OCC] +pattern :n: [ {@nonnasal-v@|li|kɛ|baa}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.OCC] +pattern :n: [ {@smth-nasal@|ni|kɛ|baa}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.OCC] +pattern :n: [ {@nonnasal-v@|li|kɛ|cogo}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :n:] +pattern :n: [ {@smth-nasal@|ni|kɛ|cogo}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :n:] +pattern :n: [ {@nonnasal-v@|li|kɛ|yɔrɔ}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :n:] +pattern :n: [ {@smth-nasal@|ni|kɛ|yɔrɔ}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :n:] +pattern :n: [ {@nonnasal-v@|li|kɛ|fɛn}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire fɛ́n:n:chose] +pattern :n: [ {@smth-nasal@|ni|kɛ|fɛn}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire fɛ́n:n:chose] -# need to handle -ba AUGM inside ex: ko.jugu.ba.kɛ.la +pattern :n: [ {|tuma}:: ] | :n: [ :v: tùma:n:moment] +pattern :n: [ {|kɛ|tuma}:: ] | :n: [ :n: kɛ́:v:faire tùma:n:moment] +pattern :n: [ {|waati}:: ] | :n: [ :v: wáati:n:moment] +pattern :n: [ {|kɛ|waati}:: ] | :n: [ :n: kɛ́:v:faire wáati:n:moment] +pattern :n: [ {|yɔrɔ}:: ] | :n: [ :v: yɔ́rɔ:n:lieu] +pattern :n: [ {|kɛ|yɔrɔ}:: ] | :n: [ :n: kɛ́:v:faire yɔ́rɔ:n:lieu] +pattern :n: [ {|baara}:: ] | :n: [ :v: báara:n:travail] +pattern :n: [ {|kɛ|baara}:: ] | :n: [ :n: kɛ́:v:faire báara:n:travail] +pattern :n: [ {|cogo}:: ] | :n: [ :v: cógo:n:manière] +pattern :n: [ {|kɛ|cogo}:: ] | :n: [ :n: kɛ́:v:faire cógo:n:manière] +pattern :n: [ {|kun}:: ] | :n: [ :v: kùn:n:tête] +pattern :n: [ {|don}:: ] | :n: [ :v: dón:n:jour] +pattern :n: [ {|fɛn}:: ] | :n: [ :v: fɛ́n:n:chose] +pattern :n: [ {|sira}:: ] | :n: [ :v: síra:n:chemin] + +pattern :n: [ {|jamana}:: ] | :n: [ :n: jàmana:n:pays] +pattern :n.prop: [ {|jamana}:: ] | :n.prop: [ :n.prop: jàmana:n:pays] +pattern :n: [ {|tigi}:: ] | :n: [ :n: tìgi:n:maître] +pattern :n: [ {|ko}:: ] | :n: [ :n: kó:n:affaire] +pattern :n: [ {|cɛ}:: ] | :n: [ :n: cɛ̀:n:mâle] +pattern :n: [ {|mɔgɔ}:: ] | :n: [ :n: mɔ̀gɔ:n:homme] +pattern :n: [ {|maa}:: ] | :n: [ :n: màa:n:homme] +pattern :n: [ {|den}:: ] | :n: [ :n: dén:n:enfant] +pattern :n: [ {|sira}:: ] | :n: [ :n: síra:n:chemin] +pattern :n: [ {@nonnasal-v@|li|sira}:: ] | :n: [ :v: li:mrph:NMLZ síra:n:chemin] +pattern :n: [ {@smth-nasal@|ni|sira}:: ] | :n: [ :v: ni:mrph:NMLZ síra:n:chemin] +pattern :n: [ {|fɛn}:: ] | :n: [ :n: fɛ́n:n:chose] +pattern :n: [ {|baara}:: ] | :n: [ :n: báara:n:travail] +pattern :n: [ {|bana}:: ] | :n: [ :n: bàna:n:maladie] +pattern :n: [ {@nonnasal-v@|la|bana}:: ] | :n: [ :n: lá:pp:à bàna:n:maladie] +pattern :n: [ {@smth-nasal@|na|bana}:: ] | :n: [ :n: ná:pp:à bàna:n:maladie] +pattern :n: [ {|wari}:: ] | :n: [ :n: wári:n:argent] +pattern :n: [ {|kɛ|wari}:: ] | :n: [ :n: kɛ́:v:faire wári:n:argent] +pattern :n: [ {|kɛ|tɔn}:: ] | :n: [ :n: kɛ́:v:faire tɔ́n:n:société] +pattern :n: [ {|kɛ|so}:: ] | :n: [ :n: kɛ́:v:faire só:n:maison] + +pattern :n: [ {|muso}:: ] | :n: [ :n: mùso:adj:féminin] + +pattern :n: [ {|kɛ}:: ] | :n: [ :n: kɛ:adj:mâle] +pattern :n: [ {|kɛ}:: ] | :n/v: [ :n: kɛ́:v:faire] + +# dɔn.kɛ.tɔ not only ST +pattern :ptcp: [ {|kɛ|tɔ}:: ] | :ptcp: [ :n: kɛ́:v:faire :mrph:CONV] +pattern :ptcp: [ {|kɛ|tɔ|la}:: ] | :ptcp: [ :n: kɛ́:v:faire :mrph:CONV :mrph:PROG] +pattern :ptcp: [ {|kɛ|len}:: ] | :ptcp: [ :n: kɛ́:v:faire :mrph:PTCP.RES] + +section specialcompo2 + +# this is a temporary trick +# I would not advise to return before other "normal" composition +# but can't do otherwise. Caution with most... + +# 4 lines commented out after Kirill's parser update 7/3/2017 +#pattern :n: [ {@nonnasal-v@|li|.+}:: ] | :n: [ :v: :mrph:NMLZ :n:] +#pattern :n: [ {@smth-nasal-v@|ni|.+}:: ] | :n: [ :v: :mrph:NMLZ :n:] +#pattern :n: [ {@nonnasal-v@|li|.+}:: ] | :v: [ :v: :mrph:NMLZ :v:] +#pattern :n: [ {@smth-nasal-v@|ni|.+}:: ] | :v: [ :v: :mrph:NMLZ :v:] + +pattern :n: [ {@nonnasal-v@|li|kɛ|.+}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :n:] +pattern :n: [ {@smth-nasal-v@|ni|kɛ|.+}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :n:] +pattern :n: [ {@nonnasal-v@|li|kɛ|.+}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire :v:] +pattern :n: [ {@smth-nasal-v@|ni|kɛ|.+}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire :v:] + +pattern :n: [ {|ɲɔgɔn|.+}:: ] | :n: [ :v: ɲɔ́gɔn:prn:RECP :n:] +pattern :n: [ {|ɲɔgɔn|.+}:: ] | :v: [ :v: ɲɔ́gɔn:prn:RECP :v:] + +# composés nominaux avec yɛrɛ comme yɛrɛ.dɔn yɛrɛ.labila +pattern :n: [ {yɛrɛ|}:: ] | :n: [ yɛ̀rɛ̂:dtm:même :v: ] +# ou comme yɛrɛ.faga.su +pattern :n: [ {yɛrɛ|}:: ] | :n: [ yɛ̀rɛ̂:dtm:même :v: :n: ] +# composés verbaux aussi +pattern :v: [ {yɛrɛ|}:: ] | :v: [ yɛ̀rɛ̂:dtm:même :v: ] -pattern :n: [ {|ɲɔgɔn}:: ] | :n: [ :v: :prn:RECP] -pattern :n: [ {|ɲwaa?n}:: ] | :n: [ :v: :prn:RECP] -pattern :n: [ {ɲɔgɔn|}:: ] | :n: [ :prn:RECP :v: ] # vq derivation section vq_derivation @@ -130,72 +405,235 @@ pattern :adj: [ {|man}:: ] | :adj: [ :vq: :mrph:ADJ] # numeral derivation section num_derivation -pattern :num: [ {@nasal-v@|na}:: ] | :num: [ :num: :mrph:PRICE] -pattern :num: [ {@nonnasal-v@|la}:: ] | :num: [ :num: :mrph:PRICE] -pattern :num: [ {@nasal-v@|nan}:: ] | :num: [ :num: :mrph:ORD] -pattern :num: [ {[0-9]+|nan}:: ] | :num: [ :num: :mrph:ORD] -pattern :num: [ {@nonnasal-v@|lan}:: ] | :num: [ :num: :mrph:ORD] +# moved up to inflection pattern :num: [ {@nasal-v@|na}:: ] | :num: [ :num: :mrph:PRIX] +# pattern :num: [ {@nonnasal-v@|la}:: ] | :num: [ :num: :mrph:PRIX] +# moved up to v/vq derivation pattern :num: [ {|nan}:: ] | :adj: [ :num: :mrph:ORD] +# new fails pattern :num: [ {[0-9]+|nan}:: ] | :adj:ORDINAL [ :num:CARDINAL :mrph:ORD] +# CARDINAL => index out of range error - not in the list of allowed keywords +pattern :num: [ {[0-9]+|nan}:: ] | :adj:ORDINAL [ :num: :mrph:ORD] +# old : pattern :num: [ {[0-9]+|nan}:: ] | :num: [ :num: :mrph:ORD] + -## reduplication +# Jɛkabaara uses -n instead of -nan +pattern :num: [ {[0-9]+|n}:: ] | :adj:ORDINAL [ :num: :mrph:ORD] +# not sure capitals will work here : NO, DANGER : NAN in capitals is checked as keyword, and it does not exist +# pattern :num: [ {[0-9]+|NAN}:: ] | :adj:ORDINAL [ :num: :mrph:ORD] + +## reduplication THIS DOES NOT WORK ????? section reduplication +# added n : yaalayaala +pattern :n: [ {(?P.+)|(?P=stem)}:: ] | :n: [ :n: :n: ] pattern :v: [ {(?P.+)|(?P=stem)}:: ] | :v: [ :v: :v: ] +pattern :v/ptcp: [ {(?P.+)|(?P=stem)|len}:: ] | :ptcp: [ :v: :v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {(?P.+)|(?P=stem)|nen}:: ] | :ptcp: [ :v: :v: :mrph:PTCP.RES] +pattern :v/n: [ {(?P.+)|(?P=stem)|li}:: ] | :n: [ :v: :v: :mrph:NMLZ] +pattern :v/n: [ {(?P.+)|(?P=stem)|ni}:: ] | :n: [ :v: :v: :mrph:NMLZ] + pattern :adj: [ {(?P.+)|(?P=stem)}:: ] | :adj: [ :adj: :adj: ] +pattern :adv: [ {(?P.+)|(?P=stem)}:: ] | :adv: [ :adv: :adv: ] pattern :num: [ {(?P.+)|(?P=stem)}:: ] | :num: [ :num: :num: ] -pattern :v: [ {(?P.+)|-|(?P=stem)}:: ] | :v: [ :v: :: :v: ] -pattern :adj: [ {(?P.+)|-|(?P=stem)}:: ] | :adj: [ :adj: :: :adj: ] -pattern :num: [ {(?P.+)|-|(?P=stem)}:: ] | :num: [ :num: :: :num: ] +pattern :num/adj: [ {(?P.+)|(?P=stem)|nan}:: ] | :adj: [ :num: :num: :mrph:ORD] + +# MIDDLE HYPEN dulon-dulon kari-kari,... +# but : kelen-kelenin does not work (syntax? hyphen?) +# same with hyphens to be dropped +# middle hyphen is preferably verbs, how to prioritize ? +# added n : yaala-yaala - how to force :v1:= :v2: ? +# COMMENTED OUT 26/03/2024 - often breaks system eg cawu-cawu +# "ntgloss.py", line 85, in morphmatch / if len(self.morphemes) < len(other.morphemes): /TypeError: object of type 'filter' has no len()" +# pattern :v: [ {(?P.+)|-|(?P=stem)}:: ] | :v: [ :v: :: :v: ] +# pattern :n: [ {(?P.+)|-|(?P=stem)}:: ] | :n: [ :n: :: :n: ] +# pattern :adj: [ {(?P.+)|-|(?P=stem)}:: ] | :adj: [ :adj: :: :adj: ] +# pattern :adv: [ {(?P.+)|-|(?P=stem)}:: ] | :adv: [ :adv: :: :adv: ] +# pattern :num: [ {(?P.+)|-|(?P=stem)}:: ] | :num: [ :num: :: :num: ] +#pattern :n.prop: [ {(?P.+)|-|(?P=stem)}:: ] | :n.prop: [ :n.prop: :: :n.prop: ] + +# hyphen inside, E.G. road Yanfolila-Kalana +# pattern :n.prop: [ {(.+)|-|(.+)}:: ] | :n.prop: [ :n.prop: :: :n.prop: ] + + +# added triplicates pattern :v: [ {(?P.+)|(?P=stem)|(?P=stem)}:: ] | :v: [ :v: :v: :v: ] pattern :adj: [ {(?P.+)|(?P=stem)|(?P=stem)}:: ] | :adj: [ :adj: :adj: :adj: ] +pattern :adv: [ {(?P.+)|(?P=stem)|(?P=stem)}:: ] | :adv: [ :adv: :adv: :adv: ] +#pattern :v: [ {(?P.+)|-|(?P=stem)|-|(?P=stem)}:: ] | :v: [ :v: :: :v: :: :v: ] +#pattern :adj: [ {(?P.+)|-|(?P=stem)|-|(?P=stem)}:: ] | :adj: [ :adj: :: :adj: :: :adj: ] +#pattern :adv: [ {(?P.+)|-|(?P=stem)|-|(?P=stem)}:: ] | :adv: [ :adv: :: :adv: :: :adv: ] + ## composition # general part-of-speech composition patterns # + section pos_composition +# special case of middle pp or middle pm +# V kà V = N with kà:pm:INF +# N ká N = N with ká:pp:POSS # was conj previously +# ta-ka-nɛnɛ +pattern :n: [ :v: ka:pm: :v: ] | :n: [ :v: kà:pm:INF :v: ] +# jugu-ma-dogo - COMMENTED OUT - TOO MANY STRANGE RESULTS +# pattern :n: [ :n: ma:pm: :v: ] | :n: [ :n: má:pm:PFV.NEG :v: ] +# Ala-ka-bon +pattern :n: [ :n: ka:pm: :vq: ] | :n: [ :n: ká:pm:QUAL.AFF :vq: ] +# Ɲani-man-jugu +pattern :n: [ :n: man:pm: :vq: ] | :n: [ :n: mán:pm:QUAL.NEG :vq: ] +# cɛ-ka-bon +pattern :n: [ :n: ka:pp: :n: ] | :n: [ :n: ká:pp:POSS :n: ] +# cɛ-la-taa +pattern :n: [ :n: la:pp: :n: ] | :n: [ :n: lá:pp:à :n: ] +pattern :n: [ :n: na:pp: :n: ] | :n: [ :n: ná:pp:à :n: ] +# ni-kan-dugu +pattern :n: [ :n: kan:pp: :n: ] | :n: [ :n: kàn:pp:sur :n: ] +# avec fɛ +pattern :n: [ :n: fɛ:pp: :n: ] | :n: [ :n: fɛ̀:pp:par :n: ] +# avec kɔ́nɔ : du-kɔnɔ-ko +pattern :n: [ :n: kɔnɔ:pp: :n: ] | :n: [ :n: kɔ́nɔ:pp:dans :n: ] +# avec ni:conj:et sónnifàna ou són-ni-fàna +pattern :n: [ :n: ni:conj: :n: ] | :n: [ :n: :conj:et :n: ] +pattern :n: [ :n: ní:conj: :n: ] | :n: [ :n: ni:conj:et :n: ] + # two-words composites + # n.prop + n = n ex: Irisi.jamana -pattern :n: [ :n.prop: :n: ] | :n: [ :n.prop: :n: ] -# v/n + n = n +pattern :n.prop: [ :n.prop: :n: ] | :n.prop: [ :n.prop: :n: ] +# and reverse ex Ba.Joliba, Ba.Borama # THIS SHOULD BE RESTRICTED TO ba:n:, fa:n: ... +# pattern :n: [ :n: :n.prop:] | :n: [ :n: :n.prop: ] +pattern :n.prop: [ :n: :n.prop:] | :n.prop: [ :n: :n.prop: ] +# nprop + adj = nprop ex: Sumayila.bilen, Adama.ncinin +pattern :n.prop: [ :n.prop: :vq/adj: ] | :n.prop: [ :n.prop: :vq/adj: ] +# GENT 12032024 +pattern :n.prop: [ :n.prop: ka:mrph: ] | :n.prop: [ :n.prop: :mrph:GENT ] + +# n + n = n ex: fari.kolo +pattern :n: [ :n: :n: ] | :n: [ :n: :n: ] +# v + n = n ex: sinsin.bere +pattern :n: [ :v: :n: ] | :n: [ :v: :n: ] +# v/n + n = n ex: sinsin.bere (?) pattern :n: [ :n/v: :n: ] | :n: [ :n/v: :n: ] -# n + v = n/v +# n + v = n/v ex: juru.don (v) dan.sigi (v) kɔ.segin (v) pattern :n/v: [ :n: :v: ] | :n/v: [ :n: :v: ] -# n + adj/num = n -pattern :n: [ :n: :adj/num: ] | :n: [ :n: :adj/num: ] -# dtm + v = n +# n + n/adj/num/ptcp = n ex: bugu.jɛ da.fila soso.muso +pattern :n: [ :n: :n/adj/num/ptcp: ] | :n: [ :n: :n/adj/num/ptcp: ] +# dtm + v = n ex: yɛrɛ.dulon (see other yɛrɛ-) pattern :n: [ :dtm: :v: ] | :n: [ :dtm: :v: ] +# num + v = n ex: kelen.sigi +pattern :n: [ :num: :v: ] | :n: [ :num: :v: ] # pp + n = n ex. kɔkan.maliden, kɔnɔ.mɔgɔ pattern :n: [ :pp: :n: ] | :n: [ :pp: :n: ] -# three-words composites -# n + adj/pp/num + n = n -> added num : san.duuru.baara -pattern :n: [ :n: :adj/pp/num: :n: ] | :n: [ :n: :adj/pp/num: :n: ] -# dtm/n + v + n = n -pattern :n: [ :dtm/n: :v: :n: ] | :n: [ :dtm/n: :v: :n: ] -# n + pp + v = n/v -pattern :n/v: [ :n: :pp: :v: ] | :n/v: [ :n: :pp: :v: ] -# n+n+n = n yiriwali.nafolo.ko ? -pattern :n: [ :n: :n: :n: ] | :n: [ :n: :n: :n: ] -# ??? : mrph not taken into acount (not in dic?) +#section pos_composition3mrph +# # v+mrph+n ex: gansi.li.walan pattern :n: [ :v: li:mrph: :n: ] | :n: [ :v: :mrph:NMLZ :n: ] -pattern :n: [ :n: ba:mrph: :n: ] | :n: [ :n: :mrph:AUGM :n: ] +pattern :n: [ :v: ni:mrph: :n: ] | :n: [ :v: :mrph:NMLZ :n: ] # v+mrph+v ex: kɔlɔsi.li.kɛ pattern :v: [ :v: li:mrph: :v: ] | :v: [ :v: :mrph:NMLZ :v: ] -pattern :n: [ :v: ka:pm: :v: ] | :n: [ :v: kà:pm:INF :v: ] +pattern :v: [ :v: ni:mrph: :v: ] | :v: [ :v: :mrph:NMLZ :v: ] + +# n + mrph + n = n : wulu.ba.kunkolo wulu.nin.sen fura.lan.cogo +pattern :n: [ :n: ba:mrph: :n: ] | :n: [ :n: :mrph:AUGM :n: ] +pattern :n: [ :n: nin:mrph: :n: ] | :n: [ :n: :mrph:DIM :n: ] + +# v + mrph + n = n : gansi.baa.wari +pattern :n: [ :v: lan:mrph: :n: ] | :n: [ :v: :mrph:INSTR :n: ] +pattern :n: [ :v: nan:mrph: :n: ] | :n: [ :v: :mrph:INSTR :n: ] +pattern :n: [ :v: baa:mrph: :n: ] | :n: [ :v: :mrph:AG.OCC :n: ] +pattern :n: [ :v: baga:mrph: :n: ] | :n: [ :v: :mrph:AG.OCC :n: ] + +# n + mrph + v = v : wulu.ba.kunkolo wulu.nin.sen fura.lan.cogo +pattern :n: [ :n: ba:mrph: :v: ] | :v: [ :n: :mrph:AUGM :v: ] +pattern :n: [ :n: nin:mrph: :v: ] | :v: [ :n: :mrph:DIM :v: ] + +# v + mrph + v = v : gansi.baa.wari +pattern :v: [ :v: lan:mrph: :v: ] | :v: [ :v: :mrph:INSTR :v: ] +pattern :v: [ :v: nan:mrph: :v: ] | :v: [ :v: :mrph:INSTR :v: ] +pattern :v: [ :v: baa:mrph: :v: ] | :v: [ :v: :mrph:AG.OCC :v: ] +pattern :v: [ :v: baga:mrph: :v: ] | :v: [ :v: :mrph:AG.OCC :v: ] + +# can we try for la/na AG.PRM ? +pattern :n: [ :v: la:mrph: :n: ] | :n: [ :v: :mrph:AG.PRM :n: ] +pattern :v: [ :v: la:mrph: :v: ] | :v: [ :v: :mrph:AG.PRM :v: ] +pattern :n: [ :v: na:mrph: :n: ] | :n: [ :v: :mrph:AG.PRM :n: ] +pattern :v: [ :v: na:mrph: :v: ] | :v: [ :v: :mrph:AG.PRM :v: ] +# with ma:COM +pattern :n: [ :n: ma:mrph: :n: ] | :n: [ :n: :mrph:COM :n: ] +# nin et ba avec adj final : sɔ̀gɔsɔgɔ.nin.gbɛ +pattern :n: [ :n: nin:mrph: :adj: ] | :n: [ :n: :mrph:DIM :adj: ] +pattern :n: [ :n: nin:mrph: :n/adj: ] | :n: [ :n: :mrph:DIM :n/adj: ] +pattern :n: [ :n: nin:mrph: :vq/adj: ] | :n: [ :n: :mrph:DIM :vq/adj: ] +pattern :n: [ :n: ba:mrph: :adj: ] | :n: [ :n: :mrph:AUGM :adj: ] +pattern :n: [ :n: ba:mrph: :n/adj: ] | :n: [ :n: :mrph:AUGM :n/adj: ] +pattern :n: [ :n: ba:mrph: :vq/adj: ] | :n: [ :n: :mrph:AUGM :vq/adj: ] + +# test mrph as last ? for n.prop+adj+ba ex: Sumayila.kɔrɔ.ba DOES NOT WORK MAY 2017 +#pattern :n.prop: [ :n.prop: :vq/adj: ba:mrph: ] | :n.prop: [ :n.prop: :vq/adj: :mrph:AUGM ] +#pattern :n.prop: [ :n.prop: :vq/adj: nin:mrph: ] | :n.prop: [ :n.prop: :vq/adj: :mrph:DIM ] + +section pos_composition3 + +# three-words composites +# n + adj/pp/num + n = n ex: bin.kɛnɛ.sa -> added num : san.duuru.baara +pattern :n: [ :n: :adj/pp/num: :n: ] | :n: [ :n: :adj/pp/num: :n: ] +# n + adj + v = n ex: fini.ɲuman.don +pattern :n: [ :n: :adj: :v: ] | :n: [ :n: :adj: :v: ] +# dtm/n + v + n = n ex : yɛrɛ.faga.su bo.dun.duga +pattern :n: [ :dtm/n: :v: :n: ] | :n: [ :dtm/n: :v: :n: ] +# n + pp + v = n/v ex: sen.na.bɔ (v) da.ma.da (v) nɔ.na.bila (v) bana.kɔ.taa (n) +pattern :n/v: [ :n: :pp: :v: ] | :n/v: [ :n: :pp: :v: ] +# n + pp + n = n ex: bolo.kɔnɔ.mɔgɔ tu.kɔnɔ.jamana +pattern :n: [ :n: :pp: :n: ] | :n: [ :n: :pp: :n: ] +# n + n + n = n yiriwali.nafolo.ko ? +pattern :n: [ :n: :n: :n: ] | :n: [ :n: :n: :n: ] +# n + n + adj = n wari.tigi.kɔrɔ misi.nɔnɔ.kɛnɛ +pattern :n: [ :n: :n: :vq/adj: ] | :n: [ :n: :n: :vq/adj: ] + +# conglomérés +# v + n/pers/prn + pp = n ex: bɔ.n.kɔnɔ jɔn.mɔgɔ.la +pattern :n: [ :v: :n/pers/prn: :pp: ] | :n: [ :v: :n/pers/prn: :pp: ] + +section pos_composition4 # four-words composites +# lots of possibilities! # n + pp + n + n = n pattern :n: [ :n: :pp: :n: :n: ] | :n: [ :n: :pp: :n: :n: ] # n + pp + adj + n = n Ex: dugu.jukɔrɔ.nafoloma.fɛn pattern :n: [ :n: :pp: :adj: :n: ] | :n: [ :n: :pp: :adj: :n: ] # n+v+v+n : kunnafoni.falen.falen.ko pattern :n: [ :n: :v: :v: :n: ] | :n: [ :n: :v: :v: :n: ] +# n+v+n+n ex: balo.sɔrɔ.yɔrɔ.ko +pattern :n: [ :n: :v: :n: :n: ] | :n: [ :n: :v: :n: :n: ] +# add n n n n and with num ??? +section propername +pattern @proper@:: | :n.prop:NOM # orthographically unlikely to be bamana words # for NEW orthography, with no tones marked + +section alphadigits +# still may be bamana : some forms ending with digits +pattern :num: [ {dɔrɔmɛ|[0-9]+}:: ] | :num: [ dɔ́rɔmɛ:n:cinq.francs.CFA :num:CARDINAL] +pattern :num: [ {d|[0-9]+}:: ] | :num: [ dɔ́rɔmɛ:n:cinq.francs.CFA :num:CARDINAL] +pattern :num: [ {m|[0-9]+}:: ] | :num: [ mɛ́tɛrɛ:n:mètre :num:CARDINAL] +pattern :num: [ {w|[0-9]+}:: ] | :num: [ wáati:n:moment :num:CARDINAL] +# waati au sens d'heure : w5 s30= waati 5 sanga 30 (sanga : minute) +pattern :num: [ {kg|[0-9]+}:: ] | :num: [ kílo:n:kilogramme :num:CARDINAL] +pattern :num: [ {km|[0-9]+}:: ] | :num: [ kílomɛtɛrɛ:n:kilomètre :num:CARDINAL] +# note : les occurrences de km kɛmɛmɛtɛrɛ (hectomètre) sont plutôt écrites séparées que collées. +pattern :n: [ {COP|[0-9]+}:: ] | :n.prop: [ COP:n:prop :num:CARDINAL] +pattern :n: [ {CAN|[0-9]+}:: ] | :n.prop: [ CAN:n.prop:ABR :num:CARDINAL] +pattern :n: [ {san|[0-9]+}:: ] | :n: [ sàn:n:année :num:CARDINAL] +# note: peut-être soit une date: san 2025, soit un nombre d'années: san 5 + +# some rare occurrences with maa/mɔgɔ, tile, san, siɲɛ... +pattern :n/n.prop: [ {@worddigits@|[0-9]+}:: ] | :n: [ :n/n.prop: :num:CARDINAL] + +# other forms may introduce footnotes and should be split (can't do that in bamana.gram), EG Ala²=Ala2 -> Ala (2) + section nonbamana # forms ending with nonfinal consonants pattern .+@bam-cons@:: | ::EMPR diff --git a/docs/samples/maninka.nko.gram.txt b/docs/samples/maninka.nko.gram.txt index a537cb1..517ad88 100644 --- a/docs/samples/maninka.nko.gram.txt +++ b/docs/samples/maninka.nko.gram.txt @@ -1,17 +1,19 @@ # Maninka morphotactic patterns # macro definitions -macro @nasal-v@ .*[aoeuiɔɛ]n -macro @nonnasal-v@ .*[aoeuiɔɛ][^n]? +macro @nasal-v@ .*[aoeuiɔɛ][́̀̌]?n +macro @nonnasal-v@ .*[aoeuiɔɛ][́̀̌]?[^n]? # processing instructions plan for token: stage 0 add sequential parse common_inflection +# stage 0 add sequential parse dtmprn_nko_inflection # NO RESULT : mɛ́n`:dtm/prn: stage 0 add parallel parse v_inflection stage 0 apply lookup return if parsed stage 1 add sequential parse common_derivation +return if parsed stage 1 add parallel parse v_derivation stage 1 add parallel parse v_prefixes stage 1 add parallel parse n_derivation @@ -29,116 +31,290 @@ return if parsed # nominal/common inflection section common_inflection # -` ART -pattern :n/adj/ptcp/n.prop: [ {|`}:: ] | :n/adj/ptcp/n.prop: [:n/adj/ptcp/n.prop: :mrph:ART] +#pattern :n/adj/ptcp/n.prop/dtm/prn: [ {|`}:: ] | :n/adj/ptcp/n.prop/dtm/prn: [:n/adj/ptcp/n.prop/dtm/prn: :mrph:ART] +#pattern :n/adj/ptcp/dtm/prn/v/pers/prt/num/n.prop: [ {|`}:: ] | :n/adj/ptcp/dtm/prn/v/pers/prt/num/n.prop: [:n/adj/ptcp/dtm/prn/v/pers/prt/num/n.prop: :mrph:ART] # soupçons sur n.prop: -> testé sans=rien de probant - mis en dernier. +pattern :n/adj/ptcp/v/pers/prt/num/n.prop: [ {|`}:: ] | :n/adj/ptcp/v/pers/prt/num/n.prop: [:n/adj/ptcp/v/pers/prt/num/n.prop: :mrph:ART] +# soupçons sur n.prop: -> testé sans=rien de probant - mis en dernier. 4/6/2024 enlevé dtm/prn: vt dans malidaba ? rien lookup ne trouve que les v pour mɛ́n` - ajouté 'vt' dans variantfields (formats/DictReader) +# ART v -> n : nominalisation fréquente des verbes +pattern :v: [ {|`}:: ] | :n: [:v: :mrph:ART] + +# section dtmprn_nko_inflection +# pattern :dtm/prn: [ {|`}:: ] | :dtm/prn: [:dtm/prn:] section v_inflection # verbal inflection # -la/-na INF pattern :v: [ {@nonnasal-v@|là}:: ] | :v: [:v: :mrph:INF.LA] pattern :v: [ {@nonnasal-v@|lá}:: ] | :v: [:v: :mrph:INF.LA] +pattern :v: [ {@nonnasal-v@|la}:: ] | :v: [:v: :mrph:INF.LA] pattern :v: [ {@nasal-v@|nà}:: ] | :v: [:v: :mrph:INF.LA] pattern :v: [ {@nasal-v@|ná}:: ] | :v: [:v: :mrph:INF.LA] +pattern :v: [ {@nasal-v@|na}:: ] | :v: [:v: :mrph:INF.LA] # -san IMMED pattern :v: [ {|sàn}:: ] | :v: [:v: :mrph:IMMED] pattern :v: [ {|sán}:: ] | :v: [:v: :mrph:IMMED] +pattern :v: [ {|san}:: ] | :v: [:v: :mrph:IMMED] + # participles # -nin/-nɛn PTCP.RES pattern :v/ptcp: [ {|nìn}:: ] | :ptcp: [:v: :mrph:PTCP.RES] pattern :v/ptcp: [ {|nín}:: ] | :ptcp: [:v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {|nin}:: ] | :ptcp: [:v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {|nin|`}:: ] | :ptcp: [:v: :mrph:PTCP.RES :mrph:ART] + pattern :v/ptcp: [ {|nɛ́n}:: ] | :ptcp: [:v: :mrph:PTCP.RES] pattern :v/ptcp: [ {|nɛ̀n}:: ] | :ptcp: [:v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {|nɛn}:: ] | :ptcp: [:v: :mrph:PTCP.RES] +pattern :v/ptcp: [ {|nɛn|`}:: ] | :ptcp: [:v: :mrph:PTCP.RES :mrph:ART] + +pattern :v/ptcp: [ {|nɛn|ba}:: ] | :ptcp: [:v: :mrph:PTCP.RES :mrph:AUGM] +pattern :v/ptcp: [ {|nɛn|ba|`}:: ] | :ptcp: [:v: :mrph:PTCP.RES :mrph:AUGM :mrph:ART] + +# à cause des confusions -nɛn PTCP.RES et -nɛn DIM, le paragraphe qui suit a été remonté dans cette section +pattern :n/adj/ptcp: [ {|nìn}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nín}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nin}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nin|`}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM :mrph:ART] +# ajouté : -nɛn DIM +pattern :n/adj/ptcp: [ {|nɛ̀n}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nɛ́n}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nɛn}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nɛn|`}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM :mrph:ART] + # -ta PTCP.POT pattern :v/ptcp: [ {|tà}:: ] | :ptcp: [:v: :mrph:PTCP.POT] pattern :v/ptcp: [ {|tá}:: ] | :ptcp: [:v: :mrph:PTCP.POT] -# -tɔ PTCP.PROG -pattern :v/ptcp: [ {|tɔ̀}:: ] | :ptcp: [:v: :mrph:PTCP.PROG] -pattern :v/ptcp: [ {|tɔ́}:: ] | :ptcp: [:v: :mrph:PTCP.PROG] +pattern :v/ptcp: [ {|ta}:: ] | :ptcp: [:v: :mrph:PTCP.POT] +pattern :v/ptcp: [ {|ta|`}:: ] | :ptcp: [:v: :mrph:PTCP.POT :mrph:ART] +# -tɔ PTCP.PROG -> changed to CONV.PROG +pattern :v/ptcp: [ {|tɔ̀}:: ] | :ptcp: [:v: :mrph:CONV.PROG] +pattern :v/ptcp: [ {|tɔ́}:: ] | :ptcp: [:v: :mrph:CONV.PROG] +pattern :v/ptcp: [ {|tɔ}:: ] | :ptcp: [:v: :mrph:CONV.PROG] +pattern :v/ptcp: [ {|tɔ|`}:: ] | :ptcp: [:v: :mrph:CONV.PROG :mrph:ART] +# -tɔla CONV.PROG+PROG ajouté le 22/3/2024 +pattern :v/ptcp: [ {|tɔ|la}:: ] | :ptcp: [:v: :mrph:CONV.PROG la:mrph:PROG] +pattern :v/ptcp: [ {|tɔ|la|`}:: ] | :ptcp: [:v: :mrph:CONV.PROG la:mrph:PROG :mrph:ART] + # -bali PTCP.NEG pattern :v/ptcp: [ {|bàlì}:: ] | :ptcp: [:v: :mrph:PTCP.NEG] pattern :v/ptcp: [ {|bálí}:: ] | :ptcp: [:v: :mrph:PTCP.NEG] +pattern :v/ptcp: [ {|bali}:: ] | :ptcp: [:v: :mrph:PTCP.NEG] +pattern :v/ptcp: [ {|bali|`}:: ] | :ptcp: [:v: :mrph:PTCP.NEG :mrph:ART] + +# -baliya +pattern :v/ptcp: [ {|bali|ya}:: ] | :n: [:v: :mrph:PTCP.NEG :mrph:ABSTR] +pattern :v/ptcp: [ {|bali|ya|`}:: ] | :n: [:v: :mrph:PTCP.NEG :mrph:ABSTR :mrph:ART] ## DERIVATION section common_derivation +# les lignes avec articles tonal sont nécessaires - mais cela ne parait pas logique (faute de documentation, on s'adapte) + # -ya/-ɲa ABSTR -pattern :n/v: [ {|yà}:: ] | :n: [:n/adj/ptcp: :mrph:ABSTR] -pattern :n/v: [ {|yá}:: ] | :n: [:n/adj/ptcp: :mrph:ABSTR] -pattern :n/v: [ {|ɲà}:: ] | :n: [:n/adj/ptcp: :mrph:ABSTR] -pattern :n/v: [ {|ɲá}:: ] | :n: [:n/adj/ptcp: :mrph:ABSTR] -# -nin DIM -pattern :n/adj/ptcp: [ {|nìn}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] -pattern :n/adj/ptcp: [ {|nín}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +#pattern :n/v: [ {|yà}:: ] | :n: [:n/adj/ptcp: :mrph:ABSTR] # n/adj/ptcp ??? +pattern :n/v: [ {|yà}:: ] | :n: [:n/v: :mrph:ABSTR] +pattern :n/v: [ {|yá}:: ] | :n: [:n/v: :mrph:ABSTR] +pattern :n/v: [ {|ya}:: ] | :n: [:n/v: :mrph:ABSTR] +pattern :n/v: [ {|ɲà}:: ] | :n: [:n/v: :mrph:ABSTR] +pattern :n/v: [ {|ɲá}:: ] | :n: [:n/v: :mrph:ABSTR] +pattern :n/v: [ {|ɲa}:: ] | :n: [:n/v: :mrph:ABSTR] +pattern :n/v: [ {|ɲa|`}:: ] | :n: [:n/v: :mrph:ABSTR :mrph:ART] + +# -nin DIM +pattern :n/adj/ptcp: [ {|nìn}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nín}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nin}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|nin|`}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM :mrph:ART] + +# -ren DIM +pattern :n/adj/ptcp: [ {|ren}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM ] +pattern :n/adj/ptcp: [ {|ren|`}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:DIM :mrph:ART] + # -ba AUGM -pattern :n/adj/ptcp: [ {|bà}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:AUG ] -pattern :n/adj/ptcp: [ {|bá}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:AUG ] +pattern :n/adj/ptcp: [ {|bà}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:AUGM ] +pattern :n/adj/ptcp: [ {|bá}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:AUGM ] +pattern :n/adj/ptcp: [ {|ba}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:AUGM ] +pattern :n/adj/ptcp: [ {|ba|`}:: ] | :n/adj/ptcp: [ :n/adj/ptcp: :mrph:AUGM :mrph:ART] + # -nte AG.EX pattern :n: [ {|ntè}:: ] | :n: [ :n/adj/v: :mrph:AG.EX ] pattern :n: [ {|nté}:: ] | :n: [ :n/adj/v: :mrph:AG.EX ] +pattern :n: [ {|nte}:: ] | :n: [ :n/adj/v: :mrph:AG.EX ] +pattern :n: [ {|nte|`}:: ] | :n: [ :n/adj/v: :mrph:AG.EX :mrph:ART] + # ‑ɲɔɔn RECP pattern :n: [ {|ɲɔ̀ɔn}:: ] | :n: [ :v/n: :mrph:RECP] pattern :n: [ {|ɲɔ́ɔn}:: ] | :n: [ :v/n: :mrph:RECP] +pattern :n: [ {|ɲɔɔn}:: ] | :n: [ :v/n: :mrph:RECP] +pattern :n: [ {|ɲɔɔn|`}:: ] | :n: [ :v/n: :mrph:RECP :mrph:ART] +# -lama STAT ajouté le 22/3/2024 +pattern :n: [ {@nonnasal-v@|lama}:: ] | :adj: [ :n: :mrph:STAT] +pattern :n: [ {@nonnasal-v@|lama|`}:: ] | :adj: [ :n: :mrph:STAT :mrph:ART] +pattern :n: [ {@nasal-v@|nama}:: ] | :adj: [ :n: :mrph:STAT] +pattern :n: [ {@nasal-v@|nama|`}:: ] | :adj: [ :n: :mrph:STAT :mrph:ART] +pattern :n: [ {@nonnasal-v@|rɔma}:: ] | :adj: [ :n: :mrph:STAT] +pattern :n: [ {@nonnasal-v@|rɔma|`}:: ] | :adj: [ :n: :mrph:STAT :mrph:ART] + +# -ma RECP.PRN ajouté le 22/3/2024 +pattern :n: [ {|ma}:: ] | :n: [ :n: :mrph:RECP.PRN] +pattern :n: [ {|ma|`}:: ] | :n: [ :n: :mrph:RECP.PRN :mrph:ART] + +# -ta PRICE ajouté le 22/3/2024 +pattern :num/n: [ {|ta}:: ] | :n: [ :num: :mrph:PRICE] +pattern :num/n: [ {|ta|`}:: ] | :n: [ :num: :mrph:PRICE :mrph:ART] +pattern :num/n: [ {|la}:: ] | :n: [ :num: :mrph:PRICE] +pattern :num/n: [ {|la|`}:: ] | :n: [ :num: :mrph:PRICE :mrph:ART] # verbal derivation section v_derivation # -baa AG.OCC pattern :n/adj: [ {|bàa}:: ] | :n/adj: [ :v: :mrph:AG.OCC ] pattern :n/adj: [ {|báa}:: ] | :n/adj: [ :v: :mrph:AG.OCC ] +pattern :n/adj: [ {|baa}:: ] | :n/adj: [ :v: :mrph:AG.OCC ] +pattern :n/adj: [ {|baa|`}:: ] | :n/adj: [ :v: :mrph:AG.OCC :mrph:ART] # -la/-na AG.PRM pattern :n: [ {@nonnasal-v@|là}:: ] | :n: [ :v: :mrph:AG.PRM ] pattern :n: [ {@nonnasal-v@|lá}:: ] | :n: [ :v: :mrph:AG.PRM ] +pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [ :v: :mrph:AG.PRM ] +pattern :n: [ {@nonnasal-v@|la|`}:: ] | :n: [ :v: :mrph:AG.PRM :mrph:ART] pattern :n: [ {@nasal-v@|nà}:: ] | :n: [ :v: :mrph:AG.PRM ] pattern :n: [ {@nasal-v@|ná}:: ] | :n: [ :v: :mrph:AG.PRM ] +pattern :n: [ {@nasal-v@|na}:: ] | :n: [ :v: :mrph:AG.PRM ] +pattern :n: [ {@nasal-v@|na|`}:: ] | :n: [ :v: :mrph:AG.PRM :mrph:ART] # -lan/-nan/-ran INSTR pattern :n: [ {@nonnasal-v@|làn}:: ] | :n: [ :v: :mrph:INSTR] pattern :n: [ {@nonnasal-v@|lán}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {@nonnasal-v@|lan}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {@nonnasal-v@|lan|`}:: ] | :n: [ :v: :mrph:INSTR :mrph:ART] pattern :n: [ {@nasal-v@|nàn}:: ] | :n: [ :v: :mrph:INSTR] pattern :n: [ {@nasal-v@|nán}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {@nasal-v@|nan}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {@nasal-v@|nan|`}:: ] | :n: [ :v: :mrph:INSTR :mrph:ART] +pattern :n: [ {@nonnasal-v@|ràn}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {@nonnasal-v@|rán}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {@nonnasal-v@|ran}:: ] | :n: [ :v: :mrph:INSTR] +pattern :n: [ {@nonnasal-v@|ran|`}:: ] | :n: [ :v: :mrph:INSTR :mrph:ART] # -li/-nin NMLZ -pattern :n: [ {@nonnasal-syl@|lì}:: ] | :n: [ :v: :mrph:NMLZ] -pattern :n: [ {@nonnasal-syl@|lí}:: ] | :n: [ :v: :mrph:NMLZ] -pattern :n: [ {@nasal-syl@|nìn}:: ] | :n: [ :v: :mrph:NMLZ] -pattern :n: [ {@nasal-syl@|nín}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nonnasal-v@|lì}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nonnasal-v@|lí}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nonnasal-v@|li}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nonnasal-v@|li|`}:: ] | :n: [ :v: :mrph:NMLZ :mrph:ART] +pattern :n: [ {@nasal-v@|nìn}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nasal-v@|nín}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nasal-v@|nin}:: ] | :n: [ :v: :mrph:NMLZ] +pattern :n: [ {@nasal-v@|nin|`}:: ] | :n: [ :v: :mrph:NMLZ :mrph:ART] section v_prefixes # lá-/ná- CAUS -#pattern :v: [ {la|}:: ] | :v: [ :mrph:CAUS :v: ] -#pattern :v: [ {na|}:: ] | :v: [ :mrph:CAUS :v: ] -# mà- SUPER -#pattern :v: [ {ma|}:: ] | :v: [ :mrph:SUPER :v: ] +pattern :v: [ {lá|}:: ] | :v: [ :mrph:CAUS :v: ] +pattern :v: [ {ná|}:: ] | :v: [ :mrph:CAUS :v: ] +pattern :v/ptcp: [ {lá||nin|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {ná||nin|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {lá||nɛn|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {ná||nɛn|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.RES :mrph:ART] + +pattern :v/ptcp: [ {lá||ta|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.POT :mrph:ART] +pattern :v/ptcp: [ {lá||tɔ|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:CONV.PROG :mrph:ART] +pattern :v/ptcp: [ {lá||bali|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.PRIV :mrph:ART] + +pattern :v/n: [ {lá||li|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ :mrph:ART] +pattern :v/n: [ {lá||nin|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ :mrph:ART] +pattern :v/n: [ {ná||la|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:AG.PRM :mrph:ART] +pattern :v/n: [ {ná||lan|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:INSTR :mrph:ART] + +pattern :v/ptcp: [ {ná||ta|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.POT :mrph:ART] +pattern :v/ptcp: [ {ná||tɔ|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:CONV.PROG :mrph:ART] +pattern :v/ptcp: [ {ná||bali|`}:: ] | :ptcp: [ :mrph:CAUS :v: :mrph:PTCP.PRIV :mrph:ART] + +pattern :v/n: [ {ná||li|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ :mrph:ART] +pattern :v/n: [ {ná||nin|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:NMLZ :mrph:ART] +pattern :v/n: [ {ná||na|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:AG.PRM :mrph:ART] +pattern :v/n: [ {ná||nan|`}:: ] | :n: [ :mrph:CAUS :v: :mrph:INSTR :mrph:ART] + +# má- SUPER +pattern :v: [ {má|}:: ] | :v: [ :mrph:SUPER :v: ] +pattern :v/ptcp: [ {má||nin|`}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {má||nɛn|`}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.RES :mrph:ART] + +pattern :v/ptcp: [ {má||ta|`}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.POT :mrph:ART] +pattern :v/ptcp: [ {má||tɔ|`}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:CONV.PROG :mrph:ART] +pattern :v/ptcp: [ {má||bali|`}:: ] | :ptcp: [ :mrph:SUPER :v: :mrph:PTCP.PRIV :mrph:ART] + +pattern :v/n: [ {má||li|`}:: ] | :n: [ :mrph:SUPER :v: :mrph:NMLZ :mrph:ART] +pattern :v/n: [ {má||nin|`}:: ] | :n: [ :mrph:SUPER :v: :mrph:NMLZ :mrph:ART] + # dɔ́ ~ rɔ́-/nɔ́ IN -#pattern :v: [ {dɔ|}:: ] | :v: [ :mrph:IN :v: ] -#pattern :v: [ {rɔ|}:: ] | :v: [ :mrph:IN :v: ] -#pattern :v: [ {nɔ|}:: ] | :v: [ :mrph:IN :v: ] +pattern :v: [ {dɔ́|}:: ] | :v: [ :mrph:IN :v: ] +pattern :v: [ {rɔ́|}:: ] | :v: [ :mrph:IN :v: ] +pattern :v: [ {nɔ́|}:: ] | :v: [ :mrph:IN :v: ] +pattern :v/ptcp: [ {dɔ́||nin|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {rɔ́||nin|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {nɔ́||nin|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {dɔ́||nɛn|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {rɔ́||nɛn|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.RES :mrph:ART] +pattern :v/ptcp: [ {nɔ́||nɛn|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.RES :mrph:ART] + +pattern :v/ptcp: [ {dɔ́||ta|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.POT :mrph:ART] +pattern :v/ptcp: [ {dɔ́||tɔ|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:CONV.PROG :mrph:ART] +pattern :v/ptcp: [ {dɔ́||bali|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.PRIV :mrph:ART] + +pattern :v/n: [ {dɔ́||li|`}:: ] | :n: [ :mrph:SUPER :v: :mrph:PTCP.PRIV :mrph:NMLZ] +pattern :v/n: [ {dɔ́||nin|`}:: ] | :n: [ :mrph:SUPER :v: :mrph:PTCP.PRIV :mrph:NMLZ] + +pattern :v/ptcp: [ {rɔ́||ta|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.POT :mrph:ART] +pattern :v/ptcp: [ {rɔ́||tɔ|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:CONV.PROG :mrph:ART] +pattern :v/ptcp: [ {rɔ́||bali|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.PRIV :mrph:ART] +pattern :v/ptcp: [ {nɔ́||ta|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.POT :mrph:ART] +pattern :v/ptcp: [ {nɔ́||tɔ|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:CONV.PROG :mrph:ART] +pattern :v/ptcp: [ {nɔ́||bali|`}:: ] | :ptcp: [ :mrph:IN :v: :mrph:PTCP.PRIV :mrph:ART] section n_derivation # -laka LOC.GENT pattern :n/n.prop: [ {@nonnasal-v@|là|kà}:: ] | :n/n.prop: [ :n/n.prop: :mrph:LOC :mrph:GENT ] pattern :n/n.prop: [ {@nonnasal-v@|lá|ká}:: ] | :n/n.prop: [ :n/n.prop: :mrph:LOC :mrph:GENT ] -pattern :n/n.prop: [ {@nonnasal-v@|là|ká}:: ] | :n/n.prop: [ :n/n.prop: :mrph:LOC :mrph:GENT ] +pattern :n/n.prop: [ {@nonnasal-v@|là|ká}:: ] | :n/n.prop: [ :n/n.prop: :mrph:LOC :mrph:GENT ] +pattern :n/n.prop: [ {@nonnasal-v@|la|ka}:: ] | :n/n.prop: [ :n/n.prop: :mrph:LOC :mrph:GENT ] +pattern :n/n.prop: [ {@nonnasal-v@|la|ka|`}:: ] | :n/n.prop: [ :n/n.prop: :mrph:LOC :mrph:GENT :mrph:ART] # -ka GENT pattern :n/n.prop: [ {|kà}:: ] | :n/n.prop: [ :n/n.prop: :mrph:GENT ] pattern :n/n.prop: [ {|ká}:: ] | :n/n.prop: [ :n/n.prop: :mrph:GENT ] +pattern :n/n.prop: [ {|ka}:: ] | :n/n.prop: [ :n/n.prop: :mrph:GENT ] +pattern :n/n.prop: [ {|ka|`}:: ] | :n/n.prop: [ :n/n.prop: :mrph:GENT :mrph:ART] # -la/-na LOC pattern :n/n.prop: [ {@nonnasal-v@|là}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] pattern :n/n.prop: [ {@nonnasal-v@|lá}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] +pattern :n/n.prop: [ {@nonnasal-v@|la}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] +pattern :n/n.prop: [ {@nonnasal-v@|la|`}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC :mrph:ART] pattern :n/n.prop: [ {@nasal-v@|nà}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] pattern :n/n.prop: [ {@nasal-v@|ná}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] +pattern :n/n.prop: [ {@nasal-v@|na}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] +pattern :n/n.prop: [ {@nasal-v@|na|`}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC :mrph:ART] # -ntan PRIV pattern :adj/n: [ {|ntàn}:: ] | :adj/n: [ :n: :mrph:PRIV] pattern :adj/n: [ {|ntán}:: ] | :adj/n: [ :n: :mrph:PRIV] +pattern :adj/n: [ {|ntan}:: ] | :adj/n: [ :n: :mrph:PRIV] +pattern :adj/n: [ {|ntan|`}:: ] | :adj/n: [ :n: :mrph:PRIV :mrph:ART] # -ma COM pattern :n: [ {|mà}:: ] | :adj/n: [ :n: :mrph:COM] pattern :n: [ {|má}:: ] | :adj/n: [ :n: :mrph:COM] +pattern :n: [ {|ma}:: ] | :adj/n: [ :n: :mrph:COM] +pattern :n: [ {|ma|`}:: ] | :adj/n: [ :n: :mrph:COM :mrph:ART] # -nan ORD pattern :num: [ {|nàn}:: ] | :adj: [ :num: :mrph:ORD ] pattern :num: [ {|nán}:: ] | :adj: [ :num: :mrph:ORD ] +pattern :num: [ {|nan}:: ] | :adj: [ :num: :mrph:ORD ] +pattern :num: [ {|nan|`}:: ] | :adj: [ :num: :mrph:ORD :mrph:ART] # -ta FOC.ADJ pattern :adj: [ {|tà}:: ] | :adj: [ :adj: :mrph:FOC.ADJ ] pattern :adj: [ {|tá}:: ] | :adj: [ :adj: :mrph:FOC.ADJ ] +pattern :adj: [ {|ta}:: ] | :adj: [ :adj: :mrph:FOC.ADJ ] +pattern :adj: [ {|ta|`}:: ] | :adj: [ :adj: :mrph:FOC.ADJ :mrph:ART] # -tɔ ST pattern :n/adj: [ {|tɔ́}:: ] | :n/adj: [ :n: :mrph:ST ] pattern :n/adj: [ {|tɔ̀}:: ] | :n/adj: [ :n: :mrph:ST ] +pattern :n/adj: [ {|tɔ}:: ] | :n/adj: [ :n: :mrph:ST ] +pattern :n/adj: [ {|tɔ|`}:: ] | :n/adj: [ :n: :mrph:ST :mrph:ART] # vq derivation @@ -146,11 +322,20 @@ section vq_derivation # -man ADJ pattern :vq: [ {|màn}:: ] | :adj: [ :vq: :mrph:ADJ ] pattern :vq: [ {|mán}:: ] | :adj: [ :vq: :mrph:ADJ ] +pattern :vq: [ {|man}:: ] | :adj: [ :vq: :mrph:ADJ ] +pattern :vq: [ {|man|`}:: ] | :adj: [ :vq: :mrph:ADJ :mrph:ART] # -ya/-ɲa DEQU pattern :vq: [ {@nonnasal-v@|yà}:: ] | :n/v: [ :vq: :mrph:DEQU] pattern :vq: [ {@nonnasal-v@|yá}:: ] | :n/v: [ :vq: :mrph:DEQU] +pattern :vq: [ {@nonnasal-v@|ya}:: ] | :n/v: [ :vq: :mrph:DEQU] +pattern :vq: [ {@nonnasal-v@|ya|`}:: ] | :n: [ :vq: :mrph:DEQU :mrph:ART] pattern :vq: [ {@nasal-v@|ɲà}:: ] | :n/v: [ :vq: :mrph:DEQU] pattern :vq: [ {@nasal-v@|ɲá}:: ] | :n/v: [ :vq: :mrph:DEQU] +pattern :vq: [ {@nasal-v@|ɲa}:: ] | :n/v: [ :vq: :mrph:DEQU] +pattern :vq: [ {@nasal-v@|ɲa|`}:: ] | :n: [ :vq: :mrph:DEQU :mrph:ART] +# cela arrive aussi quand même : +pattern :vq: [ {@nasal-v@|ya}:: ] | :n/v: [ :vq: :mrph:DEQU] +pattern :vq: [ {@nasal-v@|ya|`}:: ] | :n: [ :vq: :mrph:DEQU :mrph:ART] ## reduplication section reduplication @@ -169,13 +354,25 @@ pattern :adj: [ {(?P.+)|(?P=stem)|(?P=stem)}:: # two-word composites section pos_composition # v/n + n = n -pattern :n: [ :n/v: :n: ] | :n: [ :n/v: :n: ] +# BUGS(misses yìdako) pattern :n: [ :n/v: :n: ] | :n: [ :n/v: :n: ] +# NOT NEEDED -> pattern :n: [ :n/v: {|`}:n: ] | :n: [ :n/v: :n: :mrph:ART] +pattern :n/v: [ :v: :n: ] | :n: [ :v: :n: ] +pattern :n: [ :n: :n: ] | :n: [ :n: :n: ] + # n + v = n/v pattern :n/v: [ :n: :v: ] | :n/v: [ :n: :v: ] + # n + adj/num = n pattern :n: [ :n: :adj/num: ] | :n: [ :n: :adj/num: ] +# pattern :n/adj: [ :n: :adj: ] | :n: [ :n: :adj: ] +# pattern :n/num: [ :n: :num: ] | :n: [ :n: :num: ] + +# prn + v ɲɔ́ɔn.yé +pattern :n/v: [ :prn: :v: ] | :n/v: [ :prn: :v: ] + # dtm + v = n pattern :n: [ :dtm: :v: ] | :n: [ :dtm: :v: ] + # three-word composites # n + adj/pp + n = n pattern :n: [ :n: :adj/pp: :n: ] | :n: [ :n: :adj/pp: :n: ] diff --git a/docs/samples/meta.xml b/docs/samples/meta.xml index 7bea67d..b9f5847 100644 --- a/docs/samples/meta.xml +++ b/docs/samples/meta.xml @@ -1,4 +1,4 @@ - +
@@ -10,6 +10,7 @@ + @@ -20,10 +21,10 @@ - +
-
+
@@ -33,7 +34,7 @@ - + @@ -96,7 +97,7 @@
- + @@ -104,7 +105,7 @@ - + @@ -112,7 +113,7 @@ - + @@ -121,16 +122,16 @@ - + - - - - - - - - + + + + + + + + @@ -147,7 +148,7 @@ - + @@ -179,6 +180,7 @@ + @@ -186,15 +188,17 @@ + + - + @@ -208,24 +212,29 @@ - + - + + + + +