diff --git a/differential_tone_coding.py b/differential_tone_coding.py index 2ba78ee..67eacd2 100755 --- a/differential_tone_coding.py +++ b/differential_tone_coding.py @@ -6,21 +6,316 @@ import Levenshtein from syllables import syllabify, vowels import re +import itertools +import csv +import codecs # Installation of prerequisites # sudo pip install python-Levenshtein # Constant lists markers_tone = [unichr(0x0300),unichr(0x0301),unichr(0x0302),unichr(0x030c)] -token_seperator = u'_' code_seperator = u'_' mode_indicators = u'+-' mode_names = [u"insert",u"delete"] -markers_to_be_ignored = u"[].-" + code_seperator -markers_to_be_replaced = {u"’":u"'"} +markers_to_be_ignored = u"." # u"[].-" + code_seperator +markers_to_be_replaced = dict() # {u"’":u"'"} -# todo : decomposition en opérations - opérands -def code_dispatcher(code) : +def apply_filter_to_base_element(x, sets, sel_en, decomposition_en, show_approx_err = False) : + + if isinstance(x, tuple) : + return (x[0], filter(x[0], x[1].decode("utf-8"), sets, sel_en, decomposition_en, show_approx_err).encode("utf-8")) + else : + return [apply_filter_to_base_element(element, sets, sel_en, decomposition_en, show_approx_err) for element in x] + + +def filter(token, tag, sets, sel_en, decomposition_en, show_approx_err = False) : + + subcodes = code_dispatcher(tag, sel_en, decomposition_en) + code2 = code_seperator.encode("utf-8").join([subcode for p, subcode in enumerate(subcodes) if (p in sets and subcode)]) + ret = code_resort(code2) + + if ret != tag and set(sets) == {0,1,2,3}and show_approx_err : + # un prix d'approximation à payer : + # notre décomposition de la CRF paye bien un prix : la perte de l'ordre des caractères (de types différents) à insérer sur + # la même position par rapport qu token original, bien que ce soit rarissime d'insérer successivement un caractère non diacritique + # et un diacritique à la même position sur un token, ça peut arriver, donc générer une erreur observable si nous dés-commentons + # les lignes de code ci-dessous + # par exemple, + # tag_gold = +_0_ɔ_+_0_̀_+_0_n_-_0_a + # tag_reconstitued = +_0_ɔ_+_0_n_+_0_̀_-_0_a + # pour rappel, cette erreur est comptée dans l'évaluation, et curieusement on n'a pas constaté de dégradation considérable + # dans l'exactitude du résultat, ça peut s'expliquer par ce qu'une insertion successive est souvent dfficile à apprendre + # en soi, (hypothèse à vérifier, donc), donc une perte d'ordre ne rend pas ce cas de figure plus catastrohpique quantitativement + print "Case of modeling error inherent to decomposition hypothesis : \n", ret, tag + + return ret + + +def verify(x) : + apply_filter_to_base_element(x, [0,1,2,3], show_approx_err=True) + + +def split2 (str_in, seperator) : + + buf = '' + ret = [] + for c in str_in : + if c != seperator : + buf += c + else : + if buf : + ret.append(buf) + buf = '' + else : + + buf += c + if buf : + ret.append(buf) + return ret + +def marginal_tone(taggers, tnum, tokens, tag, token, chunk_mode, sel_en, decomposition_en) : + + enc = encoder_tones() + codes, syllabes = enc.differential_encode(token, tag.decode('utf-8'), chunk_mode) + + k = 0 + snums = [] + for i, t in enumerate(tokens) : + for j, syllabe in enumerate(t) : + if i == tnum : + snums.append(k) + k += 1 + + if len(syllabes) != len(snums) : + print "Bug 3 !" + exit() + + prob_tot = 1 + for p, tagger in enumerate(taggers) : + if not taggers[p]._model_file : + continue + for i in range(len(syllabes)) : + subcode = code_dispatcher(codes[i], sel_en, decomposition_en)[p] + try : + prob = taggers[p]._tagger.marginal(subcode.encode('utf-8'), snums[i]) + except : + prob = 0.0 + prob_tot *= prob + + return prob_tot + +def accuray2 (dataset1, dataset2, is_tone_mode = False) : + cnt_sucess = 0 + cnt_fail = 0 + if not is_tone_mode : + for sent1, sent2 in zip(dataset1, dataset2) : + for token1, token2 in zip(sent1, sent2) : + if token1 == token2 : + cnt_sucess += 1 + else : + cnt_fail += 1 + + else : + for sent1, sent2 in zip(dataset1, dataset2) : + for token1, token2 in zip(sent1, sent2) : + is_identical = True + for syllabe1, syllabe2 in zip(token1, token2) : + if syllabe1 != syllabe2 : is_identical = False ; break; + if is_identical : + cnt_sucess += 1 + else : + cnt_fail += 1 + + cnt_tot = cnt_sucess + cnt_fail + if not cnt_tot : return 0.0 + else : return cnt_sucess / float(cnt_tot) + +def get_sub_tone_code_of_sentence (sentence, phase, sel_en, decomposition_en) : + labels = list() + for i, token in enumerate(sentence) : + label = list() + for j, syllabe_code in enumerate(token) : + syllabe, code = syllabe_code + subcode = code_dispatcher(code.decode('utf-8'), sel_en, decomposition_en)[phase].encode('utf-8') + label.append(subcode) + labels.append(label) + return labels + +def accumulate_tone_code_of_dataset (dataset_acc, dataset) : + for p, sent in enumerate(dataset_acc) : + for i, token in enumerate(sent) : + for j, syllabe_tag_acc in enumerate(token) : + syllabe_acc, tag_acc = syllabe_tag_acc + syllabe, tag = dataset[p][i][j] + if tag_acc and tag : + tag_acc += code_seperator.encode('utf-8') + tag + else : + tag_acc += tag + dataset_acc[p][i][j] = \ + tuple([syllabe, code_resort(tag_acc.decode('utf-8')).encode('utf-8')]) + + return dataset_acc + +def reshape_tokens_as_sentnece(tokens, sentnece) : + + ret = list() + n = 0 + for i, token in enumerate(sentnece) : + tmp = list() + for j, syllabe in enumerate(token) : + tmp.append(tokens[n]) + n += 1 + ret.append(tmp) + + return ret + +def make_tokens_from_sentence(sent, is_tone_mode = False) : + if is_tone_mode : + tokens = list() + labels = list() + for token in sent : + tokens.append(unzip(token)[0]) + labels.append(unzip(token)[1]) + else : + tokens = unzip(sent)[0] + labels = unzip(sent)[1] + + return [tokens, labels] + +def make_features_from_tokens(tokens, phase = 0, is_tone_mode = False) : + if is_tone_mode : + features_syllabe = list() + for i, token in enumerate(tokens) : + feature = list() + for j, syllabe_code in enumerate(token) : + feature.append(get_features_customised_tone(tokens, i, j, phase)) + features_syllabe.append(feature) + features = list(itertools.chain(*features_syllabe)) + else : + features = list() + for i in range(len(tokens)) : + features.append(get_features_customised(tokens, i)) + return features + +def inspector_tokens(gold_tokens, predicted_tokens) : + for x,y in zip(gold_tokens, predicted_tokens) : + try : + if x[1] != y[1] : + print x[0],":",x[1].decode('utf-8'),"->",y[1].decode('utf-8') # ,"(",len(x[1]), len(y[1]),")" + else : + print "*",x[0],":",x[1].decode('utf-8'),"->",y[1].decode('utf-8') # ,"(",len(x[1]), len(y[1]),")" + except : + print type(x[0]),":",type(x[1]),"->",type(y[1]) + +def unzip(input) : + return [list(li) for li in zip(*input)] + +def csv_export(filename, gold_set, test_set, is_tone_mode = False): + + if not is_tone_mode : + csvfile = codecs.open(filename, 'wb') + writer = csv.writer(csvfile) + writer.writerow(["Token", "Golden", "Predicted", "Same"]) + for gold_sent, test_sent in zip(gold_set, test_set) : + for gold_token, test_token in zip(gold_sent, test_sent) : + token = gold_token[0] + gold_code = gold_token[1] + test_code = test_token[-1] + # print token, gold_code, test_code + sameCodes = (gold_code == test_code) + + if not repr(token.encode('utf-8')) : + sameCodes = u'' + row = [\ + (token.encode('utf-8')), \ + gold_code, \ + test_code, \ + sameCodes] + writer.writerow(row) + csvfile.close() + else : + csvfile = codecs.open(filename, 'wb') + writer = csv.writer(csvfile) + writer.writerow(["Token", \ + "Golden Form","Predicted Form", \ + "Golden code", "Predicted code", "Same"]) + enc = encoder_tones() + for gold_sent, test_sent in zip(gold_set, test_set) : + for gold_token, test_token in zip(gold_sent, test_sent) : + gold_code = '' + test_code = '' + gold_form = '' + test_form = '' + token = '' + for gold_syllabe, test_syllabe in zip(gold_token, test_token) : + token += gold_syllabe[0] + ' ' + if gold_syllabe[1] : + gold_code += gold_syllabe[1] + ' ' + else : + gold_code += 'NULL' + ' ' + if test_syllabe[1] : + test_code += test_syllabe[1] + ' ' + else : + test_code += 'NULL' + ' ' + gold_form += enc.differential_decode(gold_syllabe[0], gold_syllabe[1].decode('utf-8')) + ' ' + test_form += enc.differential_decode(gold_syllabe[0], test_syllabe[1].decode('utf-8')) + ' ' + sameCodes = (gold_code == test_code) + sameForms = (gold_form == test_form) + sameCodes = (gold_code == test_code) + sameForms = (gold_form == test_form) + if not repr(token.encode('utf-8')) : + sameCodes = u'' + row = [\ + (token.encode('utf-8')), \ + repr(gold_form.encode('utf-8')), \ + repr(test_form.encode('utf-8')), \ + repr(gold_code, spaces=True), \ + repr(test_code, spaces=True), \ + sameCodes] + writer.writerow(row) + csvfile.close() + +def sampling(allsents, p, ratio = 1) : + train_set, eval_set = [], [] + for i, sent in enumerate(allsents[0 : : int(1/float(ratio))]) : + p_approx = float(len(train_set) + 1) / float(len(eval_set) + len(train_set) + 1) + if p_approx <= p : + train_set.append(sent) + else: + eval_set.append(sent) + return [train_set, eval_set] + +def get_duration(t1_secs, t2_secs) : + secs = abs(t1_secs - t2_secs) + days = secs // 86400 + hours = secs // 3600 - days * 24 + minutes = secs // 60 - hours * 60 - days * 60 * 24 + secondes = int(secs) % 60 + return '{:>02.0f}:{:>02.0f}:{:>02.0f}:{:>02d}'.format(days, hours, minutes, secondes) + +def is_a_good_code(code) : + + if not code : return True + + code2 = code + + # +_2__ is good, because -> + 2 _ + if code2[-1] == code_seperator.decode('utf-8') or code2[-1] == code_seperator : + try : + if code2[-1] != code2[-2] : + return False + except IndexError: + return False + + # code3 = code2.split(code_seperator.decode('utf-8')) + code3 = split2(code2,code_seperator.decode('utf-8')) + if len(code3) % 3 != 0 : + return False + else : + return True + +def code_dispatcher(code, sel_en, decomposition_en) : lst = [] for i in mode_indicators : @@ -28,152 +323,206 @@ def code_dispatcher(code) : lst.append("") if not code : return lst - if code[-1] == code_seperator : code = code[: -1] - code_segments = code.split(code_seperator) - for i in range(0, len(code_segments), 3) : - m, p, c = code_segments[i : i + 3] - lst[mode_indicators.index(m) + len(mode_indicators) * int(c in markers_tone)] += \ + if not is_a_good_code(code) : print "(dispatcher) input code incorrect !" ; print code ; exit() + #if code[-1] == code_seperator : code = code[: -1] + # code_segments = code.split(code_seperator) + code_segments = split2(code,code_seperator) + + # Filtering + def indexing(op) : + m,p,c = op + return 2 * int(p) + mode_indicators.index(m) + + ops = [code_segments[i : i + 3] for i in range(0, len(code_segments), 3)] + + if sel_en : + ops = sorted(ops, key=lambda op : indexing(op)) + ops2 = list() + i_pre = -1 + for op in ops : + i = indexing(op) + if i > i_pre : + ops2.append(op) + i_pre = i + else : + ops2 = ops + + for op in ops2 : + m,p,c = op + if decomposition_en : + phase = mode_indicators.index(m) + len(mode_indicators) * int(c in markers_tone) + else : + phase = len(mode_indicators) * int(c in markers_tone) + lst[phase] += \ u"{}{}{}{}{}{}".format(m, code_seperator, p, code_seperator, c, code_seperator) - return lst + lst2 = list() + for element in lst : + try : + if element[-1] == code_seperator or element[-1] == code_seperator.decode('utf-8') : + lst2.append(element[:-1]) + else : + lst2.append(element) + except : + lst2.append(element) + + for code in lst2 : + if not is_a_good_code(code): + print "(dispatcher) output code incorrect !" + print code + + return lst2 def code_resort(code) : + ret = [] if not code : return code - if code[-1] == code_seperator : code = code[: -1] - code_segments = code.split(code_seperator) + if not is_a_good_code(code) : print "(resort) input code incorrect !" ; exit() + #if code[-1] == code_seperator : code = code[: -1] + #code_segments = code.split(code_seperator) + code_segments = split2(code,code_seperator) for i in range(0, len(code_segments), 3) : - m, p, c = code_segments[i : i + 3] + try : + m, p, c = code_segments[i : i + 3] + except : + print code + print code_segments; + print "Bug 1 !" + exit() + ret.append(u"{}{}{}{}{}{}".format(m, code_seperator, p, code_seperator, c, code_seperator)) - ret = sorted(ret, key=lambda x : int(mode_indicators.index(m))+2*int(x.split(code_seperator)[1])) + ret = sorted(ret, key=lambda x : int(mode_indicators.index(split2(x, code_seperator)[0])) + 2 * int(split2(x, code_seperator)[1])) ret = ''.join(ret) if ret : ret = ret[:-1] + if not is_a_good_code(ret) : print ("(resort) ouptut code incorrect !") ; exit() + return ret -def _get_features_customised_for_tones(tokens, idx): +def get_features_customised(tokens, idx): feature_list = [] if not tokens: return feature_list - try : - token = tokens[idx] - except IndexError : - raise - - # positon du syllabe actuel et préfixe et suffixe du même mot - lst = [] - for i in range(idx, len(tokens) + 1, 1) : - try : - if tokens[i] == token_seperator : - lst.append(i) - if len(lst) >= 2 : - break - except IndexError : - lst.append(i) - break - - try : - feature_list.append("SYLLABE_ID1_" + str(lst[0] - idx)) - except : - pass - - try : - feature_list.append("SUFFIXE_ACTUEL_" + tokens(lst[0] - 1)) - except : - pass - - lst2 = [] - for i in range(idx, -2, -1) : - try : - if tokens[i] == token_seperator : - lst2.append(i) - if len(lst2) >= 2 : - break - except IndexError : - lst2.append(i) - break - - try : - feature_list.append("SYLLABE_ID2_" + str(idx - lst2[0])) - except : - pass - - try : - feature_list.append("PREFIXE_ACTUEL_" + tokens(lst2[0] + 1)) - except : - pass - - # préfixe et suffixe du mots précédent et suivant dans la même phrase - try : - prefixe_du_mot_suivant = tokens[lst[0] + 1] - feature_list.append("PREFIXE_SUIVANT_" + prefixe_du_mot_suivant) - except IndexError : - pass - try : - suffixe_du_mot_precedent = tokens[lst2[0] - 1] - feature_list.append("SUFFIXE_PRECEDENT_" + suffixe_du_mot_precedent) - except IndexError: - pass - - try : - suffixe_du_mot_suivant = tokens[lst[1] - 1] - feature_list.append("SUFFIXE_SUIVANT_" + suffixe_du_mot_suivant) - except IndexError : - pass - try : - prefixe_du_mot_precedent = tokens[lst2[1] + 1] - feature_list.append("PREFIXE_PRECEDENT_" + prefixe_du_mot_precedent) - except IndexError : - pass + token = tokens[idx] # Capitalization if token[0].isupper(): - feature_list.append('CAPITALIZATION') + feature_list.append(u'CAPITALIZATION') # Number if re.search(r'\d', token) is not None: - feature_list.append('IL_Y_A_UN_CHIFFRE') + feature_list.append(u'IL_Y_A_UN_CHIFFRE') # Punctuation - punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"]) + punc_cat = set([u"Pc", u"Pd", u"Ps", u"Pe", u"Pi", u"Pf", u"Po"]) if all (unicodedata.category(x) in punc_cat for x in token): - feature_list.append('PONCTUATION_PURE') + feature_list.append(u'PONCTUATION_PURE') # Voyelles - voyelles = "" + voyelles = u"" for c in token : if c.lower() in vowels: voyelles += c - feature_list.append('VOYELLES_'+ voyelles) + feature_list.append(u'VOYELLES_'+ voyelles) # Syllabes précédent et suivant try : - feature_list.append('SYLLABE_PRECEDENT_' + token[idx - 1]) + feature_list.append(u'TOKEN_PRECEDENT_' + token[idx - 1]) except IndexError : pass try : - feature_list.append('SYLLABE_SUIVANT_' + token[idx + 1]) + feature_list.append(u'TOKEN_SUIVANT_' + token[idx + 1]) except IndexError : pass - feature_list.append('SYLLABE_ACTUEL_' + (token)) + feature_list.append(u'TOKEN_ACTUEL_' + (token)) # Suffix & prefix up to length 3 if len(token) > 1: - feature_list.append('SUF_' + token[-1:]) - feature_list.append('PRE_' + token[:1]) + feature_list.append(u'SUF_' + token[-1:]) + feature_list.append(u'PRE_' + token[:1]) if len(token) > 2: - feature_list.append('SUF_' + token[-2:]) - feature_list.append('PRE_' + token[:2]) + feature_list.append(u'SUF_' + token[-2:]) + feature_list.append(u'PRE_' + token[:2]) if len(token) > 3: - feature_list.append('SUF_' + token[-3:]) - feature_list.append('PRE_' + token[:3]) + feature_list.append(u'SUF_' + token[-3:]) + feature_list.append(u'PRE_' + token[:3]) + + return feature_list + +def get_features_customised_tone(tokens, i, j, phase) : + + feature_list = [] + + if not tokens: + return feature_list + + try : + syllabes = tokens[i] + syllabe = syllabes[j] + except IndexError : + raise + + # phases + feature_list.append(u'PHASE_ID_' + str(phase)) + + # Positions + feature_list.append(u'SYLLABE_ID_POSITIF_' + str(j)) + feature_list.append(u'SYLLABE_ID_NEGATIF_' + str(len(syllabes) - j - 1)) + feature_list.append(u'TOKEN_ID_POSITIF_' + str(i)) + feature_list.append(u'TOKEN_ID_NEGATIF_' + str(len(tokens) - i - 1)) + + # Châine de caractères au niveau du vocable actuel + feature_list.append(u'SYLLABE_ACTUELLE_' + syllabe) + feature_list.append(u'PREFIXE_ACTUEL_' + syllabes[0]) + feature_list.append(u'SUFFIXE_ACTUEL_' + syllabes[-1]) + try : feature_list.append(u'SYLLABE_QUI_PRECEDE_' + syllabes[j - 1]) + except : pass + try : feature_list.append(u'SYLLABE_QUI_SUIT_' + syllabes[j + 1]) + except : pass + + # châine de caractères au niveau du vocable qui précède et celui qui suit + try : feature_list.append(u'PREFIXE_DU_TOKEN_QUI_PRECEDE_' + tokens[i-1][0]) + except : pass + try : feature_list.append(u'SUFFIXE_DU_TOKEN_QUI_PRECEDE_' + tokens[i-1][-1]) + except : pass + try : ffeature_list.append(u'PREFIXE_DU_TOKEN_QUI_SUIT_' + tokens[i+1][0]) + except : pass + try : ffeature_list.append(u'SUFFIXE_DU_TOKEN_QUI_SUIT_' + tokens[i+1][-1]) + except : pass + + # châine de caractères au niveau d'une phrase + feature_list.append(u'TOKEN_ACTUEL_' + ''.join(syllabes)) + try : feature_list.append(u'TOKEN_QUI_PRECEDE_' + ''.join(tokens[i - 1])) + except : pass + try : feature_list.append(u'TOKEN_QUI_SUIT_' + ''.join(tokens[i + 1])) + except : pass + + # Capitalization + if syllabe[0].isupper(): + feature_list.append(u'CAPITALIZATION') + + # Number + if re.search(r'\d', syllabe) is not None: + feature_list.append(u'IL_Y_A_UN_CHIFFRE') + + # Punctuation + punc_cat = set([u"Pc", u"Pd", u"Ps", u"Pe", u"Pi", u"Pf", u"Po"]) + if all (unicodedata.category(x) in punc_cat for x in syllabe): + feature_list.append(u'PONCTUATION_PURE') + + # Voyelles + voyelles = u"" + for c in syllabe : + if c.lower() in vowels: + voyelles += c + feature_list.append(u'VOYELLES_'+ voyelles) return feature_list @@ -197,11 +546,28 @@ def rm_sep(str_in, seprator_in = code_seperator, replacing = u''): except : raise -def chunking (token) : +def chunking (token, mode) : chunks = [] - for chunk in syllabify(token)[0]: - chunks.append(unicodedata.normalize('NFD', chunk)) + + if mode == 0 : + # sans segmenteur + chunks.append(token) + elif mode < 0 : + # syllabification + for chunk in syllabify(token)[0]: + chunks.append(unicodedata.normalize('NFD', chunk)) + # segmentation à intervalle régulier + else : + token2 = unicodedata.normalize('NFD', token) + seg = "" + for c in token2 : + seg += c + if len(seg) == mode : + chunks.append(seg) + seg = "" + if seg : + chunks.append(seg) return chunks @@ -297,7 +663,6 @@ def entropy (cnt, unit = 'shannon') : ent -= p * math.log(p, base[unit]) return ent - def sprint_cnt(cnt, prefix = "", num = -1, min = -1) : lst = cnt.most_common() @@ -319,6 +684,7 @@ def __init__(self) : self.form_non_tonal = Counter() self.form_tonal = Counter() self.code = Counter() + self.code2 = Counter() self.segment_code = Counter() self.dict_code = defaultdict() self.dict_form_tonal= defaultdict() @@ -336,7 +702,9 @@ def __str__(self) : ret += u"Entropies globales\n" ret += u"\tE(Token) = {:<6.2f} \n".format(entropy(self.form_non_tonal)) ret += u"\tE(Forme tonale) = {:<6.2f} \n".format(entropy(self.form_tonal)) - ret += u"\tE(Code produit) = {:<6.2f} \n".format(entropy(self.code)) + ret += u"\tE(Code produit) = {:<6.2f} \n".format(entropy(self.code2)) + ret += u"\tr_E(Code produit) = {:<6.2f} \n".format(entropy(self.form_tonal)/entropy(self.code2)) + ret += u"Entropies par token (en moyenne)\n" ret += u"\tE(Forme tonale) = {:<6.2f} \n".\ format(entropy2(self.dict_form_tonal, cnty = self.form_tonal, cntx = self.form_non_tonal)) @@ -392,24 +760,7 @@ def insert(self) : self.stat.dst_insert[caracter_dst] += 1 self.stat.segment_code[repr(segment)] += 1 - """ - def replace(self) : - mode_id = mode_names.index("replace") - [mp_code, chunk_id] = mode_position_encoder(self.src,self.p_src, mode_id, self.chunks) - segment = mp_code + code_seperator - caracter_src = self.src[self.p_src] - caracter_dst = self.dst[self.p_dst] - segment += caracter_dst + code_seperator - self.ret[chunk_id] += segment - - self.stat.cnt_ops += 1 - self.stat.mode["replace"] += 1 - self.stat.src_replace[caracter_src] += 1 - self.stat.dst_replace[caracter_dst] += 1 - self.stat.segment_code[repr(segment)] += 1 - """ - - def differential_encode (self, form_non_tonal, form_tonal, seperator = True) : + def differential_encode (self, form_non_tonal, form_tonal, chunk_mode) : self.p_src = -1 self.p_dst = -1 @@ -417,12 +768,9 @@ def differential_encode (self, form_non_tonal, form_tonal, seperator = True) : self.src = reshaping(form_non_tonal, False) if not self.src : - if seperator: - return [u"", [token_seperator]] - else : - return [u"", []] + return [[u""], [form_non_tonal]] - self.chunks = chunking(self.src) + self.chunks = chunking(self.src, chunk_mode) self.ret = [u"" for i in range(len(self.chunks))] self.dst = reshaping(form_tonal, False) @@ -446,18 +794,20 @@ def differential_encode (self, form_non_tonal, form_tonal, seperator = True) : # enlèvement du séparateur du code à la fin du chunk tmp = [] - for ret2 in self.ret : + for ret2 in self.ret : try : - if ret2[-1] == code_seperator : - ret2 = ret2[:-1] + if ret2[-1] == code_seperator : + ret2 = ret2[:-1] except IndexError: pass - tmp.append(ret2) - self.ret = tmp + tmp.append(ret2) + self.ret = tmp self.stat.num += 1 repr_code = repr(u"".join(self.ret)) self.stat.code[repr_code] += 1 + for chunk_code in self.ret : + self.stat.code2[chunk_code] += 1 self.stat.dict_code.setdefault(self.src, []).append(repr_code) # internal auto-check @@ -468,9 +818,11 @@ def differential_encode (self, form_non_tonal, form_tonal, seperator = True) : if form1 != form2 : self.stat.err_cnt += 1 - if seperator : - self.ret.append(u'') - self.chunks.append(token_seperator) + for code in self.ret : + if not is_a_good_code : + print "(encode) ouput code incorrect !"; + print code ; + exit () return [self.ret, self.chunks] @@ -482,9 +834,11 @@ def differential_decode (self, chunk, code) : chunk = reshaping(chunk, False) if len(code.strip()) == 0 : return chunk + if not is_a_good_code(code) : print "(decode) input code incorrect !" ; print chunk ; print code ; exit() - if code[-1] == code_seperator : code = code[: -1] - code_segments = code.split(code_seperator) + # if code[-1] == code_seperator : code = code[: -1] + # code_segments = code.split(code_seperator) + code_segments = split2(code,code_seperator) if len(code_segments) % 3 != 0 : print code ; print (code_segments) ; print ("input code incorrect !"); exit(1) p_offset = 0 @@ -492,7 +846,7 @@ def differential_decode (self, chunk, code) : try : m, p, c = code_segments[i:i+3] except : - print (u"Bug in differential_decode : {}".format(code)) + print (u"Bug 2 : {}".format(code)) exit(1) p_eff = int(p) + p_offset @@ -515,6 +869,7 @@ def differential_decode (self, chunk, code) : def main () : + """ forms_non_tonal = [u'tò',u'yerehré',u'ò',u'e', u'òhehòhe', u'òhòh',u'ohoh',u'ehe', u'tò',u'hééh',u'heeh',u'hèé', u'narè'] forms_tonal = [u'tɔ',u'yɛrɛ̂hre',u'o',u'é', u'ohéhohé', u'ohoh',u'òhòh',u'ebe',u'tɔ',u'heeh',u'hééh',u'héè', u'nàrɛ'] @@ -523,11 +878,13 @@ def main () : for form_non_tonal, form_tonal in zip(forms_non_tonal, forms_tonal) : print u"Source {}".format(reshaping(form_non_tonal, False)) print u"Destination {}".format(reshaping(form_tonal, False)) - [codes, chunks] = enc.differential_encode (form_non_tonal, form_tonal) + [codes, chunks] = enc.differential_encode (form_non_tonal, form_tonal, chunk_mode) i = 0 for chunk, code in zip(chunks, codes) : sys.stdout.write(u"Syllabe_{} '{}' - '{}' -> '{}'\n".format(i, enc.differential_decode(chunk, code), chunk, repr(code))); - sys.stdout.write(u"Syllabe_{} '{}' - '{}' -> '{}'\n".format(i, enc.differential_decode(chunk, code_resort(''.join(code_dispatcher(code)))), chunk, repr(code_resort(''.join(code_dispatcher(code)))))); + sys.stdout.write(u"Syllabe_{} '{}' - '{}' -> '{}'\n".\ + format(i, enc.differential_decode(\ + chunk, code_resort(''.join(code_dispatcher(code)))), chunk, repr(code_resort(''.join(code_dispatcher(code)))))); pass print "" @@ -537,5 +894,6 @@ def main () : print form1, form2 enc.report() + """ if __name__ == "__main__" : main() diff --git a/disambiguation.py b/disambiguation.py index a15bd26..4c5da25 100644 --- a/disambiguation.py +++ b/disambiguation.py @@ -3,106 +3,28 @@ # Auteur : Elvis Mboning, Stagiaire 2016, INALCO # Auteur : Damien Nouvel, MCF, INALCO +# Auteur : Luigi (Yu-Cheng) Liu, Stagiaire 2017, INALCO # Le principale rôle de ce script est de créer des modèles de données pour l'apprentissage automatique avec CRFTagger. # Le CRF implémenté provient du module tag de NLTK inspiré de CRFSuite (http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf). # Trois modèles sont possibles : les POS, les tons, les gloses -# todo: -# * petit rapport sur les distributions de caractères et leurs natures dans le corpus -## * enregistrement et téléverser -# * models produits /models/pos_exactitude_0p92.mod -# * models produits /models/tone_exactitude_0p91.mod -# * avec un fihier in et un fichier out -# -# des RDV. prévus -# mercredi 17 mai à 14 : 30 - -import sys, re, codecs, glob, time, os -import argparse + +import sys, re, codecs, glob, time, os, collections, argparse, itertools import formats, grammar -import collections +from gdisamb import FileParser from ntgloss import Gloss from nltk.tag.crf import CRFTagger -from gdisamb import FileParser -from differential_tone_coding import encoder_tones, repr, token_seperator, _get_features_customised_for_tones, code_dispatcher, code_resort, mode_indicators -import unicodedata -import pycrfsuite -import csv import nltk.tag.util -import itertools -from nltk.metrics.scores import accuracy -import zipfile +import pycrfsuite +from differential_tone_coding import apply_filter_to_base_element, get_features_customised, get_duration, sampling, csv_export, unzip, encoder_tones, mode_indicators, marginal_tone, accuray2, get_sub_tone_code_of_sentence, accumulate_tone_code_of_dataset, reshape_tokens_as_sentnece, make_tokens_from_sentence, make_features_from_tokens +import unicodedata +import zipfile, ntpath -import codecs, sys +import codecs, sys, fnmatch sys.stdin = codecs.getreader('utf8')(sys.stdin) sys.stdout = codecs.getwriter('utf8')(sys.stdout) -def unzip(input) : - return [list(li) for li in zip(*input)] - -# dataset : list((str,str)) -def getTag(dataset) : - ret = [] - buf = str() - for data in dataset : - if data[0] != token_seperator : - buf += data[1] - else : - ret.append(buf) - buf = str() - if buf : - ret.append(buf) - return ret - -def csv_export(enc, filename, gold_tokens, test_tokens): - - try : - csvfile = codecs.open(filename, 'wb') - writer = csv.writer(csvfile) - writer.writerow(["Token", "Golden Form", "Predicted Form","Golden code", "Predicted code", "Same"]) - for g, t in zip(gold_tokens, test_tokens) : - token = g[0] - golden_code = g[-1] - predicted_code = t[-1] - golden_form = enc.differential_decode(token, golden_code.decode('utf-8')) - predicted_form = enc.differential_decode(token, predicted_code.decode('utf-8')) - sameCodes = (golden_code == predicted_code) - sameForms = (golden_form == predicted_form) - - if not repr(token.encode('utf-8')) : - sameCodes = u'' - row = [\ - repr(token.encode('utf-8')), \ - repr(golden_form.encode('utf-8')), \ - repr(predicted_form.encode('utf-8')), \ - repr(golden_code, spaces=True), \ - repr(predicted_code, spaces=True), \ - sameCodes] - - writer.writerow(row) - csvfile.close() - except : - raise - print "unable to dump result in CSV file to create !" - -def sampling(allsents, p, ratio = 1) : - train_set, eval_set = [], [] - for i, sent in enumerate(allsents[0 : : int(1/float(ratio))]) : - p_approx = float(len(train_set) + 1) / float(len(eval_set) + len(train_set) + 1) - if p_approx <= p : - train_set.append(sent) - else: - eval_set.append(sent) - return [train_set, eval_set] - -def get_duration(t1_secs, t2_secs) : - secs = abs(t1_secs - t2_secs) - days = secs // 86400 - hours = secs // 3600 - days * 24 - minutes = secs // 60 - hours * 60 - days * 60 * 24 - secondes = int(secs) % 60 - return '{:>02.0f}:{:>02.0f}:{:>02.0f}:{:>02d}'.format(days, hours, minutes, secondes) def main(): @@ -111,187 +33,265 @@ def main(): aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') - # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') - aparser.add_argument('-e', '--evalsize', help='Percent of training data with respect to training and test one (default 10)', default=10) + aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') + aparser.add_argument('-e', '--evalsize', help='Percent of training data with respect to training and test one (default 10)', default=10, type=float) + aparser.add_argument('-c', '--chunkmode', help='Chunking mode specification which is effective only for tone (default -1)', default=-1, type=int) aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None) aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true') + + aparser.add_argument('--filtering', help = 'Experimental option', action='store_true') + aparser.add_argument('--no_decomposition', help = 'Experimental option', action='store_true') + aparser.add_argument('--diacritic_only', help = 'Experimental option', action='store_true') + aparser.add_argument('--non_diacritic_only', help = 'Experimental option', action='store_true') + aparser.add_argument('--no_coding', help = 'Experimental option', action='store_true') + aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin) aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout) - aparser.add_argument('-s', '--store', help='Store tagged raw data in file (.csv) for further research purpose', default=None) + aparser.add_argument('-s', '--store', help='Store evaluation resault in file (.csv) for further research purpose', default=None) args = aparser.parse_args() if args.verbose : - print args - - if args.learn and (args.pos or args.tone or args.gloss): - - if not (args.pos or args.tone or args.gloss) : - print 'Choose pos, tone, gloss or combination of them' + print 'Arguments received by script' + dico = vars(args) + for key,val in dico.items(): + typeName = type(val).__name__ + sys.stdout.write("\t{} = {} ".format(key, val)) + if val : + sys.stdout.write("({})".format(typeName)) + print "" + + if not (args.pos or args.tone or args.gloss) : + print 'Choose pos, tone, gloss' + aparser.print_help() exit(0) + if args.learn : print 'Make list of files' + + """ files1 = glob.iglob("../corbama/*/*.dis.html") files2 = glob.iglob("../corbama/*.dis.html") allfiles = "" for file1, file2 in zip(files1, files2): allfiles += file1+','+file2+',' + """ + allfiles = [] + for root, dirnames, filenames in os.walk('../corbama'): + for filename in fnmatch.filter(filenames, '*.dis.html'): + allfiles.append(os.path.join(root, filename)) + allsents = [] - # pour le débogage - allfiles = '../corbama/sisoko-daa_ka_kore.dis.html' + # pour le débogage rapide + # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html' - if args.tone : - try : - enc = encoder_tones() - except : - enc = None - print ("Error : unable to initialize the tone encoder !") - - print 'Open files and find features / supervision tags' - for infile in allfiles.split(','): - if(len(infile)) : - print '-', infile - sent = [] + print 'Making observation data from disambiggated corpus of which' + for infile in allfiles: + if infile : + print '\t', infile html_parser = FileParser() html_parser.read_file(infile) - for snum, sentence in enumerate(html_parser.glosses) : - for tnum, token in enumerate(sentence[2]) : - tag = '' - if token.type == 'w' or token.type == 'c': - tags = '' - if args.pos: - for ps in token.gloss.ps : tags += ps.encode('utf-8') - sent.append((token.token, tags)) - elif args.tone: - # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ? - # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est - # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter - # une réélle amélioration dans la modélisation de tonalisation. Néanmoins, - # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans - # les données d'entraînement et d'en observer le apport - if '|' not in token.gloss.form : - [codes, chunks] = enc.differential_encode(token.token, token.gloss.form) - for chunk, code in zip(chunks, codes) : - try : sent.append((chunk, code.encode('utf-8'))) - except LookupError: pass - """ - elif args.gloss: - tags += token.gloss.gloss.encode('utf-8') - sent.append((token.token, tags)) - """ + sent = [] + for sentence in html_parser.glosses : + for token in sentence[2] : + if token.type == 'w' or \ + token.type == 'c': + if args.pos and not args.tone and not args.gloss : + # sent : list(str,str) + tags = '' + for ps in token.gloss.ps : + tags += ps + sent.append((token.token, tags.encode('utf-8'))) + elif args.tone and not args.pos and not args.gloss : + # sent : list(str,str) + form = token.gloss.form.split('|') + tags = form[0] + sent.append((token.token, tags.encode('utf-8'))) + elif args.gloss and not args.tone and not args.pos : + # sent : list(str,str) + tags = token.gloss.gloss + sent.append((token.token, tags.encode('utf-8'))) + else : + print ('Error : multi-modal learning is not yet be supported !') + exit() if len(sent) > 1: allsents.append(sent) sent = [] - if args.verbose and args.tone : - enc.report() - - # Constitution des ensmebles d'entraînement de d'évaluation + if args.tone and not args.no_coding : + print 'Token segmentation and tonal informaiotn compression' + enc = encoder_tones() + allsents2 = allsents + allsents = [] + for sent in allsents2 : + sent2 = [] + for token_tags in sent : + token, tags = token_tags + [codes, syllabes] = enc.differential_encode(token, tags.decode('utf-8'), args.chunkmode) + token2 = [(syllabe, code.encode('utf-8')) for syllabe, code in zip(syllabes, codes)] + sent2.append(token2) + allsents.append(sent2) + + if args.verbose : + enc.report() + + R = 1 # 1 pour la totalité des corpus p = (1 - args.evalsize / 100.0) - train_set, eval_set = sampling(allsents, p) - print 'Split the data in train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)' + train_set, eval_set = sampling(allsents, p, R) + print 'Split the data in \t train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)' - print 'Building classifier (CRF/NLTK)' + print 'Building classifier (pyCRFsuite)' # Initialization t1 = time.time() - if args.tone : - num_phases = len([False, True]) * len(mode_indicators) + if args.tone and not args.no_coding : + num_phases = 2 * len(mode_indicators) myzip = zipfile.ZipFile(args.learn + '.zip', 'w') else : num_phases = 1 - # Training + # A. Entrâinement des modèles for phase in range(num_phases) : + # A.1. Initialiser un nouveau modèle CRF tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) trainer = pycrfsuite.Trainer(verbose = tagger._verbose) trainer.set_params(tagger._training_options) - if num_phases > 1 : - model_name = args.learn + '.' + str(phase) - else: - model_name = args.learn - - # train_set : list(list((str,list(str)))) - for sent in train_set: - tokens = unzip(sent)[0] - labels = unzip(sent)[1] - if num_phases > 1 : - for lab in labels : - pass - labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels] - features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))] + model_name = args.learn + if args.tone and not args.no_coding : + if args.diacritic_only and (phase == 0 or phase == 1) : + continue + if args.non_diacritic_only and (phase == 2 or phase == 3) : + continue + elif args.no_decomposition and phase % len(mode_indicators) != 0 : + continue + model_name += '.' + str(phase) + + # A.2. Mettre à plat les structures de données pour préparer l'entrâinement contextuel + for sent in train_set : + if args.tone and not args.no_coding : + [tokens, labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding) + features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding) + labels = get_sub_tone_code_of_sentence(sent, phase, sel_en = args.filtering, decomposition_en = not args.no_decomposition) + labels = list(itertools.chain(*labels)) + else : + [tokens, labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding) + features = make_features_from_tokens(tokens, 0, args.tone and not args.no_coding) + trainer.append(features, labels) trainer.train(model = model_name) - if num_phases > 1 : + + if args.tone and not args.no_coding : myzip.write(model_name) os.remove(model_name) - if num_phases > 1 : + + if args.tone and not args.no_coding : myzip.close() print "... done in", get_duration(t1_secs = t1, t2_secs = time.time()) - # Evaluation + # B. Evaluation print 'Evaluating classifier' - # gold_set, predicted_set : list(list((str, str))) - # input_set, output_gold_set : list(list(str)) gold_set = eval_set - input_set = [unzip(sent)[0] for sent in gold_set] - predicted_set = [list() for sent in gold_set] - if num_phases > 1 : + + if args.tone and not args.no_coding : myzip = zipfile.ZipFile(args.learn + '.zip', 'r') - for phase in range(num_phases) : + predicted_set_acc = list() + for phase in range(num_phases) : + + # B.1. Charger le modèle CRF pour une des quatre phases d'annoation tonale + tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) + trainer = pycrfsuite.Trainer(verbose = tagger._verbose) + trainer.set_params(tagger._training_options) + model_basename = '' + for m in myzip.namelist() : + if m.endswith(str(phase)): + model_basename = m + break + if not model_basename : + continue + if args.diacritic_only and (phase == 0 or phase == 1) : + continue + if args.non_diacritic_only and (phase == 2 or phase == 3): + continue + elif args.no_decomposition and phase % len(mode_indicators) != 0 : + continue + + myzip.extract(model_basename) + tagger.set_model_file(model_basename) + os.remove(model_basename) + + # B.2 Annotation automatique syllabe par syllabe pour une phrase + predicted_set = list() + for p, sent in enumerate(gold_set) : + + [tokens, gold_labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding) + features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding) + labels = tagger._tagger.tag(features) + labels = reshape_tokens_as_sentnece(labels, sent) + + predicted_tokens = list() + for i, token in enumerate(sent) : + predicted_tokens.append(map(list, zip(tokens[i], labels[i]))) + predicted_set.append(predicted_tokens) + + # B.3 Accumuler en ordonner l'annotation syllabique + if not predicted_set_acc : + predicted_set_acc = \ + [[[['',''] for syllabe in token] for token in sent] for sent in predicted_set] + + predicted_set_acc = accumulate_tone_code_of_dataset (predicted_set_acc, predicted_set) + + predicted_set = predicted_set_acc + + + else : + # B.1. Charger le modèle CRF pour l'annoation tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) trainer = pycrfsuite.Trainer(verbose = tagger._verbose) trainer.set_params(tagger._training_options) - if num_phases > 1: - model_name = args.learn + '.' + str(phase) - myzip.extract(model_name) - else : - model_name = args.learn + model_name = args.learn tagger.set_model_file(model_name) - for i, sent in enumerate(input_set) : - features = [_get_features_customised_for_tones(sent,j) for j in range(len(sent))] + + # B.2. Annotation automatique token par token + predicted_set = list() + for sent in gold_set : + [tokens, gold_labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding) + features = make_features_from_tokens(tokens, 0, args.tone and not args.no_coding) labels = tagger._tagger.tag(features) - if num_phases > 1 : - labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels] - tagged_sent = list(zip(sent, labels)) - if not predicted_set[i] : - predicted_set[i] = tagged_sent - else : - sent_acc, labels_acc = unzip(predicted_set[i]) - labels_acc = [label_acc + label for label_acc, label in zip(labels_acc, labels)] - predicted_set[i] = list(zip(sent_acc, labels_acc)) - if num_phases > 1 : - os.remove(model_name) - myzip.close() - - # gold_tokens, predicted_tokens : list((str,str)) - predicted_tokens = list(itertools.chain(*predicted_set)) - if num_phases > 1 : - predicted_tokens = [ tuple([pair[0], code_resort(pair[1].decode('utf-8')).encode('utf-8')]) for pair in predicted_tokens] - gold_tokens = list(itertools.chain(*gold_set)) - # gold_tokens_eval, predicted_tokens_eval : list(str) - if args.tone : - gold_tokens_eval = getTag(gold_tokens) - predicted_tokens_eval = getTag(predicted_tokens) - else : - gold_tokens_eval = gold_tokens - predicted_tokens_eval = predicted_tokens + predicted_set.append(zip(tokens, labels)) - if args.store and args.tone : - stored_filename = args.store - csv_export(enc, stored_filename, gold_tokens, predicted_tokens) - print "Exactitude : {:>5.3f}".format(accuracy(gold_tokens_eval, predicted_tokens_eval)) + if args.tone and not args.no_coding : + # on ajuste l'évaluation dans les cas d'apprentissage partiel + # en nous proposant de filtrer les caractères ignorés par l'apprentissage + # sinon, nous obtiendrons un résultat pénalisé + # en voulant comparer une forme prédite partiellement à la forme tonale intégrale d'un même token + if args.diacritic_only : + gold_set = apply_filter_to_base_element(gold_set, [2,3], sel_en = args.filtering, decomposition_en = not args.no_decomposition) + elif args.non_diacritic_only : + gold_set = apply_filter_to_base_element(gold_set, [0,1], sel_en = args.filtering, decomposition_en = not args.no_decomposition) + elif args.filtering : + gold_set = apply_filter_to_base_element(gold_set, [0,1,2,3], sel_en = args.filtering, decomposition_en = not args.no_decomposition) + + """ + if args.verbose : + verify(gold_set) + """ + + print "Accuracy : {:>5.3f}".format(accuray2(gold_set, predicted_set, args.tone and not args.no_coding)) + + if args.store : + stored_filename = args.store + csv_export(stored_filename, gold_set, predicted_set, args.tone and not args.no_coding) if args.verbose and args.store : print ("Tagged result is exported in {}".format(args.store)) elif args.disambiguate and args.infile and args.outfile : - # Lecture de texte en .HTML + html_parser = FileParser() tagger = CRFTagger() @@ -307,35 +307,92 @@ def main(): print "Error : unable to open the input file {} !".format(args.infile) exit(1) - # Exportation du résultat de désambiguïsation en .HTML for snum, sentence in enumerate(html_parser.glosses) : tokens = [token.token for token in sentence[2]] - features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))] + features = [get_features_customised(tokens, i) for i in range(len(tokens))] tagger._tagger.set(features) for tnum, token in enumerate(sentence[2]) : options = list() if token.value and len(token.value) > 2: for nopt, option in enumerate(token.value[2]) : try: tag = option.ps[0] - except IndexError : tag = '' - prob = tagger._tagger.marginal(tag, tnum) + except : tag = '' + try: + prob = tagger._tagger.marginal(tag, tnum) + except : + prob = 0.0 options.append((prob, option)) - reordered_probs, reordered_options = unzip(sorted(options, reverse = True)) + reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True)) if args.select : prob_max = reordered_probs[0] reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max]) + html_parser.glosses[snum][1][tnum] = reordered_options - elif args.tone : - pass + elif args.tone and not args.no_coding : + try : + html_parser.read_file(args.infile) + except IOError: + print "Error : unable to open the input file {} !".format(args.infile) + exit(1) + try : + myzip = zipfile.ZipFile(args.disambiguate, 'r') + except IOError: + print "Error : unable to open the model file {} !".format((args.disambiguate + '.zip')) + exit(1) - try : html_parser.write(args.outfile) - except IOError: print "Error : unable to create the output file {}".format(args.outfile) + num_phases = 2 * len(mode_indicators) + taggers = [] + enc = encoder_tones() + for phase in range(num_phases) : + taggers.append(CRFTagger()) + model_basename = '' + for m in myzip.namelist() : + if m.endswith(str(phase)): + model_basename = m + break + if not model_basename : + continue + if args.diacritic_only and (phase == 0 or phase == 1) : + continue + if args.non_diacritic_only and (phase == 2 or phase == 3): + continue + elif args.no_decomposition and phase % len(mode_indicators) != 0 : + continue + myzip.extract(model_basename) + taggers[phase].set_model_file(model_basename) + os.remove(model_basename) + myzip.close() - else : - aparser.print_help() + for snum, sentence in enumerate(html_parser.glosses) : + tokens = [enc.differential_encode(token.token, token.token, args.chunkmode)[1] for token in sentence[2]] + for phase in range(num_phases) : + features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding) + if taggers[phase]._model_file : + taggers[phase]._tagger.set(features) + for tnum, token in enumerate(sentence[2]) : + options = list() + if token.value and len(token.value) > 2: + for nopt, option in enumerate(token.value[2]) : + try: tag = option.form.encode('utf-8') + except : tag = '' + # def marginal_tone(taggers, tnum, tokens, tag, token, chunk_mode, sel_en, decomposition_en) + prob = marginal_tone(taggers, tnum, tokens, tag, token.token, chunk_mode = args.chunkmode, sel_en = args.filtering, decomposition_en = not args.no_decomposition) + options.append((prob, option)) + reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True)) + if args.select : + prob_max = reordered_probs[0] + reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max]) + html_parser.glosses[snum][1][tnum] = reordered_options + try : + html_parser.write(args.outfile) + print "Disambiggated resulat for {} is saved in {}".format(args.infile,args.outfile) + except IOError: print "Error : unable to create the output file {} !".format(args.outfile) + + else : + aparser.print_help() exit(0) diff --git a/doc/samples/bamana.gram.txt b/doc/samples/bamana.gram.txt index 7de06d1..d0afc90 100644 --- a/doc/samples/bamana.gram.txt +++ b/doc/samples/bamana.gram.txt @@ -41,111 +41,111 @@ return if parsed section inflection # verbal inflection # -la/-na PROG -pattern :v: [ {@nasal-v@|na}:: ] | :v: [:v: :mrph:PROG] -pattern :v: [ {@nonnasal-v@|la}:: ] | :v: [:v: :mrph:PROG] +pattern :v: [{@nasal-v@|na}::] | :v: [:v: :mrph:PROG] +pattern :v: [{@nonnasal-v@|la}::] | :v: [:v: :mrph:PROG] # moved up from v_vq_derivation because of na/la ambiguity -pattern :n: [ {@smth-nasal@|na}:: ] | :n: [ :v: :mrph:AG.PRM] -pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [ :v: :mrph:AG.PRM] +pattern :n: [{@smth-nasal@|na}::] | :n: [:v: :mrph:AG.PRM] +pattern :n: [{@nonnasal-v@|la}::] | :n: [:v: :mrph:AG.PRM] # -ra/-la/-na PFV.INTR -pattern :v: [ {@nasal-syl@|n[a']}:: ] | :v: [:v: :mrph:PFV.INTR] -pattern :v: [ {@glide-syl@|l[a']}:: ] | :v: [:v: :mrph:PFV.INTR] -pattern :v: [ {@nonnasalglide-syl@|r[a']}:: ] | :v: [:v: :mrph:PFV.INTR] +pattern :v: [{@nasal-syl@|n[a']}::] | :v: [:v: :mrph:PFV.INTR] +pattern :v: [{@glide-syl@|l[a']}::] | :v: [:v: :mrph:PFV.INTR] +pattern :v: [{@nonnasalglide-syl@|r[a']}::] | :v: [:v: :mrph:PFV.INTR] # nominal inflection # -w PL -pattern :n/adj/dtm/prn/ptcp/n.prop/num: [ {|w}:: ] | :n/adj/dtm/prn/ptcp/n.prop/num: [:n/adj/dtm/prn/ptcp/n.prop/num: :mrph:PL] +pattern :n/adj/dtm/prn/ptcp/n.prop/num: [{|w}::] | :n/adj/dtm/prn/ptcp/n.prop/num: [:n/adj/dtm/prn/ptcp/n.prop/num: :mrph:PL] # participles section participles -pattern :v/ptcp: [ {|bali}:: ] | :ptcp: [ :v: :mrph:PTCP.PRIV] -pattern :v/ptcp: [ {|ta}:: ] | :ptcp: [ :v: :mrph:PTCP.POT] -pattern :v/ptcp: [ {|tɔ}:: ] | :ptcp: [ :v: :mrph:CONV.PROG] -pattern :v/ptcp: [ {@smth-nasal@|nen}:: ] | :ptcp: [ :v: :mrph:PTCP.RES] -pattern :v/ptcp: [ {@nonnasal-v@|len}:: ] | :ptcp: [ :v: :mrph:PTCP.RES] +pattern :v/ptcp: [{|bali}::] | :ptcp: [:v: :mrph:PTCP.PRIV] +pattern :v/ptcp: [{|ta}::] | :ptcp: [:v: :mrph:PTCP.POT] +pattern :v/ptcp: [{|tɔ}::] | :ptcp: [:v: :mrph:CONV.PROG] +pattern :v/ptcp: [{@smth-nasal@|nen}::] | :ptcp: [:v: :mrph:PTCP.RES] +pattern :v/ptcp: [{@nonnasal-v@|len}::] | :ptcp: [:v: :mrph:PTCP.RES] -pattern :v/ptcp: [ {@smth-nasal@|nen|ba}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:AUGM] -pattern :v/ptcp: [ {@nonnasal-v@|len|ba}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:AUGM] +pattern :v/ptcp: [{@smth-nasal@|nen|ba}::] | :ptcp: [:v: :mrph:PTCP.RES :mrph:AUGM] +pattern :v/ptcp: [{@nonnasal-v@|len|ba}::] | :ptcp: [:v: :mrph:PTCP.RES :mrph:AUGM] # derivative forms we need to consider even if we have them in dictionary section common_derivation -pattern :ptcp/n/adj: [ {|nin}:: ] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:DIM] -pattern :n/adj/ptcp/v: [ {|ya}:: ] | :n: [:n/adj/ptcp/v: :mrph:ABSTR] -pattern :ptcp/n/adj: [ {|ba}:: ] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:AUGM] +pattern :ptcp/n/adj: [{|nin}::] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:DIM] +pattern :n/adj/ptcp/v: [{|ya}::] | :n: [:n/adj/ptcp/v: :mrph:ABSTR] +pattern :ptcp/n/adj: [{|ba}::] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:AUGM] # to handle -baliya ex: basigi.bali.ya -pattern :v/ptcp/n/adj: [ {|bali|ya}:: ] | :n: [:v/ptcp: :mrph:PTCP.PRIV :mrph:ABSTR] +pattern :v/ptcp/n/adj: [{|bali|ya}::] | :n: [:v/ptcp: :mrph:PTCP.PRIV :mrph:ABSTR] # common nominal/verbal derivation (locatives) section n_v_derivation -pattern :n/n.prop: [ {|ka}:: ] | :n/n.prop: [:n/n.prop: :mrph:GENT] -pattern :n/n.prop: [ {@nasal-v@|na}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] -pattern :n/n.prop: [ {@nonnasal-v@|la}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC] +pattern :n/n.prop: [{|ka}::] | :n/n.prop: [:n/n.prop: :mrph:GENT] +pattern :n/n.prop: [{@nasal-v@|na}::] | :n/n.prop: [:n/n.prop: :mrph:LOC] +pattern :n/n.prop: [{@nonnasal-v@|la}::] | :n/n.prop: [:n/n.prop: :mrph:LOC] # nominal derivation section n_derivation -pattern :n: [ {@smth-nasal@|nama}:: ] | :n: [ :n: :mrph:STAT] -pattern :n: [ {@nonnasal-v@|lama}:: ] | :n: [ :n: :mrph:STAT] -pattern :n: [ {|ma}:: ] | :n: [ :n: :mrph:COM] -pattern :adj/n: [ {|ntan}:: ] | :adj/n: [ :n: :mrph:PRIV] -pattern :adj/n: [ {|bagatɔ}:: ] | :adj/n: [ :n: :mrph:ST] -pattern :adj/n: [ {|baatɔ}:: ] | :adj/n: [ :n: :mrph:ST] -pattern :n: [ {ɲɔgɔn|}:: ] | :n: [ :prn:RECP :n: ] -pattern :n: [ {|ɲwaa?n}:: ] | :n: [ :n: :prn:RECP] +pattern :n: [{@smth-nasal@|nama}::] | :n: [:n: :mrph:STAT] +pattern :n: [{@nonnasal-v@|lama}::] | :n: [:n: :mrph:STAT] +pattern :n: [{|ma}::] | :n: [:n: :mrph:COM] +pattern :adj/n: [{|ntan}::] | :adj/n: [:n: :mrph:PRIV] +pattern :adj/n: [{|bagatɔ}::] | :adj/n: [:n: :mrph:ST] +pattern :adj/n: [{|baatɔ}::] | :adj/n: [:n: :mrph:ST] +pattern :n: [{ɲɔgɔn|}::] | :n: [:prn:RECP :n:] +pattern :n: [{|ɲwaa?n}::] | :n: [:n: :prn:RECP] # verbal/vq derivation section v_vq_derivation -pattern :n: [ {@smth-nasal@|nan}:: ] | :n: [ :v: :mrph:INSTR] -pattern :n: [ {@nonnasal-v@|lan}:: ] | :n: [ :v: :mrph:INSTR] -pattern :n: [ {@smth-nasal@|ni}:: ] | :n: [ :v: :mrph:NMLZ] -pattern :n: [ {@nonnasal-v@|li}:: ] | :n: [ :v: :mrph:NMLZ] -pattern :n: [ {|baga}:: ] | :n: [ :v: :mrph:AG.OCC] -pattern :n: [ {|baa}:: ] | :n: [ :v: :mrph:AG.OCC] -pattern :n: [ {|baga|nci}:: ] | :n: [ :v: :mrph:AG.OCC :mrph:AG.EX] -pattern :n: [ {|baa|nci}:: ] | :n: [ :v: :mrph:AG.OCC :mrph:AG.EX] +pattern :n: [{@smth-nasal@|nan}::] | :n: [:v: :mrph:INSTR] +pattern :n: [{@nonnasal-v@|lan}::] | :n: [:v: :mrph:INSTR] +pattern :n: [{@smth-nasal@|ni}::] | :n: [:v: :mrph:NMLZ] +pattern :n: [{@nonnasal-v@|li}::] | :n: [:v: :mrph:NMLZ] +pattern :n: [{|baga}::] | :n: [:v: :mrph:AG.OCC] +pattern :n: [{|baa}::] | :n: [:v: :mrph:AG.OCC] +pattern :n: [{|baga|nci}::] | :n: [:v: :mrph:AG.OCC :mrph:AG.EX] +pattern :n: [{|baa|nci}::] | :n: [:v: :mrph:AG.OCC :mrph:AG.EX] # attempt to handle -likɛ, -likɛla, others like -liwari... -pattern :v: [ {@nonnasal-v@|li|kɛ}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire] -pattern :n: [ {@nonnasal-v@|li|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] -pattern :v: [ {@smth-nasal@|ni|kɛ}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire] -pattern :n: [ {@smth-nasal@|ni|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] -pattern :n: [ {@nonnasal-v@|li|wari}:: ] | :n: [ :v: :mrph:NMLZ :n:] -pattern :n: [ {@smth-nasal@|ni|wari}:: ] | :n: [ :v: :mrph:NMLZ :n:] -pattern :n: [ {@nonnasal-v@|li|fɛn}:: ] | :n: [ :v: :mrph:NMLZ fɛ́n:n:chose] -pattern :n: [ {@smth-nasal@|ni|fɛn}:: ] | :n: [ :v: :mrph:NMLZ fɛ́n:n:chose] -pattern :n: [ {@nonnasal-v@|li|ko}:: ] | :n: [ :v: :mrph:NMLZ kó:n:affaire] -pattern :n: [ {@smth-nasal@|ni|ko}:: ] | :n: [ :v: :mrph:NMLZ kó:n:affaire] +pattern :v: [{@nonnasal-v@|li|kɛ}::] | :v: [:v: :mrph:NMLZ kɛ́:v:faire] +pattern :n: [{@nonnasal-v@|li|kɛ|la}::] | :n: [:v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] +pattern :v: [{@smth-nasal@|ni|kɛ}::] | :v: [:v: :mrph:NMLZ kɛ́:v:faire] +pattern :n: [{@smth-nasal@|ni|kɛ|la}::] | :n: [:v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM] +pattern :n: [{@nonnasal-v@|li|wari}::] | :n: [:v: :mrph:NMLZ :n:] +pattern :n: [{@smth-nasal@|ni|wari}::] | :n: [:v: :mrph:NMLZ :n:] +pattern :n: [{@nonnasal-v@|li|fɛn}::] | :n: [:v: :mrph:NMLZ fɛ́n:n:chose] +pattern :n: [{@smth-nasal@|ni|fɛn}::] | :n: [:v: :mrph:NMLZ fɛ́n:n:chose] +pattern :n: [{@nonnasal-v@|li|ko}::] | :n: [:v: :mrph:NMLZ kó:n:affaire] +pattern :n: [{@smth-nasal@|ni|ko}::] | :n: [:v: :mrph:NMLZ kó:n:affaire] # need to handle -ba AUGM inside ex: ko.jugu.ba.kɛ.la -pattern :n: [ {|ɲɔgɔn}:: ] | :n: [ :v: :prn:RECP] -pattern :n: [ {|ɲwaa?n}:: ] | :n: [ :v: :prn:RECP] -pattern :n: [ {ɲɔgɔn|}:: ] | :n: [ :prn:RECP :v: ] +pattern :n: [{|ɲɔgɔn}::] | :n: [:v: :prn:RECP] +pattern :n: [{|ɲwaa?n}::] | :n: [:v: :prn:RECP] +pattern :n: [{ɲɔgɔn|}::] | :n: [:prn:RECP :v:] # vq derivation section vq_derivation -pattern :vq: [ {|ya}:: ] | :n/v: [ :vq: :mrph:DEQU] -pattern :adj: [ {|man}:: ] | :adj: [ :vq: :mrph:ADJ] +pattern :vq: [{|ya}::] | :n/v: [:vq: :mrph:DEQU] +pattern :adj: [{|man}::] | :adj: [:vq: :mrph:ADJ] # numeral derivation section num_derivation -pattern :num: [ {@nasal-v@|na}:: ] | :num: [ :num: :mrph:PRICE] -pattern :num: [ {@nonnasal-v@|la}:: ] | :num: [ :num: :mrph:PRICE] -pattern :num: [ {@nasal-v@|nan}:: ] | :num: [ :num: :mrph:ORD] -pattern :num: [ {[0-9]+|nan}:: ] | :num: [ :num: :mrph:ORD] -pattern :num: [ {@nonnasal-v@|lan}:: ] | :num: [ :num: :mrph:ORD] +pattern :num: [{@nasal-v@|na}::] | :num: [:num: :mrph:PRICE] +pattern :num: [{@nonnasal-v@|la}::] | :num: [:num: :mrph:PRICE] +pattern :num: [{@nasal-v@|nan}::] | :num: [:num: :mrph:ORD] +pattern :num: [{[0-9]+|nan}::] | :num: [:num: :mrph:ORD] +pattern :num: [{@nonnasal-v@|lan}::] | :num: [:num: :mrph:ORD] ## reduplication section reduplication -pattern :v: [ {(?P.+)|(?P=stem)}:: ] | :v: [ :v: :v: ] -pattern :adj: [ {(?P.+)|(?P=stem)}:: ] | :adj: [ :adj: :adj: ] -pattern :num: [ {(?P.+)|(?P=stem)}:: ] | :num: [ :num: :num: ] -pattern :v: [ {(?P.+)|-|(?P=stem)}:: ] | :v: [ :v: :: :v: ] -pattern :adj: [ {(?P.+)|-|(?P=stem)}:: ] | :adj: [ :adj: :: :adj: ] -pattern :num: [ {(?P.+)|-|(?P=stem)}:: ] | :num: [ :num: :: :num: ] -pattern :v: [ {(?P.+)|(?P=stem)|(?P=stem)}:: ] | :v: [ :v: :v: :v: ] -pattern :adj: [ {(?P.+)|(?P=stem)|(?P=stem)}:: ] | :adj: [ :adj: :adj: :adj: ] +pattern :v: [{(?P.+)|(?P=stem)}::] | :v: [:v: :v:] +pattern :adj: [{(?P.+)|(?P=stem)}::] | :adj: [:adj: :adj:] +pattern :num: [{(?P.+)|(?P=stem)}::] | :num: [:num: :num:] +pattern :v: [{(?P.+)|-|(?P=stem)}::] | :v: [:v: :: :v:] +pattern :adj: [{(?P.+)|-|(?P=stem)}::] | :adj: [:adj: :: :adj:] +pattern :num: [{(?P.+)|-|(?P=stem)}::] | :num: [:num: :: :num:] +pattern :v: [{(?P.+)|(?P=stem)|(?P=stem)}::] | :v: [:v: :v: :v:] +pattern :adj: [{(?P.+)|(?P=stem)|(?P=stem)}::] | :adj: [:adj: :adj: :adj:] ## composition # general part-of-speech composition patterns @@ -154,44 +154,44 @@ section pos_composition # two-words composites # n.prop + n = n ex: Irisi.jamana -pattern :n: [ :n.prop: :n: ] | :n: [ :n.prop: :n: ] +pattern :n: [:n.prop: :n:] | :n: [:n.prop: :n:] # v/n + n = n -pattern :n: [ :n/v: :n: ] | :n: [ :n/v: :n: ] +pattern :n: [:n/v: :n:] | :n: [:n/v: :n:] # n + v = n/v -pattern :n/v: [ :n: :v: ] | :n/v: [ :n: :v: ] +pattern :n/v: [:n: :v:] | :n/v: [:n: :v:] # n + adj/num = n -pattern :n: [ :n: :adj/num: ] | :n: [ :n: :adj/num: ] +pattern :n: [:n: :adj/num:] | :n: [:n: :adj/num:] # dtm + v = n -pattern :n: [ :dtm: :v: ] | :n: [ :dtm: :v: ] +pattern :n: [:dtm: :v:] | :n: [:dtm: :v:] # pp + n = n ex. kɔkan.maliden, kɔnɔ.mɔgɔ -pattern :n: [ :pp: :n: ] | :n: [ :pp: :n: ] +pattern :n: [:pp: :n:] | :n: [:pp: :n:] # three-words composites # n + adj/pp/num + n = n -> added num : san.duuru.baara -pattern :n: [ :n: :adj/pp/num: :n: ] | :n: [ :n: :adj/pp/num: :n: ] +pattern :n: [:n: :adj/pp/num: :n:] | :n: [:n: :adj/pp/num: :n:] # dtm/n + v + n = n -pattern :n: [ :dtm/n: :v: :n: ] | :n: [ :dtm/n: :v: :n: ] +pattern :n: [:dtm/n: :v: :n:] | :n: [:dtm/n: :v: :n:] # n + pp + v = n/v -pattern :n/v: [ :n: :pp: :v: ] | :n/v: [ :n: :pp: :v: ] +pattern :n/v: [:n: :pp: :v:] | :n/v: [:n: :pp: :v:] # n+n+n = n yiriwali.nafolo.ko ? -pattern :n: [ :n: :n: :n: ] | :n: [ :n: :n: :n: ] +pattern :n: [:n: :n: :n:] | :n: [:n: :n: :n:] # ??? : mrph not taken into acount (not in dic?) # v+mrph+n ex: gansi.li.walan -pattern :n: [ :v: li:mrph: :n: ] | :n: [ :v: :mrph:NMLZ :n: ] -pattern :n: [ :n: ba:mrph: :n: ] | :n: [ :n: :mrph:AUGM :n: ] +pattern :n: [:v: li:mrph: :n:] | :n: [:v: :mrph:NMLZ :n:] +pattern :n: [:n: ba:mrph: :n:] | :n: [:n: :mrph:AUGM :n:] # v+mrph+v ex: kɔlɔsi.li.kɛ -pattern :v: [ :v: li:mrph: :v: ] | :v: [ :v: :mrph:NMLZ :v: ] -pattern :n: [ :v: ka:pm: :v: ] | :n: [ :v: kà:pm:INF :v: ] +pattern :v: [:v: li:mrph: :v:] | :v: [:v: :mrph:NMLZ :v:] +pattern :n: [:v: ka:pm: :v:] | :n: [:v: kà:pm:INF :v:] # four-words composites # n + pp + n + n = n -pattern :n: [ :n: :pp: :n: :n: ] | :n: [ :n: :pp: :n: :n: ] +pattern :n: [:n: :pp: :n: :n:] | :n: [:n: :pp: :n: :n:] # n + pp + adj + n = n Ex: dugu.jukɔrɔ.nafoloma.fɛn -pattern :n: [ :n: :pp: :adj: :n: ] | :n: [ :n: :pp: :adj: :n: ] +pattern :n: [:n: :pp: :adj: :n:] | :n: [:n: :pp: :adj: :n:] # n+v+v+n : kunnafoni.falen.falen.ko -pattern :n: [ :n: :v: :v: :n: ] | :n: [ :n: :v: :v: :n: ] +pattern :n: [:n: :v: :v: :n:] | :n: [:n: :v: :v: :n:] # orthographically unlikely to be bamana words diff --git a/exp_accuracy_no_coding.sh b/exp_accuracy_no_coding.sh new file mode 100644 index 0000000..b1dd7a7 --- /dev/null +++ b/exp_accuracy_no_coding.sh @@ -0,0 +1,27 @@ +#! /bin/bash + +set -vx + +GIT_VERSION="$(git rev-parse HEAD)" +NOM=exp_accuracy_vs_no_coding_$(date +%d_%H_%M)_"$GIT_VERSION" + +BASIC_OPTIONS="-v -t -l $NOM" +SUPP_OPTIONS="-e 50 --no_coding" + +KEYWORD="Seconds required for this iteration: |Error norm|Iteration #" +KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total" +FP_PAT="[-+]?[0-9]+\.?[0-9]*" + +touch "$NOM.log" + +VAR_OPTS="-s "$NOM"_w_"$w".csv" + +if hash stdbuf 2>/dev/null; then +stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +else +gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +fi diff --git a/exp_accuracy_vs_evalsize.sh b/exp_accuracy_vs_evalsize.sh new file mode 100644 index 0000000..b333a5a --- /dev/null +++ b/exp_accuracy_vs_evalsize.sh @@ -0,0 +1,29 @@ +#! /bin/bash + +set -vx + +GIT_VERSION="$(git rev-parse HEAD)" +NOM=exp_accuracy_vs_evalsize_$(date +%d_%H_%M)_"$GIT_VERSION" + +BASIC_OPTIONS="-v -t -l $NOM" +SUPP_OPTIONS="--filtering --diacritic_only" + +KEYWORD="Seconds required for this iteration: |Error norm|Iteration #" +KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total" +FP_PAT="[-+]?[0-9]+\.?[0-9]*" + +touch "$NOM.log" + +for evalsize in 10 20 30 40 50 60 70 80 90 +do +VAR_OPTS="-e $evalsize -s "$NOM"_evalsize_"$evalsize".csv" +if hash stdbuf 2>/dev/null; then +stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +else +gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +fi +done diff --git a/exp_accuracy_vs_segmentation_type.sh b/exp_accuracy_vs_segmentation_type.sh new file mode 100644 index 0000000..3ff4a72 --- /dev/null +++ b/exp_accuracy_vs_segmentation_type.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +set -vx + +GIT_VERSION="$(git rev-parse HEAD)" +NOM=exp_accuracy_vs_segmentation_type_$(date +%d_%H_%M)_"$GIT_VERSION" + +BASIC_OPTIONS="-v -t -l $NOM" +SUPP_OPTIONS="-e 50 --filtering --diacritic_only" + +KEYWORD="Seconds required for this iteration: |Error norm|Iteration #" +KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total" +FP_PAT="[-+]?[0-9]+\.?[0-9]*" + +touch "$NOM.log" + +for w in -1 1 2 3 4 5 6 0 +do +VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv" + +if hash stdbuf 2>/dev/null; then +stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +else +gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +fi +done diff --git a/exp_accuracy_vs_segmentation_type_no_decomposition_A.sh b/exp_accuracy_vs_segmentation_type_no_decomposition_A.sh new file mode 100644 index 0000000..e8e4322 --- /dev/null +++ b/exp_accuracy_vs_segmentation_type_no_decomposition_A.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +set -vx + +GIT_VERSION="$(git rev-parse HEAD)" +NOM=exp_accuracy_vs_segmentation_type_no_decomposition_A_$(date +%d_%H_%M)_"$GIT_VERSION" + +BASIC_OPTIONS="-v -t -l $NOM" +SUPP_OPTIONS="-e 50 --filtering --diacritic_only --no_decomposition" + +KEYWORD="Seconds required for this iteration: |Error norm|Iteration #" +KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total" +FP_PAT="[-+]?[0-9]+\.?[0-9]*" + +touch "$NOM.log" + +for w in -1 1 2 3 +do +VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv" + +if hash stdbuf 2>/dev/null; then +stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +else +gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +fi +done diff --git a/exp_accuracy_vs_segmentation_type_no_decomposition_B.sh b/exp_accuracy_vs_segmentation_type_no_decomposition_B.sh new file mode 100644 index 0000000..53355e4 --- /dev/null +++ b/exp_accuracy_vs_segmentation_type_no_decomposition_B.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +set -vx + +GIT_VERSION="$(git rev-parse HEAD)" +NOM=exp_accuracy_vs_segmentation_type_no_decomposition_B_$(date +%d_%H_%M)_"$GIT_VERSION" + +BASIC_OPTIONS="-v -t -l $NOM" +SUPP_OPTIONS="-e 50 --filtering --diacritic_only --no_decomposition" + +KEYWORD="Seconds required for this iteration: |Error norm|Iteration #" +KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total" +FP_PAT="[-+]?[0-9]+\.?[0-9]*" + +touch "$NOM.log" + +for w in 4 5 6 0 +do +VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv" + +if hash stdbuf 2>/dev/null; then +stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +else +gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +fi +done diff --git a/exp_accuracy_vs_segmentation_type_no_filter.sh b/exp_accuracy_vs_segmentation_type_no_filter.sh new file mode 100644 index 0000000..4be8c9c --- /dev/null +++ b/exp_accuracy_vs_segmentation_type_no_filter.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +set -vx + +GIT_VERSION="$(git rev-parse HEAD)" +NOM=exp_accuracy_vs_segmentation_type_no_filter_$(date +%d_%H_%M)_"$GIT_VERSION" + +BASIC_OPTIONS="-v -t -l $NOM" +SUPP_OPTIONS="-e 50 --diacritic_only" + +KEYWORD="Seconds required for this iteration: |Error norm|Iteration #" +KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total" +FP_PAT="[-+]?[0-9]+\.?[0-9]*" + +touch "$NOM.log" + +for w in -1 1 2 3 4 5 6 0 +do +VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv" + +if hash stdbuf 2>/dev/null; then +stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +else +gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +fi +done diff --git a/exp_accuracy_vs_segmentation_type_no_filter_no_decomposition.sh b/exp_accuracy_vs_segmentation_type_no_filter_no_decomposition.sh new file mode 100644 index 0000000..75b87b7 --- /dev/null +++ b/exp_accuracy_vs_segmentation_type_no_filter_no_decomposition.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +set -vx + +GIT_VERSION="$(git rev-parse HEAD)" +NOM=exp_accuracy_vs_segmentation_type_no_decomposition_no_filter_no_decomposition_$(date +%d_%H_%M)_"$GIT_VERSION" + +BASIC_OPTIONS="-v -t -l $NOM" +SUPP_OPTIONS="-e 50 --diacritic_only --no_decomposition" + +KEYWORD="Seconds required for this iteration: |Error norm|Iteration #" +KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total" +FP_PAT="[-+]?[0-9]+\.?[0-9]*" + +touch "$NOM.log" + +for w in -1 1 2 3 4 5 6 0 +do +VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv" + +if hash stdbuf 2>/dev/null; then +stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +else +gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \ +| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \ +>> "$NOM.log" +fi +done diff --git a/fouille_erreurs.py b/fouille_erreurs.py new file mode 100644 index 0000000..34dec82 --- /dev/null +++ b/fouille_erreurs.py @@ -0,0 +1,166 @@ +#coding=utf-8 + +import re, argparse, sys, codecs +from collections import Counter +markers_tone=[unichr(0x0301),unichr(0x0300),unichr(0x0302),unichr(0x030c)] + +def stat_from_cnt (cnt) : + + tot = sum(cnt.values()) + for k in ['E_c','E_p','E_pc','E_noise','E_silence'] : + print k,str(round(cnt[k] / float(tot) * 100, 2))+"%" + +def rm_deletion (str_in) : + + str_out = '' + record = True + for c in str_in : + if record : + if c != '-': + str_out += c + else : + record = False + else : + if c == '+': + str_out += c + record = True + + return str_out + + +def line2entries (str_in, min_len = 6, max_len = 6) : + + regex = re.compile('[\n\r]') + str_in = regex.sub('', str_in) + entries = str_in.strip().split(',') + if len(entries) < min_len or len(entries) > max_len : entries = None + return entries + +def align(dico, lst1, lst2, position_to_match = -1) : + + if lst2 == [u"NULL"] : + return dico + if not lst1 : + return dico + + for i, x in enumerate(lst1) : + for j, y in enumerate(lst2) : + if not dico : cond = True + else : cond = (i not in dico.keys()) and (j not in dico.values()) + if cond : + if position_to_match < 0 : + dico[i] = j + break + elif x[position_to_match] == y[position_to_match]: + dico[i] = j + break + return dico + +def make_aligned_result(match,ops1,ops2) : + + dst = [] + for i, x in enumerate(ops1) : + if i in match.keys() : + dst.append(ops2[match[i]]) + else : + dst.append('') + + return dst + +def stat (str1, str2, cnt, cnt2) : + + # les entrées diviser en operation d'édition + ops1 = [x for x in str1.split('+') if x] + ops2 = [x for x in str2.split('+') if x] + + # alignement pas très efficace + match = dict() + match = align(match, ops1, ops2, 0) + match = align(match, ops1, ops2, 1) + match = align(match, ops1, ops2) + dst = make_aligned_result(match, ops1, ops2) + src = ops1 + + # stat : op = p,c + for op, op2 in zip(src, dst) : + if op[1] not in markers_tone : + continue + if not op2 : # silence + tag = '3_err_silence' + tag2 = '' + elif op == op2 : # perfect + tag = '4_good' + tag2 = op[1] + elif op[0] == op2[0] : # E_c - E_p + tag = '0_err_c' + tag2 = op[1] + u'___' + op2[1] + elif op[1] == op2[1] : # E_p - E_c + tag = '1_err_p' + tag2 = '' + else : # E_p inter E_c + tag = '2_err_others' + tag2 = '' + + cnt[tag] += 1 + if tag2 : cnt2[tag2] += 1 + + return [cnt,cnt2] + +def main(infile) : + + cnt = Counter() + cnt2 = Counter() + with codecs.open(infile,'r', encoding='utf-8') as f : + for line in f: + + cols = line2entries(line) + if cols : + token, gold_form, test_form, gold_code, test_code, cmp = cols + gold_code_segments = gold_code.split() + test_code_segments = test_code.split() + for gold_code_segment, test_code_segment in zip(gold_code_segments, test_code_segments) : + gold_code_segment = rm_deletion(gold_code_segment) + test_code_segment = rm_deletion(test_code_segment) + if gold_code_segment != "NULL" and gold_code_segment : + cnt,cnt2 = stat(gold_code_segment, test_code_segment, cnt,cnt2) + return cnt,cnt2 + +def print_cnt (cnt, mode) : + + tot = float(sum(cnt.values())) + + + if mode == 0: + for k in sorted(cnt.keys()) : + print u"{:16s} = {:05.4f}".format(k, cnt[k] / tot) + else : + # horizontal label + sys.stdout.write(u"{:5s} ".format('')) + for k2 in markers_tone : + sys.stdout.write(u"{:>5s} ".format(k2)) + print "" + + for k1 in markers_tone : + for i,k2 in enumerate(markers_tone) : + tag = k1 + u'___' + k2 + if k1 == k2 and k1 in cnt.keys() : val = cnt[k1] / tot + elif k1 != k2 and tag in cnt.keys() : val = cnt[tag] / tot + else : val = 0 + # vertical label + if not i : sys.stdout.write(u"{:>5s} & ".format(k1)) + # matrix content + if i == len(markers_tone) - 1 : c = '\\\\' + else : c = '&' + sys.stdout.write(u"{:5.4f} {:1s} ".format(val,c)) + print "" + +if __name__ == "__main__" : + + aparser = argparse.ArgumentParser() + aparser.add_argument('infile' , help='Input file (.csv)' , default=sys.stdin) + args = aparser.parse_args() + cnt,cnt2 = main(args.infile) + print args.infile + print_cnt(cnt,0) + print "" + print_cnt(cnt2,1) diff --git a/kill_all_exps.sh b/kill_all_exps.sh new file mode 100644 index 0000000..bce7615 --- /dev/null +++ b/kill_all_exps.sh @@ -0,0 +1,7 @@ +#! /bin/sh + +killall tail +killall gawk +killall Python +killall python +killall bash diff --git a/launch_all_exps.sh b/launch_all_exps.sh new file mode 100644 index 0000000..54a520b --- /dev/null +++ b/launch_all_exps.sh @@ -0,0 +1,10 @@ +#! /bin/sh + +#set -vx + +for f in exp*.sh ; do + bash "$f" & +done + +sleep 5 +tail -f *.log diff --git a/models/model_pos_exactitude_92p3 b/models/model_pos_exactitude_92p3 new file mode 100644 index 0000000..3ac9217 Binary files /dev/null and b/models/model_pos_exactitude_92p3 differ diff --git a/models/model_tone_exactitude_92p1.zip b/models/model_tone_exactitude_92p1.zip new file mode 100644 index 0000000..df74bfd Binary files /dev/null and b/models/model_tone_exactitude_92p1.zip differ