diff --git a/differential_tone_coding.py b/differential_tone_coding.py
index 2ba78ee..67eacd2 100755
--- a/differential_tone_coding.py
+++ b/differential_tone_coding.py
@@ -6,21 +6,316 @@
import Levenshtein
from syllables import syllabify, vowels
import re
+import itertools
+import csv
+import codecs
# Installation of prerequisites
# sudo pip install python-Levenshtein
# Constant lists
markers_tone = [unichr(0x0300),unichr(0x0301),unichr(0x0302),unichr(0x030c)]
-token_seperator = u'_'
code_seperator = u'_'
mode_indicators = u'+-'
mode_names = [u"insert",u"delete"]
-markers_to_be_ignored = u"[].-" + code_seperator
-markers_to_be_replaced = {u"’":u"'"}
+markers_to_be_ignored = u"." # u"[].-" + code_seperator
+markers_to_be_replaced = dict() # {u"’":u"'"}
-# todo : decomposition en opérations - opérands
-def code_dispatcher(code) :
+def apply_filter_to_base_element(x, sets, sel_en, decomposition_en, show_approx_err = False) :
+
+ if isinstance(x, tuple) :
+ return (x[0], filter(x[0], x[1].decode("utf-8"), sets, sel_en, decomposition_en, show_approx_err).encode("utf-8"))
+ else :
+ return [apply_filter_to_base_element(element, sets, sel_en, decomposition_en, show_approx_err) for element in x]
+
+
+def filter(token, tag, sets, sel_en, decomposition_en, show_approx_err = False) :
+
+ subcodes = code_dispatcher(tag, sel_en, decomposition_en)
+ code2 = code_seperator.encode("utf-8").join([subcode for p, subcode in enumerate(subcodes) if (p in sets and subcode)])
+ ret = code_resort(code2)
+
+ if ret != tag and set(sets) == {0,1,2,3}and show_approx_err :
+ # un prix d'approximation à payer :
+ # notre décomposition de la CRF paye bien un prix : la perte de l'ordre des caractères (de types différents) à insérer sur
+ # la même position par rapport qu token original, bien que ce soit rarissime d'insérer successivement un caractère non diacritique
+ # et un diacritique à la même position sur un token, ça peut arriver, donc générer une erreur observable si nous dés-commentons
+ # les lignes de code ci-dessous
+ # par exemple,
+ # tag_gold = +_0_ɔ_+_0_̀_+_0_n_-_0_a
+ # tag_reconstitued = +_0_ɔ_+_0_n_+_0_̀_-_0_a
+ # pour rappel, cette erreur est comptée dans l'évaluation, et curieusement on n'a pas constaté de dégradation considérable
+ # dans l'exactitude du résultat, ça peut s'expliquer par ce qu'une insertion successive est souvent dfficile à apprendre
+ # en soi, (hypothèse à vérifier, donc), donc une perte d'ordre ne rend pas ce cas de figure plus catastrohpique quantitativement
+ print "Case of modeling error inherent to decomposition hypothesis : \n", ret, tag
+
+ return ret
+
+
+def verify(x) :
+ apply_filter_to_base_element(x, [0,1,2,3], show_approx_err=True)
+
+
+def split2 (str_in, seperator) :
+
+ buf = ''
+ ret = []
+ for c in str_in :
+ if c != seperator :
+ buf += c
+ else :
+ if buf :
+ ret.append(buf)
+ buf = ''
+ else :
+
+ buf += c
+ if buf :
+ ret.append(buf)
+ return ret
+
+def marginal_tone(taggers, tnum, tokens, tag, token, chunk_mode, sel_en, decomposition_en) :
+
+ enc = encoder_tones()
+ codes, syllabes = enc.differential_encode(token, tag.decode('utf-8'), chunk_mode)
+
+ k = 0
+ snums = []
+ for i, t in enumerate(tokens) :
+ for j, syllabe in enumerate(t) :
+ if i == tnum :
+ snums.append(k)
+ k += 1
+
+ if len(syllabes) != len(snums) :
+ print "Bug 3 !"
+ exit()
+
+ prob_tot = 1
+ for p, tagger in enumerate(taggers) :
+ if not taggers[p]._model_file :
+ continue
+ for i in range(len(syllabes)) :
+ subcode = code_dispatcher(codes[i], sel_en, decomposition_en)[p]
+ try :
+ prob = taggers[p]._tagger.marginal(subcode.encode('utf-8'), snums[i])
+ except :
+ prob = 0.0
+ prob_tot *= prob
+
+ return prob_tot
+
+def accuray2 (dataset1, dataset2, is_tone_mode = False) :
+ cnt_sucess = 0
+ cnt_fail = 0
+ if not is_tone_mode :
+ for sent1, sent2 in zip(dataset1, dataset2) :
+ for token1, token2 in zip(sent1, sent2) :
+ if token1 == token2 :
+ cnt_sucess += 1
+ else :
+ cnt_fail += 1
+
+ else :
+ for sent1, sent2 in zip(dataset1, dataset2) :
+ for token1, token2 in zip(sent1, sent2) :
+ is_identical = True
+ for syllabe1, syllabe2 in zip(token1, token2) :
+ if syllabe1 != syllabe2 : is_identical = False ; break;
+ if is_identical :
+ cnt_sucess += 1
+ else :
+ cnt_fail += 1
+
+ cnt_tot = cnt_sucess + cnt_fail
+ if not cnt_tot : return 0.0
+ else : return cnt_sucess / float(cnt_tot)
+
+def get_sub_tone_code_of_sentence (sentence, phase, sel_en, decomposition_en) :
+ labels = list()
+ for i, token in enumerate(sentence) :
+ label = list()
+ for j, syllabe_code in enumerate(token) :
+ syllabe, code = syllabe_code
+ subcode = code_dispatcher(code.decode('utf-8'), sel_en, decomposition_en)[phase].encode('utf-8')
+ label.append(subcode)
+ labels.append(label)
+ return labels
+
+def accumulate_tone_code_of_dataset (dataset_acc, dataset) :
+ for p, sent in enumerate(dataset_acc) :
+ for i, token in enumerate(sent) :
+ for j, syllabe_tag_acc in enumerate(token) :
+ syllabe_acc, tag_acc = syllabe_tag_acc
+ syllabe, tag = dataset[p][i][j]
+ if tag_acc and tag :
+ tag_acc += code_seperator.encode('utf-8') + tag
+ else :
+ tag_acc += tag
+ dataset_acc[p][i][j] = \
+ tuple([syllabe, code_resort(tag_acc.decode('utf-8')).encode('utf-8')])
+
+ return dataset_acc
+
+def reshape_tokens_as_sentnece(tokens, sentnece) :
+
+ ret = list()
+ n = 0
+ for i, token in enumerate(sentnece) :
+ tmp = list()
+ for j, syllabe in enumerate(token) :
+ tmp.append(tokens[n])
+ n += 1
+ ret.append(tmp)
+
+ return ret
+
+def make_tokens_from_sentence(sent, is_tone_mode = False) :
+ if is_tone_mode :
+ tokens = list()
+ labels = list()
+ for token in sent :
+ tokens.append(unzip(token)[0])
+ labels.append(unzip(token)[1])
+ else :
+ tokens = unzip(sent)[0]
+ labels = unzip(sent)[1]
+
+ return [tokens, labels]
+
+def make_features_from_tokens(tokens, phase = 0, is_tone_mode = False) :
+ if is_tone_mode :
+ features_syllabe = list()
+ for i, token in enumerate(tokens) :
+ feature = list()
+ for j, syllabe_code in enumerate(token) :
+ feature.append(get_features_customised_tone(tokens, i, j, phase))
+ features_syllabe.append(feature)
+ features = list(itertools.chain(*features_syllabe))
+ else :
+ features = list()
+ for i in range(len(tokens)) :
+ features.append(get_features_customised(tokens, i))
+ return features
+
+def inspector_tokens(gold_tokens, predicted_tokens) :
+ for x,y in zip(gold_tokens, predicted_tokens) :
+ try :
+ if x[1] != y[1] :
+ print x[0],":",x[1].decode('utf-8'),"->",y[1].decode('utf-8') # ,"(",len(x[1]), len(y[1]),")"
+ else :
+ print "*",x[0],":",x[1].decode('utf-8'),"->",y[1].decode('utf-8') # ,"(",len(x[1]), len(y[1]),")"
+ except :
+ print type(x[0]),":",type(x[1]),"->",type(y[1])
+
+def unzip(input) :
+ return [list(li) for li in zip(*input)]
+
+def csv_export(filename, gold_set, test_set, is_tone_mode = False):
+
+ if not is_tone_mode :
+ csvfile = codecs.open(filename, 'wb')
+ writer = csv.writer(csvfile)
+ writer.writerow(["Token", "Golden", "Predicted", "Same"])
+ for gold_sent, test_sent in zip(gold_set, test_set) :
+ for gold_token, test_token in zip(gold_sent, test_sent) :
+ token = gold_token[0]
+ gold_code = gold_token[1]
+ test_code = test_token[-1]
+ # print token, gold_code, test_code
+ sameCodes = (gold_code == test_code)
+
+ if not repr(token.encode('utf-8')) :
+ sameCodes = u''
+ row = [\
+ (token.encode('utf-8')), \
+ gold_code, \
+ test_code, \
+ sameCodes]
+ writer.writerow(row)
+ csvfile.close()
+ else :
+ csvfile = codecs.open(filename, 'wb')
+ writer = csv.writer(csvfile)
+ writer.writerow(["Token", \
+ "Golden Form","Predicted Form", \
+ "Golden code", "Predicted code", "Same"])
+ enc = encoder_tones()
+ for gold_sent, test_sent in zip(gold_set, test_set) :
+ for gold_token, test_token in zip(gold_sent, test_sent) :
+ gold_code = ''
+ test_code = ''
+ gold_form = ''
+ test_form = ''
+ token = ''
+ for gold_syllabe, test_syllabe in zip(gold_token, test_token) :
+ token += gold_syllabe[0] + ' '
+ if gold_syllabe[1] :
+ gold_code += gold_syllabe[1] + ' '
+ else :
+ gold_code += 'NULL' + ' '
+ if test_syllabe[1] :
+ test_code += test_syllabe[1] + ' '
+ else :
+ test_code += 'NULL' + ' '
+ gold_form += enc.differential_decode(gold_syllabe[0], gold_syllabe[1].decode('utf-8')) + ' '
+ test_form += enc.differential_decode(gold_syllabe[0], test_syllabe[1].decode('utf-8')) + ' '
+ sameCodes = (gold_code == test_code)
+ sameForms = (gold_form == test_form)
+ sameCodes = (gold_code == test_code)
+ sameForms = (gold_form == test_form)
+ if not repr(token.encode('utf-8')) :
+ sameCodes = u''
+ row = [\
+ (token.encode('utf-8')), \
+ repr(gold_form.encode('utf-8')), \
+ repr(test_form.encode('utf-8')), \
+ repr(gold_code, spaces=True), \
+ repr(test_code, spaces=True), \
+ sameCodes]
+ writer.writerow(row)
+ csvfile.close()
+
+def sampling(allsents, p, ratio = 1) :
+ train_set, eval_set = [], []
+ for i, sent in enumerate(allsents[0 : : int(1/float(ratio))]) :
+ p_approx = float(len(train_set) + 1) / float(len(eval_set) + len(train_set) + 1)
+ if p_approx <= p :
+ train_set.append(sent)
+ else:
+ eval_set.append(sent)
+ return [train_set, eval_set]
+
+def get_duration(t1_secs, t2_secs) :
+ secs = abs(t1_secs - t2_secs)
+ days = secs // 86400
+ hours = secs // 3600 - days * 24
+ minutes = secs // 60 - hours * 60 - days * 60 * 24
+ secondes = int(secs) % 60
+ return '{:>02.0f}:{:>02.0f}:{:>02.0f}:{:>02d}'.format(days, hours, minutes, secondes)
+
+def is_a_good_code(code) :
+
+ if not code : return True
+
+ code2 = code
+
+ # +_2__ is good, because -> + 2 _
+ if code2[-1] == code_seperator.decode('utf-8') or code2[-1] == code_seperator :
+ try :
+ if code2[-1] != code2[-2] :
+ return False
+ except IndexError:
+ return False
+
+ # code3 = code2.split(code_seperator.decode('utf-8'))
+ code3 = split2(code2,code_seperator.decode('utf-8'))
+ if len(code3) % 3 != 0 :
+ return False
+ else :
+ return True
+
+def code_dispatcher(code, sel_en, decomposition_en) :
lst = []
for i in mode_indicators :
@@ -28,152 +323,206 @@ def code_dispatcher(code) :
lst.append("")
if not code : return lst
- if code[-1] == code_seperator : code = code[: -1]
- code_segments = code.split(code_seperator)
- for i in range(0, len(code_segments), 3) :
- m, p, c = code_segments[i : i + 3]
- lst[mode_indicators.index(m) + len(mode_indicators) * int(c in markers_tone)] += \
+ if not is_a_good_code(code) : print "(dispatcher) input code incorrect !" ; print code ; exit()
+ #if code[-1] == code_seperator : code = code[: -1]
+ # code_segments = code.split(code_seperator)
+ code_segments = split2(code,code_seperator)
+
+ # Filtering
+ def indexing(op) :
+ m,p,c = op
+ return 2 * int(p) + mode_indicators.index(m)
+
+ ops = [code_segments[i : i + 3] for i in range(0, len(code_segments), 3)]
+
+ if sel_en :
+ ops = sorted(ops, key=lambda op : indexing(op))
+ ops2 = list()
+ i_pre = -1
+ for op in ops :
+ i = indexing(op)
+ if i > i_pre :
+ ops2.append(op)
+ i_pre = i
+ else :
+ ops2 = ops
+
+ for op in ops2 :
+ m,p,c = op
+ if decomposition_en :
+ phase = mode_indicators.index(m) + len(mode_indicators) * int(c in markers_tone)
+ else :
+ phase = len(mode_indicators) * int(c in markers_tone)
+ lst[phase] += \
u"{}{}{}{}{}{}".format(m, code_seperator, p, code_seperator, c, code_seperator)
- return lst
+ lst2 = list()
+ for element in lst :
+ try :
+ if element[-1] == code_seperator or element[-1] == code_seperator.decode('utf-8') :
+ lst2.append(element[:-1])
+ else :
+ lst2.append(element)
+ except :
+ lst2.append(element)
+
+ for code in lst2 :
+ if not is_a_good_code(code):
+ print "(dispatcher) output code incorrect !"
+ print code
+
+ return lst2
def code_resort(code) :
+
ret = []
if not code : return code
- if code[-1] == code_seperator : code = code[: -1]
- code_segments = code.split(code_seperator)
+ if not is_a_good_code(code) : print "(resort) input code incorrect !" ; exit()
+ #if code[-1] == code_seperator : code = code[: -1]
+ #code_segments = code.split(code_seperator)
+ code_segments = split2(code,code_seperator)
for i in range(0, len(code_segments), 3) :
- m, p, c = code_segments[i : i + 3]
+ try :
+ m, p, c = code_segments[i : i + 3]
+ except :
+ print code
+ print code_segments;
+ print "Bug 1 !"
+ exit()
+
ret.append(u"{}{}{}{}{}{}".format(m, code_seperator, p, code_seperator, c, code_seperator))
- ret = sorted(ret, key=lambda x : int(mode_indicators.index(m))+2*int(x.split(code_seperator)[1]))
+ ret = sorted(ret, key=lambda x : int(mode_indicators.index(split2(x, code_seperator)[0])) + 2 * int(split2(x, code_seperator)[1]))
ret = ''.join(ret)
if ret : ret = ret[:-1]
+ if not is_a_good_code(ret) : print ("(resort) ouptut code incorrect !") ; exit()
+
return ret
-def _get_features_customised_for_tones(tokens, idx):
+def get_features_customised(tokens, idx):
feature_list = []
if not tokens:
return feature_list
- try :
- token = tokens[idx]
- except IndexError :
- raise
-
- # positon du syllabe actuel et préfixe et suffixe du même mot
- lst = []
- for i in range(idx, len(tokens) + 1, 1) :
- try :
- if tokens[i] == token_seperator :
- lst.append(i)
- if len(lst) >= 2 :
- break
- except IndexError :
- lst.append(i)
- break
-
- try :
- feature_list.append("SYLLABE_ID1_" + str(lst[0] - idx))
- except :
- pass
-
- try :
- feature_list.append("SUFFIXE_ACTUEL_" + tokens(lst[0] - 1))
- except :
- pass
-
- lst2 = []
- for i in range(idx, -2, -1) :
- try :
- if tokens[i] == token_seperator :
- lst2.append(i)
- if len(lst2) >= 2 :
- break
- except IndexError :
- lst2.append(i)
- break
-
- try :
- feature_list.append("SYLLABE_ID2_" + str(idx - lst2[0]))
- except :
- pass
-
- try :
- feature_list.append("PREFIXE_ACTUEL_" + tokens(lst2[0] + 1))
- except :
- pass
-
- # préfixe et suffixe du mots précédent et suivant dans la même phrase
- try :
- prefixe_du_mot_suivant = tokens[lst[0] + 1]
- feature_list.append("PREFIXE_SUIVANT_" + prefixe_du_mot_suivant)
- except IndexError :
- pass
- try :
- suffixe_du_mot_precedent = tokens[lst2[0] - 1]
- feature_list.append("SUFFIXE_PRECEDENT_" + suffixe_du_mot_precedent)
- except IndexError:
- pass
-
- try :
- suffixe_du_mot_suivant = tokens[lst[1] - 1]
- feature_list.append("SUFFIXE_SUIVANT_" + suffixe_du_mot_suivant)
- except IndexError :
- pass
- try :
- prefixe_du_mot_precedent = tokens[lst2[1] + 1]
- feature_list.append("PREFIXE_PRECEDENT_" + prefixe_du_mot_precedent)
- except IndexError :
- pass
+ token = tokens[idx]
# Capitalization
if token[0].isupper():
- feature_list.append('CAPITALIZATION')
+ feature_list.append(u'CAPITALIZATION')
# Number
if re.search(r'\d', token) is not None:
- feature_list.append('IL_Y_A_UN_CHIFFRE')
+ feature_list.append(u'IL_Y_A_UN_CHIFFRE')
# Punctuation
- punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
+ punc_cat = set([u"Pc", u"Pd", u"Ps", u"Pe", u"Pi", u"Pf", u"Po"])
if all (unicodedata.category(x) in punc_cat for x in token):
- feature_list.append('PONCTUATION_PURE')
+ feature_list.append(u'PONCTUATION_PURE')
# Voyelles
- voyelles = ""
+ voyelles = u""
for c in token :
if c.lower() in vowels:
voyelles += c
- feature_list.append('VOYELLES_'+ voyelles)
+ feature_list.append(u'VOYELLES_'+ voyelles)
# Syllabes précédent et suivant
try :
- feature_list.append('SYLLABE_PRECEDENT_' + token[idx - 1])
+ feature_list.append(u'TOKEN_PRECEDENT_' + token[idx - 1])
except IndexError :
pass
try :
- feature_list.append('SYLLABE_SUIVANT_' + token[idx + 1])
+ feature_list.append(u'TOKEN_SUIVANT_' + token[idx + 1])
except IndexError :
pass
- feature_list.append('SYLLABE_ACTUEL_' + (token))
+ feature_list.append(u'TOKEN_ACTUEL_' + (token))
# Suffix & prefix up to length 3
if len(token) > 1:
- feature_list.append('SUF_' + token[-1:])
- feature_list.append('PRE_' + token[:1])
+ feature_list.append(u'SUF_' + token[-1:])
+ feature_list.append(u'PRE_' + token[:1])
if len(token) > 2:
- feature_list.append('SUF_' + token[-2:])
- feature_list.append('PRE_' + token[:2])
+ feature_list.append(u'SUF_' + token[-2:])
+ feature_list.append(u'PRE_' + token[:2])
if len(token) > 3:
- feature_list.append('SUF_' + token[-3:])
- feature_list.append('PRE_' + token[:3])
+ feature_list.append(u'SUF_' + token[-3:])
+ feature_list.append(u'PRE_' + token[:3])
+
+ return feature_list
+
+def get_features_customised_tone(tokens, i, j, phase) :
+
+ feature_list = []
+
+ if not tokens:
+ return feature_list
+
+ try :
+ syllabes = tokens[i]
+ syllabe = syllabes[j]
+ except IndexError :
+ raise
+
+ # phases
+ feature_list.append(u'PHASE_ID_' + str(phase))
+
+ # Positions
+ feature_list.append(u'SYLLABE_ID_POSITIF_' + str(j))
+ feature_list.append(u'SYLLABE_ID_NEGATIF_' + str(len(syllabes) - j - 1))
+ feature_list.append(u'TOKEN_ID_POSITIF_' + str(i))
+ feature_list.append(u'TOKEN_ID_NEGATIF_' + str(len(tokens) - i - 1))
+
+ # Châine de caractères au niveau du vocable actuel
+ feature_list.append(u'SYLLABE_ACTUELLE_' + syllabe)
+ feature_list.append(u'PREFIXE_ACTUEL_' + syllabes[0])
+ feature_list.append(u'SUFFIXE_ACTUEL_' + syllabes[-1])
+ try : feature_list.append(u'SYLLABE_QUI_PRECEDE_' + syllabes[j - 1])
+ except : pass
+ try : feature_list.append(u'SYLLABE_QUI_SUIT_' + syllabes[j + 1])
+ except : pass
+
+ # châine de caractères au niveau du vocable qui précède et celui qui suit
+ try : feature_list.append(u'PREFIXE_DU_TOKEN_QUI_PRECEDE_' + tokens[i-1][0])
+ except : pass
+ try : feature_list.append(u'SUFFIXE_DU_TOKEN_QUI_PRECEDE_' + tokens[i-1][-1])
+ except : pass
+ try : ffeature_list.append(u'PREFIXE_DU_TOKEN_QUI_SUIT_' + tokens[i+1][0])
+ except : pass
+ try : ffeature_list.append(u'SUFFIXE_DU_TOKEN_QUI_SUIT_' + tokens[i+1][-1])
+ except : pass
+
+ # châine de caractères au niveau d'une phrase
+ feature_list.append(u'TOKEN_ACTUEL_' + ''.join(syllabes))
+ try : feature_list.append(u'TOKEN_QUI_PRECEDE_' + ''.join(tokens[i - 1]))
+ except : pass
+ try : feature_list.append(u'TOKEN_QUI_SUIT_' + ''.join(tokens[i + 1]))
+ except : pass
+
+ # Capitalization
+ if syllabe[0].isupper():
+ feature_list.append(u'CAPITALIZATION')
+
+ # Number
+ if re.search(r'\d', syllabe) is not None:
+ feature_list.append(u'IL_Y_A_UN_CHIFFRE')
+
+ # Punctuation
+ punc_cat = set([u"Pc", u"Pd", u"Ps", u"Pe", u"Pi", u"Pf", u"Po"])
+ if all (unicodedata.category(x) in punc_cat for x in syllabe):
+ feature_list.append(u'PONCTUATION_PURE')
+
+ # Voyelles
+ voyelles = u""
+ for c in syllabe :
+ if c.lower() in vowels:
+ voyelles += c
+ feature_list.append(u'VOYELLES_'+ voyelles)
return feature_list
@@ -197,11 +546,28 @@ def rm_sep(str_in, seprator_in = code_seperator, replacing = u''):
except :
raise
-def chunking (token) :
+def chunking (token, mode) :
chunks = []
- for chunk in syllabify(token)[0]:
- chunks.append(unicodedata.normalize('NFD', chunk))
+
+ if mode == 0 :
+ # sans segmenteur
+ chunks.append(token)
+ elif mode < 0 :
+ # syllabification
+ for chunk in syllabify(token)[0]:
+ chunks.append(unicodedata.normalize('NFD', chunk))
+ # segmentation à intervalle régulier
+ else :
+ token2 = unicodedata.normalize('NFD', token)
+ seg = ""
+ for c in token2 :
+ seg += c
+ if len(seg) == mode :
+ chunks.append(seg)
+ seg = ""
+ if seg :
+ chunks.append(seg)
return chunks
@@ -297,7 +663,6 @@ def entropy (cnt, unit = 'shannon') :
ent -= p * math.log(p, base[unit])
return ent
-
def sprint_cnt(cnt, prefix = "", num = -1, min = -1) :
lst = cnt.most_common()
@@ -319,6 +684,7 @@ def __init__(self) :
self.form_non_tonal = Counter()
self.form_tonal = Counter()
self.code = Counter()
+ self.code2 = Counter()
self.segment_code = Counter()
self.dict_code = defaultdict()
self.dict_form_tonal= defaultdict()
@@ -336,7 +702,9 @@ def __str__(self) :
ret += u"Entropies globales\n"
ret += u"\tE(Token) = {:<6.2f} \n".format(entropy(self.form_non_tonal))
ret += u"\tE(Forme tonale) = {:<6.2f} \n".format(entropy(self.form_tonal))
- ret += u"\tE(Code produit) = {:<6.2f} \n".format(entropy(self.code))
+ ret += u"\tE(Code produit) = {:<6.2f} \n".format(entropy(self.code2))
+ ret += u"\tr_E(Code produit) = {:<6.2f} \n".format(entropy(self.form_tonal)/entropy(self.code2))
+
ret += u"Entropies par token (en moyenne)\n"
ret += u"\tE(Forme tonale) = {:<6.2f} \n".\
format(entropy2(self.dict_form_tonal, cnty = self.form_tonal, cntx = self.form_non_tonal))
@@ -392,24 +760,7 @@ def insert(self) :
self.stat.dst_insert[caracter_dst] += 1
self.stat.segment_code[repr(segment)] += 1
- """
- def replace(self) :
- mode_id = mode_names.index("replace")
- [mp_code, chunk_id] = mode_position_encoder(self.src,self.p_src, mode_id, self.chunks)
- segment = mp_code + code_seperator
- caracter_src = self.src[self.p_src]
- caracter_dst = self.dst[self.p_dst]
- segment += caracter_dst + code_seperator
- self.ret[chunk_id] += segment
-
- self.stat.cnt_ops += 1
- self.stat.mode["replace"] += 1
- self.stat.src_replace[caracter_src] += 1
- self.stat.dst_replace[caracter_dst] += 1
- self.stat.segment_code[repr(segment)] += 1
- """
-
- def differential_encode (self, form_non_tonal, form_tonal, seperator = True) :
+ def differential_encode (self, form_non_tonal, form_tonal, chunk_mode) :
self.p_src = -1
self.p_dst = -1
@@ -417,12 +768,9 @@ def differential_encode (self, form_non_tonal, form_tonal, seperator = True) :
self.src = reshaping(form_non_tonal, False)
if not self.src :
- if seperator:
- return [u"", [token_seperator]]
- else :
- return [u"", []]
+ return [[u""], [form_non_tonal]]
- self.chunks = chunking(self.src)
+ self.chunks = chunking(self.src, chunk_mode)
self.ret = [u"" for i in range(len(self.chunks))]
self.dst = reshaping(form_tonal, False)
@@ -446,18 +794,20 @@ def differential_encode (self, form_non_tonal, form_tonal, seperator = True) :
# enlèvement du séparateur du code à la fin du chunk
tmp = []
- for ret2 in self.ret :
+ for ret2 in self.ret :
try :
- if ret2[-1] == code_seperator :
- ret2 = ret2[:-1]
+ if ret2[-1] == code_seperator :
+ ret2 = ret2[:-1]
except IndexError:
pass
- tmp.append(ret2)
- self.ret = tmp
+ tmp.append(ret2)
+ self.ret = tmp
self.stat.num += 1
repr_code = repr(u"".join(self.ret))
self.stat.code[repr_code] += 1
+ for chunk_code in self.ret :
+ self.stat.code2[chunk_code] += 1
self.stat.dict_code.setdefault(self.src, []).append(repr_code)
# internal auto-check
@@ -468,9 +818,11 @@ def differential_encode (self, form_non_tonal, form_tonal, seperator = True) :
if form1 != form2 :
self.stat.err_cnt += 1
- if seperator :
- self.ret.append(u'')
- self.chunks.append(token_seperator)
+ for code in self.ret :
+ if not is_a_good_code :
+ print "(encode) ouput code incorrect !";
+ print code ;
+ exit ()
return [self.ret, self.chunks]
@@ -482,9 +834,11 @@ def differential_decode (self, chunk, code) :
chunk = reshaping(chunk, False)
if len(code.strip()) == 0 : return chunk
+ if not is_a_good_code(code) : print "(decode) input code incorrect !" ; print chunk ; print code ; exit()
- if code[-1] == code_seperator : code = code[: -1]
- code_segments = code.split(code_seperator)
+ # if code[-1] == code_seperator : code = code[: -1]
+ # code_segments = code.split(code_seperator)
+ code_segments = split2(code,code_seperator)
if len(code_segments) % 3 != 0 : print code ; print (code_segments) ; print ("input code incorrect !"); exit(1)
p_offset = 0
@@ -492,7 +846,7 @@ def differential_decode (self, chunk, code) :
try :
m, p, c = code_segments[i:i+3]
except :
- print (u"Bug in differential_decode : {}".format(code))
+ print (u"Bug 2 : {}".format(code))
exit(1)
p_eff = int(p) + p_offset
@@ -515,6 +869,7 @@ def differential_decode (self, chunk, code) :
def main () :
+ """
forms_non_tonal = [u'tò',u'yerehré',u'ò',u'e', u'òhehòhe', u'òhòh',u'ohoh',u'ehe', u'tò',u'hééh',u'heeh',u'hèé', u'narè']
forms_tonal = [u'tɔ',u'yɛrɛ̂hre',u'o',u'é', u'ohéhohé', u'ohoh',u'òhòh',u'ebe',u'tɔ',u'heeh',u'hééh',u'héè', u'nàrɛ']
@@ -523,11 +878,13 @@ def main () :
for form_non_tonal, form_tonal in zip(forms_non_tonal, forms_tonal) :
print u"Source {}".format(reshaping(form_non_tonal, False))
print u"Destination {}".format(reshaping(form_tonal, False))
- [codes, chunks] = enc.differential_encode (form_non_tonal, form_tonal)
+ [codes, chunks] = enc.differential_encode (form_non_tonal, form_tonal, chunk_mode)
i = 0
for chunk, code in zip(chunks, codes) :
sys.stdout.write(u"Syllabe_{} '{}' - '{}' -> '{}'\n".format(i, enc.differential_decode(chunk, code), chunk, repr(code)));
- sys.stdout.write(u"Syllabe_{} '{}' - '{}' -> '{}'\n".format(i, enc.differential_decode(chunk, code_resort(''.join(code_dispatcher(code)))), chunk, repr(code_resort(''.join(code_dispatcher(code))))));
+ sys.stdout.write(u"Syllabe_{} '{}' - '{}' -> '{}'\n".\
+ format(i, enc.differential_decode(\
+ chunk, code_resort(''.join(code_dispatcher(code)))), chunk, repr(code_resort(''.join(code_dispatcher(code))))));
pass
print ""
@@ -537,5 +894,6 @@ def main () :
print form1, form2
enc.report()
+ """
if __name__ == "__main__" : main()
diff --git a/disambiguation.py b/disambiguation.py
index a15bd26..4c5da25 100644
--- a/disambiguation.py
+++ b/disambiguation.py
@@ -3,106 +3,28 @@
# Auteur : Elvis Mboning, Stagiaire 2016, INALCO
# Auteur : Damien Nouvel, MCF, INALCO
+# Auteur : Luigi (Yu-Cheng) Liu, Stagiaire 2017, INALCO
# Le principale rôle de ce script est de créer des modèles de données pour l'apprentissage automatique avec CRFTagger.
# Le CRF implémenté provient du module tag de NLTK inspiré de CRFSuite (http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf).
# Trois modèles sont possibles : les POS, les tons, les gloses
-# todo:
-# * petit rapport sur les distributions de caractères et leurs natures dans le corpus
-## * enregistrement et téléverser
-# * models produits /models/pos_exactitude_0p92.mod
-# * models produits /models/tone_exactitude_0p91.mod
-# * avec un fihier in et un fichier out
-#
-# des RDV. prévus
-# mercredi 17 mai à 14 : 30
-
-import sys, re, codecs, glob, time, os
-import argparse
+
+import sys, re, codecs, glob, time, os, collections, argparse, itertools
import formats, grammar
-import collections
+from gdisamb import FileParser
from ntgloss import Gloss
from nltk.tag.crf import CRFTagger
-from gdisamb import FileParser
-from differential_tone_coding import encoder_tones, repr, token_seperator, _get_features_customised_for_tones, code_dispatcher, code_resort, mode_indicators
-import unicodedata
-import pycrfsuite
-import csv
import nltk.tag.util
-import itertools
-from nltk.metrics.scores import accuracy
-import zipfile
+import pycrfsuite
+from differential_tone_coding import apply_filter_to_base_element, get_features_customised, get_duration, sampling, csv_export, unzip, encoder_tones, mode_indicators, marginal_tone, accuray2, get_sub_tone_code_of_sentence, accumulate_tone_code_of_dataset, reshape_tokens_as_sentnece, make_tokens_from_sentence, make_features_from_tokens
+import unicodedata
+import zipfile, ntpath
-import codecs, sys
+import codecs, sys, fnmatch
sys.stdin = codecs.getreader('utf8')(sys.stdin)
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
-def unzip(input) :
- return [list(li) for li in zip(*input)]
-
-# dataset : list((str,str))
-def getTag(dataset) :
- ret = []
- buf = str()
- for data in dataset :
- if data[0] != token_seperator :
- buf += data[1]
- else :
- ret.append(buf)
- buf = str()
- if buf :
- ret.append(buf)
- return ret
-
-def csv_export(enc, filename, gold_tokens, test_tokens):
-
- try :
- csvfile = codecs.open(filename, 'wb')
- writer = csv.writer(csvfile)
- writer.writerow(["Token", "Golden Form", "Predicted Form","Golden code", "Predicted code", "Same"])
- for g, t in zip(gold_tokens, test_tokens) :
- token = g[0]
- golden_code = g[-1]
- predicted_code = t[-1]
- golden_form = enc.differential_decode(token, golden_code.decode('utf-8'))
- predicted_form = enc.differential_decode(token, predicted_code.decode('utf-8'))
- sameCodes = (golden_code == predicted_code)
- sameForms = (golden_form == predicted_form)
-
- if not repr(token.encode('utf-8')) :
- sameCodes = u''
- row = [\
- repr(token.encode('utf-8')), \
- repr(golden_form.encode('utf-8')), \
- repr(predicted_form.encode('utf-8')), \
- repr(golden_code, spaces=True), \
- repr(predicted_code, spaces=True), \
- sameCodes]
-
- writer.writerow(row)
- csvfile.close()
- except :
- raise
- print "unable to dump result in CSV file to create !"
-
-def sampling(allsents, p, ratio = 1) :
- train_set, eval_set = [], []
- for i, sent in enumerate(allsents[0 : : int(1/float(ratio))]) :
- p_approx = float(len(train_set) + 1) / float(len(eval_set) + len(train_set) + 1)
- if p_approx <= p :
- train_set.append(sent)
- else:
- eval_set.append(sent)
- return [train_set, eval_set]
-
-def get_duration(t1_secs, t2_secs) :
- secs = abs(t1_secs - t2_secs)
- days = secs // 86400
- hours = secs // 3600 - days * 24
- minutes = secs // 60 - hours * 60 - days * 60 * 24
- secondes = int(secs) % 60
- return '{:>02.0f}:{:>02.0f}:{:>02.0f}:{:>02d}'.format(days, hours, minutes, secondes)
def main():
@@ -111,187 +33,265 @@ def main():
aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
- # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
- aparser.add_argument('-e', '--evalsize', help='Percent of training data with respect to training and test one (default 10)', default=10)
+ aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
+ aparser.add_argument('-e', '--evalsize', help='Percent of training data with respect to training and test one (default 10)', default=10, type=float)
+ aparser.add_argument('-c', '--chunkmode', help='Chunking mode specification which is effective only for tone (default -1)', default=-1, type=int)
aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None)
aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true')
+
+ aparser.add_argument('--filtering', help = 'Experimental option', action='store_true')
+ aparser.add_argument('--no_decomposition', help = 'Experimental option', action='store_true')
+ aparser.add_argument('--diacritic_only', help = 'Experimental option', action='store_true')
+ aparser.add_argument('--non_diacritic_only', help = 'Experimental option', action='store_true')
+ aparser.add_argument('--no_coding', help = 'Experimental option', action='store_true')
+
aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin)
aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout)
- aparser.add_argument('-s', '--store', help='Store tagged raw data in file (.csv) for further research purpose', default=None)
+ aparser.add_argument('-s', '--store', help='Store evaluation resault in file (.csv) for further research purpose', default=None)
args = aparser.parse_args()
if args.verbose :
- print args
-
- if args.learn and (args.pos or args.tone or args.gloss):
-
- if not (args.pos or args.tone or args.gloss) :
- print 'Choose pos, tone, gloss or combination of them'
+ print 'Arguments received by script'
+ dico = vars(args)
+ for key,val in dico.items():
+ typeName = type(val).__name__
+ sys.stdout.write("\t{} = {} ".format(key, val))
+ if val :
+ sys.stdout.write("({})".format(typeName))
+ print ""
+
+ if not (args.pos or args.tone or args.gloss) :
+ print 'Choose pos, tone, gloss'
+ aparser.print_help()
exit(0)
+ if args.learn :
print 'Make list of files'
+
+ """
files1 = glob.iglob("../corbama/*/*.dis.html")
files2 = glob.iglob("../corbama/*.dis.html")
allfiles = ""
for file1, file2 in zip(files1, files2):
allfiles += file1+','+file2+','
+ """
+ allfiles = []
+ for root, dirnames, filenames in os.walk('../corbama'):
+ for filename in fnmatch.filter(filenames, '*.dis.html'):
+ allfiles.append(os.path.join(root, filename))
+
allsents = []
- # pour le débogage
- allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'
+ # pour le débogage rapide
+ # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'
- if args.tone :
- try :
- enc = encoder_tones()
- except :
- enc = None
- print ("Error : unable to initialize the tone encoder !")
-
- print 'Open files and find features / supervision tags'
- for infile in allfiles.split(','):
- if(len(infile)) :
- print '-', infile
- sent = []
+ print 'Making observation data from disambiggated corpus of which'
+ for infile in allfiles:
+ if infile :
+ print '\t', infile
html_parser = FileParser()
html_parser.read_file(infile)
- for snum, sentence in enumerate(html_parser.glosses) :
- for tnum, token in enumerate(sentence[2]) :
- tag = ''
- if token.type == 'w' or token.type == 'c':
- tags = ''
- if args.pos:
- for ps in token.gloss.ps : tags += ps.encode('utf-8')
- sent.append((token.token, tags))
- elif args.tone:
- # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ?
- # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est
- # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter
- # une réélle amélioration dans la modélisation de tonalisation. Néanmoins,
- # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans
- # les données d'entraînement et d'en observer le apport
- if '|' not in token.gloss.form :
- [codes, chunks] = enc.differential_encode(token.token, token.gloss.form)
- for chunk, code in zip(chunks, codes) :
- try : sent.append((chunk, code.encode('utf-8')))
- except LookupError: pass
- """
- elif args.gloss:
- tags += token.gloss.gloss.encode('utf-8')
- sent.append((token.token, tags))
- """
+ sent = []
+ for sentence in html_parser.glosses :
+ for token in sentence[2] :
+ if token.type == 'w' or \
+ token.type == 'c':
+ if args.pos and not args.tone and not args.gloss :
+ # sent : list(str,str)
+ tags = ''
+ for ps in token.gloss.ps :
+ tags += ps
+ sent.append((token.token, tags.encode('utf-8')))
+ elif args.tone and not args.pos and not args.gloss :
+ # sent : list(str,str)
+ form = token.gloss.form.split('|')
+ tags = form[0]
+ sent.append((token.token, tags.encode('utf-8')))
+ elif args.gloss and not args.tone and not args.pos :
+ # sent : list(str,str)
+ tags = token.gloss.gloss
+ sent.append((token.token, tags.encode('utf-8')))
+ else :
+ print ('Error : multi-modal learning is not yet be supported !')
+ exit()
if len(sent) > 1:
allsents.append(sent)
sent = []
- if args.verbose and args.tone :
- enc.report()
-
- # Constitution des ensmebles d'entraînement de d'évaluation
+ if args.tone and not args.no_coding :
+ print 'Token segmentation and tonal informaiotn compression'
+ enc = encoder_tones()
+ allsents2 = allsents
+ allsents = []
+ for sent in allsents2 :
+ sent2 = []
+ for token_tags in sent :
+ token, tags = token_tags
+ [codes, syllabes] = enc.differential_encode(token, tags.decode('utf-8'), args.chunkmode)
+ token2 = [(syllabe, code.encode('utf-8')) for syllabe, code in zip(syllabes, codes)]
+ sent2.append(token2)
+ allsents.append(sent2)
+
+ if args.verbose :
+ enc.report()
+
+ R = 1 # 1 pour la totalité des corpus
p = (1 - args.evalsize / 100.0)
- train_set, eval_set = sampling(allsents, p)
- print 'Split the data in train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)'
+ train_set, eval_set = sampling(allsents, p, R)
+ print 'Split the data in \t train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)'
- print 'Building classifier (CRF/NLTK)'
+ print 'Building classifier (pyCRFsuite)'
# Initialization
t1 = time.time()
- if args.tone :
- num_phases = len([False, True]) * len(mode_indicators)
+ if args.tone and not args.no_coding :
+ num_phases = 2 * len(mode_indicators)
myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
else :
num_phases = 1
- # Training
+ # A. Entrâinement des modèles
for phase in range(num_phases) :
+ # A.1. Initialiser un nouveau modèle CRF
tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
trainer.set_params(tagger._training_options)
- if num_phases > 1 :
- model_name = args.learn + '.' + str(phase)
- else:
- model_name = args.learn
-
- # train_set : list(list((str,list(str))))
- for sent in train_set:
- tokens = unzip(sent)[0]
- labels = unzip(sent)[1]
- if num_phases > 1 :
- for lab in labels :
- pass
- labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
- features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
+ model_name = args.learn
+ if args.tone and not args.no_coding :
+ if args.diacritic_only and (phase == 0 or phase == 1) :
+ continue
+ if args.non_diacritic_only and (phase == 2 or phase == 3) :
+ continue
+ elif args.no_decomposition and phase % len(mode_indicators) != 0 :
+ continue
+ model_name += '.' + str(phase)
+
+ # A.2. Mettre à plat les structures de données pour préparer l'entrâinement contextuel
+ for sent in train_set :
+ if args.tone and not args.no_coding :
+ [tokens, labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+ features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding)
+ labels = get_sub_tone_code_of_sentence(sent, phase, sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+ labels = list(itertools.chain(*labels))
+ else :
+ [tokens, labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+ features = make_features_from_tokens(tokens, 0, args.tone and not args.no_coding)
+
trainer.append(features, labels)
trainer.train(model = model_name)
- if num_phases > 1 :
+
+ if args.tone and not args.no_coding :
myzip.write(model_name)
os.remove(model_name)
- if num_phases > 1 :
+
+ if args.tone and not args.no_coding :
myzip.close()
print "... done in", get_duration(t1_secs = t1, t2_secs = time.time())
- # Evaluation
+ # B. Evaluation
print 'Evaluating classifier'
- # gold_set, predicted_set : list(list((str, str)))
- # input_set, output_gold_set : list(list(str))
gold_set = eval_set
- input_set = [unzip(sent)[0] for sent in gold_set]
- predicted_set = [list() for sent in gold_set]
- if num_phases > 1 :
+
+ if args.tone and not args.no_coding :
myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
- for phase in range(num_phases) :
+ predicted_set_acc = list()
+ for phase in range(num_phases) :
+
+ # B.1. Charger le modèle CRF pour une des quatre phases d'annoation tonale
+ tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
+ trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
+ trainer.set_params(tagger._training_options)
+ model_basename = ''
+ for m in myzip.namelist() :
+ if m.endswith(str(phase)):
+ model_basename = m
+ break
+ if not model_basename :
+ continue
+ if args.diacritic_only and (phase == 0 or phase == 1) :
+ continue
+ if args.non_diacritic_only and (phase == 2 or phase == 3):
+ continue
+ elif args.no_decomposition and phase % len(mode_indicators) != 0 :
+ continue
+
+ myzip.extract(model_basename)
+ tagger.set_model_file(model_basename)
+ os.remove(model_basename)
+
+ # B.2 Annotation automatique syllabe par syllabe pour une phrase
+ predicted_set = list()
+ for p, sent in enumerate(gold_set) :
+
+ [tokens, gold_labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+ features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding)
+ labels = tagger._tagger.tag(features)
+ labels = reshape_tokens_as_sentnece(labels, sent)
+
+ predicted_tokens = list()
+ for i, token in enumerate(sent) :
+ predicted_tokens.append(map(list, zip(tokens[i], labels[i])))
+ predicted_set.append(predicted_tokens)
+
+ # B.3 Accumuler en ordonner l'annotation syllabique
+ if not predicted_set_acc :
+ predicted_set_acc = \
+ [[[['',''] for syllabe in token] for token in sent] for sent in predicted_set]
+
+ predicted_set_acc = accumulate_tone_code_of_dataset (predicted_set_acc, predicted_set)
+
+ predicted_set = predicted_set_acc
+
+
+ else :
+ # B.1. Charger le modèle CRF pour l'annoation
tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
trainer.set_params(tagger._training_options)
- if num_phases > 1:
- model_name = args.learn + '.' + str(phase)
- myzip.extract(model_name)
- else :
- model_name = args.learn
+ model_name = args.learn
tagger.set_model_file(model_name)
- for i, sent in enumerate(input_set) :
- features = [_get_features_customised_for_tones(sent,j) for j in range(len(sent))]
+
+ # B.2. Annotation automatique token par token
+ predicted_set = list()
+ for sent in gold_set :
+ [tokens, gold_labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+ features = make_features_from_tokens(tokens, 0, args.tone and not args.no_coding)
labels = tagger._tagger.tag(features)
- if num_phases > 1 :
- labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
- tagged_sent = list(zip(sent, labels))
- if not predicted_set[i] :
- predicted_set[i] = tagged_sent
- else :
- sent_acc, labels_acc = unzip(predicted_set[i])
- labels_acc = [label_acc + label for label_acc, label in zip(labels_acc, labels)]
- predicted_set[i] = list(zip(sent_acc, labels_acc))
- if num_phases > 1 :
- os.remove(model_name)
- myzip.close()
-
- # gold_tokens, predicted_tokens : list((str,str))
- predicted_tokens = list(itertools.chain(*predicted_set))
- if num_phases > 1 :
- predicted_tokens = [ tuple([pair[0], code_resort(pair[1].decode('utf-8')).encode('utf-8')]) for pair in predicted_tokens]
- gold_tokens = list(itertools.chain(*gold_set))
- # gold_tokens_eval, predicted_tokens_eval : list(str)
- if args.tone :
- gold_tokens_eval = getTag(gold_tokens)
- predicted_tokens_eval = getTag(predicted_tokens)
- else :
- gold_tokens_eval = gold_tokens
- predicted_tokens_eval = predicted_tokens
+ predicted_set.append(zip(tokens, labels))
- if args.store and args.tone :
- stored_filename = args.store
- csv_export(enc, stored_filename, gold_tokens, predicted_tokens)
- print "Exactitude : {:>5.3f}".format(accuracy(gold_tokens_eval, predicted_tokens_eval))
+ if args.tone and not args.no_coding :
+ # on ajuste l'évaluation dans les cas d'apprentissage partiel
+ # en nous proposant de filtrer les caractères ignorés par l'apprentissage
+ # sinon, nous obtiendrons un résultat pénalisé
+ # en voulant comparer une forme prédite partiellement à la forme tonale intégrale d'un même token
+ if args.diacritic_only :
+ gold_set = apply_filter_to_base_element(gold_set, [2,3], sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+ elif args.non_diacritic_only :
+ gold_set = apply_filter_to_base_element(gold_set, [0,1], sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+ elif args.filtering :
+ gold_set = apply_filter_to_base_element(gold_set, [0,1,2,3], sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+
+ """
+ if args.verbose :
+ verify(gold_set)
+ """
+
+ print "Accuracy : {:>5.3f}".format(accuray2(gold_set, predicted_set, args.tone and not args.no_coding))
+
+ if args.store :
+ stored_filename = args.store
+ csv_export(stored_filename, gold_set, predicted_set, args.tone and not args.no_coding)
if args.verbose and args.store :
print ("Tagged result is exported in {}".format(args.store))
elif args.disambiguate and args.infile and args.outfile :
- # Lecture de texte en .HTML
+
html_parser = FileParser()
tagger = CRFTagger()
@@ -307,35 +307,92 @@ def main():
print "Error : unable to open the input file {} !".format(args.infile)
exit(1)
- # Exportation du résultat de désambiguïsation en .HTML
for snum, sentence in enumerate(html_parser.glosses) :
tokens = [token.token for token in sentence[2]]
- features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
+ features = [get_features_customised(tokens, i) for i in range(len(tokens))]
tagger._tagger.set(features)
for tnum, token in enumerate(sentence[2]) :
options = list()
if token.value and len(token.value) > 2:
for nopt, option in enumerate(token.value[2]) :
try: tag = option.ps[0]
- except IndexError : tag = ''
- prob = tagger._tagger.marginal(tag, tnum)
+ except : tag = ''
+ try:
+ prob = tagger._tagger.marginal(tag, tnum)
+ except :
+ prob = 0.0
options.append((prob, option))
- reordered_probs, reordered_options = unzip(sorted(options, reverse = True))
+ reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True))
if args.select :
prob_max = reordered_probs[0]
reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max])
+
html_parser.glosses[snum][1][tnum] = reordered_options
- elif args.tone :
- pass
+ elif args.tone and not args.no_coding :
+ try :
+ html_parser.read_file(args.infile)
+ except IOError:
+ print "Error : unable to open the input file {} !".format(args.infile)
+ exit(1)
+ try :
+ myzip = zipfile.ZipFile(args.disambiguate, 'r')
+ except IOError:
+ print "Error : unable to open the model file {} !".format((args.disambiguate + '.zip'))
+ exit(1)
- try : html_parser.write(args.outfile)
- except IOError: print "Error : unable to create the output file {}".format(args.outfile)
+ num_phases = 2 * len(mode_indicators)
+ taggers = []
+ enc = encoder_tones()
+ for phase in range(num_phases) :
+ taggers.append(CRFTagger())
+ model_basename = ''
+ for m in myzip.namelist() :
+ if m.endswith(str(phase)):
+ model_basename = m
+ break
+ if not model_basename :
+ continue
+ if args.diacritic_only and (phase == 0 or phase == 1) :
+ continue
+ if args.non_diacritic_only and (phase == 2 or phase == 3):
+ continue
+ elif args.no_decomposition and phase % len(mode_indicators) != 0 :
+ continue
+ myzip.extract(model_basename)
+ taggers[phase].set_model_file(model_basename)
+ os.remove(model_basename)
+ myzip.close()
- else :
- aparser.print_help()
+ for snum, sentence in enumerate(html_parser.glosses) :
+ tokens = [enc.differential_encode(token.token, token.token, args.chunkmode)[1] for token in sentence[2]]
+ for phase in range(num_phases) :
+ features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding)
+ if taggers[phase]._model_file :
+ taggers[phase]._tagger.set(features)
+ for tnum, token in enumerate(sentence[2]) :
+ options = list()
+ if token.value and len(token.value) > 2:
+ for nopt, option in enumerate(token.value[2]) :
+ try: tag = option.form.encode('utf-8')
+ except : tag = ''
+ # def marginal_tone(taggers, tnum, tokens, tag, token, chunk_mode, sel_en, decomposition_en)
+ prob = marginal_tone(taggers, tnum, tokens, tag, token.token, chunk_mode = args.chunkmode, sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+ options.append((prob, option))
+ reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True))
+ if args.select :
+ prob_max = reordered_probs[0]
+ reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max])
+ html_parser.glosses[snum][1][tnum] = reordered_options
+ try :
+ html_parser.write(args.outfile)
+ print "Disambiggated resulat for {} is saved in {}".format(args.infile,args.outfile)
+ except IOError: print "Error : unable to create the output file {} !".format(args.outfile)
+
+ else :
+ aparser.print_help()
exit(0)
diff --git a/doc/samples/bamana.gram.txt b/doc/samples/bamana.gram.txt
index 7de06d1..d0afc90 100644
--- a/doc/samples/bamana.gram.txt
+++ b/doc/samples/bamana.gram.txt
@@ -41,111 +41,111 @@ return if parsed
section inflection
# verbal inflection
# -la/-na PROG
-pattern :v: [ {@nasal-v@|na}:: ] | :v: [:v: :mrph:PROG]
-pattern :v: [ {@nonnasal-v@|la}:: ] | :v: [:v: :mrph:PROG]
+pattern :v: [{@nasal-v@|na}::] | :v: [:v: :mrph:PROG]
+pattern :v: [{@nonnasal-v@|la}::] | :v: [:v: :mrph:PROG]
# moved up from v_vq_derivation because of na/la ambiguity
-pattern :n: [ {@smth-nasal@|na}:: ] | :n: [ :v: :mrph:AG.PRM]
-pattern :n: [ {@nonnasal-v@|la}:: ] | :n: [ :v: :mrph:AG.PRM]
+pattern :n: [{@smth-nasal@|na}::] | :n: [:v: :mrph:AG.PRM]
+pattern :n: [{@nonnasal-v@|la}::] | :n: [:v: :mrph:AG.PRM]
# -ra/-la/-na PFV.INTR
-pattern :v: [ {@nasal-syl@|n[a']}:: ] | :v: [:v: :mrph:PFV.INTR]
-pattern :v: [ {@glide-syl@|l[a']}:: ] | :v: [:v: :mrph:PFV.INTR]
-pattern :v: [ {@nonnasalglide-syl@|r[a']}:: ] | :v: [:v: :mrph:PFV.INTR]
+pattern :v: [{@nasal-syl@|n[a']}::] | :v: [:v: :mrph:PFV.INTR]
+pattern :v: [{@glide-syl@|l[a']}::] | :v: [:v: :mrph:PFV.INTR]
+pattern :v: [{@nonnasalglide-syl@|r[a']}::] | :v: [:v: :mrph:PFV.INTR]
# nominal inflection
# -w PL
-pattern :n/adj/dtm/prn/ptcp/n.prop/num: [ {|w}:: ] | :n/adj/dtm/prn/ptcp/n.prop/num: [:n/adj/dtm/prn/ptcp/n.prop/num: :mrph:PL]
+pattern :n/adj/dtm/prn/ptcp/n.prop/num: [{|w}::] | :n/adj/dtm/prn/ptcp/n.prop/num: [:n/adj/dtm/prn/ptcp/n.prop/num: :mrph:PL]
# participles
section participles
-pattern :v/ptcp: [ {|bali}:: ] | :ptcp: [ :v: :mrph:PTCP.PRIV]
-pattern :v/ptcp: [ {|ta}:: ] | :ptcp: [ :v: :mrph:PTCP.POT]
-pattern :v/ptcp: [ {|tɔ}:: ] | :ptcp: [ :v: :mrph:CONV.PROG]
-pattern :v/ptcp: [ {@smth-nasal@|nen}:: ] | :ptcp: [ :v: :mrph:PTCP.RES]
-pattern :v/ptcp: [ {@nonnasal-v@|len}:: ] | :ptcp: [ :v: :mrph:PTCP.RES]
+pattern :v/ptcp: [{|bali}::] | :ptcp: [:v: :mrph:PTCP.PRIV]
+pattern :v/ptcp: [{|ta}::] | :ptcp: [:v: :mrph:PTCP.POT]
+pattern :v/ptcp: [{|tɔ}::] | :ptcp: [:v: :mrph:CONV.PROG]
+pattern :v/ptcp: [{@smth-nasal@|nen}::] | :ptcp: [:v: :mrph:PTCP.RES]
+pattern :v/ptcp: [{@nonnasal-v@|len}::] | :ptcp: [:v: :mrph:PTCP.RES]
-pattern :v/ptcp: [ {@smth-nasal@|nen|ba}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:AUGM]
-pattern :v/ptcp: [ {@nonnasal-v@|len|ba}:: ] | :ptcp: [ :v: :mrph:PTCP.RES :mrph:AUGM]
+pattern :v/ptcp: [{@smth-nasal@|nen|ba}::] | :ptcp: [:v: :mrph:PTCP.RES :mrph:AUGM]
+pattern :v/ptcp: [{@nonnasal-v@|len|ba}::] | :ptcp: [:v: :mrph:PTCP.RES :mrph:AUGM]
# derivative forms we need to consider even if we have them in dictionary
section common_derivation
-pattern :ptcp/n/adj: [ {|nin}:: ] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:DIM]
-pattern :n/adj/ptcp/v: [ {|ya}:: ] | :n: [:n/adj/ptcp/v: :mrph:ABSTR]
-pattern :ptcp/n/adj: [ {|ba}:: ] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:AUGM]
+pattern :ptcp/n/adj: [{|nin}::] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:DIM]
+pattern :n/adj/ptcp/v: [{|ya}::] | :n: [:n/adj/ptcp/v: :mrph:ABSTR]
+pattern :ptcp/n/adj: [{|ba}::] | :ptcp/n/adj: [:ptcp/n/adj: :mrph:AUGM]
# to handle -baliya ex: basigi.bali.ya
-pattern :v/ptcp/n/adj: [ {|bali|ya}:: ] | :n: [:v/ptcp: :mrph:PTCP.PRIV :mrph:ABSTR]
+pattern :v/ptcp/n/adj: [{|bali|ya}::] | :n: [:v/ptcp: :mrph:PTCP.PRIV :mrph:ABSTR]
# common nominal/verbal derivation (locatives)
section n_v_derivation
-pattern :n/n.prop: [ {|ka}:: ] | :n/n.prop: [:n/n.prop: :mrph:GENT]
-pattern :n/n.prop: [ {@nasal-v@|na}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC]
-pattern :n/n.prop: [ {@nonnasal-v@|la}:: ] | :n/n.prop: [:n/n.prop: :mrph:LOC]
+pattern :n/n.prop: [{|ka}::] | :n/n.prop: [:n/n.prop: :mrph:GENT]
+pattern :n/n.prop: [{@nasal-v@|na}::] | :n/n.prop: [:n/n.prop: :mrph:LOC]
+pattern :n/n.prop: [{@nonnasal-v@|la}::] | :n/n.prop: [:n/n.prop: :mrph:LOC]
# nominal derivation
section n_derivation
-pattern :n: [ {@smth-nasal@|nama}:: ] | :n: [ :n: :mrph:STAT]
-pattern :n: [ {@nonnasal-v@|lama}:: ] | :n: [ :n: :mrph:STAT]
-pattern :n: [ {|ma}:: ] | :n: [ :n: :mrph:COM]
-pattern :adj/n: [ {|ntan}:: ] | :adj/n: [ :n: :mrph:PRIV]
-pattern :adj/n: [ {|bagatɔ}:: ] | :adj/n: [ :n: :mrph:ST]
-pattern :adj/n: [ {|baatɔ}:: ] | :adj/n: [ :n: :mrph:ST]
-pattern :n: [ {ɲɔgɔn|}:: ] | :n: [ :prn:RECP :n: ]
-pattern :n: [ {|ɲwaa?n}:: ] | :n: [ :n: :prn:RECP]
+pattern :n: [{@smth-nasal@|nama}::] | :n: [:n: :mrph:STAT]
+pattern :n: [{@nonnasal-v@|lama}::] | :n: [:n: :mrph:STAT]
+pattern :n: [{|ma}::] | :n: [:n: :mrph:COM]
+pattern :adj/n: [{|ntan}::] | :adj/n: [:n: :mrph:PRIV]
+pattern :adj/n: [{|bagatɔ}::] | :adj/n: [:n: :mrph:ST]
+pattern :adj/n: [{|baatɔ}::] | :adj/n: [:n: :mrph:ST]
+pattern :n: [{ɲɔgɔn|}::] | :n: [:prn:RECP :n:]
+pattern :n: [{|ɲwaa?n}::] | :n: [:n: :prn:RECP]
# verbal/vq derivation
section v_vq_derivation
-pattern :n: [ {@smth-nasal@|nan}:: ] | :n: [ :v: :mrph:INSTR]
-pattern :n: [ {@nonnasal-v@|lan}:: ] | :n: [ :v: :mrph:INSTR]
-pattern :n: [ {@smth-nasal@|ni}:: ] | :n: [ :v: :mrph:NMLZ]
-pattern :n: [ {@nonnasal-v@|li}:: ] | :n: [ :v: :mrph:NMLZ]
-pattern :n: [ {|baga}:: ] | :n: [ :v: :mrph:AG.OCC]
-pattern :n: [ {|baa}:: ] | :n: [ :v: :mrph:AG.OCC]
-pattern :n: [ {|baga|nci}:: ] | :n: [ :v: :mrph:AG.OCC :mrph:AG.EX]
-pattern :n: [ {|baa|nci}:: ] | :n: [ :v: :mrph:AG.OCC :mrph:AG.EX]
+pattern :n: [{@smth-nasal@|nan}::] | :n: [:v: :mrph:INSTR]
+pattern :n: [{@nonnasal-v@|lan}::] | :n: [:v: :mrph:INSTR]
+pattern :n: [{@smth-nasal@|ni}::] | :n: [:v: :mrph:NMLZ]
+pattern :n: [{@nonnasal-v@|li}::] | :n: [:v: :mrph:NMLZ]
+pattern :n: [{|baga}::] | :n: [:v: :mrph:AG.OCC]
+pattern :n: [{|baa}::] | :n: [:v: :mrph:AG.OCC]
+pattern :n: [{|baga|nci}::] | :n: [:v: :mrph:AG.OCC :mrph:AG.EX]
+pattern :n: [{|baa|nci}::] | :n: [:v: :mrph:AG.OCC :mrph:AG.EX]
# attempt to handle -likɛ, -likɛla, others like -liwari...
-pattern :v: [ {@nonnasal-v@|li|kɛ}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire]
-pattern :n: [ {@nonnasal-v@|li|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM]
-pattern :v: [ {@smth-nasal@|ni|kɛ}:: ] | :v: [ :v: :mrph:NMLZ kɛ́:v:faire]
-pattern :n: [ {@smth-nasal@|ni|kɛ|la}:: ] | :n: [ :v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM]
-pattern :n: [ {@nonnasal-v@|li|wari}:: ] | :n: [ :v: :mrph:NMLZ :n:]
-pattern :n: [ {@smth-nasal@|ni|wari}:: ] | :n: [ :v: :mrph:NMLZ :n:]
-pattern :n: [ {@nonnasal-v@|li|fɛn}:: ] | :n: [ :v: :mrph:NMLZ fɛ́n:n:chose]
-pattern :n: [ {@smth-nasal@|ni|fɛn}:: ] | :n: [ :v: :mrph:NMLZ fɛ́n:n:chose]
-pattern :n: [ {@nonnasal-v@|li|ko}:: ] | :n: [ :v: :mrph:NMLZ kó:n:affaire]
-pattern :n: [ {@smth-nasal@|ni|ko}:: ] | :n: [ :v: :mrph:NMLZ kó:n:affaire]
+pattern :v: [{@nonnasal-v@|li|kɛ}::] | :v: [:v: :mrph:NMLZ kɛ́:v:faire]
+pattern :n: [{@nonnasal-v@|li|kɛ|la}::] | :n: [:v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM]
+pattern :v: [{@smth-nasal@|ni|kɛ}::] | :v: [:v: :mrph:NMLZ kɛ́:v:faire]
+pattern :n: [{@smth-nasal@|ni|kɛ|la}::] | :n: [:v: :mrph:NMLZ kɛ́:v:faire :mrph:AG.PRM]
+pattern :n: [{@nonnasal-v@|li|wari}::] | :n: [:v: :mrph:NMLZ :n:]
+pattern :n: [{@smth-nasal@|ni|wari}::] | :n: [:v: :mrph:NMLZ :n:]
+pattern :n: [{@nonnasal-v@|li|fɛn}::] | :n: [:v: :mrph:NMLZ fɛ́n:n:chose]
+pattern :n: [{@smth-nasal@|ni|fɛn}::] | :n: [:v: :mrph:NMLZ fɛ́n:n:chose]
+pattern :n: [{@nonnasal-v@|li|ko}::] | :n: [:v: :mrph:NMLZ kó:n:affaire]
+pattern :n: [{@smth-nasal@|ni|ko}::] | :n: [:v: :mrph:NMLZ kó:n:affaire]
# need to handle -ba AUGM inside ex: ko.jugu.ba.kɛ.la
-pattern :n: [ {|ɲɔgɔn}:: ] | :n: [ :v: :prn:RECP]
-pattern :n: [ {|ɲwaa?n}:: ] | :n: [ :v: :prn:RECP]
-pattern :n: [ {ɲɔgɔn|}:: ] | :n: [ :prn:RECP :v: ]
+pattern :n: [{|ɲɔgɔn}::] | :n: [:v: :prn:RECP]
+pattern :n: [{|ɲwaa?n}::] | :n: [:v: :prn:RECP]
+pattern :n: [{ɲɔgɔn|}::] | :n: [:prn:RECP :v:]
# vq derivation
section vq_derivation
-pattern :vq: [ {|ya}:: ] | :n/v: [ :vq: :mrph:DEQU]
-pattern :adj: [ {|man}:: ] | :adj: [ :vq: :mrph:ADJ]
+pattern :vq: [{|ya}::] | :n/v: [:vq: :mrph:DEQU]
+pattern :adj: [{|man}::] | :adj: [:vq: :mrph:ADJ]
# numeral derivation
section num_derivation
-pattern :num: [ {@nasal-v@|na}:: ] | :num: [ :num: :mrph:PRICE]
-pattern :num: [ {@nonnasal-v@|la}:: ] | :num: [ :num: :mrph:PRICE]
-pattern :num: [ {@nasal-v@|nan}:: ] | :num: [ :num: :mrph:ORD]
-pattern :num: [ {[0-9]+|nan}:: ] | :num: [ :num: :mrph:ORD]
-pattern :num: [ {@nonnasal-v@|lan}:: ] | :num: [ :num: :mrph:ORD]
+pattern :num: [{@nasal-v@|na}::] | :num: [:num: :mrph:PRICE]
+pattern :num: [{@nonnasal-v@|la}::] | :num: [:num: :mrph:PRICE]
+pattern :num: [{@nasal-v@|nan}::] | :num: [:num: :mrph:ORD]
+pattern :num: [{[0-9]+|nan}::] | :num: [:num: :mrph:ORD]
+pattern :num: [{@nonnasal-v@|lan}::] | :num: [:num: :mrph:ORD]
## reduplication
section reduplication
-pattern :v: [ {(?P.+)|(?P=stem)}:: ] | :v: [ :v: :v: ]
-pattern :adj: [ {(?P.+)|(?P=stem)}:: ] | :adj: [ :adj: :adj: ]
-pattern :num: [ {(?P.+)|(?P=stem)}:: ] | :num: [ :num: :num: ]
-pattern :v: [ {(?P.+)|-|(?P=stem)}:: ] | :v: [ :v: :: :v: ]
-pattern :adj: [ {(?P.+)|-|(?P=stem)}:: ] | :adj: [ :adj: :: :adj: ]
-pattern :num: [ {(?P.+)|-|(?P=stem)}:: ] | :num: [ :num: :: :num: ]
-pattern :v: [ {(?P.+)|(?P=stem)|(?P=stem)}:: ] | :v: [ :v: :v: :v: ]
-pattern :adj: [ {(?P.+)|(?P=stem)|(?P=stem)}:: ] | :adj: [ :adj: :adj: :adj: ]
+pattern :v: [{(?P.+)|(?P=stem)}::] | :v: [:v: :v:]
+pattern :adj: [{(?P.+)|(?P=stem)}::] | :adj: [:adj: :adj:]
+pattern :num: [{(?P.+)|(?P=stem)}::] | :num: [:num: :num:]
+pattern :v: [{(?P.+)|-|(?P=stem)}::] | :v: [:v: :: :v:]
+pattern :adj: [{(?P.+)|-|(?P=stem)}::] | :adj: [:adj: :: :adj:]
+pattern :num: [{(?P.+)|-|(?P=stem)}::] | :num: [:num: :: :num:]
+pattern :v: [{(?P.+)|(?P=stem)|(?P=stem)}::] | :v: [:v: :v: :v:]
+pattern :adj: [{(?P.+)|(?P=stem)|(?P=stem)}::] | :adj: [:adj: :adj: :adj:]
## composition
# general part-of-speech composition patterns
@@ -154,44 +154,44 @@ section pos_composition
# two-words composites
# n.prop + n = n ex: Irisi.jamana
-pattern :n: [ :n.prop: :n: ] | :n: [ :n.prop: :n: ]
+pattern :n: [:n.prop: :n:] | :n: [:n.prop: :n:]
# v/n + n = n
-pattern :n: [ :n/v: :n: ] | :n: [ :n/v: :n: ]
+pattern :n: [:n/v: :n:] | :n: [:n/v: :n:]
# n + v = n/v
-pattern :n/v: [ :n: :v: ] | :n/v: [ :n: :v: ]
+pattern :n/v: [:n: :v:] | :n/v: [:n: :v:]
# n + adj/num = n
-pattern :n: [ :n: :adj/num: ] | :n: [ :n: :adj/num: ]
+pattern :n: [:n: :adj/num:] | :n: [:n: :adj/num:]
# dtm + v = n
-pattern :n: [ :dtm: :v: ] | :n: [ :dtm: :v: ]
+pattern :n: [:dtm: :v:] | :n: [:dtm: :v:]
# pp + n = n ex. kɔkan.maliden, kɔnɔ.mɔgɔ
-pattern :n: [ :pp: :n: ] | :n: [ :pp: :n: ]
+pattern :n: [:pp: :n:] | :n: [:pp: :n:]
# three-words composites
# n + adj/pp/num + n = n -> added num : san.duuru.baara
-pattern :n: [ :n: :adj/pp/num: :n: ] | :n: [ :n: :adj/pp/num: :n: ]
+pattern :n: [:n: :adj/pp/num: :n:] | :n: [:n: :adj/pp/num: :n:]
# dtm/n + v + n = n
-pattern :n: [ :dtm/n: :v: :n: ] | :n: [ :dtm/n: :v: :n: ]
+pattern :n: [:dtm/n: :v: :n:] | :n: [:dtm/n: :v: :n:]
# n + pp + v = n/v
-pattern :n/v: [ :n: :pp: :v: ] | :n/v: [ :n: :pp: :v: ]
+pattern :n/v: [:n: :pp: :v:] | :n/v: [:n: :pp: :v:]
# n+n+n = n yiriwali.nafolo.ko ?
-pattern :n: [ :n: :n: :n: ] | :n: [ :n: :n: :n: ]
+pattern :n: [:n: :n: :n:] | :n: [:n: :n: :n:]
# ??? : mrph not taken into acount (not in dic?)
# v+mrph+n ex: gansi.li.walan
-pattern :n: [ :v: li:mrph: :n: ] | :n: [ :v: :mrph:NMLZ :n: ]
-pattern :n: [ :n: ba:mrph: :n: ] | :n: [ :n: :mrph:AUGM :n: ]
+pattern :n: [:v: li:mrph: :n:] | :n: [:v: :mrph:NMLZ :n:]
+pattern :n: [:n: ba:mrph: :n:] | :n: [:n: :mrph:AUGM :n:]
# v+mrph+v ex: kɔlɔsi.li.kɛ
-pattern :v: [ :v: li:mrph: :v: ] | :v: [ :v: :mrph:NMLZ :v: ]
-pattern :n: [ :v: ka:pm: :v: ] | :n: [ :v: kà:pm:INF :v: ]
+pattern :v: [:v: li:mrph: :v:] | :v: [:v: :mrph:NMLZ :v:]
+pattern :n: [:v: ka:pm: :v:] | :n: [:v: kà:pm:INF :v:]
# four-words composites
# n + pp + n + n = n
-pattern :n: [ :n: :pp: :n: :n: ] | :n: [ :n: :pp: :n: :n: ]
+pattern :n: [:n: :pp: :n: :n:] | :n: [:n: :pp: :n: :n:]
# n + pp + adj + n = n Ex: dugu.jukɔrɔ.nafoloma.fɛn
-pattern :n: [ :n: :pp: :adj: :n: ] | :n: [ :n: :pp: :adj: :n: ]
+pattern :n: [:n: :pp: :adj: :n:] | :n: [:n: :pp: :adj: :n:]
# n+v+v+n : kunnafoni.falen.falen.ko
-pattern :n: [ :n: :v: :v: :n: ] | :n: [ :n: :v: :v: :n: ]
+pattern :n: [:n: :v: :v: :n:] | :n: [:n: :v: :v: :n:]
# orthographically unlikely to be bamana words
diff --git a/exp_accuracy_no_coding.sh b/exp_accuracy_no_coding.sh
new file mode 100644
index 0000000..b1dd7a7
--- /dev/null
+++ b/exp_accuracy_no_coding.sh
@@ -0,0 +1,27 @@
+#! /bin/bash
+
+set -vx
+
+GIT_VERSION="$(git rev-parse HEAD)"
+NOM=exp_accuracy_vs_no_coding_$(date +%d_%H_%M)_"$GIT_VERSION"
+
+BASIC_OPTIONS="-v -t -l $NOM"
+SUPP_OPTIONS="-e 50 --no_coding"
+
+KEYWORD="Seconds required for this iteration: |Error norm|Iteration #"
+KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total"
+FP_PAT="[-+]?[0-9]+\.?[0-9]*"
+
+touch "$NOM.log"
+
+VAR_OPTS="-s "$NOM"_w_"$w".csv"
+
+if hash stdbuf 2>/dev/null; then
+stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+else
+gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+fi
diff --git a/exp_accuracy_vs_evalsize.sh b/exp_accuracy_vs_evalsize.sh
new file mode 100644
index 0000000..b333a5a
--- /dev/null
+++ b/exp_accuracy_vs_evalsize.sh
@@ -0,0 +1,29 @@
+#! /bin/bash
+
+set -vx
+
+GIT_VERSION="$(git rev-parse HEAD)"
+NOM=exp_accuracy_vs_evalsize_$(date +%d_%H_%M)_"$GIT_VERSION"
+
+BASIC_OPTIONS="-v -t -l $NOM"
+SUPP_OPTIONS="--filtering --diacritic_only"
+
+KEYWORD="Seconds required for this iteration: |Error norm|Iteration #"
+KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total"
+FP_PAT="[-+]?[0-9]+\.?[0-9]*"
+
+touch "$NOM.log"
+
+for evalsize in 10 20 30 40 50 60 70 80 90
+do
+VAR_OPTS="-e $evalsize -s "$NOM"_evalsize_"$evalsize".csv"
+if hash stdbuf 2>/dev/null; then
+stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+else
+gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+fi
+done
diff --git a/exp_accuracy_vs_segmentation_type.sh b/exp_accuracy_vs_segmentation_type.sh
new file mode 100644
index 0000000..3ff4a72
--- /dev/null
+++ b/exp_accuracy_vs_segmentation_type.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+set -vx
+
+GIT_VERSION="$(git rev-parse HEAD)"
+NOM=exp_accuracy_vs_segmentation_type_$(date +%d_%H_%M)_"$GIT_VERSION"
+
+BASIC_OPTIONS="-v -t -l $NOM"
+SUPP_OPTIONS="-e 50 --filtering --diacritic_only"
+
+KEYWORD="Seconds required for this iteration: |Error norm|Iteration #"
+KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total"
+FP_PAT="[-+]?[0-9]+\.?[0-9]*"
+
+touch "$NOM.log"
+
+for w in -1 1 2 3 4 5 6 0
+do
+VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv"
+
+if hash stdbuf 2>/dev/null; then
+stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+else
+gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+fi
+done
diff --git a/exp_accuracy_vs_segmentation_type_no_decomposition_A.sh b/exp_accuracy_vs_segmentation_type_no_decomposition_A.sh
new file mode 100644
index 0000000..e8e4322
--- /dev/null
+++ b/exp_accuracy_vs_segmentation_type_no_decomposition_A.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+set -vx
+
+GIT_VERSION="$(git rev-parse HEAD)"
+NOM=exp_accuracy_vs_segmentation_type_no_decomposition_A_$(date +%d_%H_%M)_"$GIT_VERSION"
+
+BASIC_OPTIONS="-v -t -l $NOM"
+SUPP_OPTIONS="-e 50 --filtering --diacritic_only --no_decomposition"
+
+KEYWORD="Seconds required for this iteration: |Error norm|Iteration #"
+KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total"
+FP_PAT="[-+]?[0-9]+\.?[0-9]*"
+
+touch "$NOM.log"
+
+for w in -1 1 2 3
+do
+VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv"
+
+if hash stdbuf 2>/dev/null; then
+stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+else
+gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+fi
+done
diff --git a/exp_accuracy_vs_segmentation_type_no_decomposition_B.sh b/exp_accuracy_vs_segmentation_type_no_decomposition_B.sh
new file mode 100644
index 0000000..53355e4
--- /dev/null
+++ b/exp_accuracy_vs_segmentation_type_no_decomposition_B.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+set -vx
+
+GIT_VERSION="$(git rev-parse HEAD)"
+NOM=exp_accuracy_vs_segmentation_type_no_decomposition_B_$(date +%d_%H_%M)_"$GIT_VERSION"
+
+BASIC_OPTIONS="-v -t -l $NOM"
+SUPP_OPTIONS="-e 50 --filtering --diacritic_only --no_decomposition"
+
+KEYWORD="Seconds required for this iteration: |Error norm|Iteration #"
+KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total"
+FP_PAT="[-+]?[0-9]+\.?[0-9]*"
+
+touch "$NOM.log"
+
+for w in 4 5 6 0
+do
+VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv"
+
+if hash stdbuf 2>/dev/null; then
+stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+else
+gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+fi
+done
diff --git a/exp_accuracy_vs_segmentation_type_no_filter.sh b/exp_accuracy_vs_segmentation_type_no_filter.sh
new file mode 100644
index 0000000..4be8c9c
--- /dev/null
+++ b/exp_accuracy_vs_segmentation_type_no_filter.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+set -vx
+
+GIT_VERSION="$(git rev-parse HEAD)"
+NOM=exp_accuracy_vs_segmentation_type_no_filter_$(date +%d_%H_%M)_"$GIT_VERSION"
+
+BASIC_OPTIONS="-v -t -l $NOM"
+SUPP_OPTIONS="-e 50 --diacritic_only"
+
+KEYWORD="Seconds required for this iteration: |Error norm|Iteration #"
+KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total"
+FP_PAT="[-+]?[0-9]+\.?[0-9]*"
+
+touch "$NOM.log"
+
+for w in -1 1 2 3 4 5 6 0
+do
+VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv"
+
+if hash stdbuf 2>/dev/null; then
+stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+else
+gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+fi
+done
diff --git a/exp_accuracy_vs_segmentation_type_no_filter_no_decomposition.sh b/exp_accuracy_vs_segmentation_type_no_filter_no_decomposition.sh
new file mode 100644
index 0000000..75b87b7
--- /dev/null
+++ b/exp_accuracy_vs_segmentation_type_no_filter_no_decomposition.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+set -vx
+
+GIT_VERSION="$(git rev-parse HEAD)"
+NOM=exp_accuracy_vs_segmentation_type_no_decomposition_no_filter_no_decomposition_$(date +%d_%H_%M)_"$GIT_VERSION"
+
+BASIC_OPTIONS="-v -t -l $NOM"
+SUPP_OPTIONS="-e 50 --diacritic_only --no_decomposition"
+
+KEYWORD="Seconds required for this iteration: |Error norm|Iteration #"
+KEYWORD2="[^_]diacritic_only|chunkmode|filtering|no_coding|no_decomposition|r_E|accuracy|done|eval|total"
+FP_PAT="[-+]?[0-9]+\.?[0-9]*"
+
+touch "$NOM.log"
+
+for w in -1 1 2 3 4 5 6 0
+do
+VAR_OPTS="-c $w -s "$NOM"_w_"$w".csv"
+
+if hash stdbuf 2>/dev/null; then
+stdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+else
+gstdbuf -oL python disambiguation.py $VAR_OPTS $SUPP_OPTIONS $BASIC_OPTIONS \
+| gawk "BEGIN{IGNORECASE=1} /.*($KEYWORD2).*/ {print \$0} match(\$0, /.*($KEYWORD)[^.0-9+-]*($FP_PAT)/, ary) {print ary[2]}" \
+>> "$NOM.log"
+fi
+done
diff --git a/fouille_erreurs.py b/fouille_erreurs.py
new file mode 100644
index 0000000..34dec82
--- /dev/null
+++ b/fouille_erreurs.py
@@ -0,0 +1,166 @@
+#coding=utf-8
+
+import re, argparse, sys, codecs
+from collections import Counter
+markers_tone=[unichr(0x0301),unichr(0x0300),unichr(0x0302),unichr(0x030c)]
+
+def stat_from_cnt (cnt) :
+
+ tot = sum(cnt.values())
+ for k in ['E_c','E_p','E_pc','E_noise','E_silence'] :
+ print k,str(round(cnt[k] / float(tot) * 100, 2))+"%"
+
+def rm_deletion (str_in) :
+
+ str_out = ''
+ record = True
+ for c in str_in :
+ if record :
+ if c != '-':
+ str_out += c
+ else :
+ record = False
+ else :
+ if c == '+':
+ str_out += c
+ record = True
+
+ return str_out
+
+
+def line2entries (str_in, min_len = 6, max_len = 6) :
+
+ regex = re.compile('[\n\r]')
+ str_in = regex.sub('', str_in)
+ entries = str_in.strip().split(',')
+ if len(entries) < min_len or len(entries) > max_len : entries = None
+ return entries
+
+def align(dico, lst1, lst2, position_to_match = -1) :
+
+ if lst2 == [u"NULL"] :
+ return dico
+ if not lst1 :
+ return dico
+
+ for i, x in enumerate(lst1) :
+ for j, y in enumerate(lst2) :
+ if not dico : cond = True
+ else : cond = (i not in dico.keys()) and (j not in dico.values())
+ if cond :
+ if position_to_match < 0 :
+ dico[i] = j
+ break
+ elif x[position_to_match] == y[position_to_match]:
+ dico[i] = j
+ break
+ return dico
+
+def make_aligned_result(match,ops1,ops2) :
+
+ dst = []
+ for i, x in enumerate(ops1) :
+ if i in match.keys() :
+ dst.append(ops2[match[i]])
+ else :
+ dst.append('')
+
+ return dst
+
+def stat (str1, str2, cnt, cnt2) :
+
+ # les entrées diviser en operation d'édition
+ ops1 = [x for x in str1.split('+') if x]
+ ops2 = [x for x in str2.split('+') if x]
+
+ # alignement pas très efficace
+ match = dict()
+ match = align(match, ops1, ops2, 0)
+ match = align(match, ops1, ops2, 1)
+ match = align(match, ops1, ops2)
+ dst = make_aligned_result(match, ops1, ops2)
+ src = ops1
+
+ # stat : op = p,c
+ for op, op2 in zip(src, dst) :
+ if op[1] not in markers_tone :
+ continue
+ if not op2 : # silence
+ tag = '3_err_silence'
+ tag2 = ''
+ elif op == op2 : # perfect
+ tag = '4_good'
+ tag2 = op[1]
+ elif op[0] == op2[0] : # E_c - E_p
+ tag = '0_err_c'
+ tag2 = op[1] + u'___' + op2[1]
+ elif op[1] == op2[1] : # E_p - E_c
+ tag = '1_err_p'
+ tag2 = ''
+ else : # E_p inter E_c
+ tag = '2_err_others'
+ tag2 = ''
+
+ cnt[tag] += 1
+ if tag2 : cnt2[tag2] += 1
+
+ return [cnt,cnt2]
+
+def main(infile) :
+
+ cnt = Counter()
+ cnt2 = Counter()
+ with codecs.open(infile,'r', encoding='utf-8') as f :
+ for line in f:
+
+ cols = line2entries(line)
+ if cols :
+ token, gold_form, test_form, gold_code, test_code, cmp = cols
+ gold_code_segments = gold_code.split()
+ test_code_segments = test_code.split()
+ for gold_code_segment, test_code_segment in zip(gold_code_segments, test_code_segments) :
+ gold_code_segment = rm_deletion(gold_code_segment)
+ test_code_segment = rm_deletion(test_code_segment)
+ if gold_code_segment != "NULL" and gold_code_segment :
+ cnt,cnt2 = stat(gold_code_segment, test_code_segment, cnt,cnt2)
+ return cnt,cnt2
+
+def print_cnt (cnt, mode) :
+
+ tot = float(sum(cnt.values()))
+
+
+ if mode == 0:
+ for k in sorted(cnt.keys()) :
+ print u"{:16s} = {:05.4f}".format(k, cnt[k] / tot)
+ else :
+ # horizontal label
+ sys.stdout.write(u"{:5s} ".format(''))
+ for k2 in markers_tone :
+ sys.stdout.write(u"{:>5s} ".format(k2))
+ print ""
+
+ for k1 in markers_tone :
+ for i,k2 in enumerate(markers_tone) :
+ tag = k1 + u'___' + k2
+ if k1 == k2 and k1 in cnt.keys() : val = cnt[k1] / tot
+ elif k1 != k2 and tag in cnt.keys() : val = cnt[tag] / tot
+ else : val = 0
+ # vertical label
+ if not i : sys.stdout.write(u"{:>5s} & ".format(k1))
+ # matrix content
+ if i == len(markers_tone) - 1 : c = '\\\\'
+ else : c = '&'
+ sys.stdout.write(u"{:5.4f} {:1s} ".format(val,c))
+ print ""
+
+if __name__ == "__main__" :
+
+ aparser = argparse.ArgumentParser()
+ aparser.add_argument('infile' , help='Input file (.csv)' , default=sys.stdin)
+ args = aparser.parse_args()
+ cnt,cnt2 = main(args.infile)
+ print args.infile
+ print_cnt(cnt,0)
+ print ""
+ print_cnt(cnt2,1)
diff --git a/kill_all_exps.sh b/kill_all_exps.sh
new file mode 100644
index 0000000..bce7615
--- /dev/null
+++ b/kill_all_exps.sh
@@ -0,0 +1,7 @@
+#! /bin/sh
+
+killall tail
+killall gawk
+killall Python
+killall python
+killall bash
diff --git a/launch_all_exps.sh b/launch_all_exps.sh
new file mode 100644
index 0000000..54a520b
--- /dev/null
+++ b/launch_all_exps.sh
@@ -0,0 +1,10 @@
+#! /bin/sh
+
+#set -vx
+
+for f in exp*.sh ; do
+ bash "$f" &
+done
+
+sleep 5
+tail -f *.log
diff --git a/models/model_pos_exactitude_92p3 b/models/model_pos_exactitude_92p3
new file mode 100644
index 0000000..3ac9217
Binary files /dev/null and b/models/model_pos_exactitude_92p3 differ
diff --git a/models/model_tone_exactitude_92p1.zip b/models/model_tone_exactitude_92p1.zip
new file mode 100644
index 0000000..df74bfd
Binary files /dev/null and b/models/model_tone_exactitude_92p1.zip differ