From 779577d365714fc48e5010c8ab28a36e5a2e4a6c Mon Sep 17 00:00:00 2001
From: nlpAr <nlpAr@users.noreply.github.com>
Date: Mon, 24 Jul 2017 11:01:46 +0200
Subject: [PATCH 1/2] Adding gloss disambiguation by n-grams

---
 dict_creation.py | 107 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 dict_creation.py

diff --git a/dict_creation.py b/dict_creation.py
new file mode 100644
index 0000000..e5d9dd6
--- /dev/null
+++ b/dict_creation.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+#   from dict_creation import n_grams_freq, multi_freq, annot_pars
+
+from collections import defaultdict
+
+def n_grams_freq(train_set):
+    """
+        Input
+                train_set : list of lists [ [sentence], ... ]
+        Return
+                d1, d2, d3 : three n-grams dictionaries ; { (n-grams) : frequency }
+    """
+    d1 = dict() # Unigrams dict
+    d2 = dict() # Bigrams dict
+    d3 = dict() # Trigrams dict
+    for phrase in train_set:
+        p = list()
+        g = list()
+        for i, token in enumerate(phrase):
+            p.append(phrase[i][0])
+            g.append(phrase[i][1].decode('utf-8'))
+        for y, token in enumerate(p):
+            d1[(token, g[y])] = d1.get((token, g[y]), 0)+1
+            try:
+                d2[(token, p[y+1]), (g[y], g[y+1])] = d2.get(((token, p[y+1]), (g[y], g[y+1])), 0)+1
+            except IndexError:
+                pass
+            try:
+                d3[(token, p[y+1], p[y+2]), (g[y], g[y+1], g[y+2])] = d3.get(((token, p[y+1], p[y+2]), (g[y], g[y+1], g[y+2])), 0)+1
+            except IndexError:
+                pass
+    return d1, d2, d3
+
+#############################################################################################
+
+def multi_freq(dico):
+    """
+        Input
+                dico : a dictionary ; { (token, gloss) : freqency }
+        Return
+                dic, a dictionary ; { token : [ [token, gloss, freq], ... ] }
+    """
+    dic = defaultdict(list)
+    for c in dico:
+        dic[c[0]].append([c[1], dico[c]])
+    return dic
+
+#############################################################################################
+
+def annot_pars(sent, d1, d2, d3):
+    """
+        Input :
+                sent : a list, of one sentence
+                d1, d2, d3 : ngrams dictionaries make by n_grams_freq and multi_freq
+        Return
+                a list of tuples [ (token, annotated_gloss) ]
+    """
+    new_sent = [u''] * len(sent)
+    for i, token in enumerate(sent):
+        try:
+            maxi = 0
+            win = [u''] * 3
+            for val in d3[(token, sent[i+1], sent[i+2])]:
+                if val[1] > maxi:
+                    maxi = val[1]
+                    win[0] = val[0][0]
+                    win[1] = val[0][1]
+                    win[2] = val[0][2]
+            new_sent[i] = win[0]
+            new_sent[i+1] = win[1]
+            new_sent[i+2] = win[2]
+        except IndexError:
+            pass
+
+    for i, token in enumerate(sent):
+        try:
+            if new_sent[i] == u'' and new_sent[i+1] == u'':
+                maxi = 0
+                win = [u''] * 2
+                for val in d2[(token, sent[i+1])]:
+                    if val[1] > maxi:
+                        maxi = val[1]
+                        win[0] = val[0][0]
+                        win[1] = val[0][1]
+                new_sent[i] = win[0]
+                new_sent[i+1] = win[1]
+        except IndexError:
+            pass
+
+    for i, token in enumerate(sent):
+        if token.isdigit():
+            new_sent[i] = u"CARDINAL"
+        else:
+            if new_sent[i] == u'':
+                maxi = 0
+                win = [u'']
+                for val in d1[token]:
+                    if val[1] > maxi:
+                        maxi = val[1]
+                        win[0] = val[0]
+                new_sent[i] = win[0]
+
+    return zip(sent, new_sent)
+
+

From e109465e3e4b5542b12d060fafcca133e7674592 Mon Sep 17 00:00:00 2001
From: nlpAr <nlpAr@users.noreply.github.com>
Date: Mon, 24 Jul 2017 11:07:09 +0200
Subject: [PATCH 2/2] Adding dict_creation, disambiguate ; disambiguation by
 n-grams

---
 disambiguation.py | 833 ++++++++++++++++++++++++++++------------------
 1 file changed, 513 insertions(+), 320 deletions(-)

diff --git a/disambiguation.py b/disambiguation.py
index a15bd26..649f6a5 100644
--- a/disambiguation.py
+++ b/disambiguation.py
@@ -3,341 +3,534 @@
 
 # Auteur : Elvis Mboning, Stagiaire 2016, INALCO
 # Auteur : Damien Nouvel, MCF, INALCO
+# Auteur : Luigi (Yu-Cheng) Liu, Stagiaire 2017, INALCO
+# Auteur : Arthur Provenier, Stagiaire 2017, INALCO (gloss disambiguation)
 
 # Le principale rôle de ce script est de créer des modèles de données pour l'apprentissage automatique avec CRFTagger.
 # Le CRF implémenté provient du module tag de NLTK inspiré de CRFSuite (http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf).
 # Trois modèles sont possibles : les POS, les tons, les gloses
 
-# todo:
-# * petit rapport sur les distributions de caractères et leurs natures dans le corpus
-## * enregistrement et téléverser
-# * models produits /models/pos_exactitude_0p92.mod
-# * models produits /models/tone_exactitude_0p91.mod
-# * avec un fihier in et un fichier out
-#
-# des RDV. prévus
-# mercredi 17 mai à 14 : 30
-
-import sys, re, codecs, glob, time, os
-import argparse
+# Gloss can be predict with two differents models :
+#   one using CRF (option -g), training is not timewise efficiency and accuracy is uncertain
+#   one using n-grams (option -G), faster and more accurate than CRF (~ 10s / accuracy : 0.91)
+# Ngrams generated dictionaries are saved with "Pickles" during training and can be called later during disambiguate (option -d)
+
+import sys, re, codecs, glob, time, os, collections, argparse, itertools
 import formats,  grammar
-import collections
+from gdisamb import FileParser
 from ntgloss import Gloss
 from nltk.tag.crf import CRFTagger
-from gdisamb import FileParser
-from differential_tone_coding import encoder_tones, repr, token_seperator, _get_features_customised_for_tones, code_dispatcher, code_resort, mode_indicators
-import unicodedata
-import pycrfsuite
-import csv
 import nltk.tag.util
-import itertools
-from nltk.metrics.scores import accuracy
-import zipfile
+import pycrfsuite
+from differential_tone_coding import apply_filter_to_base_element, get_features_customised, get_duration, sampling, csv_export, unzip, encoder_tones, mode_indicators, marginal_tone, accuray2, get_sub_tone_code_of_sentence, accumulate_tone_code_of_dataset, reshape_tokens_as_sentnece, make_tokens_from_sentence, make_features_from_tokens
+import unicodedata
+import zipfile, ntpath
 
-import codecs, sys
+import fnmatch
 sys.stdin = codecs.getreader('utf8')(sys.stdin)
 sys.stdout = codecs.getwriter('utf8')(sys.stdout)
 
-def unzip(input) :
-	return [list(li) for li in zip(*input)]
-
-# dataset : list((str,str))
-def getTag(dataset) :
-	ret = []
-	buf = str()
-	for data in dataset :
-		if data[0] != token_seperator :
-			buf += data[1]
-		else :
-			ret.append(buf)
-			buf = str()
-	if buf :
-		ret.append(buf)
-	return ret
-
-def csv_export(enc, filename, gold_tokens, test_tokens):
-
-	try :
-		csvfile = codecs.open(filename, 'wb')
-		writer = csv.writer(csvfile)
-		writer.writerow(["Token", "Golden Form", "Predicted Form","Golden code", "Predicted code", "Same"])
-		for g, t in zip(gold_tokens, test_tokens) :
-			token          = g[0]
-			golden_code    = g[-1]
-			predicted_code = t[-1]
-			golden_form    = enc.differential_decode(token, golden_code.decode('utf-8'))
-			predicted_form = enc.differential_decode(token, predicted_code.decode('utf-8'))
-			sameCodes = (golden_code == predicted_code)
-			sameForms = (golden_form == predicted_form)
-
-			if not repr(token.encode('utf-8')) :
-				sameCodes = u''
-			row = [\
-				repr(token.encode('utf-8')), \
-				repr(golden_form.encode('utf-8')), \
-				repr(predicted_form.encode('utf-8')), \
-				repr(golden_code, spaces=True), \
-				repr(predicted_code, spaces=True), \
-				sameCodes]
-
-			writer.writerow(row)
-		csvfile.close()
-	except :
-		raise
-		print "unable to dump result in CSV file to create !"
-
-def sampling(allsents, p, ratio = 1) :
-	train_set, eval_set = [], []
-	for i, sent in enumerate(allsents[0 : : int(1/float(ratio))]) :
-		p_approx = float(len(train_set) + 1) / float(len(eval_set) + len(train_set) + 1)
-		if p_approx <= p :
-			train_set.append(sent)
-		else:
-			eval_set.append(sent)
-	return [train_set, eval_set]
-
-def get_duration(t1_secs, t2_secs) :
-	secs = abs(t1_secs - t2_secs)
-	days = secs // 86400
-	hours = secs // 3600 - days * 24
-	minutes = secs // 60 - hours * 60 - days * 60 * 24
-	secondes = int(secs) % 60
-	return '{:>02.0f}:{:>02.0f}:{:>02.0f}:{:>02d}'.format(days, hours, minutes, secondes)
+from dict_creation import n_grams_freq, multi_freq, annot_pars
+import cPickle as pickle
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
 
 def main():
 
-	aparser = argparse.ArgumentParser(description='Daba disambiguator')
-	aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
-	aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
-	aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
-	aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
-	# aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
-	aparser.add_argument('-e', '--evalsize', help='Percent of training data with respect to training and test one (default 10)', default=10)
-	aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None)
-	aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true')
-	aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin)
-	aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout)
-	aparser.add_argument('-s', '--store', help='Store tagged raw data in file (.csv) for further research purpose', default=None)
-
-	args = aparser.parse_args()
-	if args.verbose :
-		print args
-
-	if args.learn and (args.pos or args.tone or args.gloss):
-
-		if not (args.pos or args.tone or args.gloss) :
-			print 'Choose pos, tone, gloss or combination of them'
-			exit(0)
-
-		print 'Make list of files'
-		files1 = glob.iglob("../corbama/*/*.dis.html")
-		files2 = glob.iglob("../corbama/*.dis.html")
-
-		allfiles = ""
-		for file1, file2 in zip(files1, files2):
-			allfiles += file1+','+file2+','
-		allsents = []
-
-		# pour le débogage
-		allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'
-
-		if args.tone :
-			try :
-				enc = encoder_tones()
-			except :
-				enc = None
-				print ("Error : unable to initialize the tone encoder !")
-
-		print 'Open files and find features / supervision tags'
-		for infile in allfiles.split(','):
-			if(len(infile)) :
-				print '-', infile
-				sent = []
-
-				html_parser = FileParser()
-				html_parser.read_file(infile)
-
-				for snum, sentence in enumerate(html_parser.glosses) :
-					for tnum, token in enumerate(sentence[2]) :
-						tag = ''
-						if token.type == 'w' or token.type == 'c':
-							tags = ''
-							if args.pos:
-								for ps in token.gloss.ps : tags += ps.encode('utf-8')
-								sent.append((token.token, tags))
-							elif args.tone:
-								# Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ?
-								# Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est
-								# au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter
-								# une réélle amélioration dans la modélisation de tonalisation. Néanmoins,
-								# dans la conception du cadre logiciel, rien n'interdit de l'inclure dans
-								# les données d'entraînement et d'en observer le apport
-								if '|' not in token.gloss.form :
-									[codes, chunks] = enc.differential_encode(token.token, token.gloss.form)
-									for chunk, code in zip(chunks, codes) :
-										try : sent.append((chunk, code.encode('utf-8')))
-										except LookupError: pass
-							"""
-							elif args.gloss:
-								tags += token.gloss.gloss.encode('utf-8')
-								sent.append((token.token, tags))
-							"""
-
-					if len(sent) > 1:
-						allsents.append(sent)
-						sent = []
-
-		if args.verbose and args.tone :
-			enc.report()
-
-		# Constitution des ensmebles d'entraînement de d'évaluation
-		p = (1 - args.evalsize / 100.0)
-		train_set, eval_set = sampling(allsents, p)
-		print 'Split the data in train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)'
-
-		print 'Building classifier (CRF/NLTK)'
-		# Initialization
-		t1 = time.time()
-		if args.tone  :
-			num_phases = len([False, True]) * len(mode_indicators)
-			myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
-		else :
-			num_phases = 1
-
-		# Training
-		for phase in range(num_phases) :
-			tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
-			trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
-			trainer.set_params(tagger._training_options)
-			if num_phases > 1 :
-				model_name = args.learn + '.' + str(phase)
-			else:
-				model_name = args.learn
-
-			# train_set : list(list((str,list(str))))
-			for sent in train_set:
-				tokens = unzip(sent)[0]
-				labels = unzip(sent)[1]
-				if num_phases > 1 :
-					for lab in labels :
-						pass
-					labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
-				features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
-				trainer.append(features, labels)
-			trainer.train(model = model_name)
-			if num_phases > 1 :
-				myzip.write(model_name)
-				os.remove(model_name)
-		if num_phases > 1 :
-			myzip.close()
-
-		print "... done in", get_duration(t1_secs = t1, t2_secs = time.time())
-
-		# Evaluation
-		print 'Evaluating classifier'
-		# gold_set, predicted_set : list(list((str, str)))
-		# input_set, output_gold_set : list(list(str))
-		gold_set = eval_set
-		input_set = [unzip(sent)[0] for sent in gold_set]
-		predicted_set = [list() for sent in gold_set]
-		if num_phases > 1 :
-			myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
-		for phase in range(num_phases) :
-			tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
-			trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
-			trainer.set_params(tagger._training_options)
-			if num_phases > 1:
-				model_name = args.learn + '.' + str(phase)
-				myzip.extract(model_name)
-			else :
-				model_name = args.learn
-			tagger.set_model_file(model_name)
-			for i, sent in enumerate(input_set) :
-				features = [_get_features_customised_for_tones(sent,j) for j in range(len(sent))]
-				labels = tagger._tagger.tag(features)
-				if num_phases > 1 :
-					labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
-				tagged_sent = list(zip(sent, labels))
-				if not predicted_set[i] :
-					predicted_set[i] = tagged_sent
-				else :
-					sent_acc, labels_acc = unzip(predicted_set[i])
-					labels_acc = [label_acc + label for label_acc, label in zip(labels_acc, labels)]
-					predicted_set[i] = list(zip(sent_acc, labels_acc))
-			if num_phases > 1 :
-				os.remove(model_name)
-		myzip.close()
-
-		# gold_tokens, predicted_tokens : list((str,str))
-		predicted_tokens = list(itertools.chain(*predicted_set))
-		if num_phases > 1 :
-			predicted_tokens = [ tuple([pair[0], code_resort(pair[1].decode('utf-8')).encode('utf-8')]) for pair  in predicted_tokens]
-		gold_tokens = list(itertools.chain(*gold_set))
-		# gold_tokens_eval, predicted_tokens_eval : list(str)
-		if args.tone :
-			gold_tokens_eval = getTag(gold_tokens)
-			predicted_tokens_eval = getTag(predicted_tokens)
-		else :
-			gold_tokens_eval = gold_tokens
-			predicted_tokens_eval = predicted_tokens
-
-		if args.store and args.tone :
-			stored_filename = args.store
-			csv_export(enc, stored_filename, gold_tokens, predicted_tokens)
-
-		print "Exactitude : {:>5.3f}".format(accuracy(gold_tokens_eval, predicted_tokens_eval))
-
-		if args.verbose and args.store :
-			print ("Tagged result is exported in {}".format(args.store))
-
-	elif args.disambiguate and args.infile and args.outfile :
-		# Lecture de texte en .HTML
-		html_parser = FileParser()
-		tagger = CRFTagger()
-
-		if args.pos :
-			try :
-				tagger.set_model_file(args.disambiguate)
-			except IOError:
-				print "Error : unable to open the model {} !".format(args.infile)
-				exit(1)
-			try :
-				html_parser.read_file(args.infile)
-			except IOError:
-				print "Error : unable to open the input file {} !".format(args.infile)
-				exit(1)
-
-			# Exportation du résultat de désambiguïsation en .HTML
-			for snum, sentence in enumerate(html_parser.glosses) :
-				tokens = [token.token for token in sentence[2]]
-				features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
-				tagger._tagger.set(features)
-				for tnum, token in enumerate(sentence[2]) :
-					options = list()
-					if token.value and len(token.value) > 2:
-						for nopt, option in enumerate(token.value[2]) :
-							try: tag = option.ps[0]
-							except IndexError : tag = ''
-							prob = tagger._tagger.marginal(tag, tnum)
-							options.append((prob, option))
-						reordered_probs, reordered_options = unzip(sorted(options, reverse = True))
-						if args.select :
-							prob_max = reordered_probs[0]
-							reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max])
-						html_parser.glosses[snum][1][tnum] = reordered_options
-
-		elif args.tone :
-			pass
-
-		try : html_parser.write(args.outfile)
-		except IOError: print "Error : unable to create the output file {}".format(args.outfile)
-
-	else :
-		aparser.print_help()
-
-
-
-	exit(0)
+    aparser = argparse.ArgumentParser(description='Daba disambiguator')
+    aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
+    aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
+    aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
+    aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
+    aparser.add_argument('-g', '--gloss', help='Prediction for gloses using CRFsuite', default=False, action='store_true')
+    aparser.add_argument('-G', '--Gloss', help='Prediction for gloses using n-grams', default=False, action='store_true')
+    aparser.add_argument('-e', '--evalsize', help='Percent of training data with respect to training and test one (default 10)', default=10, type=float)
+    aparser.add_argument('-c', '--chunkmode', help='Chunking mode specification which is effective only for tone (default -1)', default=-1, type=int)
+    aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None)
+    aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true')
+
+    aparser.add_argument('--filtering', help = 'Experimental option', action='store_true')
+    aparser.add_argument('--no_decomposition', help = 'Experimental option', action='store_true')
+    aparser.add_argument('--diacritic_only', help = 'Experimental option', action='store_true')
+    aparser.add_argument('--non_diacritic_only', help = 'Experimental option', action='store_true')
+    aparser.add_argument('--no_coding', help = 'Experimental option', action='store_true')
+
+    aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin)
+    aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout)
+    aparser.add_argument('-s', '--store', help='Store evaluation resault in file (.csv) for further research purpose', default=None)
+
+    args = aparser.parse_args()
+    if args.verbose :
+        print 'Arguments received by script'
+        dico = vars(args)
+        for key,val in dico.items():
+            typeName = type(val).__name__
+            sys.stdout.write("\t{} = {} ".format(key, val))
+            if val :
+                sys.stdout.write("({})".format(typeName))
+            print ""
+
+    if not (args.pos or args.tone or args.gloss or args.Gloss) :
+            print 'Choose pos, tone, gloss'
+            aparser.print_help()
+            exit(0)
+
+    if args.learn :
+        print 'Make list of files'
+
+        """
+        files1 = glob.iglob("../corbama/*/*.dis.html")
+        files2 = glob.iglob("../corbama/*.dis.html")
+
+        allfiles = ""
+        for file1, file2 in zip(files1, files2):
+            allfiles += file1+','+file2+','
+        """
+        allfiles = []
+        for root, dirnames, filenames in os.walk('../corbama'):
+            for filename in fnmatch.filter(filenames, '*.dis.html'):
+                allfiles.append(os.path.join(root, filename))
+
+        allsents = []
+
+        # pour le débogage rapide
+        #allfiles = list()
+        #allfiles.append(u'../corbama/musokonoma_ka_banaw.dis.html')
+        #allfiles.append(u'../corbama/sisoko-daa_ka_kore.dis.html')
+
+        print 'Making observation data from disambiggated corpus of which'
+        for infile in allfiles:
+            if infile :
+                print '\t', infile
+
+                html_parser = FileParser()
+                html_parser.read_file(infile)
+
+                sent = []
+                for sentence in html_parser.glosses :
+                    for token in sentence[2] :
+                        if token.type == 'w' or \
+                           token.type == 'c':
+                            if args.pos and not args.tone and not args.gloss :
+                                # sent : list(str,str)
+                                tags = ''
+                                for ps in token.gloss.ps :
+                                    tags += ps
+                                sent.append((token.token, tags.encode('utf-8')))
+                            elif args.tone and not args.pos and not args.gloss :
+                                # sent : list(str,str)
+                                form = token.gloss.form.split('|')
+                                tags = form[0]
+                                sent.append((token.token, tags.encode('utf-8')))
+                            elif args.gloss or args.Gloss and not args.tone and not args.pos :
+                                # sent : list(str,str)
+                                tags = token.gloss.gloss
+                                sent.append((token.token, tags.encode('utf-8')))
+                            else :
+                                print ('Error : multi-modal learning is not yet be supported !')
+                                exit()
+
+                    if len(sent) > 1:
+                        allsents.append(sent)
+                        sent = []
+
+        if args.tone and not args.no_coding :
+            print 'Token segmentation and tonal information compression'
+            enc = encoder_tones()
+            allsents2 = allsents
+            allsents = []
+            for sent in allsents2 :
+                sent2 = []
+                for token_tags in sent :
+                    token, tags = token_tags
+                    [codes, syllabes] = enc.differential_encode(token, tags.decode('utf-8'), args.chunkmode)
+                    token2 = [(syllabe, code.encode('utf-8')) for syllabe, code in zip(syllabes, codes)]
+                    sent2.append(token2)
+                allsents.append(sent2)
+
+            if args.verbose :
+                enc.report()
+
+        R = 1 # 1 pour la totalité des corpus
+        p = (1 - args.evalsize / 100.0)
+        train_set, eval_set = sampling(allsents, p, R)
+        print 'Split the data in \t train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)'
+
+        # Initialization
+        t1 = time.time()
+
+        if args.Gloss:
+            print "Making dictionaries"
+            dic_unigram, dic_bigram, dic_trigram = n_grams_freq(train_set)
+
+            if len(dic_unigram) == 0:
+                print "Error, -- {} is EMPTY! Exit".format(dic_unigram)
+                exit(1)
+            if len(dic_bigram) == 0:
+                print "Error, -- {} is EMPTY! Exit".format(dic_bigram)
+            if len(dic_trigram) == 0:
+                print "Error, -- {} is EMPTY! Exit".format(dic_trigram)
+                exit(1)
+
+            dic_uni = multi_freq(dic_unigram)
+            dic_bi = multi_freq(dic_bigram)
+            dic_tri = multi_freq(dic_trigram)
+            print "Dictionaries created"
+
+            with open(args.learn, "wb") as F_OUT:
+                pickle.dump( (dic_uni, dic_bi, dic_tri), F_OUT )
+
+        else:
+            print 'Building classifier (pyCRFsuite)'
+            if args.tone and not args.no_coding :
+                num_phases = 2 * len(mode_indicators)
+                myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
+            else :
+                num_phases = 1
+
+            # A. Entrâinement des modèles
+            for phase in range(num_phases) :
+                # A.1. Initialiser un nouveau modèle CRF
+                tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
+                trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
+                trainer.set_params(tagger._training_options)
+                model_name = args.learn
+                if args.tone and not args.no_coding :
+                    if args.diacritic_only and (phase == 0 or phase == 1) :
+                        continue
+                    if args.non_diacritic_only and (phase == 2 or phase == 3) :
+                        continue
+                    elif args.no_decomposition and phase % len(mode_indicators) != 0 :
+                        continue
+                    model_name += '.' + str(phase)
+
+                # A.2. Mettre à plat les structures de données pour préparer l'entrâinement contextuel
+                for sent in train_set :
+                    if args.tone and not args.no_coding :
+                        [tokens, labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+                        features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding)
+                        labels = get_sub_tone_code_of_sentence(sent, phase, sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+                        labels = list(itertools.chain(*labels))
+                    else :
+                        [tokens, labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+                        features = make_features_from_tokens(tokens, 0, args.tone and not args.no_coding)
+
+                    trainer.append(features, labels)
+                trainer.train(model = model_name)
+
+                if args.tone and not args.no_coding :
+                    myzip.write(model_name)
+                    os.remove(model_name)
+
+            if args.tone and not args.no_coding :
+                myzip.close()
+
+        print "... done in", get_duration(t1_secs = t1, t2_secs = time.time())
+
+        # B. Evaluation
+        print 'Evaluating classifier'
+        gold_set = eval_set
+
+        if args.Gloss:
+            predicted_set = list()
+            for i, sentences in enumerate(eval_set):
+                sent = list()
+                for y, token in enumerate(sentences):
+                    sent.append(token[0])
+                predicted_set.append(annot_pars(sent, dic_uni, dic_bi, dic_tri))
+
+
+            #predicted_set = eval_annot(eval_set, dic_uni, dic_bi, dic_tri)
+        else:
+            if args.tone and not args.no_coding :
+                myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
+                predicted_set_acc = list()
+                for phase in range(num_phases) :
+
+                    # B.1. Charger le modèle CRF pour une des quatre phases d'annoation tonale
+                    tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
+                    trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
+                    trainer.set_params(tagger._training_options)
+                    model_basename = ''
+                    for m in myzip.namelist() :
+                        if m.endswith(str(phase)):
+                            model_basename = m
+                            break
+                    if not model_basename :
+                        continue
+                    if args.diacritic_only and (phase == 0 or phase == 1) :
+                        continue
+                    if args.non_diacritic_only and (phase == 2 or phase == 3):
+                        continue
+                    elif args.no_decomposition and phase % len(mode_indicators) != 0 :
+                        continue
+
+                    myzip.extract(model_basename)
+                    tagger.set_model_file(model_basename)
+                    os.remove(model_basename)
+
+                    # B.2 Annotation automatique syllabe par syllabe pour une phrase
+                    predicted_set = list()
+                    for p, sent in enumerate(gold_set) :
+
+                        [tokens, gold_labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+                        features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding)
+                        labels = tagger._tagger.tag(features)
+                        labels = reshape_tokens_as_sentnece(labels, sent)
+
+                        predicted_tokens = list()
+                        for i, token in enumerate(sent) :
+                            predicted_tokens.append(map(list, zip(tokens[i], labels[i])))
+                        predicted_set.append(predicted_tokens)
+
+                    # B.3 Accumuler en ordonner l'annotation syllabique
+                    if not predicted_set_acc :
+                        predicted_set_acc = \
+                            [[[['',''] for syllabe in token] for token in sent] for sent in predicted_set]
+
+                    predicted_set_acc = accumulate_tone_code_of_dataset (predicted_set_acc, predicted_set)
+
+                predicted_set = predicted_set_acc
+
+
+            else :
+                # B.1. Charger le modèle CRF pour l'annoation
+                tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
+                trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
+                trainer.set_params(tagger._training_options)
+                model_name = args.learn
+                tagger.set_model_file(model_name)
+
+                # B.2. Annotation automatique token par token
+                predicted_set = list()
+                for sent in gold_set :
+                    [tokens, gold_labels] = make_tokens_from_sentence(sent, args.tone and not args.no_coding)
+                    features = make_features_from_tokens(tokens, 0, args.tone and not args.no_coding)
+                    labels = tagger._tagger.tag(features)
+                    predicted_set.append(zip(tokens, labels))
+
+
+            if args.tone and not args.no_coding :
+                # on ajuste l'évaluation dans les cas d'apprentissage partiel
+                # en nous proposant de filtrer les caractères ignorés par l'apprentissage
+                # sinon, nous obtiendrons un résultat pénalisé
+                # en voulant comparer une forme prédite partiellement à la forme tonale intégrale d'un même token
+                if args.diacritic_only :
+                    gold_set = apply_filter_to_base_element(gold_set, [2,3], sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+                elif args.non_diacritic_only :
+                    gold_set = apply_filter_to_base_element(gold_set, [0,1], sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+                elif args.filtering :
+                    gold_set = apply_filter_to_base_element(gold_set, [0,1,2,3], sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+
+                """
+                if args.verbose :
+                    verify(gold_set)
+                """
+
+        print "Accuracy : {:>5.3f}".format(accuray2(gold_set, predicted_set, args.tone and not args.no_coding))
+
+        if args.store :
+            stored_filename = args.store
+            csv_export(stored_filename, gold_set, predicted_set, args.tone and not args.no_coding)
+
+        if args.verbose and args.store :
+            print ("Tagged result is exported in {}".format(args.store))
+
+    elif args.disambiguate and args.infile and args.outfile :
+
+        html_parser = FileParser()
+        tagger = CRFTagger()
+
+        if args.pos :
+            try :
+                tagger.set_model_file(args.disambiguate)
+            except IOError:
+                print "Error : unable to open the model {} !".format(args.disambiguate)
+                exit(1)
+            try :
+                html_parser.read_file(args.infile)
+            except IOError:
+                print "Error : unable to open the input file {} !".format(args.infile)
+                exit(1)
+
+            for snum, sentence in enumerate(html_parser.glosses) :
+                tokens = [token.token for token in sentence[2]]
+                features = [get_features_customised(tokens, i) for i in range(len(tokens))]
+                tagger._tagger.set(features)
+                for tnum, token in enumerate(sentence[2]) :
+                    options = list()
+                    if token.value and len(token.value) > 2:
+                        for nopt, option in enumerate(token.value[2]) :
+                            try: tag = option.ps[0]
+                            except : tag = ''
+                            try:
+                                prob = tagger._tagger.marginal(tag, tnum)
+                            except :
+                                prob = 0.0
+                            options.append((prob, option))
+                        reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True))
+                        if args.select :
+                            prob_max = reordered_probs[0]
+                            reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max])
+
+                        html_parser.glosses[snum][1][tnum] = reordered_options
+
+        elif args.tone and not args.no_coding :
+            try :
+                html_parser.read_file(args.infile)
+            except IOError:
+                print "Error : unable to open the input file {} !".format(args.infile)
+                exit(1)
+            try :
+                if re.search(u"\.zip", args.disambiguate):
+                    myzip = zipfile.ZipFile(args.disambiguate,'r')
+                else:
+                    myzip = zipfile.ZipFile(args.disambiguate + '.zip', 'r')
+            except IOError:
+                print "Error : unable to open the model file {} !".format((args.disambiguate + '.zip'))
+                exit(1)
+
+            num_phases = 2 * len(mode_indicators)
+            taggers = []
+            enc = encoder_tones()
+            for phase in range(num_phases) :
+                taggers.append(CRFTagger())
+                model_basename = ''
+                for m in myzip.namelist() :
+                    if m.endswith(str(phase)):
+                        model_basename = m
+                        break
+                if not model_basename :
+                    continue
+                if args.diacritic_only and (phase == 0 or phase == 1) :
+                    continue
+                if args.non_diacritic_only and (phase == 2 or phase == 3):
+                    continue
+                elif args.no_decomposition and phase % len(mode_indicators) != 0 :
+                    continue
+
+                myzip.extract(model_basename)
+                taggers[phase].set_model_file(model_basename)
+                os.remove(model_basename)
+            myzip.close()
+
+            for snum, sentence in enumerate(html_parser.glosses) :
+                tokens = [enc.differential_encode(token.token, token.token, args.chunkmode)[1] for token in sentence[2]]
+                for phase in range(num_phases) :
+                    features = make_features_from_tokens(tokens, phase, args.tone and not args.no_coding)
+                    if taggers[phase]._model_file :
+                        taggers[phase]._tagger.set(features)
+                for tnum, token in enumerate(sentence[2]) :
+                    options = list()
+                    if token.value and len(token.value) > 2:
+                        for nopt, option in enumerate(token.value[2]) :
+                            try: tag = option.form.encode('utf-8')
+                            except : tag = ''
+                            prob = marginal_tone(taggers, tnum, tokens, tag, token.token, args.chunkmode, sel_en = args.filtering, decomposition_en = not args.no_decomposition)
+                            options.append((prob, option))
+
+                        reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True))
+                        if args.select :
+                            prob_max = reordered_probs[0]
+                            reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max])
+                        html_parser.glosses[snum][1][tnum] = reordered_options
+
+        elif args.Gloss:
+            try :
+                print u"Loading model : {}".format(args.disambiguate)
+                with open(args.disambiguate, "rb") as F_IN:
+                    d1, d2, d3 = pickle.load(F_IN)
+            except IOError:
+                print u"Error : unable to open the model {} !".format(args.disambiguate)
+                exit(1)
+            try :
+                print u"Reading file : {}".format(args.infile)
+                html_parser.read_file(args.infile)
+            except IOError:
+                print u"Error : unable to open the input file {} !".format(args.infile)
+                exit(1)
+
+            for snum, sentence in enumerate(html_parser.glosses) :
+                sent = list() # Each sentence
+                for tnum, token in enumerate(sentence[2]):
+                    if token.type == 'w' or token.type == 'c':
+                        sent.append(token.token)
+
+                # sent_annot = [(token, gloss), (), ...]
+                sent_annot = annot_pars(sent, d1, d2, d3)
+
+                cpt = 0
+                for tnum, token in enumerate(sentence[2]):
+                    if token.type == 'w' or token.type == 'c':
+                        options = list()
+                        if token.value and len(token.value) > 2:
+                            for nopt, option in enumerate(token.value[2]) :
+                                if option.gloss == sent_annot[cpt][1]:
+                                    prob = 1
+                                else:
+                                    prob = 0.0
+                                options.append((prob, option))
+                            reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True))
+
+                            if args.select :
+                                prob_max = reordered_probs[0]
+                                reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max])
+
+                            html_parser.glosses[snum][1][tnum] = reordered_options
+                        cpt += 1
+            print u"File disambiguated"
+
+
+        elif args.gloss:
+            try :
+                print u"Loading model : {}".format(args.disambiguate)
+                tagger.set_model_file(args.disambiguate)
+            except IOError:
+                print u"Error : unable to open the model {} !".format(args.disambiguate)
+                exit(1)
+            try :
+                print u"Reading file : {}".format(args.infile)
+                html_parser.read_file(args.infile)
+            except IOError:
+                print u"Error : unable to open the input file {} !".format(args.infile)
+                exit(1)
+
+            for snum, sentence in enumerate(html_parser.glosses) :
+                tokens = [token.token for token in sentence[2]]
+                features = [get_features_customised(tokens, i) for i in range(len(tokens))]
+                tagger._tagger.set(features)
+                for tnum, token in enumerate(sentence[2]) :
+                    options = list()
+                    if token.value and len(token.value) > 2:
+                        for nopt, option in enumerate(token.value[2]) :
+                            try:
+                                tag = option.gloss
+                            except :
+                                tag = ''
+                            try:
+                                prob = tagger._tagger.marginal(tag, tnum)
+                            except :
+                                prob = 0.0
+                            options.append((prob, option))
+                        reordered_probs, reordered_options = unzip(sorted(options, key = lambda x : x[0], reverse = True))
+                        if args.select :
+                            prob_max = reordered_probs[0]
+                            reordered_options = tuple([reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max])
+
+                        html_parser.glosses[snum][1][tnum] = reordered_options
+
+        try :
+            html_parser.write(args.outfile)
+            print u"Disambiguated result for {} is saved in {}".format(args.infile, args.outfile)
+        except IOError:
+            print u"Error : unable to create the output file {} !".format(args.outfile)
+
+    else :
+        aparser.print_help()
+
+    exit(0)
 
 if __name__ == '__main__':
-	main()
+    main()