From 15c668720f8278eb3479abc029ea6a47b68d85a3 Mon Sep 17 00:00:00 2001 From: Jitesh Malipeddi Date: Mon, 7 Oct 2024 17:54:13 +0530 Subject: [PATCH 01/16] updated files to work with python3 --- .gitignore | 5 + accuracy.py | 47 ++-- annotate_wiki_file.py | 43 ++-- calc_statistics.py | 110 ++++----- check_annotated_wiki_file.py | 142 ++++-------- chen_cities_converter.py | 75 +++--- chen_elements_convertor.py | 75 +++--- choi_convertor.py | 65 +++--- choiloader.py | 62 ++--- clean_wiki_dataset.py | 41 ++-- configgenerator.py | 8 +- convert_seperator.py | 34 +-- evaluate.py | 12 +- gpu2cpu.py | 21 +- graphseg_gen.sh | 17 +- graphseg_timer.py | 49 ++-- models/from_presentation.py | 30 +-- models/max_sentence_embedding.py | 34 +-- models/naive.py | 27 +-- models/single_lstm.py | 40 ++-- run.py | 220 +++++++----------- run_web_server.py | 27 ++- seg_comparsion.py | 130 +++++------ test_accuracy.py | 102 ++++---- test_accuracy_choi.py | 129 +++++------ tests.py | 168 ++++++-------- text_manipulation.py | 97 ++++---- times_profiler.py | 33 +-- utils.py | 136 +++-------- wiki_extractor.py | 8 +- wiki_loader.py | 33 +-- wiki_processor.py | 384 ++++++++++++------------------- wiki_utils.py | 33 ++- 33 files changed, 1016 insertions(+), 1421 deletions(-) diff --git a/.gitignore b/.gitignore index 7bbc71c..de6246e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +.DS_Store +config.json +data/ +runs/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/accuracy.py b/accuracy.py index 39758f6..c8581fe 100644 --- a/accuracy.py +++ b/accuracy.py @@ -1,39 +1,37 @@ import segeval as seg import numpy as np - def softmax(x): max_each_row = np.max(x, axis=1, keepdims=True) exps = np.exp(x - max_each_row) sums = np.sum(exps, axis=1, keepdims=True) return exps / sums - class Accuracy: def __init__(self, threshold=0.3): self.pk_to_weight = [] self.windiff_to_weight = [] self.threshold = threshold - def update(self, h, gold, sentences_length = None): + def update(self, h, gold, sentences_length=None): h_boundaries = self.get_seg_boundaries(h, sentences_length) gold_boundaries = self.get_seg_boundaries(gold, sentences_length) pk, count_pk = self.pk(h_boundaries, gold_boundaries) - windiff, count_wd = -1, 400;# self.win_diff(h_boundaries, gold_boundaries) + windiff, count_wd = -1, 400 # Placeholder for windiff calculation if pk != -1: self.pk_to_weight.append((pk, count_pk)) else: - print ('pk error') + print('pk error') if windiff != -1: self.windiff_to_weight.append((windiff, count_wd)) - def get_seg_boundaries(self, classifications, sentences_length = None): + def get_seg_boundaries(self, classifications, sentences_length=None): """ - :param list of tuples, each tuple is a sentence and its class (1 if it the sentence starts a segment, 0 otherwise). - e.g: [(this is, 0), (a segment, 1) , (and another one, 1) - :return: boundaries of segmentation to use for pk method. For given example the function will return (4, 3) + :param classifications: list of tuples, each tuple is a sentence and its class (1 if the sentence starts a segment, 0 otherwise). + :param sentences_length: list of sentence lengths (optional) + :return: boundaries of segmentation for pk method. """ curr_seg_length = 0 boundaries = [] @@ -41,7 +39,7 @@ def get_seg_boundaries(self, classifications, sentences_length = None): is_split_point = bool(classifications[i]) add_to_current_segment = 1 if sentences_length is None else sentences_length[i] curr_seg_length += add_to_current_segment - if (is_split_point): + if is_split_point: boundaries.append(curr_seg_length) curr_seg_length = 0 @@ -49,10 +47,10 @@ def get_seg_boundaries(self, classifications, sentences_length = None): def pk(self, h, gold, window_size=-1): """ - :param gold: gold segmentation (item in the list contains the number of words in segment) - :param h: hypothesis segmentation (each item in the list contains the number of words in segment) - :param window_size: optional - :return: accuracy + :param h: hypothesis segmentation + :param gold: gold segmentation + :param window_size: optional window size + :return: pk accuracy """ if window_size != -1: false_seg_count, total_count = seg.pk(h, gold, window_size=window_size, return_parts=True) @@ -60,19 +58,18 @@ def pk(self, h, gold, window_size=-1): false_seg_count, total_count = seg.pk(h, gold, return_parts=True) if total_count == 0: - # TODO: Check when happens false_prob = -1 else: - false_prob = float(false_seg_count) / float(total_count) + false_prob = float(false_seg_count) / total_count return false_prob, total_count def win_diff(self, h, gold, window_size=-1): """ - :param gold: gold segmentation (item in the list contains the number of words in segment) - :param h: hypothesis segmentation (each item in the list contains the number of words in segment) - :param window_size: optional - :return: accuracy + :param h: hypothesis segmentation + :param gold: gold segmentation + :param window_size: optional window size + :return: win_diff accuracy """ if window_size != -1: false_seg_count, total_count = seg.window_diff(h, gold, window_size=window_size, return_parts=True) @@ -82,14 +79,12 @@ def win_diff(self, h, gold, window_size=-1): if total_count == 0: false_prob = -1 else: - false_prob = float(false_seg_count) / float(total_count) + false_prob = float(false_seg_count) / total_count return false_prob, total_count def calc_accuracy(self): - pk = sum([pw[0] * pw[1] for pw in self.pk_to_weight]) / sum([pw[1] for pw in self.pk_to_weight]) if len( - self.pk_to_weight) > 0 else -1.0 - windiff = sum([pw[0] * pw[1] for pw in self.windiff_to_weight]) / sum( - [pw[1] for pw in self.windiff_to_weight]) if len(self.windiff_to_weight) > 0 else -1.0 + pk = sum(pw[0] * pw[1] for pw in self.pk_to_weight) / sum(pw[1] for pw in self.pk_to_weight) if self.pk_to_weight else -1.0 + windiff = sum(pw[0] * pw[1] for pw in self.windiff_to_weight) / sum(pw[1] for pw in self.windiff_to_weight) if self.windiff_to_weight else -1.0 - return pk, windiff + return pk, windiff \ No newline at end of file diff --git a/annotate_wiki_file.py b/annotate_wiki_file.py index e67ee32..9994021 100644 --- a/annotate_wiki_file.py +++ b/annotate_wiki_file.py @@ -1,56 +1,47 @@ from argparse import ArgumentParser from wiki_loader import read_wiki_file import pandas as pd -from pathlib2 import Path +from pathlib import Path # Use pathlib, not pathlib2 import os - def get_files(path): - all_objects = Path(path).glob('**/*') + all_objects = Path(path).rglob('*') # Use rglob for '**/*' pattern files = (str(p) for p in all_objects if p.is_file()) return files def generate_segmentation_template(path, output_path): - writer = pd.ExcelWriter(output_path, engine='xlsxwriter') - sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False) - df = pd.DataFrame({ 'Cut here': [0] * len(sentences),'Sentences': sentences}) - df = df[['Cut here','Sentences']] - - df.to_excel(writer, sheet_name='segment') - writer.save() - + with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer: # Use context manager for ExcelWriter + sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False) + df = pd.DataFrame({'Cut here': [0] * len(sentences), 'Sentences': sentences}) + df = df[['Cut here', 'Sentences']] + df.to_excel(writer, sheet_name='segment') def generate_test_article(path, output_path): - sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False, + sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False, high_granularity=False) article_text = "\n".join(sentences) - with open(output_path, "w") as f: - f.write(article_text.encode('utf-8')) - f.close() + with open(output_path, "w", encoding='utf-8') as f: # Use context manager and specify encoding + f.write(article_text) -def generate_folder(input_folder,output_folder): +def generate_folder(input_folder, output_folder, to_text): counter = 0 input_files = get_files(input_folder) for file in input_files: id = os.path.basename(file) - file_name = id + ".xlsx" if not args.toText else id + file_name = f"{id}.xlsx" if not to_text else id output_file = os.path.join(output_folder, file_name) - if (args.toText): + if to_text: generate_test_article(file, output_file) else: - generate_segmentation_template(file,output_file) + generate_segmentation_template(file, output_file) counter += 1 - print 'generates ' + str(counter) + ' files' - - + print(f'Generated {counter} files') if __name__ == '__main__': - parser = ArgumentParser() parser.add_argument('--path', help='input folder path', default='/home/michael/Downloads/migo/68943', type=str) parser.add_argument('--output_path', help='output folder path', default='blah.xlsx', type=str) - parser.add_argument('--toText', help='output to text files ?', action='store_true') + parser.add_argument('--toText', help='output to text files?', action='store_true') args = parser.parse_args() - generate_folder(args.path,args.output_path) - + generate_folder(args.path, args.output_path, args.toText) \ No newline at end of file diff --git a/calc_statistics.py b/calc_statistics.py index 031e314..af9ca24 100644 --- a/calc_statistics.py +++ b/calc_statistics.py @@ -1,8 +1,5 @@ -from __future__ import division - import torch from torch.utils.data import DataLoader -from torch.autograd import Variable import numpy as np from choiloader import ChoiDataset, collate_fn @@ -11,19 +8,17 @@ from utils import maybe_cuda import utils import sys -from pathlib2 import Path +from pathlib import Path # Use pathlib instead of pathlib2 from wiki_loader import WikipediaDataSet import accuracy logger = utils.setup_logger(__name__, 'train.log') - - def main(args): sys.path.append(str(Path(__file__).parent)) utils.read_config_file(args.config) - utils.config.update(args.__dict__) + utils.config.update(vars(args)) # Update config with args dictionary logger.debug('Running with config %s', utils.config) article_with_problems = 0 @@ -39,94 +34,85 @@ def main(args): min_num_sentences = 1000 max_num_sentences = 0 - dl = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=False) docs_num_segments_vec = np.zeros(len(dl)) segments_num_sentences_vec = [] - print 'num of docs is ' + str(len(dl)) + print(f'Number of documents: {len(dl)}') with tqdm(desc='Testing', total=len(dl)) as pbar: - for i, (data, targets, paths) in enumerate(dl): - if (len(paths) == 0): + if len(paths) == 0: article_with_problems += 1 docs_num_segments_vec[i] = np.nan continue try: - - if ( ((i % 1000 ) == 0) & i > 0): - print i + if i % 1000 == 0 and i > 0: + print(i) if len(targets) > 0: - targets_var = Variable(maybe_cuda(torch.cat(targets, 0), None), requires_grad=False) - target_seg = targets_var.data.cpu().numpy() + targets_var = maybe_cuda(torch.cat(targets, 0), None) + target_seg = targets_var.cpu().numpy() target_seg = np.concatenate([target_seg, np.array([1])]) else: target_seg = np.ones(1) - num_sentences += (len(target_seg)) - doc_num_of_segment = (sum(target_seg)) - if (doc_num_of_segment < min_num_segment): - min_num_segment = doc_num_of_segment - if (doc_num_of_segment > max_num_segment): - max_num_segment = doc_num_of_segment + + num_sentences += len(target_seg) + doc_num_of_segment = sum(target_seg) + + min_num_segment = min(min_num_segment, doc_num_of_segment) + max_num_segment = max(max_num_segment, doc_num_of_segment) + num_segments += doc_num_of_segment num_documents += 1 docs_num_segments_vec[i] = doc_num_of_segment one_inds = np.where(target_seg == 1)[0] one_inds += 1 - one_inds = np.concatenate((np.zeros(1),one_inds)) - if (len(one_inds) == 1): + one_inds = np.concatenate(([0], one_inds)) + + if len(one_inds) == 1: sentences_in_segments = [len(target_seg)] else: sentences_in_segments = one_inds[1:] - one_inds[:-1] - segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec,sentences_in_segments)) + + segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec, sentences_in_segments)) current_min = np.min(sentences_in_segments) current_max = np.max(sentences_in_segments) - if (current_min < min_num_sentences): - min_num_sentences = current_min - if (current_max > max_num_sentences): - max_num_sentences = current_max - - + + min_num_sentences = min(min_num_sentences, current_min) + max_num_sentences = max(max_num_sentences, current_max) except Exception as e: - logger.info('Exception "%s" in batch %s', e, i) + logger.info(f'Exception "{e}" in batch {i}') logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True) raise + print(f'Total sentences: {num_sentences}.') + print(f'Total segments: {num_segments}.') + print(f'Total documents: {num_documents}.') + print(f'Average segment size: {num_sentences / num_segments:.3f}.') + print(f'Min #segments in a document: {min_num_segment}.') + print(f'Max #segments in a document: {max_num_segment}.') + print(f'Min #sentences in a segment: {min_num_sentences}.') + print(f'Max #sentences in a segment: {max_num_sentences}.') + + print('\nNew computing method\n') + print(f'Number of documents: {len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum()}.') + print(f'Total segments: {np.nansum(docs_num_segments_vec)}.') + print(f'Total sentences: {np.sum(segments_num_sentences_vec)}.') + + print(f'Min #segments in a document: {np.nanmin(docs_num_segments_vec)}.') + print(f'Max #segments in a document: {np.nanmax(docs_num_segments_vec)}.') + print(f'Mean segments in a document: {np.nanmean(docs_num_segments_vec):.3f}.') + print(f'Standard deviation of segments in a document: {np.nanstd(docs_num_segments_vec):.3f}.') + print(f'\nMin #sentences in a segment: {np.min(segments_num_sentences_vec)}.') + print(f'Max #sentences in a segment: {np.max(segments_num_sentences_vec)}.') + print(f'Average segment size: {np.mean(segments_num_sentences_vec):.3f}.') + print(f'Standard deviation of segment size: {np.std(segments_num_sentences_vec):.3f}.') - print 'total sentences: {}.'.format(num_sentences) - print 'total segments: {}.'.format(num_segments) - print 'total documents: {}.'.format(num_documents) - print 'average segment size is: {:.3}.'.format(np.true_divide(num_sentences,num_segments)) - print 'min #segment in document: {}.'.format(min_num_segment) - print 'max #segment in document: {}.'.format(max_num_segment) - print 'min #sentence in segment: {}.'.format(min_num_sentences) - print 'max #sentence in segment: {}.'.format(max_num_sentences) - - - print '' - print 'new computing method' - print '' - print 'num of documents: {}.'.format(len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum()) - print 'total segments: {}.'.format(np.nansum(docs_num_segments_vec)) - print 'total sentences: {}.'.format(np.sum(segments_num_sentences_vec)) - print '' - print 'min #segment in document: {}.'.format(np.nanmin(docs_num_segments_vec)) - print 'max #segment in document: {}.'.format(np.nanmax(docs_num_segments_vec)) - print 'mean segments in document: {:.3}.'.format(np.nanmean(docs_num_segments_vec)) - print 'std segments in document: {:.3}.'.format(np.nanstd(docs_num_segments_vec)) - print '' - print 'min #sentence in segment: {}.'.format(np.min(segments_num_sentences_vec)) - print 'max #sentence in segment: {}.'.format(np.max(segments_num_sentences_vec)) - print 'average segment size is: {:.3}.'.format(np.mean(segments_num_sentences_vec)) - print 'std segment size is: {:.3}.'.format(np.std(segments_num_sentences_vec)) - - print '' - print 'article with problems {}'.format(article_with_problems) + print(f'\nArticles with problems: {article_with_problems}') if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--config', help='Path to config.json', default='config.json') - main(parser.parse_args()) + main(parser.parse_args()) \ No newline at end of file diff --git a/check_annotated_wiki_file.py b/check_annotated_wiki_file.py index 7d1b00d..a67f168 100644 --- a/check_annotated_wiki_file.py +++ b/check_annotated_wiki_file.py @@ -6,165 +6,123 @@ import os from glob import glob - -graphseg_delimeter = "==========" - +graphseg_delimiter = "==========" def generate_segmentation_template(path, output_path): - writer = pd.ExcelWriter(output_path, engine='xlsxwriter') - sentences, _, _ = read_wiki_file(path, None, False) - - sentences = [' '.join(s) + '.' for s in sentences] - df = pd.DataFrame({'Sentences': sentences, 'Cut here': [0] * len(sentences)}) - df = df[['Sentences', 'Cut here']] - - df.to_excel(writer, sheet_name='segment') - writer.save() - + with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer: # Use context manager + sentences, _, _ = read_wiki_file(path, None, False) + sentences = [' '.join(s) + '.' for s in sentences] + df = pd.DataFrame({'Sentences': sentences, 'Cut here': [0] * len(sentences)}) + df = df[['Sentences', 'Cut here']] + df.to_excel(writer, sheet_name='segment') def target_place_to_list(targets): - list_of_targets = [] - for i in range(targets[-1] + 1): - if i in targets: - list_of_targets.append(1) - else: - list_of_targets.append(0) - - list_of_targets[-1] = 1 + list_of_targets = [1 if i in targets else 0 for i in range(targets[-1] + 1)] + list_of_targets[-1] = 1 # Ensure the last sentence is marked as the end return list_of_targets - def get_graphseg_segments(file_path): - file = open(str(file_path), "r") - raw_content = file.read() - file.close() - sentences = [s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"] - sentences_length = [] + with open(str(file_path), "r", encoding='utf-8') as file: + raw_content = file.read() + + sentences = [s for s in raw_content.strip().split("\n") if s and s != "\n"] h = [] - t = [] for sentence in sentences: - if sentence == graphseg_delimeter: - if len(h) > 0: + if sentence == graphseg_delimiter: + if h: h[-1] = 1 else: h.append(0) - #words = extract_sentence_words(sentence) - #sentences_length.append(len(words)) - #t.append(0) - #h.append(0) - - #t[-1] = 1 # end of last segment - h[-1] = 1 # they already segment it correctly. + h[-1] = 1 # Correct segmentation for the last sentence return h - def get_xlsx_segments(xlsx_path): df = pd.read_excel(xlsx_path) outputs = df['Cut here'].values - outputs[-1] = 1 + outputs[-1] = 1 # Ensure the last sentence is marked as the end return outputs - def get_gold_segments(path): - sentences, targets, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False,high_granularity=False) - + sentences, targets, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False, high_granularity=False) return target_place_to_list(targets) - def get_sub_folders_for_graphseg(folder): - d = folder - folders = [os.path.join(d, o) for o in os.listdir(d) if os.path.isdir(os.path.join(d, o))] - print folders - return folders - - -def analyszie_folder(wiki_folder,xlsx_folder,isGraphseg, use_xlsx_sub_folders = False): + sub_folders = [os.path.join(folder, o) for o in os.listdir(folder) if os.path.isdir(os.path.join(folder, o))] + print(sub_folders) + return sub_folders +def analyze_folder(wiki_folder, xlsx_folder, is_graphseg, use_xlsx_sub_folders=False): acc = accuracy.Accuracy() input_files = get_files(wiki_folder) if use_xlsx_sub_folders: - annotated_files_folders= [] - for f in os.listdir(xlsx_folder): - sub_folder_path = xlsx_folder + f - if os.path.isdir(sub_folder_path): - annotated_files_folders.append(sub_folder_path) + annotated_files_folders = [os.path.join(xlsx_folder, f) for f in os.listdir(xlsx_folder) if os.path.isdir(os.path.join(xlsx_folder, f))] else: annotated_files_folders = [xlsx_folder] - - - for file in input_files: id = os.path.basename(file) - file_name = id + ".xlsx" if not isGraphseg else id - xlsx_file_paths = [os.path.join(xlsx_folder,file_name) for xlsx_folder in annotated_files_folders] - print str(xlsx_file_paths) - print str(file) + file_name = f"{id}.xlsx" if not is_graphseg else id + xlsx_file_paths = [os.path.join(folder, file_name) for folder in annotated_files_folders] + print(xlsx_file_paths) + print(file) for xlsx_file_path in xlsx_file_paths: if os.path.isfile(xlsx_file_path): - if (isGraphseg): + if is_graphseg: tested_segments = get_graphseg_segments(xlsx_file_path) else: - tested_segments = get_xlsx_segments(xlsx_file_path ) + tested_segments = get_xlsx_segments(xlsx_file_path) else: tested_segments = None gold_segments = get_gold_segments(file) - if (tested_segments is not None) and (len(tested_segments) != len(gold_segments)): - print "(len(tested_segments) != len(gold_segments))" - print "stop run" - return 1000,1000 - if tested_segments is not None : - acc.update(tested_segments,gold_segments) + if tested_segments is not None and len(tested_segments) != len(gold_segments): + print("(len(tested_segments) != len(gold_segments))") + print("Stopping run") + return 1000, 1000 + if tested_segments is not None: + acc.update(tested_segments, gold_segments) - #Print results: + # Print results calculated_pk, calculated_windiff = acc.calc_accuracy() print('Finished testing.') - print ('Pk: {:.4}.'.format(calculated_pk)) - print ('') - - return calculated_pk,calculated_windiff - + print(f'Pk: {calculated_pk:.4f}.') + print() -def result_to_file(pk_list,wd_list,path_list,result_file_path): - writer = pd.ExcelWriter(result_file_path, engine='xlsxwriter') - - df = pd.DataFrame({ 'pk': pk_list,'wd': wd_list,'folders': path_list}) - df = df[['pk','wd','folders']] - - df.to_excel(writer, sheet_name='annotated_result') - writer.save() + return calculated_pk, calculated_windiff +def result_to_file(pk_list, wd_list, path_list, result_file_path): + with pd.ExcelWriter(result_file_path, engine='xlsxwriter') as writer: # Use context manager + df = pd.DataFrame({'pk': pk_list, 'wd': wd_list, 'folders': path_list}) + df.to_excel(writer, sheet_name='annotated_result') if __name__ == '__main__': - parser = ArgumentParser() parser.add_argument('--path', help='wiki folder, truth', type=str) - parser.add_argument('--xlsx_path', help='folder with xlsx files', type=str) - parser.add_argument('--graphseg', help='to calc graphseg pk', action='store_true') + parser.add_argument('--xlsx_path', help='folder with xlsx files', type=str) + parser.add_argument('--graphseg', help='to calculate graphseg pk', action='store_true') args = parser.parse_args() pk_list = [] wd_list = [] path_list = [] - if (args.graphseg): + if args.graphseg: graphseg_folders = get_sub_folders_for_graphseg(args.xlsx_path) for folder in graphseg_folders: - pk,wd = analyszie_folder(args.path,folder,args.graphseg) + pk, wd = analyze_folder(args.path, folder, args.graphseg) pk_list.append(pk) wd_list.append(wd) path_list.append(folder) else: - pk, wd = analyszie_folder(args.path, args.xlsx_path, args.graphseg, use_xlsx_sub_folders=True) + pk, wd = analyze_folder(args.path, args.xlsx_path, args.graphseg, use_xlsx_sub_folders=True) pk_list.append(pk) wd_list.append(wd) path_list.append(args.xlsx_path) - #writing result to file - result_to_file(pk_list,wd_list,path_list,os.path.join(args.xlsx_path,"result_pk.xlsx") ) + # Write result to file + result_to_file(pk_list, wd_list, path_list, os.path.join(args.xlsx_path, "result_pk.xlsx")) \ No newline at end of file diff --git a/chen_cities_converter.py b/chen_cities_converter.py index 3f1fbc2..1a13670 100644 --- a/chen_cities_converter.py +++ b/chen_cities_converter.py @@ -1,81 +1,72 @@ import utils -from pathlib2 import Path +from pathlib import Path from argparse import ArgumentParser import os import wiki_utils - - - - def main(args): utils.read_config_file(args.config) - utils.config.update(args.__dict__) - + utils.config.update(vars(args)) # Update config with args as a dictionary file_path = args.input output_folder_path = args.output special_delim_sign_path = args.sign - file = open(str(special_delim_sign_path), "r") - special_delim_sign = file.read().encode('utf-8').split("\n")[0] - file.close() - - file = open(str(file_path ), "r") - raw_content = file.read() - file.close() - - result_file_path = None + # Open and read the special delimiter sign file + with open(special_delim_sign_path, "r", encoding='utf-8') as file: + special_delim_sign = file.read().split("\n")[0] + # Open and read the input file + with open(file_path, "r", encoding='utf-8') as file: + raw_content = file.read() - sentences = [s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"] + sentences = [s for s in raw_content.strip().split("\n") if s] last_doc_id = 0 last_topic = "" + result_file_path = None for sentence in sentences: - first_comma_index = sentence.index(',') - second_comma_index = sentence[first_comma_index + 1 :].index(',') - current_doc_id = sentence[0:first_comma_index] + second_comma_index = sentence[first_comma_index + 1:].index(',') + first_comma_index + 1 + current_doc_id = sentence[:first_comma_index] sign_index = sentence.index(special_delim_sign) - start_sentence_index = sign_index + 1 + start_sentence_index = sign_index + 1 actual_sentence = sentence[start_sentence_index:] - current_topic = sentence[first_comma_index + second_comma_index + 2:sign_index] - + current_topic = sentence[second_comma_index + 1:sign_index] - if (current_doc_id != last_doc_id): + # Handle new document id and create new file for it + if current_doc_id != last_doc_id: last_doc_id = current_doc_id - print 'new file index' - print last_doc_id - if (result_file_path != None): + print('New file index:', last_doc_id) + if result_file_path: result_file.close() - result_file_path = os.path.join(output_folder_path ,str(current_doc_id) + ".text") - result_file = open(str(result_file_path), "w") + result_file_path = os.path.join(output_folder_path, f"{current_doc_id}.text") + result_file = open(result_file_path, "w", encoding='utf-8') last_topic = "" - - - if (current_topic != last_topic): + # Write new topic to file if changed + if current_topic != last_topic: last_topic = current_topic - level = 1 if (current_topic == "TOP-LEVEL SEGMENT") else 2 - result_file.write((wiki_utils.get_segment_seperator(level, current_topic) + ".").encode('utf-8')) - result_file.write("\n".encode('utf-8')) + level = 1 if current_topic == "TOP-LEVEL SEGMENT" else 2 + result_file.write(wiki_utils.get_segment_seperator(level, current_topic) + ".\n") + + if '\n' in sentence: + print('Backslash in sentence') - if ('\n' in sentence): - print 'back slash in sentnece' - result_file.write(actual_sentence.encode('utf-8')) - #result_file.write(".".encode('utf-8')) - result_file.write("\n".encode('utf-8')) + # Write actual sentence to file + result_file.write(actual_sentence + "\n") + if result_file_path: + result_file.close() if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--config', help='Path to config.json', default='config.json') parser.add_argument('--input', help='Chen text file', required=True) - parser.add_argument('--output', help='folder for converted files', required=True) - parser.add_argument('--sign', help='folder for converted files', required=True) + parser.add_argument('--output', help='Folder for converted files', required=True) + parser.add_argument('--sign', help='File containing special delimiter sign', required=True) main(parser.parse_args()) \ No newline at end of file diff --git a/chen_elements_convertor.py b/chen_elements_convertor.py index 74c37a9..c70c79f 100644 --- a/chen_elements_convertor.py +++ b/chen_elements_convertor.py @@ -3,76 +3,71 @@ import os import wiki_utils - def main(args): utils.read_config_file(args.config) - utils.config.update(args.__dict__) + utils.config.update(vars(args)) # Use vars(args) for dictionary-like access file_path = args.input segments_path = args.segment output_folder_path = args.output + # Read the segments content file + with open(segments_path, "r", encoding='utf-8') as file: + segments_content = file.read() - file = open(str(segments_path), "r") - segments_content = file.read() - file.close() - - file = open(str(file_path ), "r") - raw_content = file.read() - file.close() + # Read the input file + with open(file_path, "r", encoding='utf-8') as file: + raw_content = file.read() - sentences = [s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"] - segments = [s for s in segments_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"] + sentences = [s for s in raw_content.strip().split("\n") if s] + segments = [s for s in segments_content.strip().split("\n") if s] - result_file_path = None + if len(sentences) != len(segments): + print("len(sentences) != len(segments)") + return last_doc_id = 0 last_topic = "" + result_file_path = None - if (len(sentences) != len(segments)): - print "len(sentences) != len(segments)" - return - - for i in range(len(sentences)) : - + for i in range(len(sentences)): sentence = sentences[i] - segment = segments[i].encode('utf-8').split("\r")[0] + segment = segments[i].split("\r")[0] first_comma_index = segment.index(',') - second_comma_index = segment[first_comma_index + 1 :].index(',') - current_doc_id = segment[0:first_comma_index] - current_topic = segment[first_comma_index + second_comma_index + 2:] + second_comma_index = segment[first_comma_index + 1:].index(',') + first_comma_index + 1 + current_doc_id = segment[:first_comma_index] + current_topic = segment[second_comma_index + 1:] - if (current_doc_id != last_doc_id): + # Handle new document id and create a new file for it + if current_doc_id != last_doc_id: last_doc_id = current_doc_id - print 'new file index' - print last_doc_id - if (result_file_path != None): + print('New file index:', last_doc_id) + if result_file_path: result_file.close() - result_file_path = os.path.join(output_folder_path ,str(current_doc_id) + ".text") - result_file = open(str(result_file_path), "w") + + result_file_path = os.path.join(output_folder_path, f"{current_doc_id}.text") + result_file = open(result_file_path, "w", encoding='utf-8') last_topic = "" - if (current_topic != last_topic): + # Write new topic to the file if changed + if current_topic != last_topic: last_topic = current_topic - level = 1 if (current_topic == "TOP-LEVEL SEGMENT") else 2 - result_file.write((wiki_utils.get_segment_seperator(level ,current_topic) +".").encode('utf-8')) - result_file.write("\n".encode('utf-8')) + level = 1 if current_topic == "TOP-LEVEL SEGMENT" else 2 + result_file.write(wiki_utils.get_segment_seperator(level, current_topic) + ".\n") - actual_sentence = sentence - result_file.write(actual_sentence.encode('utf-8')) - if ('\n' in sentence): - print 'back slash in sentnece' - #result_file.write(".".encode('utf-8')) - result_file.write("\n".encode('utf-8')) + # Write the actual sentence to the file + result_file.write(sentence + "\n") + if result_file_path: + result_file.close() if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--config', help='Path to config.json', default='config.json') parser.add_argument('--input', help='Chen text file', required=True) - parser.add_argument('--segment', help='regina segmentation file', required=True) - parser.add_argument('--output', help='folder for converted files', required=True) + parser.add_argument('--segment', help='Regina segmentation file', required=True) + parser.add_argument('--output', help='Folder for converted files', required=True) main(parser.parse_args()) \ No newline at end of file diff --git a/choi_convertor.py b/choi_convertor.py index f6a7f8c..e7c2103 100644 --- a/choi_convertor.py +++ b/choi_convertor.py @@ -1,63 +1,52 @@ import os -from pathlib2 import Path +from pathlib import Path from argparse import ArgumentParser -from shutil import move +from shutil import move - - -def removeEmptyFolders(path, removeRoot=True): +def remove_empty_folders(path, remove_root=True): if not os.path.isdir(path): return - # remove empty subfolders - files = os.listdir(path) - for f in files: + # Remove empty subfolders + for f in os.listdir(path): fullpath = os.path.join(path, f) if os.path.isdir(fullpath): - removeEmptyFolders(fullpath) + remove_empty_folders(fullpath) - # if folder empty, delete it - files = os.listdir(path) - if len(files) == 0 and removeRoot: - #print "Removing empty folder:", path + # If folder is empty, delete it + if not os.listdir(path) and remove_root: os.rmdir(path) - - -def convert_choi_to_bySegLength(path): - folders = [o for o in os.listdir(path) if os.path.isdir(os.path.join(path, o))] +def convert_choi_to_by_seg_length(path): + folders = [o for o in os.listdir(path) if os.path.isdir(os.path.join(path, o))] for folder in folders: full_folder_path = os.path.join(path, folder) - seg_folders = [o for o in os.listdir(full_folder_path ) if os.path.isdir(os.path.join(full_folder_path , o))] + seg_folders = [o for o in os.listdir(full_folder_path) if os.path.isdir(os.path.join(full_folder_path, o))] + for seg_folder in seg_folders: - full_seg_folder_path = os.path.join(full_folder_path ,seg_folder ) - convertedPathList = full_seg_folder_path.split(os.sep) + full_seg_folder_path = os.path.join(full_folder_path, seg_folder) + converted_path_list = full_seg_folder_path.split(os.sep) + converted_path = os.path.sep.join(converted_path_list[:-2] + [converted_path_list[-1], converted_path_list[-2]]) + if not os.path.exists(converted_path): + os.makedirs(converted_path) - convertedPath = os.path.sep.join(convertedPathList[:-2] + [convertedPathList[-1]] + [convertedPathList[-2]]) - if not os.path.exists(convertedPath): - os.makedirs(convertedPath) - all_objects = Path(full_seg_folder_path).glob('**/*') + all_objects = Path(full_seg_folder_path).rglob('*') # Use rglob for recursive search files = (str(p) for p in all_objects if p.is_file()) - for file in files: - target = os.path.join(convertedPath ,os.path.basename(file) ) - move(file,target) - print "Removing empty folder: ", full_seg_folder_path - removeEmptyFolders(full_seg_folder_path) - - - -def main (args): + for file in files: + target = os.path.join(converted_path, os.path.basename(file)) + move(file, target) - convert_choi_to_bySegLength(args.input) + print(f"Removing empty folder: {full_seg_folder_path}") + remove_empty_folders(full_seg_folder_path) - print ('done') +def main(args): + convert_choi_to_by_seg_length(args.input) + print('done') if __name__ == '__main__': - parser = ArgumentParser() parser.add_argument('--input', help='Path to choi dataset', required=True) - main(parser.parse_args()) - + main(parser.parse_args()) \ No newline at end of file diff --git a/choiloader.py b/choiloader.py index dfd1904..2085fdc 100644 --- a/choiloader.py +++ b/choiloader.py @@ -1,6 +1,3 @@ -from __future__ import print_function -from pathlib2 import Path - import torch from torch.utils.data import Dataset import numpy as np @@ -8,13 +5,12 @@ from text_manipulation import split_sentences, word_model, extract_sentence_words import utils import math - +from pathlib import Path # Use pathlib, which is built-in with Python 3 logger = utils.setup_logger(__name__, 'train.log') - def get_choi_files(path): - all_objects = Path(path).glob('**/*.ref') + all_objects = Path(path).rglob('*.ref') # Use rglob for recursive file search files = [str(p) for p in all_objects if p.is_file()] return files @@ -24,7 +20,7 @@ def collate_fn(batch): paths = [] window_size = 1 - before_sentence_count = int(math.ceil(float(window_size - 1) /2)) + before_sentence_count = int(math.ceil((window_size - 1) / 2.0)) # Python 3 division after_sentence_count = window_size - before_sentence_count - 1 for data, targets, path in batch: @@ -32,8 +28,8 @@ def collate_fn(batch): max_index = len(data) tensored_data = [] for curr_sentence_index in range(0, len(data)): - from_index = max([0, curr_sentence_index - before_sentence_count]) - to_index = min([curr_sentence_index + after_sentence_count + 1, max_index]) + from_index = max(0, curr_sentence_index - before_sentence_count) + to_index = min(curr_sentence_index + after_sentence_count + 1, max_index) sentences_window = [word for sentence in data[from_index:to_index] for word in sentence] tensored_data.append(torch.FloatTensor(np.concatenate(sentences_window))) tensored_targets = torch.zeros(len(data)).long() @@ -43,77 +39,69 @@ def collate_fn(batch): batched_targets.append(tensored_targets) paths.append(path) except Exception as e: - logger.info('Exception "%s" in file: "%s"', e, path) + logger.info(f'Exception "{e}" in file: "{path}"') logger.debug('Exception!', exc_info=True) continue return batched_data, batched_targets, paths def clean_paragraph(paragraph): - cleaned_paragraph= paragraph.replace("'' ", " ").replace(" 's", "'s").replace("``", "").strip('\n') + cleaned_paragraph = paragraph.replace("'' ", " ").replace(" 's", "'s").replace("``", "").strip('\n') return cleaned_paragraph -def read_choi_file(path, word2vec, train, return_w2v_tensors = True,manifesto=False): - seperator = '========' if manifesto else '==========' - with Path(path).open('r') as f: +def read_choi_file(path, word2vec, train, return_w2v_tensors=True, manifesto=False): + separator = '========' if manifesto else '==========' + with open(path, 'r', encoding='utf-8') as f: raw_text = f.read() - paragraphs = [clean_paragraph(p) for p in raw_text.strip().split(seperator) - if len(p) > 5 and p != "\n"] + + paragraphs = [clean_paragraph(p) for p in raw_text.strip().split(separator) if len(p) > 5 and p != "\n"] if train: random.shuffle(paragraphs) targets = [] new_text = [] - lastparagraphsentenceidx = 0 + last_paragraph_sentence_idx = 0 for paragraph in paragraphs: - if manifesto: - sentences = split_sentences(paragraph,0) - else: - sentences = [s for s in paragraph.split('\n') if len(s.split()) > 0] - + sentences = split_sentences(paragraph, 0) if manifesto else [s for s in paragraph.split('\n') if s.split()] if sentences: - sentences_count =0 - # This is the number of sentences in the paragraph and where we need to split. + sentence_count = 0 for sentence in sentences: words = extract_sentence_words(sentence) - if (len(words) == 0): + if len(words) == 0: continue - sentences_count +=1 + sentence_count += 1 if return_w2v_tensors: new_text.append([word_model(w, word2vec) for w in words]) else: new_text.append(words) - lastparagraphsentenceidx += sentences_count - targets.append(lastparagraphsentenceidx - 1) + last_paragraph_sentence_idx += sentence_count + targets.append(last_paragraph_sentence_idx - 1) return new_text, targets, path - -# Returns a list of batch_size that contains a list of sentences, where each word is encoded using word2vec. class ChoiDataset(Dataset): - def __init__(self, root, word2vec, train=False, folder=False,manifesto=False, folders_paths = None): + def __init__(self, root, word2vec, train=False, folder=False, manifesto=False, folders_paths=None): self.manifesto = manifesto if folders_paths is not None: self.textfiles = [] for f in folders_paths: self.textfiles.extend(list(f.glob('*.ref'))) - elif (folder): + elif folder: self.textfiles = get_choi_files(root) else: - self.textfiles = list(Path(root).glob('**/*.ref')) + self.textfiles = list(Path(root).rglob('*.ref')) if len(self.textfiles) == 0: - raise RuntimeError('Found 0 images in subfolders of: {}'.format(root)) + raise RuntimeError(f'Found 0 files in subfolders of: {root}') self.train = train self.root = root self.word2vec = word2vec def __getitem__(self, index): path = self.textfiles[index] - - return read_choi_file(path, self.word2vec, self.train,manifesto=self.manifesto) + return read_choi_file(path, self.word2vec, self.train, manifesto=self.manifesto) def __len__(self): - return len(self.textfiles) + return len(self.textfiles) \ No newline at end of file diff --git a/clean_wiki_dataset.py b/clean_wiki_dataset.py index a2eaf12..29f4718 100644 --- a/clean_wiki_dataset.py +++ b/clean_wiki_dataset.py @@ -1,45 +1,44 @@ -from pathlib2 import Path +from pathlib import Path import wiki_processor from argparse import ArgumentParser def remove_malicious_files(dataset_path): + # Read the malicious file IDs from the file with open('malicious_wiki_files', 'r') as f: malicious_file_ids = f.read().splitlines() - test_path = Path(dataset_path).joinpath(Path('test')) - train_path = Path(dataset_path).joinpath(Path('train')) - dev_path = Path(dataset_path).joinpath(Path('dev')) + # Define paths for test, train, and dev datasets + test_path = Path(dataset_path).joinpath('test') + train_path = Path(dataset_path).joinpath('train') + dev_path = Path(dataset_path).joinpath('dev') deleted_file_count = 0 - for id in malicious_file_ids: - file_path_suffix = Path(wiki_processor.get_file_path(id)).joinpath(id) + # Iterate over the malicious file IDs and delete the corresponding files + for file_id in malicious_file_ids: + file_path_suffix = Path(wiki_processor.get_file_path(file_id)).joinpath(file_id) + if test_path.joinpath(file_path_suffix).exists(): - test_path.joinpath(file_path_suffix).remove() + test_path.joinpath(file_path_suffix).unlink() # Use .unlink() to delete a file deleted_file_count += 1 elif train_path.joinpath(file_path_suffix).exists(): - train_path.joinpath(file_path_suffix).remove() + train_path.joinpath(file_path_suffix).unlink() deleted_file_count += 1 elif dev_path.joinpath(file_path_suffix).exists(): - dev_path.joinpath(file_path_suffix).remove() - deleted_file_count +=1 + dev_path.joinpath(file_path_suffix).unlink() + deleted_file_count += 1 else: - raise Exception('meliciious file is not included in dataset: ' + str(id)) - - print ('Deleted ' + str (deleted_file_count) + ' files. Malicious file count: ' + str(len(malicious_file_ids))) + raise Exception(f'Malicious file is not included in the dataset: {file_id}') -def main(arg): - remove_malicious_files(arg.path) + print(f'Deleted {deleted_file_count} files. Malicious file count: {len(malicious_file_ids)}') +def main(args): + remove_malicious_files(args.path) if __name__ == '__main__': parser = ArgumentParser() - parser.add_argument('--path', help='Path to dataset') - - main(parser.parse_args()) - - - + parser.add_argument('--path', help='Path to dataset', required=True) + main(parser.parse_args()) \ No newline at end of file diff --git a/configgenerator.py b/configgenerator.py index b4ce994..ae1c95d 100644 --- a/configgenerator.py +++ b/configgenerator.py @@ -1,10 +1,12 @@ import json +# Define the configuration data jsondata = { - "word2vecfile": "/home/omri/datasets/word2vec/GoogleNews-vectors-negative300.bin", + "word2vecfile": "/Users/jitesh/Downloads/text-segmentation/data/word2vec/GoogleNews-vectors-negative300.bin", "choidataset": "/home/omri/code/text-segmentation-2017/data/choi", - "wikidataset": "/home/omri/datasets/wikipedia/process_dump_r", + "wikidataset": "/home/omri/datasets/wikipedia/process_dump_r" } +# Write the data to config.json with open('config.json', 'w') as f: - json.dump(jsondata, f) + json.dump(jsondata, f, indent=4) # Added indent for better readability \ No newline at end of file diff --git a/convert_seperator.py b/convert_seperator.py index cb11cc0..a6609f8 100644 --- a/convert_seperator.py +++ b/convert_seperator.py @@ -1,27 +1,31 @@ -from pathlib2 import Path +from pathlib import Path import os -#root ='/home/adir/Projects/text-segmentation-2017/data/choi/' -root = '/home/adir/Projects/text-segmentation-2017/data/choi/1/3-5' -output ='/home/adir/Projects/text-segmentation-2017/data/part_choi/' -delimeter = '==========' +# Define paths and settings +root = '/home/adir/Projects/text-segmentation-2017/data/choi/1/3-5' +output = '/home/adir/Projects/text-segmentation-2017/data/part_choi/' +delimiter = '==========' truth = '********************************************' -textfiles = list(Path(root).glob('**/*.ref')) - +# Get all .ref files recursively from the root directory +textfiles = list(Path(root).rglob('*.ref')) counter = 0 +# Iterate over all text files for file in textfiles: counter += 1 - with file.open('r') as f: + with file.open('r', encoding='utf-8') as f: raw_text = f.read() - new_text = raw_text.replace('==========',truth) - f.close() - new_file_path = os.path.join(output,str(counter) + "_" + os.path.basename(str(file))) - with open(new_file_path, "w") as f: - f.write(new_text) - f.close() -print 'done' + # Replace the old delimiter with the new "truth" separator + new_text = raw_text.replace(delimiter, truth) + + # Create a new file path for the modified content + new_file_path = os.path.join(output, f"{counter}_{file.name}") + + # Write the new content to the new file + with open(new_file_path, "w", encoding='utf-8') as f: + f.write(new_text) +print('done') \ No newline at end of file diff --git a/evaluate.py b/evaluate.py index 8d9693e..ea515a0 100644 --- a/evaluate.py +++ b/evaluate.py @@ -1,6 +1,5 @@ import torch import numpy as np -from torch.autograd import Variable from choiloader import word_model import utils import text_manipulation @@ -9,30 +8,27 @@ def load_model(model_path=None, is_cuda=None): if model_path is None: model_path = utils.config['model'] - with open(model_path, 'r') as f: + # Open and load the model + with open(model_path, 'rb') as f: # Use 'rb' for reading binary models model = torch.load(f) model.eval() + if is_cuda is None: is_cuda = utils.config['cuda'] return utils.maybe_cuda(model, is_cuda) - def prepare_tensor(sentences): tensored_data = [] for sentence in sentences: if len(sentence) > 0: tensored_data.append(utils.maybe_cuda(torch.FloatTensor(np.concatenate(sentence)))) - return tensored_data - - def text_to_word2vec(sentences, word2vec): new_text = [] for sentence in sentences: words = text_manipulation.extract_sentence_words(sentence) new_text.append([word_model(w, word2vec) for w in words]) - - return new_text + return new_text \ No newline at end of file diff --git a/gpu2cpu.py b/gpu2cpu.py index 2156bb9..51da4a5 100644 --- a/gpu2cpu.py +++ b/gpu2cpu.py @@ -1,29 +1,30 @@ import torch from argparse import ArgumentParser -from pathlib2 import Path - +from pathlib import Path def main(args): input_path = Path(args.input) + + # Load the model from the input file (in binary mode) with input_path.open('rb') as f: - model = torch.load(f) + model = torch.load(f, map_location=torch.device('cpu')) # Ensure loading to CPU - model = model.cpu() + model = model.cpu() # Ensure the model is on CPU - if args.output is not None: + # Determine the output path + if args.output: output_path = Path(args.output) else: output_path = input_path.parent / (input_path.stem + '_cpu' + input_path.suffix) + # Save the CPU model to the output file with output_path.open('wb') as f: torch.save(model, f) - - if __name__ == '__main__': parser = ArgumentParser() - parser.add_argument('-i', '--input', help='Path to original .t7 file') - parser.add_argument('-o', '--output', help='Output path') + parser.add_argument('-i', '--input', help='Path to original model file', required=True) + parser.add_argument('-o', '--output', help='Output path for the CPU model') args = parser.parse_args() - main(args) + main(args) \ No newline at end of file diff --git a/graphseg_gen.sh b/graphseg_gen.sh index d206fd9..d6bff99 100644 --- a/graphseg_gen.sh +++ b/graphseg_gen.sh @@ -1,5 +1,18 @@ #!/bin/bash -for i in 0.2 0.25 0.3 0.35 0.4 0.45 0.5 + +# Check if the minimum segment size is provided +if [ -z "$1" ]; then + echo "Usage: $0 " + exit 1 +fi + +# Define the range of threshold values +for i in 0.2 0.25 0.3 0.35 0.4 0.45 0.5 do - python graphseg_timer.py --input ~/Downloads/wiki_dev_100_np_seperators --output ~/Downloads/wiki_dev_100_np_seperators_output --jar graphseg.jar --threshold $i --min_segment $1 + # Run the Python script with the corresponding threshold and minimum segment size + python graphseg_timer.py --input ~/Downloads/wiki_dev_100_np_seperators \ + --output ~/Downloads/wiki_dev_100_np_seperators_output \ + --jar graphseg.jar \ + --threshold $i \ + --min_segment $1 done \ No newline at end of file diff --git a/graphseg_timer.py b/graphseg_timer.py index c5cefce..4a2debd 100644 --- a/graphseg_timer.py +++ b/graphseg_timer.py @@ -4,51 +4,44 @@ import utils from argparse import ArgumentParser - def main(input, output, jar_path, threshold, min_segment): - # java -jar graphseg.jar /home/seg-input /home/seg-output 0.25 3 - - - # for min_segment in range(1, 11): - # for tresh in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, - # 0.95]: - - output_folder = os.path.join(output, - 'graphseg_output_{}_{}'.format(min_segment, threshold)) + # Create an output folder based on the threshold and min_segment + output_folder = os.path.join(output, f'graphseg_output_{min_segment}_{threshold}') + # Ensure the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) + + # Setup logger logger = utils.setup_logger(__name__, os.path.join(output_folder, 'graphseg_timer.log'), delete_old=True) - beginning_comd = ['java', '-jar', jar_path, input] - params = [str(threshold), str(min_segment)] - cmd = beginning_comd + [output_folder] + params - print cmd + # Prepare the command + cmd = ['java', '-jar', jar_path, input, output_folder, str(threshold), str(min_segment)] + print(cmd) + + # Measure execution time start = timer() - # os.system(cmd) - subprocess.call(cmd) + subprocess.call(cmd) # Use subprocess to execute the command end = timer() - print 'it tooks seconds:' - print end - start - logger.info('running on parmas: ' + str(params[0]) + " , " + str(params[1])) - logger.info('it tooks seconds:') - logger.info(end - start) - logger.info('\n') - - print ('done') + # Log the results + logger.info(f'Running with params: threshold={threshold}, min_segment={min_segment}') + logger.info(f'Execution time (seconds): {end - start}') + + print(f'Execution time (seconds): {end - start}') + print('Done') if __name__ == '__main__': parser = ArgumentParser() - parser.add_argument('--input', help='input folder path', + parser.add_argument('--input', help='Input folder path', default='/home/adir/Projects/data/wikipedia/wiki4_no_seperators', type=str) - parser.add_argument('--output', help='output folder path', + parser.add_argument('--output', help='Output folder path', default='/home/adir/Projects/data/wikipedia/wiki4_output_graphseg/', type=str) - parser.add_argument('--jar', help='graphseg jar path path', + parser.add_argument('--jar', help='Graphseg jar file path', default='/home/adir/Projects/graphseg/binary/graphseg.jar', type=str) parser.add_argument('--threshold', type=float, required=True) parser.add_argument('--min_segment', type=int, required=True) args = parser.parse_args() - main(args.input, args.output, args.jar, args.threshold, args.min_segment) + main(args.input, args.output, args.jar, args.threshold, args.min_segment) \ No newline at end of file diff --git a/models/from_presentation.py b/models/from_presentation.py index 3750b19..39e138d 100644 --- a/models/from_presentation.py +++ b/models/from_presentation.py @@ -1,25 +1,19 @@ -from __future__ import print_function -from __future__ import division - import torch import torch.nn as nn -from torch.autograd import Variable import torch.nn.functional as F from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from utils import maybe_cuda, setup_logger, unsort import numpy as np from times_profiler import profiler - logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) - +# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required. def zero_state(module, batch_size): # * 2 is for the two directions - return Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))), \ - Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))) - + return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \ + maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)) class SentenceEncodingRNN(nn.Module): def __init__(self, input_size=300, hidden=128, num_layers=2): @@ -43,7 +37,6 @@ def forward(self, x): return reshaped - class Model(nn.Module): def __init__(self, sentence_encoder, hidden=128, num_layers=2): super(Model, self).__init__() @@ -65,19 +58,17 @@ def __init__(self, sentence_encoder, hidden=128, num_layers=2): self.criterion = nn.CrossEntropyLoss() - def pad(self, s, max_length): s_length = s.size()[0] - v = Variable(maybe_cuda(s.unsqueeze(0).unsqueeze(0))) + v = maybe_cuda(s.unsqueeze(0).unsqueeze(0)) padded = F.pad(v, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) - def pad_document(self, d, max_document_length): d_length = d.size()[0] v = d.unsqueeze(0).unsqueeze(0) - padded = F.pad(v, (0, 0,0, max_document_length - d_length )) # (1, 1, max_length, 300) + padded = F.pad(v, (0, 0, 0, max_document_length - d_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) @@ -103,18 +94,18 @@ def forward(self, batch): padded_sentences = [self.pad(s, max_length) for s in sorted_sentences] big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) - packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths) + packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False) profiler.set() # 1 encoded_sentences = self.sentence_encoder(packed_tensor) profiler.set() # 2 - unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order)))) + unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count - encoded_documents.append(unsorted_encodings[index : end_index, :]) + encoded_documents.append(unsorted_encodings[index: end_index, :]) index = end_index doc_sizes = [doc.size()[0] for doc in encoded_documents] @@ -124,7 +115,7 @@ def forward(self, batch): ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx] padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] docs_tensor = torch.cat(padded_docs, 1) - packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) + packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False) profiler.set() # 3 sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size)) profiler.set() # 4 @@ -142,9 +133,8 @@ def forward(self, batch): profiler.finish(profilerLogger) # 5 return x - def create(): sentence_encoder = SentenceEncodingRNN(input_size=300, hidden=256, num_layers=2) - return Model(sentence_encoder, hidden=256, num_layers=2) + return Model(sentence_encoder, hidden=256, num_layers=2) \ No newline at end of file diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py index 6d168b6..847d053 100644 --- a/models/max_sentence_embedding.py +++ b/models/max_sentence_embedding.py @@ -1,25 +1,19 @@ -from __future__ import print_function -from __future__ import division - import torch import torch.nn as nn -from torch.autograd import Variable import torch.nn.functional as F from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from utils import maybe_cuda, setup_logger, unsort import numpy as np from times_profiler import profiler - logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) - +# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required. def zero_state(module, batch_size): # * 2 is for the two directions - return Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))), \ - Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))) - + return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \ + maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)) class SentenceEncodingRNN(nn.Module): def __init__(self, input_size, hidden, num_layers): @@ -38,15 +32,14 @@ def forward(self, x): batch_size = x.batch_sizes[0] s = zero_state(self, batch_size) packed_output, _ = self.lstm(x, s) - padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256) + padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256) - maxes = Variable(maybe_cuda(torch.zeros(batch_size, padded_output.size(2)))) + maxes = maybe_cuda(torch.zeros(batch_size, padded_output.size(2))) for i in range(batch_size): maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0] return maxes - class Model(nn.Module): def __init__(self, sentence_encoder, hidden=128, num_layers=2): super(Model, self).__init__() @@ -68,19 +61,17 @@ def __init__(self, sentence_encoder, hidden=128, num_layers=2): self.criterion = nn.CrossEntropyLoss() - def pad(self, s, max_length): s_length = s.size()[0] - v = Variable(maybe_cuda(s.unsqueeze(0).unsqueeze(0))) + v = maybe_cuda(s.unsqueeze(0).unsqueeze(0)) padded = F.pad(v, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) - def pad_document(self, d, max_document_length): d_length = d.size()[0] v = d.unsqueeze(0).unsqueeze(0) - padded = F.pad(v, (0, 0,0, max_document_length - d_length )) # (1, 1, max_length, 300) + padded = F.pad(v, (0, 0, 0, max_document_length - d_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) @@ -104,16 +95,16 @@ def forward(self, batch): padded_sentences = [self.pad(s, max_length) for s in sorted_sentences] big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) - packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths) + packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False) encoded_sentences = self.sentence_encoder(packed_tensor) - unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order)))) + unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count - encoded_documents.append(unsorted_encodings[index : end_index, :]) + encoded_documents.append(unsorted_encodings[index: end_index, :]) index = end_index doc_sizes = [doc.size()[0] for doc in encoded_documents] @@ -123,7 +114,7 @@ def forward(self, batch): ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx] padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] docs_tensor = torch.cat(padded_docs, 1) - packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) + packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False) sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size)) padded_x, _ = pad_packed_sequence(sentence_lstm_output) # (max sentence len, batch, 256) @@ -137,9 +128,8 @@ def forward(self, batch): x = self.h2s(sentence_outputs) return x - def create(): sentence_encoder = SentenceEncodingRNN(input_size=300, hidden=256, num_layers=2) - return Model(sentence_encoder, hidden=256, num_layers=2) + return Model(sentence_encoder, hidden=256, num_layers=2) \ No newline at end of file diff --git a/models/naive.py b/models/naive.py index f8c8820..a5ed174 100644 --- a/models/naive.py +++ b/models/naive.py @@ -1,12 +1,9 @@ import torch import torch.nn as nn -from torch.autograd import Variable import torch.nn.functional as F from utils import maybe_cuda, unsort import numpy as np - - class Naive(nn.Module): def __init__(self, segment_average_size): super(Naive, self).__init__() @@ -14,30 +11,26 @@ def __init__(self, segment_average_size): self.segment_average_size = segment_average_size self.criterion = nn.CrossEntropyLoss() - - - def create_random_output(self,size): - - cut_probability = float (1) / self.segment_average_size + def create_random_output(self, size): + cut_probability = float(1) / self.segment_average_size cuts = np.random.choice([0, 1], size=(size,), p=[1-cut_probability, cut_probability]) - ret = torch.zeros(size,2) + ret = torch.zeros(size, 2) + + for i in range(ret.size(0)): + ret[i, 1] = cuts[i] + ret[i, 0] = 1 - cuts[i] - for i in range(ret.size()[0]): - ret[i,1] = cuts[i] - ret[i,0] = 1 - cuts[i] return ret def forward(self, x): - batch_segmentations = [] for document in x: num_sentences = len(document) doc_segmentation = self.create_random_output(num_sentences - 1) batch_segmentations.append(doc_segmentation) - batch_output = torch.cat(batch_segmentations,0) - return Variable(batch_output) - + batch_output = torch.cat(batch_segmentations, 0) + return batch_output # No need for Variable wrapper def create(): - return Naive(13) + return Naive(13) \ No newline at end of file diff --git a/models/single_lstm.py b/models/single_lstm.py index 5560141..72920bf 100644 --- a/models/single_lstm.py +++ b/models/single_lstm.py @@ -1,25 +1,19 @@ -from __future__ import print_function -from __future__ import division - import torch import torch.nn as nn -from torch.autograd import Variable import torch.nn.functional as F from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from utils import maybe_cuda, setup_logger, unsort import numpy as np from times_profiler import profiler - logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) - +# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required. def zero_state(module, batch_size): # * 2 is for the two directions - return Variable(maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden))), \ - Variable(maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden))) - + return maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden)), \ + maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden)) class SentenceEncodingRNN(nn.Module): def __init__(self, input_size, hidden, num_layers): @@ -43,7 +37,6 @@ def forward(self, x): return reshaped - class Model(nn.Module): def __init__(self, sentence_encoder, hidden, num_layers): super(Model, self).__init__() @@ -65,19 +58,17 @@ def __init__(self, sentence_encoder, hidden, num_layers): self.criterion = nn.CrossEntropyLoss() - def pad(self, s, max_length): - s_length = s.size()[0] - v = Variable(maybe_cuda(s.unsqueeze(0).unsqueeze(0))) + s_length = s.size(0) + v = maybe_cuda(s.unsqueeze(0).unsqueeze(0)) padded = F.pad(v, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) - def pad_document(self, d, max_document_length): - d_length = d.size()[0] + d_length = d.size(0) v = d.unsqueeze(0).unsqueeze(0) - padded = F.pad(v, (0, 0,0, max_document_length - d_length )) # (1, 1, max_length, 300) + padded = F.pad(v, (0, 0, 0, max_document_length - d_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) @@ -92,10 +83,10 @@ def forward(self, batch): all_batch_sentences.extend(document) sentences_per_doc.append(len(document)) - lengths = [s.size()[0] for s in all_batch_sentences] + lengths = [s.size(0) for s in all_batch_sentences] sort_order = np.argsort(lengths)[::-1] sorted_sentences = [all_batch_sentences[i] for i in sort_order] - sorted_lengths = [s.size()[0] for s in sorted_sentences] + sorted_lengths = [s.size(0) for s in sorted_sentences] max_length = max(lengths) logger.debug('Num sentences: %s, max sentence length: %s', @@ -103,28 +94,28 @@ def forward(self, batch): padded_sentences = [self.pad(s, max_length) for s in sorted_sentences] big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) - packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths) + packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False) profiler.set() # 1 encoded_sentences = self.sentence_encoder(packed_tensor) profiler.set() # 2 - unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order)))) + unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count - encoded_documents.append(unsorted_encodings[index : end_index, :]) + encoded_documents.append(unsorted_encodings[index: end_index, :]) index = end_index - doc_sizes = [doc.size()[0] for doc in encoded_documents] + doc_sizes = [doc.size(0) for doc in encoded_documents] max_doc_size = np.max(doc_sizes) ordered_document_idx = np.argsort(doc_sizes)[::-1] ordered_doc_sizes = sorted(doc_sizes)[::-1] ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx] padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] docs_tensor = torch.cat(padded_docs, 1) - packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) + packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False) profiler.set() # 3 sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size)) profiler.set() # 4 @@ -142,9 +133,8 @@ def forward(self, batch): profiler.finish(profilerLogger) # 5 return x - def create(): sentence_encoder = SentenceEncodingRNN(input_size=300, hidden=256, num_layers=4) - return Model(sentence_encoder, hidden=256, num_layers=4) + return Model(sentence_encoder, hidden=256, num_layers=4) \ No newline at end of file diff --git a/run.py b/run.py index 9465ba7..b042079 100644 --- a/run.py +++ b/run.py @@ -12,7 +12,7 @@ from tensorboard_logger import configure, log_value import os import sys -from pathlib2 import Path +from pathlib import Path from wiki_loader import WikipediaDataSet import accuracy import numpy as np @@ -22,20 +22,17 @@ preds_stats = utils.predictions_analysis() - def softmax(x): max_each_row = np.max(x, axis=1, keepdims=True) exps = np.exp(x - max_each_row) sums = np.sum(exps, axis=1, keepdims=True) return exps / sums - def import_model(model_name): module = __import__('models.' + model_name, fromlist=['models']) return module.create() - -class Accuracies(object): +class Accuracies: def __init__(self): self.thresholds = np.arange(0, 1, 0.05) self.accuracies = {k: accuracy.Accuracy() for k in self.thresholds} @@ -47,7 +44,7 @@ def update(self, output_np, targets_np): to_idx = int(current_idx + document_sentence_count) for threshold in self.thresholds: - output = ((output_np[current_idx: to_idx, :])[:, 1] > threshold) + output = (output_np[current_idx: to_idx, 1] > threshold) h = np.append(output, [1]) tt = np.append(t, [1]) @@ -68,123 +65,91 @@ def calc_accuracy(self): return min_pk, min_epoch_windiff, min_threshold - def train(model, args, epoch, dataset, logger, optimizer): model.train() - total_loss = float(0) + total_loss = 0.0 # Changed to float value with tqdm(desc='Training', total=len(dataset)) as pbar: for i, (data, target, paths) in enumerate(dataset): - if True: - if i == args.stop_after: - break - - pbar.update() - model.zero_grad() - output = model(data) - target_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) - loss = model.criterion(output, target_var) - loss.backward() - - optimizer.step() - total_loss += loss.data[0] - # logger.debug('Batch %s - Train error %7.4f', i, loss.data[0]) - pbar.set_description('Training, loss={:.4}'.format(loss.data[0])) - # except Exception as e: - # logger.info('Exception "%s" in batch %s', e, i) - # logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True) - # pass - - total_loss = total_loss / len(dataset) - logger.debug('Training Epoch: {}, Loss: {:.4}.'.format(epoch + 1, total_loss)) - log_value('Training Loss', total_loss, epoch + 1) + if i == args.stop_after: + break + + pbar.update() + model.zero_grad() + output = model(data) + target_var = maybe_cuda(torch.cat(target, 0), args.cuda) + loss = model.criterion(output, target_var) + loss.backward() + optimizer.step() + total_loss += loss.item() # Replaced deprecated .data[0] with .item() + + pbar.set_description(f'Training, loss={loss.item():.4}') + + total_loss /= len(dataset) + logger.debug(f'Training Epoch: {epoch + 1}, Loss: {total_loss:.4}') + log_value('Training Loss', total_loss, epoch + 1) def validate(model, args, epoch, dataset, logger): model.eval() - with tqdm(desc='Validatinging', total=len(dataset)) as pbar: + with tqdm(desc='Validating', total=len(dataset)) as pbar: acc = Accuracies() for i, (data, target, paths) in enumerate(dataset): - if True: - if i == args.stop_after: - break - pbar.update() - output = model(data) - output_softmax = F.softmax(output, 1) - targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) - - output_seg = output.data.cpu().numpy().argmax(axis=1) - target_seg = targets_var.data.cpu().numpy() - preds_stats.add(output_seg, target_seg) + if i == args.stop_after: + break + pbar.update() + output = model(data) + output_softmax = F.softmax(output, dim=1) + targets_var = maybe_cuda(torch.cat(target, 0), args.cuda) - acc.update(output_softmax.data.cpu().numpy(), target) + output_seg = output.argmax(dim=1).cpu().numpy() + target_seg = targets_var.cpu().numpy() + preds_stats.add(output_seg, target_seg) - - # except Exception as e: - # # logger.info('Exception "%s" in batch %s', e, i) - # logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True) - # pass + acc.update(output_softmax.cpu().numpy(), target) epoch_pk, epoch_windiff, threshold = acc.calc_accuracy() - logger.info('Validating Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, - preds_stats.get_accuracy(), - epoch_pk, - epoch_windiff, - preds_stats.get_f1())) + logger.info(f'Validating Epoch: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, ' + f'Pk: {epoch_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}') preds_stats.reset() return epoch_pk, threshold - def test(model, args, epoch, dataset, logger, threshold): model.eval() with tqdm(desc='Testing', total=len(dataset)) as pbar: acc = accuracy.Accuracy() for i, (data, target, paths) in enumerate(dataset): - if True: - if i == args.stop_after: - break - pbar.update() - output = model(data) - output_softmax = F.softmax(output, 1) - targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) - output_seg = output.data.cpu().numpy().argmax(axis=1) - target_seg = targets_var.data.cpu().numpy() - preds_stats.add(output_seg, target_seg) - - current_idx = 0 - - for k, t in enumerate(target): - document_sentence_count = len(t) - to_idx = int(current_idx + document_sentence_count) - - output = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > threshold) - h = np.append(output, [1]) - tt = np.append(t, [1]) - - acc.update(h, tt) - - current_idx = to_idx - - # acc.update(output_softmax.data.cpu().numpy(), target) + if i == args.stop_after: + break + pbar.update() + output = model(data) + output_softmax = F.softmax(output, dim=1) + targets_var = maybe_cuda(torch.cat(target, 0), args.cuda) + output_seg = output.argmax(dim=1).cpu().numpy() + target_seg = targets_var.cpu().numpy() + preds_stats.add(output_seg, target_seg) + + current_idx = 0 + for k, t in enumerate(target): + document_sentence_count = len(t) + to_idx = int(current_idx + document_sentence_count) + + output = (output_softmax.cpu().numpy()[current_idx:to_idx, 1] > threshold) + h = np.append(output, [1]) + tt = np.append(t, [1]) - # - # except Exception as e: - # # logger.info('Exception "%s" in batch %s', e, i) - # logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True) + acc.update(h, tt) + current_idx = to_idx epoch_pk, epoch_windiff = acc.calc_accuracy() - logger.debug('Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, - preds_stats.get_accuracy(), - epoch_pk, - epoch_windiff, - preds_stats.get_f1())) + logger.debug(f'Testing Epoch: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, ' + f'Pk: {epoch_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}') preds_stats.reset() return epoch_pk - def main(args): sys.path.append(str(Path(__file__).parent)) @@ -194,30 +159,20 @@ def main(args): logger = utils.setup_logger(__name__, os.path.join(args.checkpoint_dir, 'train.log')) utils.read_config_file(args.config) - utils.config.update(args.__dict__) - logger.debug('Running with config %s', utils.config) + utils.config.update(vars(args)) # Updated to use vars(args) + logger.debug(f'Running with config {utils.config}') configure(os.path.join('runs', args.expname)) - if not args.test: - word2vec = gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True) - else: - word2vec = None + word2vec = None if args.test else gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True) if not args.infer: - if args.wiki: - dataset_path = Path(utils.config['wikidataset']) - train_dataset = WikipediaDataSet(dataset_path / 'train', word2vec=word2vec, - high_granularity=args.high_granularity) - dev_dataset = WikipediaDataSet(dataset_path / 'dev', word2vec=word2vec, high_granularity=args.high_granularity) - test_dataset = WikipediaDataSet(dataset_path / 'test', word2vec=word2vec, - high_granularity=args.high_granularity) - - else: - dataset_path = utils.config['choidataset'] - train_dataset = ChoiDataset(dataset_path, word2vec) - dev_dataset = ChoiDataset(dataset_path, word2vec) - test_dataset = ChoiDataset(dataset_path, word2vec) + dataset_class = WikipediaDataSet if args.wiki else ChoiDataset + dataset_path = Path(utils.config['wikidataset']) if args.wiki else Path(utils.config['choidataset']) + + train_dataset = dataset_class(dataset_path / 'train', word2vec, high_granularity=args.high_granularity) + dev_dataset = dataset_class(dataset_path / 'dev', word2vec, high_granularity=args.high_granularity) + test_dataset = dataset_class(dataset_path / 'test', word2vec, high_granularity=args.high_granularity) train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True, num_workers=args.num_workers) @@ -226,60 +181,45 @@ def main(args): test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers) - assert bool(args.model) ^ bool(args.load_from) # exactly one of them must be set - - if args.model: - model = import_model(args.model) - elif args.load_from: - with open(args.load_from, 'rb') as f: - model = torch.load(f) - - model.train() + model = import_model(args.model) if args.model else torch.load(open(args.load_from, 'rb')) model = maybe_cuda(model) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + if not args.infer: best_val_pk = 1.0 for j in range(args.epochs): train(model, args, j, train_dl, logger, optimizer) - with (checkpoint_path / 'model{:03d}.t7'.format(j)).open('wb') as f: - torch.save(model, f) + torch.save(model, open(checkpoint_path / f'model{j:03d}.t7', 'wb')) val_pk, threshold = validate(model, args, j, dev_dl, logger) if val_pk < best_val_pk: test_pk = test(model, args, j, test_dl, logger, threshold) - logger.debug( - colored( - 'Current best model from epoch {} with p_k {} and threshold {}'.format(j, test_pk, threshold), - 'green')) + logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green')) best_val_pk = val_pk - with (checkpoint_path / 'best_model.t7'.format(j)).open('wb') as f: - torch.save(model, f) + torch.save(model, open(checkpoint_path / 'best_model.t7', 'wb')) else: - test_dataset = WikipediaDataSet(args.infer, word2vec=word2vec, - high_granularity=args.high_granularity) - test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, - num_workers=args.num_workers) - print test(model, args, 0, test_dl, logger, 0.4) - + test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity), + batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers) + print(test(model, args, 0, test_dl, logger, 0.4)) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--cuda', help='Use cuda?', action='store_true') - parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true') + parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true') parser.add_argument('--bs', help='Batch size', type=int, default=8) - parser.add_argument('--test_bs', help='Batch size', type=int, default=5) + parser.add_argument('--test_bs', help='Test batch size', type=int, default=5) parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10) parser.add_argument('--model', help='Model to run - will import and run') parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue') parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1') parser.add_argument('--checkpoint_dir', help='Checkpoint directory', default='checkpoints') - parser.add_argument('--stop_after', help='Number of batches to stop after', default=None, type=int) + parser.add_argument('--stop_after', help='Number of batches to stop after', type=int) parser.add_argument('--config', help='Path to config.json', default='config.json') - parser.add_argument('--wiki', help='Use wikipedia as dataset?', action='store_true') + parser.add_argument('--wiki', help='Use Wikipedia as dataset?', action='store_true') parser.add_argument('--num_workers', help='How many workers to use for data loading', type=int, default=0) - parser.add_argument('--high_granularity', help='Use high granularity for wikipedia dataset segmentation', action='store_true') - parser.add_argument('--infer', help='inference_dir', type=str) + parser.add_argument('--high_granularity', help='Use high granularity for Wikipedia dataset segmentation', action='store_true') + parser.add_argument('--infer', help='Inference directory', type=str) - main(parser.parse_args()) + main(parser.parse_args()) \ No newline at end of file diff --git a/run_web_server.py b/run_web_server.py index 5bd70db..e98c53d 100644 --- a/run_web_server.py +++ b/run_web_server.py @@ -1,18 +1,23 @@ from argparse import ArgumentParser from utils import config, read_config_file +from webapp import app -parser = ArgumentParser() -parser.add_argument('--cuda', help='Is cuda?', action='store_true') -parser.add_argument('--model', help='Model file path', required=True) -parser.add_argument('--config', help='Path to config.json', default='config.json') -parser.add_argument('--test', help='Use fake word2vec', action='store_true') -parser.add_argument('--port', type=int, help='List to this port') +def main(args): + # Read configuration from the config file + read_config_file(args.config) + config.update(vars(args)) # Use vars(args) to convert argparse.Namespace to a dictionary -args = parser.parse_args() + # Run the web server + app.run(debug=True, port=args.port) -read_config_file(args.config) -config.update(args.__dict__) +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--cuda', help='Is cuda?', action='store_true') + parser.add_argument('--model', help='Model file path', required=True) + parser.add_argument('--config', help='Path to config.json', default='config.json') + parser.add_argument('--test', help='Use fake word2vec', action='store_true') + parser.add_argument('--port', type=int, help='Port to listen on', default=5000) -from webapp import app -app.run(debug=True, port=args.port) + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/seg_comparsion.py b/seg_comparsion.py index b5fe8e3..7142126 100644 --- a/seg_comparsion.py +++ b/seg_comparsion.py @@ -9,43 +9,41 @@ from choiloader import ChoiDataset, collate_fn, read_choi_file from torch.utils.data import DataLoader from test_accuracy import softmax -from wiki_loader import clean_section,split_sentences,section_delimiter,extract_sentence_words +from wiki_loader import clean_section, split_sentences, section_delimiter, extract_sentence_words import os import sys - preds_stats = utils.predictions_analysis() paragraphs_delimiter = "==" def main(args): - utils.read_config_file(args.config) - if not args.test: word2vec = gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True) else: word2vec = None + # Load model with open(args.model, 'rb') as f: model = torch.load(f) model = maybe_cuda(model) model.eval() - data_path = args.folder - if (args.wiki): - dataset = WikipediaDataSet(args.folder,word2vec,folder=True) - delimeter = section_delimiter - - elif args.choi: #not in use - dataset = ChoiDataset(args.folder, word2vec,is_cache_path=True) - delimeter = paragraphs_delimiter + # Set dataset and delimiter based on the input type + if args.wiki: + dataset = WikipediaDataSet(args.folder, word2vec, folder=True) + delimiter = section_delimiter + elif args.choi: # Not in use but kept for reference + dataset = ChoiDataset(args.folder, word2vec, is_cache_path=True) + delimiter = paragraphs_delimiter else: - print 'required dataset type' + print('Dataset type is required') return - dl = DataLoader(dataset,batch_size=1, collate_fn=collate_fn, shuffle=False) + dl = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=False) + # Process each document in the dataset for i, (data, targets, paths) in enumerate(dl): doc_path = str(paths[0]) output = model(data) @@ -56,67 +54,63 @@ def main(args): target_seg = targets_var.data.cpu().numpy() preds_stats.add(output_seg, target_seg) + # Create the output folder if it doesn't exist if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) - result_file_path = os.path.join(args.output_folder,os.path.basename(doc_path)) - result_file = open(str(result_file_path ),"w") - - file = open(str(doc_path), "r") - raw_content = file.read() - file.close() - sections = [clean_section(s) for s in raw_content.decode('utf-8').strip().split(delimeter) if len(s) > 0 and s != "\n"] - - sum_sentences = 0 - total_num_sentences = 0 - bad_sentences = 0 - - for section in sections: - sentences = split_sentences(section) - if sentences: - total_num_sentences += len(sentences) - for i in range(0,len(sentences)): - sentence = sentences[i] - words = extract_sentence_words(sentence) - sentence = " ".join(words) - - result_file.write(sentence.encode('utf-8')) - - sys.stdout.flush() - result_file.write("\n".encode('utf-8')) - if (len(target_seg) == sum_sentences): ## last sentence - continue - if (target_seg[sum_sentences]): - result_file.write(delimeter.encode('utf-8')) - sys.stdout.flush() - result_file.write("\n".encode('utf-8')) - if (output_seg[sum_sentences]): - result_file.write("*******Our_Segmentation********".encode('utf-8')) - result_file.write("\n".encode('utf-8')) - sum_sentences += 1 - result_file.close() - - if ((total_num_sentences - bad_sentences) != (len(target_seg) + 1)): ## +1 last sentence segment doesn't counted - print 'Pick another article' - print 'len(targets) + 1= ' + str(len(target_seg) + 1) - print 'total_num_sentences - bad_sentences= ' + str(total_num_sentences - bad_sentences) - else : - print 'finish comparsion' - print 'result at ' + str(result_file_path ) - print ('F1: {:.4}.'.format(preds_stats.get_f1())) - print ('Accuracy: {:.4}.'.format(preds_stats.get_accuracy())) - + # Write the result file + result_file_path = os.path.join(args.output_folder, os.path.basename(doc_path)) + with open(result_file_path, "w", encoding='utf-8') as result_file: + with open(doc_path, "r", encoding='utf-8') as file: + raw_content = file.read() + + sections = [clean_section(s) for s in raw_content.strip().split(delimiter) if len(s) > 0 and s != "\n"] + + sum_sentences = 0 + total_num_sentences = 0 + bad_sentences = 0 + + for section in sections: + sentences = split_sentences(section) + if sentences: + total_num_sentences += len(sentences) + for i, sentence in enumerate(sentences): + words = extract_sentence_words(sentence) + sentence = " ".join(words) + + result_file.write(sentence + "\n") + + if len(target_seg) == sum_sentences: # Last sentence + continue + + if target_seg[sum_sentences]: # True segmentation + result_file.write(delimiter + "\n") + + if output_seg[sum_sentences]: # Model segmentation + result_file.write("*******Our_Segmentation********\n") + + sum_sentences += 1 + + if (total_num_sentences - bad_sentences) != (len(target_seg) + 1): # +1 for last sentence + print('Pick another article') + print(f'len(targets) + 1 = {len(target_seg) + 1}') + print(f'total_num_sentences - bad_sentences = {total_num_sentences - bad_sentences}') + else: + print('Finished comparison') + print(f'Result at {result_file_path}') + print(f'F1: {preds_stats.get_f1():.4}.') + print(f'Accuracy: {preds_stats.get_accuracy():.4}.') if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--cuda', help='Use cuda?', action='store_true') - parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true') + parser.add_argument('--test', help='Test mode? (e.g., fake word2vec)', action='store_true') parser.add_argument('--model', help='Model to run - will import and run', required=True) parser.add_argument('--config', help='Path to config.json', default='config.json') - parser.add_argument('--folder', help='folder with files to test on', required=True) - parser.add_argument('--output_folder', help='folder for result', required=True) - parser.add_argument('--wiki', help='if its wiki article', action='store_true') - parser.add_argument('--manifesto', help='if its manifesto article', action='store_true') - parser.add_argument('--choi', help='if its choi article', action='store_true') + parser.add_argument('--folder', help='Folder with files to test on', required=True) + parser.add_argument('--output_folder', help='Folder for results', required=True) + parser.add_argument('--wiki', help='If the dataset is from Wikipedia', action='store_true') + parser.add_argument('--manifesto', help='If the dataset is from Manifesto', action='store_true') + parser.add_argument('--choi', help='If the dataset is from Choi', action='store_true') - main(parser.parse_args()) + main(parser.parse_args()) \ No newline at end of file diff --git a/test_accuracy.py b/test_accuracy.py index 6763796..7f9b861 100644 --- a/test_accuracy.py +++ b/test_accuracy.py @@ -1,10 +1,7 @@ -from __future__ import division - import torch from torch.utils.data import DataLoader from torch.autograd import Variable import numpy as np - from choiloader import ChoiDataset, collate_fn from tqdm import tqdm from argparse import ArgumentParser @@ -13,7 +10,7 @@ import utils import os import sys -from pathlib2 import Path +from pathlib import Path from wiki_loader import WikipediaDataSet import accuracy from models import naive @@ -29,13 +26,12 @@ def softmax(x): sums = np.sum(exps, axis=1, keepdims=True) return exps / sums -def getSegmentsFolders(path): - +def get_segments_folders(path): ret_folders = [] folders = [o for o in os.listdir(path) if os.path.isdir(os.path.join(path, o))] for folder in folders: - if folder.__contains__("-"): - ret_folders.append(os.path.join(path,folder)) + if '-' in folder: + ret_folders.append(os.path.join(path, folder)) return ret_folders @@ -45,10 +41,10 @@ def main(args): sys.path.append(str(Path(__file__).parent)) utils.read_config_file(args.config) - utils.config.update(args.__dict__) + utils.config.update(vars(args)) # Use vars for better argument handling logger.debug('Running with config %s', utils.config) - print ('Running with threshold: ' + str(args.seg_threshold)) + print(f'Running with threshold: {args.seg_threshold}') preds_stats = utils.predictions_analysis() if not args.test: @@ -57,55 +53,54 @@ def main(args): word2vec = None word2vec_done = timer() - print 'Loading word2vec ellapsed: ' + str(word2vec_done - start) + ' seconds' + print(f'Loading word2vec elapsed: {word2vec_done - start} seconds') dirname = 'test' + # Determine dataset based on wiki flag if args.wiki: dataset_folders = [Path(utils.config['wikidataset']) / dirname] - if (args.wiki_folder): - dataset_folders = [] - dataset_folders.append(args.wiki_folder) - print 'running on wikipedia' + if args.wiki_folder: + dataset_folders = [args.wiki_folder] + print('Running on Wikipedia') else: - if (args.bySegLength): - dataset_folders = getSegmentsFolders(utils.config['choidataset']) - print 'run on choi by segments length' - else : + if args.bySegLength: + dataset_folders = get_segments_folders(utils.config['choidataset']) + print('Running on Choi by segments length') + else: dataset_folders = [utils.config['choidataset']] - print 'running on Choi' - + print('Running on Choi') + # Load the model with open(args.model, 'rb') as f: model = torch.load(f) model = maybe_cuda(model) model.eval() - if (args.naive): + if args.naive: model = naive.create() for dataset_path in dataset_folders: + if args.bySegLength: + print('Segment is', os.path.basename(dataset_path), ":") - if (args.bySegLength): - print 'Segment is ',os.path.basename(dataset_path), " :" - + # Load dataset if args.wiki: - if (args.wiki_folder): + if args.wiki_folder: dataset = WikipediaDataSet(dataset_path, word2vec, folder=True, high_granularity=False) - else : + else: dataset = WikipediaDataSet(dataset_path, word2vec, high_granularity=False) else: - dataset = ChoiDataset(dataset_path , word2vec) + dataset = ChoiDataset(dataset_path, word2vec) dl = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=False) - - + # Testing loop with tqdm(desc='Testing', total=len(dl)) as pbar: total_accurate = 0 total_count = 0 total_loss = 0 - acc = accuracy.Accuracy() + acc = accuracy.Accuracy() for i, (data, targets, paths) in enumerate(dl): if i == args.stop_after: @@ -113,16 +108,16 @@ def main(args): pbar.update() output = model(data) - targets_var = Variable(maybe_cuda(torch.cat(targets, 0), args.cuda), requires_grad=False) + targets_var = maybe_cuda(torch.cat(targets, 0), args.cuda) batch_loss = 0 - output_prob = softmax(output.data.cpu().numpy()) + output_prob = softmax(output.cpu().numpy()) output_seg = output_prob[:, 1] > args.seg_threshold - target_seg = targets_var.data.cpu().numpy() + target_seg = targets_var.cpu().numpy() batch_accurate = (output_seg == target_seg).sum() total_accurate += batch_accurate total_count += len(target_seg) total_loss += batch_loss - preds_stats.add(output_seg,target_seg) + preds_stats.add(output_seg, target_seg) current_target_idx = 0 for k, t in enumerate(targets): @@ -131,18 +126,18 @@ def main(args): to_idx = int(current_target_idx + document_sentence_count) h = output_seg[current_target_idx: to_idx] - # hypothesis and targets are missing classification of last sentence, and therefore we will add - # 1 for both + # Add classification for the last sentence h = np.append(h, [1]) t = np.append(t.cpu().numpy(), [1]) - acc.update(h,t, sentences_length=sentences_length) + acc.update(h, t, sentences_length=sentences_length) current_target_idx = to_idx logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i, batch_loss, batch_accurate / len(target_seg)) pbar.set_description('Testing, Accuracy={:.4}'.format(batch_accurate / len(target_seg))) + # Logging results average_loss = total_loss / len(dl) average_accuracy = total_accurate / total_count calculated_pk, _ = acc.calc_accuracy() @@ -150,29 +145,26 @@ def main(args): logger.info('Finished testing.') logger.info('Average loss: %s', average_loss) logger.info('Average accuracy: %s', average_accuracy) - logger.info('Pk: {:.4}.'.format(calculated_pk)) - logger.info('F1: {:.4}.'.format(preds_stats.get_f1())) - + logger.info(f'Pk: {calculated_pk:.4}.') + logger.info(f'F1: {preds_stats.get_f1():.4}.') end = timer() - print ('Seconds to execute to whole flow: ' + str(end - start)) - + print(f'Seconds to execute the whole flow: {end - start}') if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--cuda', help='Use cuda?', action='store_true') - parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true') + parser.add_argument('--test', help='Test mode? (e.g., fake word2vec)', action='store_true') parser.add_argument('--bs', help='Batch size', type=int, default=8) - parser.add_argument('--model', help='Model to run - will import and run', required=True) - parser.add_argument('--stop_after', help='Number of batches to stop after', default=None, type=int) + parser.add_argument('--model', help='Model to run', required=True) + parser.add_argument('--stop_after', help='Number of batches to stop after', type=int) parser.add_argument('--config', help='Path to config.json', default='config.json') - parser.add_argument('--wiki', help='Use wikipedia as dataset?', action='store_true') - parser.add_argument('--bySegLength', help='calc pk on choi by segments length?', action='store_true') - parser.add_argument('--wiki_folder', help='path to folder which contains wiki documents') - parser.add_argument('--naive', help='use naive model', action='store_true') - parser.add_argument('--seg_threshold', help='Threshold for binary classificetion', type=float, default=0.4) - parser.add_argument('--calc_word', help='Whether to calc P_K by word', action='store_true') - - - main(parser.parse_args()) + parser.add_argument('--wiki', help='Use Wikipedia as dataset?', action='store_true') + parser.add_argument('--bySegLength', help='Calculate pk on Choi by segments length?', action='store_true') + parser.add_argument('--wiki_folder', help='Path to folder containing wiki documents') + parser.add_argument('--naive', help='Use naive model', action='store_true') + parser.add_argument('--seg_threshold', help='Threshold for binary classification', type=float, default=0.4) + parser.add_argument('--calc_word', help='Calculate P_K by word', action='store_true') + + main(parser.parse_args()) \ No newline at end of file diff --git a/test_accuracy_choi.py b/test_accuracy_choi.py index 97fff4b..84ef0fa 100644 --- a/test_accuracy_choi.py +++ b/test_accuracy_choi.py @@ -2,7 +2,6 @@ from torch.utils.data import DataLoader from torch.autograd import Variable import torch.nn.functional as F - from choiloader import ChoiDataset, collate_fn from tqdm import tqdm from argparse import ArgumentParser @@ -12,7 +11,7 @@ from tensorboard_logger import configure import os import sys -from pathlib2 import Path +from pathlib import Path import accuracy import numpy as np from termcolor import colored @@ -21,32 +20,29 @@ preds_stats = utils.predictions_analysis() - def softmax(x): max_each_row = np.max(x, axis=1, keepdims=True) exps = np.exp(x - max_each_row) sums = np.sum(exps, axis=1, keepdims=True) return exps / sums - def import_model(model_name): module = __import__('models.' + model_name, fromlist=['models']) return module.create() - -class Accuracies(object): +class Accuracies: def __init__(self): self.thresholds = np.arange(0, 1, 0.05) self.accuracies = {k: accuracy.Accuracy() for k in self.thresholds} def update(self, output_np, targets_np): current_idx = 0 - for k, t in enumerate(targets_np): + for t in targets_np: document_sentence_count = len(t) to_idx = int(current_idx + document_sentence_count) for threshold in self.thresholds: - output = ((output_np[current_idx: to_idx, :])[:, 1] > threshold) + output = (output_np[current_idx: to_idx, 1] > threshold) h = np.append(output, [1]) tt = np.append(t, [1]) @@ -69,84 +65,72 @@ def calc_accuracy(self): def validate(model, args, epoch, dataset, logger): model.eval() - with tqdm(desc='Validatinging', total=len(dataset)) as pbar: + with tqdm(desc='Validating', total=len(dataset)) as pbar: acc = Accuracies() for i, (data, target, paths) in enumerate(dataset): - if True: - if i == args.stop_after: - break - pbar.update() - output = model(data) - output_softmax = F.softmax(output, 1) - targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) + if i == args.stop_after: + break + pbar.update() + output = model(data) + output_softmax = F.softmax(output, dim=1) + targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) - output_seg = output.data.cpu().numpy().argmax(axis=1) - target_seg = targets_var.data.cpu().numpy() - preds_stats.add(output_seg, target_seg) + output_seg = output.argmax(dim=1).cpu().numpy() + target_seg = targets_var.cpu().numpy() + preds_stats.add(output_seg, target_seg) - acc.update(output_softmax.data.cpu().numpy(), target) + acc.update(output_softmax.cpu().numpy(), target) epoch_pk, epoch_windiff, threshold = acc.calc_accuracy() - logger.info('Validating Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, - preds_stats.get_accuracy(), - epoch_pk, - epoch_windiff, - preds_stats.get_f1())) + logger.info(f'Validating Epoch: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, ' + f'Pk: {epoch_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}') preds_stats.reset() return epoch_pk, threshold - def test(model, args, epoch, dataset, logger, test_threshold, test_acc): model.eval() with tqdm(desc='Testing', total=len(dataset)) as pbar: for i, (data, target, paths) in enumerate(dataset): - if True: - if i == args.stop_after: - break - pbar.update() - output = model(data) - output_softmax = F.softmax(output, 1) - targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) - output_seg = output.data.cpu().numpy().argmax(axis=1) - target_seg = targets_var.data.cpu().numpy() - preds_stats.add(output_seg, target_seg) - - current_idx = 0 - - for k, t in enumerate(target): - document_sentence_count = len(t) - to_idx = int(current_idx + document_sentence_count) - - output = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > test_threshold) - h = np.append(output, [1]) - tt = np.append(t, [1]) - - test_acc.update(h, tt) + if i == args.stop_after: + break + pbar.update() + output = model(data) + output_softmax = F.softmax(output, dim=1) + targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) + output_seg = output.argmax(dim=1).cpu().numpy() + target_seg = targets_var.cpu().numpy() + preds_stats.add(output_seg, target_seg) + + current_idx = 0 + for t in target: + document_sentence_count = len(t) + to_idx = int(current_idx + document_sentence_count) + + output = (output_softmax.cpu().numpy()[current_idx: to_idx, 1] > test_threshold) + h = np.append(output, [1]) + tt = np.append(t, [1]) - current_idx = to_idx + test_acc.update(h, tt) + current_idx = to_idx test_pk, epoch_windiff = test_acc.calc_accuracy() - logger.debug('Testing validation section: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, - preds_stats.get_accuracy(), - test_pk, - epoch_windiff, - preds_stats.get_f1())) + logger.debug(f'Testing validation section: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, ' + f'Pk: {test_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}') preds_stats.reset() return test_pk - def main(args): sys.path.append(str(Path(__file__).parent)) - logger = utils.setup_logger(__name__, 'cross_validate_choi.log') + logger = utils.setup_logger(__name__, 'cross_validate_choi.log') utils.read_config_file(args.config) - utils.config.update(args.__dict__) - logger.debug('Running with config %s', utils.config) + utils.config.update(vars(args)) # Updated to use vars(args) for cleaner conversion + logger.debug(f'Running with config {utils.config}') configure(os.path.join('runs', args.expname)) @@ -155,7 +139,6 @@ def main(args): else: word2vec = None - dataset_path = Path(args.flat_choi) with open(args.load_from, 'rb') as f: @@ -166,10 +149,10 @@ def main(args): test_accuracy = accuracy.Accuracy() for j in range(5): - validate_folder_numbers = range(5) + validate_folder_numbers = list(range(5)) validate_folder_numbers.remove(j) validate_folder_names = [dataset_path.joinpath(str(num)) for num in validate_folder_numbers] - dev_dataset = ChoiDataset(dataset_path , word2vec, folder=True, folders_paths=validate_folder_names) + dev_dataset = ChoiDataset(dataset_path, word2vec, folder=True, folders_paths=validate_folder_names) test_dataset = ChoiDataset(dataset_path, word2vec, folder=True, folders_paths=[dataset_path.joinpath(str(j))]) dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, @@ -179,28 +162,24 @@ def main(args): _, threshold = validate(model, args, j, dev_dl, logger) test_pk = test(model, args, j, test_dl, logger, threshold, test_accuracy) - logger.debug(colored('Cross validation section {} with p_k {} and threshold {}'.format(j, test_pk, threshold),'green')) + logger.debug(colored(f'Cross validation section {j} with p_k {test_pk} and threshold {threshold}', 'green')) cross_validation_pk, _ = test_accuracy.calc_accuracy() - print ('Final cross validaiton Pk is: ' + str(cross_validation_pk)) - logger.debug( - colored('Final cross validaiton Pk is: {}'.format(cross_validation_pk), 'green')) - - + print(f'Final cross validation Pk is: {cross_validation_pk}') + logger.debug(colored(f'Final cross validation Pk is: {cross_validation_pk}', 'green')) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--cuda', help='Use cuda?', action='store_true') - parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true') + parser.add_argument('--test', help='Test mode? (e.g., fake word2vec)', action='store_true') parser.add_argument('--bs', help='Batch size', type=int, default=8) - parser.add_argument('--test_bs', help='Batch size', type=int, default=5) - parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue') + parser.add_argument('--test_bs', help='Test batch size', type=int, default=5) + parser.add_argument('--load_from', help='Location of a .t7 model file to load') parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1') - parser.add_argument('--stop_after', help='Number of batches to stop after', default=None, type=int) + parser.add_argument('--stop_after', help='Number of batches to stop after', type=int) parser.add_argument('--config', help='Path to config.json', default='config.json') - parser.add_argument('--window_size', help='Window size to encode setence', type=int, default=1) - parser.add_argument('--num_workers', help='How many workers to use for data loading', type=int, default=0) + parser.add_argument('--window_size', help='Window size to encode sentence', type=int, default=1) + parser.add_argument('--num_workers', help='Number of workers for data loading', type=int, default=0) parser.add_argument('--flat_choi', help='Path to flat choi dataset') - - main(parser.parse_args()) + main(parser.parse_args()) \ No newline at end of file diff --git a/tests.py b/tests.py index 6fc40f2..5fe5dce 100644 --- a/tests.py +++ b/tests.py @@ -1,75 +1,59 @@ -from __future__ import print_function - -from unittest import TestCase -from utils import unsort import unittest +from utils import unsort import accuracy import numpy as np import text_manipulation -class LoaderTests(TestCase): - def testReallyTrivial(self): - assert 1 + 1 == 2 +class LoaderTests(unittest.TestCase): + def test_really_trivial(self): + self.assertEqual(1 + 1, 2) class PkTests(unittest.TestCase): def test_get_boundaries(self): - sentences_class = [] - sentences_class.append(("first sen.", 1)) - sentences_class.append(("sec sen.", 1)) - sentences_class.append(("third sen.", 0)) - sentences_class.append(("forth sen.", 1)) - sentences_class.append(("fifth sen.", 0)) - sentences_class.append(("sixth sen.", 0)) - sentences_class.append(("seventh sen.", 1)) - + sentences_class = [ + ("first sen.", 1), + ("sec sen.", 1), + ("third sen.", 0), + ("forth sen.", 1), + ("fifth sen.", 0), + ("sixth sen.", 0), + ("seventh sen.", 1) + ] expected = [2, 2, 4, 6] result = accuracy.get_seg_boundaries(sentences_class) - - for i, num in enumerate(result): - self.assertTrue(num == expected[i]) + self.assertEqual(result, expected) def test_get_boundaries2(self): - sentences_class = [] - sentences_class.append(("first sen is 5 words.", 0)) - sentences_class.append(("sec sen.", 0)) - sentences_class.append(("third sen is a very very very long sentence.", 1)) - sentences_class.append(("the forth one is single segment.", 1)) - - + sentences_class = [ + ("first sen is 5 words.", 0), + ("sec sen.", 0), + ("third sen is a very very very long sentence.", 1), + ("the forth one is a single segment.", 1) + ] expected = [16, 6] result = accuracy.get_seg_boundaries(sentences_class) - - for i, num in enumerate(result): - self.assertTrue(num == expected[i]) - - def test_pk_perefct_seg(self): - sentences_class = [] - sentences_class.append(("first sen is 5 words.", 0)) - sentences_class.append(("sec sen.", 0)) - sentences_class.append(("third sen is a very very very long sentence.", 1)) - sentences_class.append(("the forth one is single segment.", 1)) - + self.assertEqual(result, expected) + + def test_pk_perfect_seg(self): + sentences_class = [ + ("first sen is 5 words.", 0), + ("sec sen.", 0), + ("third sen is a very very very long sentence.", 1), + ("the forth one is a single segment.", 1) + ] gold = accuracy.get_seg_boundaries(sentences_class) h = accuracy.get_seg_boundaries(sentences_class) - # with specified window size - for window_size in range(1, 15, 1): + for window_size in range(1, 15): acc = accuracy.pk(gold, h, window_size=window_size) - self.assertEquals(acc, 1) + self.assertEqual(acc, 1) - # with default window size acc = accuracy.pk(gold, h) - self.assertEquals(acc, 1) + self.assertEqual(acc, 1) def test_pk_false_neg(self): - h = [] - h.append(("5 words sentence of data.", 0)) - h.append(("2 sentences same seg.", 1)) - - gold = [] - gold.append(("5 words sentence of data.", 1)) - gold.append(("2 sentences same seg.", 1)) - + h = [("5 words sentence of data.", 0), ("2 sentences same seg.", 1)] + gold = [("5 words sentence of data.", 1), ("2 sentences same seg.", 1)] gold = accuracy.get_seg_boundaries(gold) h = accuracy.get_seg_boundaries(h) @@ -77,75 +61,73 @@ def test_pk_false_neg(self): window_size = 3 comparison_count = 6 - # with default window size acc = accuracy.pk(gold, h) - self.assertEquals(acc, window_size / comparison_count) + self.assertEqual(acc, window_size / comparison_count) window_size = 4 acc = accuracy.pk(gold, h) - self.assertEquals(acc, window_size / comparison_count) + self.assertEqual(acc, window_size / comparison_count) def test_windiff(self): - h = [] - h.append(("5 words sentence of data.", 0)) - h.append(("short.", 1)) - h.append(("extra segmented sen.", 1)) - h.append(("last and very very very very very long sen.", 1)) - - - gold = [] - gold.append(("5 words sentence of data.", 1)) - gold.append(("short.", 1)) - gold.append(("extra segmented sen.", 0)) - gold.append(("last and very very very very very long sen.", 1)) - + h = [ + ("5 words sentence of data.", 0), + ("short.", 1), + ("extra segmented sen.", 1), + ("last and very very very very very long sen.", 1) + ] + + gold = [ + ("5 words sentence of data.", 1), + ("short.", 1), + ("extra segmented sen.", 0), + ("last and very very very very very long sen.", 1) + ] gold = accuracy.get_seg_boundaries(gold) h = accuracy.get_seg_boundaries(h) window_size = 3 + acc = accuracy.win_diff(gold, h, window_size=window_size) + self.assertEqual(float(acc), 0.6) - acc = accuracy.win_diff(gold, h, window_size = window_size) - self.assertEquals(float(acc), 0.6) - window_size = 5 - expected = float(1)- float(8) / 13 + expected = 1 - 8 / 13 acc = accuracy.win_diff(gold, h, window_size=window_size) - self.assertEquals("{0:.5f}".format(float(acc)), "{0:.5f}".format(expected)) - + self.assertAlmostEqual(float(acc), expected, places=5) -class UnsortTests(TestCase): +class UnsortTests(unittest.TestCase): def test_unsort(self): x = np.random.randint(0, 100, 10) sort_order = np.argsort(x) unsort_order = unsort(sort_order) - assert np.all(x[sort_order][unsort_order] == x) - - -class SentenceTokenizerTests(TestCase): - def test_a_little(self): - a = text_manipulation.split_sentences(u"Hello, Mr. Trump, how do you do? What? Where? I don't i.e e.g Russia.") - assert a == [u'Hello, Mr. Trump, how do you do?', - u'What?', - u'Where?', - u"I don't i.e e.g Russia."] + np.testing.assert_array_equal(x[sort_order][unsort_order], x) + +class SentenceTokenizerTests(unittest.TestCase): + def test_split_sentences(self): + text = u"Hello, Mr. Trump, how do you do? What? Where? I don't i.e e.g Russia." + expected = [ + u'Hello, Mr. Trump, how do you do?', + u'What?', + u'Where?', + u"I don't i.e e.g Russia." + ] + result = text_manipulation.split_sentences(text) + self.assertEqual(result, expected) def test_linebreaks(self): text = u'''Line one. Still line one. Line two. Can I span two lines?''' - a = text_manipulation.split_sentences(text) - print(a) - assert a == [u'Line one.', - u'Still line one.', - u'Line two.', - u'Can I span\n two lines?'] - - - - + expected = [ + u'Line one.', + u'Still line one.', + u'Line two.', + u'Can I span\n two lines?' + ] + result = text_manipulation.split_sentences(text) + self.assertEqual(result, expected) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/text_manipulation.py b/text_manipulation.py index 238a3d9..50b738f 100644 --- a/text_manipulation.py +++ b/text_manipulation.py @@ -1,15 +1,16 @@ import nltk.data -import exceptions import numpy as np from nltk.tokenize import RegexpTokenizer import wiki_utils import wiki_thresholds import utils +import logging +# Initialize global variables sentence_tokenizer = None words_tokenizer = None -missing_stop_words = set(['of', 'a', 'and', 'to']) -logger = utils.setup_logger(__name__, 'text_manipulation.log', True ) +missing_stop_words = {'of', 'a', 'and', 'to'} +logger = utils.setup_logger(__name__, 'text_manipulation.log', delete_old=True) def get_punkt(): @@ -19,7 +20,7 @@ def get_punkt(): try: tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') - except exceptions.LookupError: + except LookupError: nltk.download('punkt') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') @@ -28,90 +29,85 @@ def get_punkt(): def get_words_tokenizer(): global words_tokenizer - if words_tokenizer: return words_tokenizer words_tokenizer = RegexpTokenizer(r'\w+') return words_tokenizer - - def split_sentence_with_list(sentence): - list_pattern = "\n" + wiki_utils.get_list_token() + "." - if sentence.endswith( list_pattern ): - #splited_sentence = [str for str in sentence.encode('utf-8').split("\n" + wiki_utils.get_list_token() + ".") if len(str) > 0] - splited_sentence = [str for str in sentence.split("\n" + wiki_utils.get_list_token() + ".") if - len(str) > 0] - splited_sentence.append(wiki_utils.get_list_token() + ".") - return splited_sentence + if sentence.endswith(list_pattern): + split_sentence = [s for s in sentence.split(list_pattern) if len(s) > 0] + split_sentence.append(wiki_utils.get_list_token() + ".") + return split_sentence else: return [sentence] -def split_sentece_colon_new_line(sentence): - - splited_sentence = sentence.split(":\n") - if (len(splited_sentence) == 1): - return splited_sentence +def split_sentence_colon_new_line(sentence): + split_sentence = sentence.split(":\n") + if len(split_sentence) == 1: + return split_sentence + new_sentences = [] - # -1 . not to add ":" to last sentence - for i in range(len(splited_sentence) - 1): - if (len(splited_sentence[i]) > 0): - new_sentences.append(splited_sentence[i] + ":") - if (len(splited_sentence[-1]) > 0): - new_sentences.append(splited_sentence[-1]) + for i in range(len(split_sentence) - 1): + if len(split_sentence[i]) > 0: + new_sentences.append(split_sentence[i] + ":") + + if len(split_sentence[-1]) > 0: + new_sentences.append(split_sentence[-1]) + return new_sentences -def split_long_sentences_with_backslash_n(max_words_in_sentence,sentences, doc_id): +def split_long_sentences_with_backslash_n(max_words_in_sentence, sentences, doc_id): new_sentences = [] for sentence in sentences: sentence_words = extract_sentence_words(sentence) if len(sentence_words) > max_words_in_sentence: - splitted_sentences = sentence.split('\n') - if len(splitted_sentences) > 1: - logger.info("Sentence with backslash was splitted. Doc Id: " + str(doc_id) +" Sentence: " + sentence) - new_sentences.extend(splitted_sentences ) + split_sentences = sentence.split('\n') + if len(split_sentences) > 1: + logger.info(f"Sentence with backslash was split. Doc Id: {doc_id} Sentence: {sentence}") + new_sentences.extend(split_sentences) else: if "\n" in sentence: - logger.info("No split for sentence with backslash n. Doc Id: " + str(doc_id) +" Sentence: " + sentence) + logger.info(f"No split for sentence with backslash n. Doc Id: {doc_id} Sentence: {sentence}") new_sentences.append(sentence) return new_sentences def split_sentences(text, doc_id): sentences = get_punkt().tokenize(text) - senteces_list_fix = [] + sentences_list_fixed = [] for sentence in sentences: - seplited_list_sentence = split_sentence_with_list(sentence) - senteces_list_fix.extend(seplited_list_sentence) + split_list_sentence = split_sentence_with_list(sentence) + sentences_list_fixed.extend(split_list_sentence) - sentence_colon_fix = [] - for sentence in senteces_list_fix: - splitted_colon_sentence = split_sentece_colon_new_line(sentence) - sentence_colon_fix.extend(splitted_colon_sentence) - - sentences_without_backslash_n = split_long_sentences_with_backslash_n(wiki_thresholds.max_words_in_sentence_with_backslash_n, sentence_colon_fix, doc_id) - - ret_sentences = [] - for sentence in sentences_without_backslash_n: - ret_sentences.append(sentence.replace('\n',' ')) + sentences_colon_fixed = [] + for sentence in sentences_list_fixed: + split_colon_sentence = split_sentence_colon_new_line(sentence) + sentences_colon_fixed.extend(split_colon_sentence) + sentences_no_backslash_n = split_long_sentences_with_backslash_n( + wiki_thresholds.max_words_in_sentence_with_backslash_n, + sentences_colon_fixed, + doc_id + ) + ret_sentences = [sentence.replace('\n', ' ') for sentence in sentences_no_backslash_n] return ret_sentences -def extract_sentence_words(sentence, remove_missing_emb_words = False,remove_special_tokens = False): - if (remove_special_tokens): +def extract_sentence_words(sentence, remove_missing_emb_words=False, remove_special_tokens=False): + if remove_special_tokens: for token in wiki_utils.get_special_tokens(): - # Can't do on sentence words because tokenizer delete '***' of tokens. sentence = sentence.replace(token, "") + tokenizer = get_words_tokenizer() sentence_words = tokenizer.tokenize(sentence) + if remove_missing_emb_words: sentence_words = [w for w in sentence_words if w not in missing_stop_words] return sentence_words - def word_model(word, model): if model is None: return np.random.randn(1, 300) @@ -119,6 +115,5 @@ def word_model(word, model): if word in model: return model[word].reshape(1, 300) else: - #print ('Word missing w2v: ' + word) - return model['UNK'].reshape(1, 300) - + # If word not in model, return 'UNK' embedding + return model['UNK'].reshape(1, 300) \ No newline at end of file diff --git a/times_profiler.py b/times_profiler.py index 6875a50..b79d73c 100644 --- a/times_profiler.py +++ b/times_profiler.py @@ -1,36 +1,41 @@ from timeit import default_timer as timer -class profiler(): - +class profiler: segments = [] start = 0 end = 0 - @staticmethod - def set (): - + def set(): + """ + Mark the end of a segment and start the timer for the next segment. + """ profiler.end = timer() profiler.segments.append(profiler.end - profiler.start) profiler.start = timer() - return - @staticmethod def init(): + """ + Initialize the profiler by starting the timer. + """ profiler.start = timer() - return - @staticmethod def finish(profilerLog): + """ + Finish profiling and log the results to the provided logger. + + Args: + profilerLog: A logger object to which profiling results will be logged. + """ profiler.end = timer() profiler.segments.append(profiler.end - profiler.start) - str2log = "" - for i in range(len(profiler.segments)): - str2log += str(i) +"-"+str(i+1)+" = " + "{:.2f}".format(profiler.segments[i]) + " " + + # Format the results for logging + str2log = " ".join([f"{i}-{i+1} = {segment:.2f}" for i, segment in enumerate(profiler.segments)]) profilerLog.debug(str2log) - profiler.segments = [] - return + # Clear the segments after logging + profiler.segments = [] \ No newline at end of file diff --git a/utils.py b/utils.py index 351f58c..cce93b1 100644 --- a/utils.py +++ b/utils.py @@ -3,37 +3,29 @@ import sys import numpy as np import random -from pathlib2 import Path +from pathlib import Path # Updated to use pathlib (pathlib2 is not needed in Python 3) from shutil import copy - - config = {} - def read_config_file(path='config.json'): global config - with open(path, 'r') as f: config.update(json.load(f)) - def maybe_cuda(x, is_cuda=None): global config - if is_cuda is None and 'cuda' in config: is_cuda = config['cuda'] - if is_cuda: return x.cuda() return x - -def setup_logger(logger_name, filename, delete_old = False): +def setup_logger(logger_name, filename, delete_old=False): logger = logging.getLogger(logger_name) logger.setLevel(logging.DEBUG) stderr_handler = logging.StreamHandler(sys.stderr) - file_handler = logging.FileHandler(filename, mode='w') if delete_old else logging.FileHandler(filename) + file_handler = logging.FileHandler(filename, mode='w' if delete_old else 'a') file_handler.setLevel(logging.DEBUG) stderr_handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') @@ -43,134 +35,80 @@ def setup_logger(logger_name, filename, delete_old = False): logger.addHandler(file_handler) return logger - def unsort(sort_order): result = [-1] * len(sort_order) - for i, index in enumerate(sort_order): result[index] = i - return result -class f1(object): - - def __init__(self,ner_size): +class F1: + def __init__(self, ner_size): self.ner_size = ner_size - self.tp = np.array([0] * (ner_size +1)) - self.fp = np.array([0] * (ner_size +1)) - self.fn = np.array([0] * (ner_size +1)) - - def add(self,preds,targets,length): - tp = self.tp - fp = self.fp - fn = self.fn - ner_size = self.ner_size - - prediction = np.argmax(preds, 2) + self.tp = np.zeros(ner_size + 1) + self.fp = np.zeros(ner_size + 1) + self.fn = np.zeros(ner_size + 1) + def add(self, preds, targets, length): + prediction = np.argmax(preds, axis=2) for i in range(len(targets)): for j in range(length[i]): if targets[i, j] == prediction[i, j]: - tp[targets[i, j]] += 1 + self.tp[targets[i, j]] += 1 else: - fp[targets[i, j]] += 1 - fn[prediction[i, j]] += 1 + self.fp[targets[i, j]] += 1 + self.fn[prediction[i, j]] += 1 - unnamed_entity = ner_size - 1 - for i in range(ner_size): + unnamed_entity = self.ner_size - 1 + for i in range(self.ner_size): if i != unnamed_entity: - tp[ner_size] += tp[i] - fp[ner_size] += fp[i] - fn[ner_size] += fn[i] - + self.tp[self.ner_size] += self.tp[i] + self.fp[self.ner_size] += self.fp[i] + self.fn[self.ner_size] += self.fn[i] def score(self): - tp = self.tp - fp = self.fp - fn = self.fn - ner_size = self.ner_size - - precision = [] - recall = [] - fscore = [] - for i in range(ner_size + 1): - precision.append(tp[i] * 1.0 / (tp[i] + fp[i])) - recall.append(tp[i] * 1.0 / (tp[i] + fn[i])) - fscore.append(2.0 * precision[i] * recall[i] / (precision[i] + recall[i])) + precision = np.divide(self.tp, self.tp + self.fp, out=np.zeros_like(self.tp), where=self.tp + self.fp != 0) + recall = np.divide(self.tp, self.tp + self.fn, out=np.zeros_like(self.tp), where=self.tp + self.fn != 0) + fscore = 2 * precision * recall / (precision + recall + 1e-8) # Avoid division by zero print(fscore) + return fscore[self.ner_size] - return fscore[ner_size] - - -class predictions_analysis(object): - +class predictions_analysis: def __init__(self): self.tp = 0 self.tn = 0 self.fp = 0 self.fn = 0 - - def add(self,predicions, targets): - self.tp += ((predicions == targets) & (1 == predicions)).sum() - self.tn += ((predicions == targets) & (0 == predicions)).sum() - self.fp += ((predicions != targets) & (1 == predicions)).sum() - self.fn += ((predicions != targets) & (0 == predicions)).sum() - + def add(self, predictions, targets): + self.tp += ((predictions == targets) & (predictions == 1)).sum() + self.tn += ((predictions == targets) & (predictions == 0)).sum() + self.fp += ((predictions != targets) & (predictions == 1)).sum() + self.fn += ((predictions != targets) & (predictions == 0)).sum() def calc_recall(self): - if self.tp == 0 and self.fn == 0: - return -1 - - return np.true_divide(self.tp, self.tp + self.fn) + return np.divide(self.tp, self.tp + self.fn) if self.tp + self.fn != 0 else -1 def calc_precision(self): - if self.tp == 0 and self.fp == 0: - return -1 - - return np.true_divide(self.tp,self.tp + self.fp) - - - + return np.divide(self.tp, self.tp + self.fp) if self.tp + self.fp != 0 else -1 def get_f1(self): - if (self.tp + self.fp == 0): - return 0.0 - if (self.tp + self.fn == 0): + if self.tp + self.fp == 0 or self.tp + self.fn == 0: return 0.0 precision = self.calc_precision() recall = self.calc_recall() - if (not ((precision + recall) == 0)): - f1 = 2*(precision*recall) / (precision + recall) - else: - f1 = 0.0 - - return f1 + return 2 * precision * recall / (precision + recall + 1e-8) if precision + recall != 0 else 0.0 def get_accuracy(self): - total = self.tp + self.tn + self.fp + self.fn - if (total == 0) : - return 0.0 - else: - return np.true_divide(self.tp + self.tn, total) - + return np.divide(self.tp + self.tn, total) if total != 0 else 0.0 def reset(self): - self.tp = 0 - self.tn = 0 - self.fn = 0 - self.fp = 0 - + self.tp = self.tn = self.fp = self.fn = 0 -def get_random_files(count, input_folder, output_folder, specific_section = True): +def get_random_files(count, input_folder, output_folder, specific_section=True): files = Path(input_folder).glob('*/*/*/*') if specific_section else Path(input_folder).glob('*/*/*/*/*') - file_paths = [] - for f in files: - file_paths.append(f) - + file_paths = list(files) random_paths = random.sample(file_paths, count) - for random_path in random_paths: output_path = Path(output_folder).joinpath(random_path.name) - copy(str(random_path), str (output_path)) \ No newline at end of file + copy(random_path, output_path) \ No newline at end of file diff --git a/wiki_extractor.py b/wiki_extractor.py index 3921987..fe7814b 100644 --- a/wiki_extractor.py +++ b/wiki_extractor.py @@ -329,10 +329,10 @@ def subst(self, params, extractor, depth=0): return ''.join([tpl.subst(params, extractor, depth) for tpl in self]) def __str__(self): - return ''.join([unicode(x) for x in self]) + return ''.join([str(x) for x in self]) -class TemplateText(unicode): +class TemplateText(str): """Fixed text of template""" def subst(self, params, extractor, depth): @@ -1361,7 +1361,7 @@ def sharp_expr(expr): expr = re.sub('mod', '%', expr) expr = re.sub('\bdiv\b', '/', expr) expr = re.sub('\bround\b', '|ROUND|', expr) - return unicode(eval(expr)) + return str(eval(expr)) except: return '' @@ -2282,7 +2282,7 @@ def compact(text): def handle_unicode(entity): numeric_code = int(entity[2:-1]) if numeric_code >= 0x10000: return '' - return unichr(numeric_code) + return chr(numeric_code) # ------------------------------------------------------------------------------ diff --git a/wiki_loader.py b/wiki_loader.py index b15fb27..009b901 100644 --- a/wiki_loader.py +++ b/wiki_loader.py @@ -1,43 +1,36 @@ from torch.utils.data import Dataset -from text_manipulation import word_model -from text_manipulation import extract_sentence_words +from text_manipulation import word_model, extract_sentence_words from pathlib2 import Path import re import wiki_utils import os - import utils logger = utils.setup_logger(__name__, 'train.log') section_delimiter = "========" - def get_files(path): all_objects = Path(path).glob('**/*') files = [str(p) for p in all_objects if p.is_file()] return files - def get_cache_path(wiki_folder): cache_file_path = wiki_folder / 'paths_cache' return cache_file_path - def cache_wiki_filenames(wiki_folder): files = Path(wiki_folder).glob('*/*/*/*') cache_file_path = get_cache_path(wiki_folder) with cache_file_path.open('w+') as f: for file in files: - f.write(unicode(file) + u'\n') - + f.write(str(file) + u'\n') def clean_section(section): cleaned_section = section.strip('\n') return cleaned_section - def get_scections_from_text(txt, high_granularity=True): sections_to_keep_pattern = wiki_utils.get_seperator_foramt() if high_granularity else wiki_utils.get_seperator_foramt( (1, 2)) @@ -50,27 +43,24 @@ def get_scections_from_text(txt, high_granularity=True): sentences = [s for s in txt.strip().split("\n") if len(s) > 0 and s != "\n"] txt = '\n'.join(sentences).strip('\n') - all_sections = re.split(sections_to_keep_pattern, txt) non_empty_sections = [s for s in all_sections if len(s) > 0] return non_empty_sections - def get_sections(path, high_granularity=True): file = open(str(path), "r") raw_content = file.read() file.close() - clean_txt = raw_content.decode('utf-8').strip() + clean_txt = raw_content.strip() sections = [clean_section(s) for s in get_scections_from_text(clean_txt, high_granularity)] return sections - def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=False, remove_special_tokens=False, - return_as_sentences=False, high_granularity=True,only_letters = False): + return_as_sentences=False, high_granularity=True, only_letters=False): data = [] targets = [] all_sections = get_sections(path, high_granularity) @@ -89,10 +79,9 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals if 1 <= len(sentence_words): data.append([word_model(word, word2vec) for word in sentence_words]) else: - #raise ValueError('Sentence in wikipedia file is empty') logger.info('Sentence in wikipedia file is empty') else: # for the annotation. keep sentence as is. - if (only_letters): + if only_letters: sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence) data.append(sentence) else: @@ -102,20 +91,21 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals return data, targets, path - class WikipediaDataSet(Dataset): def __init__(self, root, word2vec, train=True, manifesto=False, folder=False, high_granularity=False): - - if (manifesto): + if manifesto: self.textfiles = list(Path(root).glob('*')) else: - if (folder): + if folder: self.textfiles = get_files(root) else: root_path = Path(root) cache_path = get_cache_path(root_path) if not cache_path.exists(): + print('Creating cache....') cache_wiki_filenames(root_path) + else: + print(f'Cache exists at {cache_path}') self.textfiles = cache_path.read_text().splitlines() if len(self.textfiles) == 0: @@ -127,9 +117,8 @@ def __init__(self, root, word2vec, train=True, manifesto=False, folder=False, hi def __getitem__(self, index): path = self.textfiles[index] - return read_wiki_file(Path(path), self.word2vec, ignore_list=True, remove_special_tokens=True, high_granularity=self.high_granularity) def __len__(self): - return len(self.textfiles) + return len(self.textfiles) \ No newline at end of file diff --git a/wiki_processor.py b/wiki_processor.py index fadadba..ba661c8 100644 --- a/wiki_processor.py +++ b/wiki_processor.py @@ -3,114 +3,87 @@ import subprocess import re from pathlib2 import Path -from random import shuffle,seed,uniform +from random import shuffle, seed, uniform import math -from shutil import move +from shutil import move import utils import wiki_utils import text_manipulation import wiki_thresholds import json - -logger = utils.setup_logger(__name__, 'processor_log.log', True ) +logger = utils.setup_logger(__name__, 'processor_log.log', True) doc_split_delimiter = "" id_parts = 7 -# minimal number of sentences in document (used to filter non informal documents such as https://en.wikipedia.org/wiki?curid=32283 seed(1234) -wikipedia_namespaces = ['Category', 'File', 'Ru', 'Wikipedia', 'Talk', 'User', 'MediaWiki', 'Template', 'Help', 'Portal', 'Book', 'Draft', - 'Education Program', 'TimedText', 'Module', 'Gadget', 'Gadget definition', 'Media', 'Special'] - -disambigutaiton_pattern = '(disambiguation)' - +wikipedia_namespaces = ['Category', 'File', 'Ru', 'Wikipedia', 'Talk', 'User', 'MediaWiki', 'Template', 'Help', 'Portal', + 'Book', 'Draft', 'Education Program', 'TimedText', 'Module', 'Gadget', 'Gadget definition', + 'Media', 'Special'] -global num_sentneces_for_avg -global sum_sentneces_for_avg -num_sentneces_for_avg = 0 -sum_sentneces_for_avg = 0 +disambiguation_pattern = '(disambiguation)' +global num_sentences_for_avg +global sum_sentences_for_avg +num_sentences_for_avg = 0 +sum_sentences_for_avg = 0 -def count_str_occurrences(str,findStr): - - return len(str.split(findStr)) - 1 +def count_str_occurrences(text, findStr): + return len(text.split(findStr)) - 1 def get_file_path(id): - chopped_id = [] - id_str = str(id) - padding_count = id_parts - len(id_str) - while padding_count > 0: - id_str = "0" + id_str - padding_count-= 1 - - for i in range(0,3): - chopped_id.append(id_str[:2]) - id_str = id_str[2:] - - path = "" - for sub_path in chopped_id: - path =os.path.join(path, sub_path) - return path + id_str = str(id).zfill(id_parts) + return os.path.join(id_str[:2], id_str[2:4], id_str[4:6]) def process_header(header): id_match = re.search(r'', header) title = title_match.groups()[0] - not_valid = title.isdigit() or any(title.startswith(prefix + ':' or prefix + ' talk:' ) for prefix in wikipedia_namespaces) or title.endswith(disambigutaiton_pattern) + not_valid = title.isdigit() or any(title.startswith(prefix + ':' or prefix + ' talk:') + for prefix in wikipedia_namespaces) or title.endswith(disambiguation_pattern) return id, not not_valid def get_sections(content): lines = content.split('\n') section = "" - # sections include headers - sections = [] - sections.append(wiki_utils.get_segment_seperator(1,"preface.")) + sections = [wiki_utils.get_segment_separator(1, "preface.")] for line in lines: - if (wiki_utils.is_seperator_line(line)): + if wiki_utils.is_separator_line(line): if len(section) > 0: sections.append(section) section = "" sections.append(line) - else: - section += line - section += '\n' + section += line + '\n' if len(section) > 0: sections.append(section) return sections - - def process_section(section, id): - global num_sentneces_for_avg - global sum_sentneces_for_avg + global num_sentences_for_avg, sum_sentences_for_avg sentences = text_manipulation.split_sentences(section, id) section_sentences = [] - num_lists = 0 - num_sentences = 0 - num_formulas = 0 - num_codes = 0 + num_lists, num_sentences, num_formulas, num_codes = 0, 0, 0, 0 last_sentence_was_list = False + for sentence in sentences: is_list_sentence = wiki_utils.get_list_token() + "." == sentence.encode('utf-8') if '\n' in sentence: - logger.info("DocId: " + str(id) + " back slash in sentence: " + sentence) - if (wiki_utils.get_list_token() in sentence) and (wiki_utils.get_list_token() + ".") != sentence.encode('utf-8'): - # TODO: delete this if section, since it is not suupposed to happen any more - but still happen + logger.info(f"DocId: {id} backslash in sentence: {sentence}") + if wiki_utils.get_list_token() in sentence and (wiki_utils.get_list_token() + ".") != sentence.encode('utf-8'): num_lists += 1 last_sentence_was_list = True - logger.info("DocId: " + str(id) + " Special case 1: " + sentence) + logger.info(f"DocId: {id} Special case 1: {sentence}") continue elif is_list_sentence: - if (last_sentence_was_list): + if last_sentence_was_list: continue last_sentence_was_list = True num_lists += 1 @@ -118,84 +91,75 @@ def process_section(section, id): last_sentence_was_list = False sentence_words = text_manipulation.extract_sentence_words(sentence) if len(sentence_words) < wiki_thresholds.min_words_in_sentence: - # ignore this sentence continue - sum_sentneces_for_avg += len(sentence_words) - num_sentneces_for_avg += 1 - + sum_sentences_for_avg += len(sentence_words) + num_sentences_for_avg += 1 num_formulas += count_str_occurrences(sentence, wiki_utils.get_formula_token()) - num_codes += count_str_occurrences(sentence, wiki_utils.get_codesnipet_token()) + num_codes += count_str_occurrences(sentence, wiki_utils.get_codesnippet_token()) num_sentences += 1 section_sentences.append(sentence) - valid_section = True error_message = None - if (num_sentences < wiki_thresholds.min_sentence_in_section): + + if num_sentences < wiki_thresholds.min_sentence_in_section: valid_section = False - error_message = "sentences count in section is too low" + error_message = "Sentences count in section is too low" - if (num_sentences > 0): - lists_perentage = float(num_lists) / float(num_sentences) - if lists_perentage >= wiki_thresholds.max_list_in_section_percentage: + if num_sentences > 0: + lists_percentage = float(num_lists) / float(num_sentences) + if lists_percentage >= wiki_thresholds.max_list_in_section_percentage: valid_section = False - error_message = "list percentage in section is too high: " + str(lists_perentage) + error_message = f"List percentage in section is too high: {lists_percentage}" - section_text = ''.join(section_sentences) + section_text = ''.join(section_sentences) if len(re.findall('[a-zA-Z]', section_text)) < wiki_thresholds.min_section_char_count: valid_section = False - error_message = "char count in section is too low" + error_message = "Char count in section is too low" if num_formulas >= wiki_thresholds.max_section_formulas_count: valid_section = False - error_message = "number of formulas in section is too high: " + str(num_formulas) + error_message = f"Number of formulas in section is too high: {num_formulas}" - if num_codes >= wiki_thresholds.max_section_code_snipet_count: + if num_codes >= wiki_thresholds.max_section_code_snippet_count: valid_section = False - error_message = "number of code snippets in section is too high: " + str(num_codes) - + error_message = f"Number of code snippets in section is too high: {num_codes}" return valid_section, section_sentences, error_message def is_valid_article(valid_section_count, section_count): if valid_section_count < wiki_thresholds.min_valid_section_count: - return False, "Valid section count is too low: " + str(valid_section_count) + return False, f"Valid section count is too low: {valid_section_count}" - valid_section_percentage = float(valid_section_count) / float (section_count) + valid_section_percentage = float(valid_section_count) / float(section_count) if valid_section_percentage < wiki_thresholds.min_valid_section_percentage: - return False, "Valid section percentage is too low: " + str(valid_section_percentage) - - - return True,"" - + return False, f"Valid section percentage is too low: {valid_section_percentage}" + return True, "" def max_level_in_article(content): - max_lavel = -1 + max_level = -1 for line in content: - if (wiki_utils.is_seperator_line(line)): + if wiki_utils.is_separator_line(line): current_level = wiki_utils.get_segment_level(line) - if current_level > max_lavel: - max_lavel = current_level - return max_lavel - + if current_level > max_level: + max_level = current_level + return max_level def delete_empty_segment_headers(content): num_of_deletions = 0 max_level = max_level_in_article(content) - for handle_level in range(max_level,0,-1): + for handle_level in range(max_level, 0, -1): last_section_level = -1 last_section_header = True - for i in range(len(content) -1 , -1 , -1): + for i in range(len(content) - 1, -1, -1): section = content[i] - if (wiki_utils.is_seperator_line(section)): + if wiki_utils.is_separator_line(section): section_level = wiki_utils.get_segment_level(section) - if (section_level == handle_level): - - # empty section if last seciont was also a header - is_empty = last_section_header - if (is_empty & (last_section_level <= section_level)): + if section_level == handle_level: + is_empty = last_section_header + if is_empty and last_section_level <= section_level: del content[i] num_of_deletions += 1 last_section_level = section_level @@ -205,87 +169,57 @@ def delete_empty_segment_headers(content): return content, num_of_deletions - def vec_to_text(sections_with_headers): - adjusted_content = "" - for section in sections_with_headers: - adjusted_content += section + '\n' - return adjusted_content - + return '\n'.join(sections_with_headers) def process_content(content, id): sections_with_headers = get_sections(content) - adjueted_content_text = "" article_lines = [] section_count = 0 valid_section_count = 0 - for i in range(len(sections_with_headers)): - section = sections_with_headers[i] - if wiki_utils.is_seperator_line(section): + + for section in sections_with_headers: + if wiki_utils.is_separator_line(section): article_lines.append(section) else: is_valid_section, section_sentences, message = process_section(section, id) section_count += 1 - if (is_valid_section): + if is_valid_section: valid_section_count += 1 article_lines.extend(section_sentences) else: - logger.info('Invalid section in article id: ' + id + - ' Reason: ' + message + ' Content: ' + vec_to_text(section_sentences).strip('\n') ) + logger.info(f'Invalid section in article id: {id} Reason: {message} Content: {vec_to_text(section_sentences).strip()}') - is_valid,reason = is_valid_article(valid_section_count, section_count ) + is_valid, reason = is_valid_article(valid_section_count, section_count) if is_valid: - article_content,_ = delete_empty_segment_headers(article_lines) - adjueted_content_text = vec_to_text(article_content) - - - return is_valid, adjueted_content_text,reason - - -# old process content, for comparsion -# def process_content(content): -# -# # keep only scetions with minimal number of characters -# sections = [s.strip('\n') for s in content.strip().split(section_delimiter) if -# len(re.findall('[a-zA-Z]', s)) > min_section_length] -# -# # article must have at least 3 sections, to avoid articles with only one section which is summaization. E.g: -# # https://en.wikipedia.org/wiki?curid=821470 -# sections_count = len(sections) -# if sections_count < min_article_sections_count or sections_count >= max_article_sections_count: -# return content, False, 'Sections count is: ' + str(sections_count) -# -# # remove first section since it usually the summary of the whole article -# adjueted_content = ('\n' + section_delimiter + '\n').join(sections[1:]) -# -# return adjueted_content, True, "" - + article_content, _ = delete_empty_segment_headers(article_lines) + adjusted_content_text = vec_to_text(article_content) + else: + adjusted_content_text = "" + return is_valid, adjusted_content_text, reason def process_article(article): - non_empty_lines = [l for l in article.strip().split("\n") if l != ""] + non_empty_lines = [l for l in article.strip().split("\n") if l != ""] header = non_empty_lines[0] id, is_valid_header = process_header(header) if not is_valid_header: - logger.info('Invalid header in doc id: ' + str(id)+ ' header: ' + header) + logger.info(f'Invalid header in doc id: {id} header: {header}') return "", id, False content = "\n".join(non_empty_lines[2:]) - is_valid_content, processed_content , debug = process_content(content, id) - if not(is_valid_content): - logger.info('Invalid article in doc id: ' + str(id) + '. ' + debug +'\n\n') + is_valid_content, processed_content, debug = process_content(content, id) + if not is_valid_content: + logger.info(f'Invalid article in doc id: {id}. {debug}\n\n') else: - logger.info('Valid article , id: ' + str(id) +'\n\n') + logger.info(f'Valid article , id: {id}\n\n') return processed_content, id, is_valid_content - -def process_wiki_file(path, output_folder,train_ratio,test_ratio, forbidden_train_ids): - train_size = 0 - dev_size = 0 - test_size = 0 +def process_wiki_file(path, output_folder, train_ratio, test_ratio, forbidden_train_ids): + train_size, dev_size, test_size = 0, 0, 0 with open(path, "r") as file: raw_content = file.read() @@ -295,184 +229,159 @@ def process_wiki_file(path, output_folder,train_ratio,test_ratio, forbidden_trai for article in articles: processed_article, id, is_valid = process_article(article) - processed_articles_count+=1 + processed_articles_count += 1 if not is_valid: - continue; + continue random_num = uniform(0, 1) - if (random_num > train_ratio and random_num <= train_ratio + test_ratio) or int(id) in forbidden_train_ids: + if (random_num > train_ratio and random_num <= train_ratio + test_ratio) or int(id) in forbidden_train_ids: partition = "test" test_size += 1 - elif (random_num > train_ratio + test_ratio): + elif random_num > train_ratio + test_ratio: partition = "dev" dev_size += 1 else: partition = "train" train_size += 1 - output_sub_folder = os.path.join(output_folder,partition, get_file_path(id)) + output_sub_folder = os.path.join(output_folder, partition, get_file_path(id)) if not os.path.exists(output_sub_folder): os.makedirs(output_sub_folder) output_file_path = os.path.join(output_sub_folder, str(id)) with open(output_file_path, "w") as output_file: - output_file.write(processed_article.encode('utf-8'), ) - created_articles_count+=1 - - return created_articles_count, processed_articles_count, train_size,dev_size,test_size + output_file.write(processed_article.encode('utf-8')) + created_articles_count += 1 + return created_articles_count, processed_articles_count, train_size, dev_size, test_size def get_forbidden_train_ids(): - # Return ids of article which must be in test set (and not train/dev) with open('wikicities_article_names_to_ids') as f: wiki_cities = json.load(f) with open('wikielements_article_names_to_ids') as f: wiki_elements = json.load(f) - forbidden_train_ids = [] - for k,v in wiki_cities.iteritems(): - forbidden_train_ids.append(int(v)) - for k,v in wiki_elements.iteritems(): - forbidden_train_ids.append(int(v)) - - unique_ids = set(forbidden_train_ids) - - return unique_ids; - - + forbidden_train_ids = [int(v) for d in (wiki_cities, wiki_elements) for v in d.values()] + return set(forbidden_train_ids) def get_wiki_files(path): all_objects = Path(path).glob('**/*') - files = (str(p) for p in all_objects if p.is_file()) - return files + return (str(p) for p in all_objects if p.is_file()) - -def process_wiki_folder(input_folder, output_folder,train_ratio,test_ratio): - total_train_size = 0 - total_dev_size = 0 - total_test_size = 0 - folders = [o for o in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, o))] - total_created_articles = 0 - total_processed_articles = 0 +def process_wiki_folder(input_folder, output_folder, train_ratio, test_ratio): + total_train_size, total_dev_size, total_test_size = 0, 0, 0 + folders = [o for o in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, o))] + total_created_articles, total_processed_articles = 0, 0 previous_debug = 0 forbidden_train_ids = get_forbidden_train_ids() + for folder in folders: full_folder_path = os.path.join(input_folder, folder) if not os.path.exists(output_folder): os.makedirs(output_folder) files = get_wiki_files(full_folder_path) for file in files: - created_articles, processed_articles, train_size, dev_size, test_size = process_wiki_file(file, output_folder, float(train_ratio), float(test_ratio), forbidden_train_ids) + created_articles, processed_articles, train_size, dev_size, test_size = process_wiki_file( + file, output_folder, float(train_ratio), float(test_ratio), forbidden_train_ids + ) total_train_size += train_size total_dev_size += dev_size total_test_size += test_size total_created_articles += created_articles total_processed_articles += processed_articles - if (total_created_articles - previous_debug > 2500): + if total_created_articles - previous_debug > 2500: previous_debug = total_created_articles - print ('Created ' + str(total_created_articles) + ' wiki articles, out of ' + str(total_processed_articles) + ' processed articles') - total_samples = total_train_size + total_dev_size + total_test_size - print 'total_samples = ', str(total_samples) - print "#train = ",total_train_size,"ratio: ","{:.2f}".format(total_train_size / float(total_samples)) - print "#dev = ", total_dev_size,"ratio: ","{:.2f}".format(total_dev_size/ float(total_samples)) - print "#test = ", total_test_size,"ratio: ","{:.2f}".format(total_test_size / float(total_samples)) + print(f'Created {total_created_articles} wiki articles, out of {total_processed_articles} processed articles') + total_samples = total_train_size + total_dev_size + total_test_size + print(f'total_samples = {total_samples}') + print(f"#train = {total_train_size}, ratio: {total_train_size / float(total_samples):.2f}") + print(f"#dev = {total_dev_size}, ratio: {total_dev_size / float(total_samples):.2f}") + print(f"#test = {total_test_size}, ratio: {total_test_size / float(total_samples):.2f}") -def move_wiki_file(src, folder, partition): - # get relative path to inputFolder +def move_wiki_file(src, folder, partition): file = os.path.relpath(src, folder) - - # extract file path in train folder - dstFile = os.path.join(folder, partition, file) - dstdir = os.path.dirname(dstFile) + dst_file = os.path.join(folder, partition, file) + dstdir = os.path.dirname(dst_file) if not os.path.exists(dstdir): os.makedirs(dstdir) - move(src, dstFile) - + move(src, dst_file) -def removeEmptyFolders(path, removeRoot=True): +def remove_empty_folders(path, remove_root=True): if not os.path.isdir(path): return - # remove empty subfolders files = os.listdir(path) for f in files: fullpath = os.path.join(path, f) if os.path.isdir(fullpath): - removeEmptyFolders(fullpath) + remove_empty_folders(fullpath) - # if folder empty, delete it files = os.listdir(path) - if len(files) == 0 and removeRoot: - #print "Removing empty folder:", path + if len(files) == 0 and remove_root: os.rmdir(path) - - -def trainTestDev(destFolder, train_size, test_size): +def train_test_dev(dest_folder, train_size, test_size): train_size_ratio = float(train_size) test_size_ratio = float(test_size) dev_size_ratio = 1 - train_size_ratio - test_size_ratio - print (destFolder,train_size,test_size) + print(dest_folder, train_size, test_size) - allFiles = [] - if not os.path.exists(destFolder): - print ("Output folder does not exist") + all_files = [] + if not os.path.exists(dest_folder): + print("Output folder does not exist") return - folders = [o for o in os.listdir(destFolder) if os.path.isdir(os.path.join(destFolder, o))] + folders = [o for o in os.listdir(dest_folder) if os.path.isdir(os.path.join(dest_folder, o))] for folder in folders: - full_folder_path = os.path.join(destFolder, folder) + full_folder_path = os.path.join(dest_folder, folder) files = get_wiki_files(full_folder_path) - allFiles.extend(files) + all_files.extend(files) + shuffle(all_files) - shuffle(allFiles) + train_size = int(math.floor(len(all_files) * train_size_ratio)) + dev_size = int(math.floor(len(all_files) * dev_size_ratio)) - trainSize = int(math.floor(len(allFiles) * train_size_ratio)) - devSize = int(math.floor(len(allFiles) * dev_size_ratio)) - for i in range(0,trainSize): - move_wiki_file(allFiles[i], destFolder, partition="train") + for i in range(train_size): + move_wiki_file(all_files[i], dest_folder, partition="train") - if devSize > 0: - for i in range(trainSize, trainSize + devSize): - move_wiki_file(allFiles[i], destFolder, partition="dev") + if dev_size > 0: + for i in range(train_size, train_size + dev_size): + move_wiki_file(all_files[i], dest_folder, partition="dev") - for i in range(trainSize + devSize,len(allFiles)): - move_wiki_file(allFiles[i], destFolder, partition="test") - print ("#train = ",trainSize) - print ("#dev = ", devSize) - print ("#test = ", len(allFiles) - trainSize -devSize) + for i in range(train_size + dev_size, len(all_files)): + move_wiki_file(all_files[i], dest_folder, partition="test") - removeEmptyFolders(destFolder) + print(f"#train = {train_size}") + print(f"#dev = {dev_size}") + print(f"#test = {len(all_files) - train_size - dev_size}") + remove_empty_folders(dest_folder) -def main (args): - global num_sentneces_for_avg - global sum_sentneces_for_avg +def main(args): + global num_sentences_for_avg, sum_sentences_for_avg if not os.path.exists(args.temp): os.makedirs(args.temp) - # execute extraction of wikipedia dump - cmd = ['python', str(Path(__file__).parent / 'wiki_extractor.py'), '-s', '-o', args.temp, '--article_count', str(args.article_count),'--lists'] - print cmd + + cmd = ['python', str(Path(__file__).parent / 'wiki_extractor.py'), '-s', '-o', args.temp, '--article_count', str(args.article_count), '--lists'] + print(cmd) if args.processes: cmd += ['--processes', args.processes] cmd += [args.input] - if not args.no_extractor: + if not args.no_extractor: subprocess.call(cmd) - print ("Finisehd extractor") - - + print("Finished extractor") if not os.path.exists(args.output): os.makedirs(args.output) - # create file per each wiki value from the extracted dump - process_wiki_folder(args.temp, args.output,args.train, args.test) + + process_wiki_folder(args.temp, args.output, args.train, args.test) - print ("Number of processed sentences: " + str(num_sentneces_for_avg)) - print "avg len sentence = " + str(sum_sentneces_for_avg / float(num_sentneces_for_avg)) - print ('done') + print(f"Number of processed sentences: {num_sentences_for_avg}") + print(f"avg len sentence = {sum_sentences_for_avg / float(num_sentences_for_avg)}") + print('done') if __name__ == '__main__': parser = ArgumentParser() @@ -483,6 +392,5 @@ def main (args): parser.add_argument('--output', help='output folder', required=True) parser.add_argument('--train', help='train size ratio', required=True) parser.add_argument('--test', help='test size ratio', required=True) - parser.add_argument("--article_count", help = 'max number of wikipedia articles to extract', default=1000000) - main(parser.parse_args()) - + parser.add_argument('--article_count', help='max number of wikipedia articles to extract', default=1000000) + main(parser.parse_args()) \ No newline at end of file diff --git a/wiki_utils.py b/wiki_utils.py index 32f7d21..8d8bcc8 100644 --- a/wiki_utils.py +++ b/wiki_utils.py @@ -1,22 +1,22 @@ segment_seperator = "========" -def get_segment_seperator(level,name): - return segment_seperator + "," + str(level) + "," +name +def get_segment_seperator(level, name): + return segment_seperator + "," + str(level) + "," + name -def get_seperator_foramt(levels = None): - level_format = '\d' if levels == None else '['+ str(levels[0]) + '-' + str(levels[1]) + ']' - seperator_fromat = segment_seperator + ',' + level_format + ",.*?\." - return seperator_fromat +def get_seperator_foramt(levels=None): + level_format = '\d' if levels is None else '[' + str(levels[0]) + '-' + str(levels[1]) + ']' + separator_format = segment_seperator + ',' + level_format + ",.*?\\." + return separator_format def is_seperator_line(line): return line.startswith(segment_seperator) -def get_segment_level(seperator_line): - return int (seperator_line.split(',')[1]) +def get_segment_level(separator_line): + return int(separator_line.split(',')[1]) -def get_segment_name(seperator_line): - return seperator_line.split(',')[2] +def get_segment_name(separator_line): + return separator_line.split(',')[2] def get_list_token(): return "***LIST***" @@ -28,10 +28,9 @@ def get_codesnipet_token(): return "***codice***" def get_special_tokens(): - special_tokens = [] - special_tokens.append(get_list_token()) - special_tokens.append(get_formula_token()) - special_tokens.append(get_codesnipet_token()) - return special_tokens - - + special_tokens = [ + get_list_token(), + get_formula_token(), + get_codesnipet_token() + ] + return special_tokens \ No newline at end of file From 803025f104a5f7211c7434cd2a35adfff4d8480a Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Mon, 7 Oct 2024 19:27:43 +0530 Subject: [PATCH 02/16] removed logs if sentence is empty --- .gitignore | 1 - config.json | 5 +++++ wiki_loader.py | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 config.json diff --git a/.gitignore b/.gitignore index de6246e..477c669 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ .DS_Store -config.json data/ runs/ diff --git a/config.json b/config.json new file mode 100644 index 0000000..64c1e2a --- /dev/null +++ b/config.json @@ -0,0 +1,5 @@ +{ + "word2vecfile": "/Users/jitesh/Downloads/text-segmentation/data/word2vec/GoogleNews-vectors-negative300.bin", + "choidataset": "/home/omri/code/text-segmentation-2017/data/choi", + "wikidataset": "/Users/jitesh/Downloads/text-segmentation/data/wiki 727" +} \ No newline at end of file diff --git a/wiki_loader.py b/wiki_loader.py index 009b901..c1f74c7 100644 --- a/wiki_loader.py +++ b/wiki_loader.py @@ -79,7 +79,8 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals if 1 <= len(sentence_words): data.append([word_model(word, word2vec) for word in sentence_words]) else: - logger.info('Sentence in wikipedia file is empty') + # logger.info('Sentence in wikipedia file is empty') + continue else: # for the annotation. keep sentence as is. if only_letters: sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence) From 2ce61b243aaad7f80aefa12f09b6632e912fe1e1 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Tue, 8 Oct 2024 19:48:58 +0530 Subject: [PATCH 03/16] code changes for simplified folder structure --- wiki_loader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/wiki_loader.py b/wiki_loader.py index c1f74c7..6483cd2 100644 --- a/wiki_loader.py +++ b/wiki_loader.py @@ -20,12 +20,12 @@ def get_cache_path(wiki_folder): return cache_file_path def cache_wiki_filenames(wiki_folder): - files = Path(wiki_folder).glob('*/*/*/*') + files = str(Path(wiki_folder)) cache_file_path = get_cache_path(wiki_folder) with cache_file_path.open('w+') as f: - for file in files: - f.write(str(file) + u'\n') + for file in os.listdir(files): + f.write(os.path.join(files,file) + u'\n') def clean_section(section): cleaned_section = section.strip('\n') @@ -94,6 +94,7 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals class WikipediaDataSet(Dataset): def __init__(self, root, word2vec, train=True, manifesto=False, folder=False, high_granularity=False): + if manifesto: self.textfiles = list(Path(root).glob('*')) else: From 117820ced0cbe2318dc98117f1007f21b4199873 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Thu, 10 Oct 2024 11:58:24 +0530 Subject: [PATCH 04/16] added pin memory flag with argparse --- .gitignore | 1 + run.py | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 477c669..9644866 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .DS_Store +.vscode/ data/ runs/ diff --git a/run.py b/run.py index b042079..7dceb50 100644 --- a/run.py +++ b/run.py @@ -1,6 +1,5 @@ import torch from torch.utils.data import DataLoader -from torch.autograd import Variable import torch.nn.functional as F from choiloader import ChoiDataset, collate_fn @@ -175,11 +174,11 @@ def main(args): test_dataset = dataset_class(dataset_path / 'test', word2vec, high_granularity=args.high_granularity) train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True, - num_workers=args.num_workers) + num_workers=args.num_workers,pin_memory=args.pin_memory) dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, - num_workers=args.num_workers) + num_workers=args.num_workers,pin_memory=args.pin_memory) test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, - num_workers=args.num_workers) + num_workers=args.num_workers,pin_memory=args.pin_memory) model = import_model(args.model) if args.model else torch.load(open(args.load_from, 'rb')) model = maybe_cuda(model) @@ -190,23 +189,24 @@ def main(args): best_val_pk = 1.0 for j in range(args.epochs): train(model, args, j, train_dl, logger, optimizer) - torch.save(model, open(checkpoint_path / f'model{j:03d}.t7', 'wb')) + torch.save(model, open(checkpoint_path / f'model{j:03d}.pt', 'wb')) val_pk, threshold = validate(model, args, j, dev_dl, logger) if val_pk < best_val_pk: test_pk = test(model, args, j, test_dl, logger, threshold) logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green')) best_val_pk = val_pk - torch.save(model, open(checkpoint_path / 'best_model.t7', 'wb')) + torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb')) else: test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity), - batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers) + batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers,pin_memory=args.pin_memory) print(test(model, args, 0, test_dl, logger, 0.4)) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--cuda', help='Use cuda?', action='store_true') + parser.add_argument('--pin_memory', help='Pin Memory?', action='store_true') parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true') parser.add_argument('--bs', help='Batch size', type=int, default=8) parser.add_argument('--test_bs', help='Test batch size', type=int, default=5) From c6fffb019b21d32ec2fc34594facba60fc38f82b Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Fri, 11 Oct 2024 22:38:21 +0530 Subject: [PATCH 05/16] moved part of model code to collate_fn --- .gitignore | 1 + choiloader.py | 39 +++++++++++++++++++++- models/max_sentence_embedding.py | 25 ++------------- run.py | 55 +++++++++++++++++++++++++------- 4 files changed, 86 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index 9644866..d8a238c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ .vscode/ data/ runs/ +checkpoints/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/choiloader.py b/choiloader.py index 2085fdc..ce6aad9 100644 --- a/choiloader.py +++ b/choiloader.py @@ -6,6 +6,8 @@ import utils import math from pathlib import Path # Use pathlib, which is built-in with Python 3 +from torch.nn.utils.rnn import pack_padded_sequence +import torch.nn.functional as F logger = utils.setup_logger(__name__, 'train.log') @@ -14,6 +16,37 @@ def get_choi_files(path): files = [str(p) for p in all_objects if p.is_file()] return files + +def custom_pad(s, max_length): + s_length = s.size()[0] + v = utils.maybe_cuda(s.unsqueeze(0).unsqueeze(0)) + padded = F.pad(v, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) + shape = padded.size() + return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) + +def pack_tensor(batch): + + sentences_per_doc = [] + all_batch_sentences = [] + for document in batch: + all_batch_sentences.extend(document) + sentences_per_doc.append(len(document)) + + lengths = [s.size()[0] for s in all_batch_sentences] + sort_order = np.argsort(lengths)[::-1] + sorted_sentences = [all_batch_sentences[i] for i in sort_order] + sorted_lengths = [s.size()[0] for s in sorted_sentences] + + max_length = max(lengths) + logger.debug('Num sentences: %s, max sentence length: %s', + sum(sentences_per_doc), max_length) + + padded_sentences = [custom_pad(s, max_length) for s in sorted_sentences] + big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) + packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False) + return packed_tensor,sentences_per_doc,sort_order + + def collate_fn(batch): batched_data = [] batched_targets = [] @@ -43,7 +76,11 @@ def collate_fn(batch): logger.debug('Exception!', exc_info=True) continue - return batched_data, batched_targets, paths + packed_data,sentences_per_doc,sort_order = pack_tensor(batched_data) + + data = (packed_data,sentences_per_doc,sort_order,len(batch)) + + return (data,batched_targets,paths) def clean_paragraph(paragraph): cleaned_paragraph = paragraph.replace("'' ", " ").replace(" 's", "'s").replace("``", "").strip('\n') diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py index 847d053..8c57bc5 100644 --- a/models/max_sentence_embedding.py +++ b/models/max_sentence_embedding.py @@ -74,28 +74,9 @@ def pad_document(self, d, max_document_length): padded = F.pad(v, (0, 0, 0, max_document_length - d_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) - - def forward(self, batch): - batch_size = len(batch) - - sentences_per_doc = [] - all_batch_sentences = [] - for document in batch: - all_batch_sentences.extend(document) - sentences_per_doc.append(len(document)) - - lengths = [s.size()[0] for s in all_batch_sentences] - sort_order = np.argsort(lengths)[::-1] - sorted_sentences = [all_batch_sentences[i] for i in sort_order] - sorted_lengths = [s.size()[0] for s in sorted_sentences] - - max_length = max(lengths) - logger.debug('Num sentences: %s, max sentence length: %s', - sum(sentences_per_doc), max_length) - - padded_sentences = [self.pad(s, max_length) for s in sorted_sentences] - big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) - packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False) + + def forward(self, data): + packed_tensor, sentences_per_doc, sort_order,batch_size = data encoded_sentences = self.sentence_encoder(packed_tensor) unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) diff --git a/run.py b/run.py index 7dceb50..3e783ed 100644 --- a/run.py +++ b/run.py @@ -1,5 +1,5 @@ import torch -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, Subset import torch.nn.functional as F from choiloader import ChoiDataset, collate_fn @@ -9,6 +9,7 @@ import gensim import utils from tensorboard_logger import configure, log_value +import time import os import sys from pathlib import Path @@ -17,7 +18,7 @@ import numpy as np from termcolor import colored -torch.multiprocessing.set_sharing_strategy('file_system') +# torch.multiprocessing.set_sharing_strategy('file_system') preds_stats = utils.predictions_analysis() @@ -64,6 +65,22 @@ def calc_accuracy(self): return min_pk, min_epoch_windiff, min_threshold +def tensor_size_in_bytes(tensor): + return tensor.numel() * tensor.element_size() + +def compute_batch_size(data): + total_size=0 + + for element in data: + num_sentences = len(element) + + for sentence in element: + total_size += tensor_size_in_bytes(sentence) + + return total_size / (1024**2) + + + def train(model, args, epoch, dataset, logger, optimizer): model.train() total_loss = 0.0 # Changed to float value @@ -74,6 +91,7 @@ def train(model, args, epoch, dataset, logger, optimizer): pbar.update() model.zero_grad() + # data_size = compute_batch_size(data) output = model(data) target_var = maybe_cuda(torch.cat(target, 0), args.cuda) loss = model.criterion(output, target_var) @@ -86,7 +104,7 @@ def train(model, args, epoch, dataset, logger, optimizer): total_loss /= len(dataset) logger.debug(f'Training Epoch: {epoch + 1}, Loss: {total_loss:.4}') - log_value('Training Loss', total_loss, epoch + 1) + # log_value('Training Loss', total_loss, epoch + 1) def validate(model, args, epoch, dataset, logger): model.eval() @@ -104,7 +122,7 @@ def validate(model, args, epoch, dataset, logger): target_seg = targets_var.cpu().numpy() preds_stats.add(output_seg, target_seg) - acc.update(output_softmax.cpu().numpy(), target) + acc.update(output_softmax.detach().cpu().numpy(), target) epoch_pk, epoch_windiff, threshold = acc.calc_accuracy() @@ -161,7 +179,9 @@ def main(args): utils.config.update(vars(args)) # Updated to use vars(args) logger.debug(f'Running with config {utils.config}') - configure(os.path.join('runs', args.expname)) + + # log_dir = os.path.join('runs', args.expname, str(time.time())) + # configure(log_dir) word2vec = None if args.test else gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True) @@ -173,6 +193,11 @@ def main(args): dev_dataset = dataset_class(dataset_path / 'dev', word2vec, high_granularity=args.high_granularity) test_dataset = dataset_class(dataset_path / 'test', word2vec, high_granularity=args.high_granularity) + if args.subset: + train_dataset = Subset(train_dataset,range(1000)) + dev_dataset = Subset(dev_dataset,range(1000)) + test_dataset = Subset(test_dataset,range(1000)) + train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True, num_workers=args.num_workers,pin_memory=args.pin_memory) dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, @@ -185,6 +210,11 @@ def main(args): optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + if args.benchmark: + for j in range(args.epochs): + train(model, args, j, train_dl, logger, optimizer) + return + if not args.infer: best_val_pk = 1.0 for j in range(args.epochs): @@ -192,11 +222,12 @@ def main(args): torch.save(model, open(checkpoint_path / f'model{j:03d}.pt', 'wb')) val_pk, threshold = validate(model, args, j, dev_dl, logger) - if val_pk < best_val_pk: - test_pk = test(model, args, j, test_dl, logger, threshold) - logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green')) - best_val_pk = val_pk - torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb')) + print(f'Current best model from epoch {j} with p_k {val_pk} and threshold {threshold}') + # if val_pk < best_val_pk: + # test_pk = test(model, args, j, test_dl, logger, threshold) + # logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green')) + # best_val_pk = val_pk + # torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb')) else: test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity), @@ -207,10 +238,12 @@ def main(args): parser = ArgumentParser() parser.add_argument('--cuda', help='Use cuda?', action='store_true') parser.add_argument('--pin_memory', help='Pin Memory?', action='store_true') + parser.add_argument('--subset', help='Use a sample of 1000 rows', action='store_true') + parser.add_argument('--benchmark', help='Use PyTorch profiler', action='store_true') parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true') parser.add_argument('--bs', help='Batch size', type=int, default=8) parser.add_argument('--test_bs', help='Test batch size', type=int, default=5) - parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10) + parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=1) parser.add_argument('--model', help='Model to run - will import and run') parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue') parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1') From 0ef4d15c317f244bdadd30cb6b96e84e3695e0ae Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sat, 12 Oct 2024 11:20:09 +0530 Subject: [PATCH 06/16] removed loop from forward method of base LSTM model --- choiloader.py | 4 +- models/from_presentation.py | 11 +--- models/max_sentence_embedding.py | 109 ++++++++++++++----------------- models/single_lstm.py | 11 +--- run.py | 5 +- 5 files changed, 58 insertions(+), 82 deletions(-) diff --git a/choiloader.py b/choiloader.py index ce6aad9..94dfb5f 100644 --- a/choiloader.py +++ b/choiloader.py @@ -19,7 +19,7 @@ def get_choi_files(path): def custom_pad(s, max_length): s_length = s.size()[0] - v = utils.maybe_cuda(s.unsqueeze(0).unsqueeze(0)) + v = s.unsqueeze(0).unsqueeze(0) padded = F.pad(v, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) @@ -78,7 +78,7 @@ def collate_fn(batch): packed_data,sentences_per_doc,sort_order = pack_tensor(batched_data) - data = (packed_data,sentences_per_doc,sort_order,len(batch)) + data = (packed_data,sentences_per_doc,sort_order) return (data,batched_targets,paths) diff --git a/models/from_presentation.py b/models/from_presentation.py index 39e138d..31acece 100644 --- a/models/from_presentation.py +++ b/models/from_presentation.py @@ -9,12 +9,6 @@ logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) -# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required. -def zero_state(module, batch_size): - # * 2 is for the two directions - return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \ - maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)) - class SentenceEncodingRNN(nn.Module): def __init__(self, input_size=300, hidden=128, num_layers=2): super(SentenceEncodingRNN, self).__init__() @@ -30,8 +24,7 @@ def __init__(self, input_size=300, hidden=128, num_layers=2): def forward(self, x): batch_size = x.batch_sizes[0] - s = zero_state(self, batch_size) - _, (hidden, _) = self.lstm(x, s) # (4, batch_size, 128) + _, (hidden, _) = self.lstm(x) # (4, batch_size, 128) transposed = hidden.transpose(0, 1) # (batch_size, 4, 128) reshaped = transposed.contiguous().view(batch_size, -1) @@ -117,7 +110,7 @@ def forward(self, batch): docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False) profiler.set() # 3 - sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size)) + sentence_lstm_output, _ = self.sentence_lstm(packed_docs) profiler.set() # 4 padded_x, _ = pad_packed_sequence(sentence_lstm_output) # (max sentence len, batch, 256) diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py index 8c57bc5..a8c4256 100644 --- a/models/max_sentence_embedding.py +++ b/models/max_sentence_embedding.py @@ -9,64 +9,36 @@ logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) -# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required. -def zero_state(module, batch_size): - # * 2 is for the two directions - return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \ - maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)) - -class SentenceEncodingRNN(nn.Module): - def __init__(self, input_size, hidden, num_layers): - super(SentenceEncodingRNN, self).__init__() - self.num_layers = num_layers - self.hidden = hidden - self.input_size = input_size - - self.lstm = nn.LSTM(input_size=self.input_size, - hidden_size=self.hidden, - num_layers=self.num_layers, - dropout=0, - bidirectional=True) - - def forward(self, x): - batch_size = x.batch_sizes[0] - s = zero_state(self, batch_size) - packed_output, _ = self.lstm(x, s) - padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256) - - maxes = maybe_cuda(torch.zeros(batch_size, padded_output.size(2))) - for i in range(batch_size): - maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0] - - return maxes class Model(nn.Module): - def __init__(self, sentence_encoder, hidden=128, num_layers=2): + def __init__(self, input_size, hidden=128, num_layers=2): super(Model, self).__init__() - self.sentence_encoder = sentence_encoder - - self.sentence_lstm = nn.LSTM(input_size=sentence_encoder.hidden * 2, - hidden_size=hidden, - num_layers=num_layers, - batch_first=True, - dropout=0, - bidirectional=True) - - # We have two labels - self.h2s = nn.Linear(hidden * 2, 2) - - self.num_layers = num_layers + self.input_size = input_size self.hidden = hidden + self.num_layers = num_layers + + + self.sentence_encoder = nn.LSTM( + input_size=self.input_size, + hidden_size=self.hidden, + num_layers=self.num_layers, + dropout=0, + bidirectional=True + ) + + self.sentence_lstm = nn.LSTM( + input_size=self.hidden * 2, + hidden_size=hidden, + num_layers=num_layers, + batch_first=True, + dropout=0, + bidirectional=True + ) + self.h2s = nn.Linear(hidden * 2, 2) self.criterion = nn.CrossEntropyLoss() - def pad(self, s, max_length): - s_length = s.size()[0] - v = maybe_cuda(s.unsqueeze(0).unsqueeze(0)) - padded = F.pad(v, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) - shape = padded.size() - return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) def pad_document(self, d, max_document_length): d_length = d.size()[0] @@ -75,9 +47,32 @@ def pad_document(self, d, max_document_length): shape = padded.size() return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) + + def forward_sentence_encoding(self, x): + # num_sequences = x.batch_sizes[0] + packed_output, _ = self.sentence_encoder(x) + padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256) + + # maxes = maybe_cuda(torch.zeros(num_sequences, padded_output.size(2))) + # for i in range(num_sequences): + # maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0] + + # Create a mask based on lengths + mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0) + mask = maybe_cuda(mask) + + # Mask padded values by setting them to a very negative value (so they don't affect the max computation) + padded_output = padded_output.masked_fill(~mask.unsqueeze(2), float('-inf')) + + # Apply max pooling over the first dimension (time dimension) for each batch + maxes, _ = torch.max(padded_output, dim=0) + + return maxes + def forward(self, data): - packed_tensor, sentences_per_doc, sort_order,batch_size = data - encoded_sentences = self.sentence_encoder(packed_tensor) + packed_tensor, sentences_per_doc, sort_order = data + packed_tensor = maybe_cuda(packed_tensor) + encoded_sentences = self.forward_sentence_encoding(packed_tensor) unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) @@ -96,7 +91,7 @@ def forward(self, data): padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False) - sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size)) + sentence_lstm_output, _ = self.sentence_lstm(packed_docs) padded_x, _ = pad_packed_sequence(sentence_lstm_output) # (max sentence len, batch, 256) doc_outputs = [] @@ -107,10 +102,4 @@ def forward(self, data): sentence_outputs = torch.cat(unsorted_doc_outputs, 0) x = self.h2s(sentence_outputs) - return x - -def create(): - sentence_encoder = SentenceEncodingRNN(input_size=300, - hidden=256, - num_layers=2) - return Model(sentence_encoder, hidden=256, num_layers=2) \ No newline at end of file + return x \ No newline at end of file diff --git a/models/single_lstm.py b/models/single_lstm.py index 72920bf..63f94b2 100644 --- a/models/single_lstm.py +++ b/models/single_lstm.py @@ -9,12 +9,6 @@ logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) -# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required. -def zero_state(module, batch_size): - # * 2 is for the two directions - return maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden)), \ - maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden)) - class SentenceEncodingRNN(nn.Module): def __init__(self, input_size, hidden, num_layers): super(SentenceEncodingRNN, self).__init__() @@ -30,8 +24,7 @@ def __init__(self, input_size, hidden, num_layers): def forward(self, x): batch_size = x.batch_sizes[0] - s = zero_state(self, batch_size) - _, (hidden, _) = self.lstm(x, s) # (4, batch_size, 128) + _, (hidden, _) = self.lstm(x) # (4, batch_size, 128) transposed = hidden.transpose(0, 1) # (batch_size, 4, 128) reshaped = transposed.contiguous().view(batch_size, -1) @@ -117,7 +110,7 @@ def forward(self, batch): docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False) profiler.set() # 3 - sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size)) + sentence_lstm_output, _ = self.sentence_lstm(packed_docs) profiler.set() # 4 padded_x, _ = pad_packed_sequence(sentence_lstm_output) # (max sentence len, batch, 256) diff --git a/run.py b/run.py index 3e783ed..8a3cb52 100644 --- a/run.py +++ b/run.py @@ -17,6 +17,7 @@ import accuracy import numpy as np from termcolor import colored +from models.max_sentence_embedding import Model # torch.multiprocessing.set_sharing_strategy('file_system') @@ -205,7 +206,7 @@ def main(args): test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers,pin_memory=args.pin_memory) - model = import_model(args.model) if args.model else torch.load(open(args.load_from, 'rb')) + model = Model(input_size=300, hidden=256, num_layers=2) model = maybe_cuda(model) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) @@ -244,7 +245,7 @@ def main(args): parser.add_argument('--bs', help='Batch size', type=int, default=8) parser.add_argument('--test_bs', help='Test batch size', type=int, default=5) parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=1) - parser.add_argument('--model', help='Model to run - will import and run') + parser.add_argument('--model', help='Model to run - will import and run',default='max_sentence_embedding') parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue') parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1') parser.add_argument('--checkpoint_dir', help='Checkpoint directory', default='checkpoints') From badbfb0d8829f388e30acc01c85ca2aa88c0f895 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sat, 12 Oct 2024 11:53:34 +0530 Subject: [PATCH 07/16] removed loop from forward method of segmentation LSTM model --- models/max_sentence_embedding.py | 60 +++++++++++++++----------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py index a8c4256..a2e18e7 100644 --- a/models/max_sentence_embedding.py +++ b/models/max_sentence_embedding.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence from utils import maybe_cuda, setup_logger, unsort import numpy as np from times_profiler import profiler @@ -53,10 +53,6 @@ def forward_sentence_encoding(self, x): packed_output, _ = self.sentence_encoder(x) padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256) - # maxes = maybe_cuda(torch.zeros(num_sequences, padded_output.size(2))) - # for i in range(num_sequences): - # maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0] - # Create a mask based on lengths mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0) mask = maybe_cuda(mask) @@ -69,37 +65,39 @@ def forward_sentence_encoding(self, x): return maxes + + def forward_helper(self, sentences_per_doc, unsorted_encodings): + + # Step 3: Efficiently split the unsorted_encodings into separate documents using tensor operations + sentences_per_doc = maybe_cuda(torch.LongTensor(sentences_per_doc)) + encoded_documents = torch.split(unsorted_encodings, sentences_per_doc.tolist()) + + # Step 4: Calculate maximum document size and pad documents in one go + padded_docs = pad_sequence(encoded_documents, batch_first=True) + + # Step 5: Pack the padded documents for LSTM processing + packed_docs = pack_padded_sequence(padded_docs, sentences_per_doc, batch_first=True, enforce_sorted=False) + + # Step 6: Pass through document-level LSTM + sentence_lstm_output, _ = self.sentence_lstm(packed_docs) + + # Step 7: Unpack the LSTM output + padded_x, _ = pad_packed_sequence(sentence_lstm_output, batch_first=True) + + # Step 8: Select the final hidden states (excluding last prediction) without using a loop + doc_outputs = [padded_x[i, :doc_len-1, :] for i, doc_len in enumerate(sentences_per_doc.tolist())] + + # Step 9: Concatenate the outputs into one tensor + sentence_outputs = torch.cat(doc_outputs, dim=0) + + return sentence_outputs + def forward(self, data): packed_tensor, sentences_per_doc, sort_order = data packed_tensor = maybe_cuda(packed_tensor) encoded_sentences = self.forward_sentence_encoding(packed_tensor) unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) - - index = 0 - encoded_documents = [] - for sentences_count in sentences_per_doc: - end_index = index + sentences_count - encoded_documents.append(unsorted_encodings[index: end_index, :]) - index = end_index - - doc_sizes = [doc.size()[0] for doc in encoded_documents] - max_doc_size = np.max(doc_sizes) - ordered_document_idx = np.argsort(doc_sizes)[::-1] - ordered_doc_sizes = sorted(doc_sizes)[::-1] - ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx] - padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] - docs_tensor = torch.cat(padded_docs, 1) - packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False) - sentence_lstm_output, _ = self.sentence_lstm(packed_docs) - padded_x, _ = pad_packed_sequence(sentence_lstm_output) # (max sentence len, batch, 256) - - doc_outputs = [] - for i, doc_len in enumerate(ordered_doc_sizes): - doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # -1 to remove last prediction - - unsorted_doc_outputs = [doc_outputs[i] for i in unsort(ordered_document_idx)] - sentence_outputs = torch.cat(unsorted_doc_outputs, 0) - + sentence_outputs = self.forward_helper(sentences_per_doc, unsorted_encodings) x = self.h2s(sentence_outputs) return x \ No newline at end of file From 05471290785d98f853c9132c6d06a437cca82992 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sat, 12 Oct 2024 12:05:43 +0530 Subject: [PATCH 08/16] minor bug fix --- models/max_sentence_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py index a2e18e7..d31d506 100644 --- a/models/max_sentence_embedding.py +++ b/models/max_sentence_embedding.py @@ -69,7 +69,7 @@ def forward_sentence_encoding(self, x): def forward_helper(self, sentences_per_doc, unsorted_encodings): # Step 3: Efficiently split the unsorted_encodings into separate documents using tensor operations - sentences_per_doc = maybe_cuda(torch.LongTensor(sentences_per_doc)) + sentences_per_doc = torch.LongTensor(sentences_per_doc) encoded_documents = torch.split(unsorted_encodings, sentences_per_doc.tolist()) # Step 4: Calculate maximum document size and pad documents in one go From 9039a3aff08600e208691bd32bb80c0f8eb7c6a0 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sat, 12 Oct 2024 14:42:50 +0530 Subject: [PATCH 09/16] saving optim and model state --- run.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/run.py b/run.py index 8a3cb52..a5dabca 100644 --- a/run.py +++ b/run.py @@ -153,7 +153,7 @@ def test(model, args, epoch, dataset, logger, threshold): document_sentence_count = len(t) to_idx = int(current_idx + document_sentence_count) - output = (output_softmax.cpu().numpy()[current_idx:to_idx, 1] > threshold) + output = (output_softmax.detach().cpu().numpy()[current_idx:to_idx, 1] > threshold) h = np.append(output, [1]) tt = np.append(t, [1]) @@ -180,10 +180,6 @@ def main(args): utils.config.update(vars(args)) # Updated to use vars(args) logger.debug(f'Running with config {utils.config}') - - # log_dir = os.path.join('runs', args.expname, str(time.time())) - # configure(log_dir) - word2vec = None if args.test else gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True) if not args.infer: @@ -220,15 +216,21 @@ def main(args): best_val_pk = 1.0 for j in range(args.epochs): train(model, args, j, train_dl, logger, optimizer) - torch.save(model, open(checkpoint_path / f'model{j:03d}.pt', 'wb')) + torch.save({ + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict() + }, open(checkpoint_path / f'model{j:03d}.pt', 'wb')) val_pk, threshold = validate(model, args, j, dev_dl, logger) print(f'Current best model from epoch {j} with p_k {val_pk} and threshold {threshold}') - # if val_pk < best_val_pk: - # test_pk = test(model, args, j, test_dl, logger, threshold) - # logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green')) - # best_val_pk = val_pk - # torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb')) + if val_pk < best_val_pk: + test_pk = test(model, args, j, test_dl, logger, threshold) + logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green')) + best_val_pk = val_pk + torch.save({ + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict() + }, open(checkpoint_path / f'best_model.pt', 'wb')) else: test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity), @@ -244,7 +246,7 @@ def main(args): parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true') parser.add_argument('--bs', help='Batch size', type=int, default=8) parser.add_argument('--test_bs', help='Test batch size', type=int, default=5) - parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=1) + parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10) parser.add_argument('--model', help='Model to run - will import and run',default='max_sentence_embedding') parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue') parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1') From b322d285127e9d41cb6f928f891d022019a2ad40 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sun, 13 Oct 2024 13:06:47 +0530 Subject: [PATCH 10/16] added code to remove poorly formatted data --- wiki_loader.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/wiki_loader.py b/wiki_loader.py index 6483cd2..ed9736d 100644 --- a/wiki_loader.py +++ b/wiki_loader.py @@ -123,4 +123,14 @@ def __getitem__(self, index): high_granularity=self.high_granularity) def __len__(self): - return len(self.textfiles) \ No newline at end of file + return len(self.textfiles) + + +if __name__ == "__main__": + root = "/Users/jitesh/Downloads/text-segmentation/data/wiki 727/train" + for path in os.listdir(root): + if path.startswith('paths_'): + continue + all_sections = get_sections(os.path.join(root,path), high_granularity=False) + if len(all_sections) <= 1: + print(os.path.join(root,path)) From 154829e91baa09a7cb1d8057f611bf083538534e Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sun, 13 Oct 2024 13:10:41 +0530 Subject: [PATCH 11/16] removed .cuda() calls in Model calss --- models/max_sentence_embedding.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py index d31d506..3231614 100644 --- a/models/max_sentence_embedding.py +++ b/models/max_sentence_embedding.py @@ -2,9 +2,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence -from utils import maybe_cuda, setup_logger, unsort -import numpy as np -from times_profiler import profiler +from utils import setup_logger, unsort logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) @@ -55,7 +53,7 @@ def forward_sentence_encoding(self, x): # Create a mask based on lengths mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0) - mask = maybe_cuda(mask) + # mask = maybe_cuda(mask) # Mask padded values by setting them to a very negative value (so they don't affect the max computation) padded_output = padded_output.masked_fill(~mask.unsqueeze(2), float('-inf')) @@ -94,9 +92,10 @@ def forward_helper(self, sentences_per_doc, unsorted_encodings): def forward(self, data): packed_tensor, sentences_per_doc, sort_order = data - packed_tensor = maybe_cuda(packed_tensor) + # packed_tensor = maybe_cuda(packed_tensor) encoded_sentences = self.forward_sentence_encoding(packed_tensor) - unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) + unsort_order = torch.LongTensor(unsort(sort_order)) + # unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) sentence_outputs = self.forward_helper(sentences_per_doc, unsorted_encodings) x = self.h2s(sentence_outputs) From f2fbf76e5bf4a35907a312e5368fce14402961b6 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sun, 13 Oct 2024 23:56:58 +0530 Subject: [PATCH 12/16] support for checkpointing training progress --- .gitignore | 1 + models/max_sentence_embedding.py | 11 +++--- run.py | 60 +++++++++++++++++++++++++++----- 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index d8a238c..89aec3e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ data/ runs/ checkpoints/ +*.pt # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py index 3231614..d31d506 100644 --- a/models/max_sentence_embedding.py +++ b/models/max_sentence_embedding.py @@ -2,7 +2,9 @@ import torch.nn as nn import torch.nn.functional as F from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence -from utils import setup_logger, unsort +from utils import maybe_cuda, setup_logger, unsort +import numpy as np +from times_profiler import profiler logger = setup_logger(__name__, 'train.log') profilerLogger = setup_logger("profilerLogger", 'profiler.log', True) @@ -53,7 +55,7 @@ def forward_sentence_encoding(self, x): # Create a mask based on lengths mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0) - # mask = maybe_cuda(mask) + mask = maybe_cuda(mask) # Mask padded values by setting them to a very negative value (so they don't affect the max computation) padded_output = padded_output.masked_fill(~mask.unsqueeze(2), float('-inf')) @@ -92,10 +94,9 @@ def forward_helper(self, sentences_per_doc, unsorted_encodings): def forward(self, data): packed_tensor, sentences_per_doc, sort_order = data - # packed_tensor = maybe_cuda(packed_tensor) + packed_tensor = maybe_cuda(packed_tensor) encoded_sentences = self.forward_sentence_encoding(packed_tensor) - unsort_order = torch.LongTensor(unsort(sort_order)) - # unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) + unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) sentence_outputs = self.forward_helper(sentences_per_doc, unsorted_encodings) x = self.h2s(sentence_outputs) diff --git a/run.py b/run.py index a5dabca..01f30eb 100644 --- a/run.py +++ b/run.py @@ -21,6 +21,8 @@ # torch.multiprocessing.set_sharing_strategy('file_system') +torch.manual_seed(42) + preds_stats = utils.predictions_analysis() def softmax(x): @@ -89,11 +91,17 @@ def train(model, args, epoch, dataset, logger, optimizer): for i, (data, target, paths) in enumerate(dataset): if i == args.stop_after: break - pbar.update() model.zero_grad() - # data_size = compute_batch_size(data) - output = model(data) + + try: + output = model(data) + except Exception as e: + print(f"Error while passing batch {i+1} to the model") + print(f"Exception: {e}") + print(f"Paths: {paths}") + continue + target_var = maybe_cuda(torch.cat(target, 0), args.cuda) loss = model.criterion(output, target_var) loss.backward() @@ -115,8 +123,16 @@ def validate(model, args, epoch, dataset, logger): if i == args.stop_after: break pbar.update() - output = model(data) - output_softmax = F.softmax(output, dim=1) + + try: + output = model(data) + output_softmax = F.softmax(output, dim=1) + except Exception as e: + print(f"Error while passing batch {i+1} to the model") + print(f"Exception: {e}") + print(f"Paths: {paths}") + continue + targets_var = maybe_cuda(torch.cat(target, 0), args.cuda) output_seg = output.argmax(dim=1).cpu().numpy() @@ -141,8 +157,16 @@ def test(model, args, epoch, dataset, logger, threshold): if i == args.stop_after: break pbar.update() - output = model(data) - output_softmax = F.softmax(output, dim=1) + + try: + output = model(data) + output_softmax = F.softmax(output, dim=1) + except Exception as e: + print(f"Error while passing batch {i+1} to the model") + print(f"Exception: {e}") + print(f"Paths: {paths}") + continue + targets_var = maybe_cuda(torch.cat(target, 0), args.cuda) output_seg = output.argmax(dim=1).cpu().numpy() target_seg = targets_var.cpu().numpy() @@ -168,6 +192,21 @@ def test(model, args, epoch, dataset, logger, threshold): return epoch_pk + +def load_model_and_optimizer(checkpoint_path, is_cuda, model, optimizer): + + map_location = torch.device('cuda') if is_cuda else torch.device('cpu') + + checkpoint = torch.load(checkpoint_path, map_location=map_location) + + model.load_state_dict(checkpoint['model_state_dict']) + + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + print(f"Loaded model and optimizer state from {checkpoint_path}") + return model, optimizer + + def main(args): sys.path.append(str(Path(__file__).parent)) @@ -207,6 +246,9 @@ def main(args): optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + if args.load_from: + model, optimizer = load_model_and_optimizer(args.load_from, args.cuda, model, optimizer) + if args.benchmark: for j in range(args.epochs): train(model, args, j, train_dl, logger, optimizer) @@ -225,7 +267,7 @@ def main(args): print(f'Current best model from epoch {j} with p_k {val_pk} and threshold {threshold}') if val_pk < best_val_pk: test_pk = test(model, args, j, test_dl, logger, threshold) - logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green')) + print(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}') best_val_pk = val_pk torch.save({ 'model_state_dict': model.state_dict(), @@ -248,7 +290,7 @@ def main(args): parser.add_argument('--test_bs', help='Test batch size', type=int, default=5) parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10) parser.add_argument('--model', help='Model to run - will import and run',default='max_sentence_embedding') - parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue') + parser.add_argument('--load_from', help='Location of a .pt model file to load. Training will continue') parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1') parser.add_argument('--checkpoint_dir', help='Checkpoint directory', default='checkpoints') parser.add_argument('--stop_after', help='Number of batches to stop after', type=int) From ded3aa26a83b8c02c90dcd177a13b4cde2749d48 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sat, 19 Oct 2024 17:34:33 +0530 Subject: [PATCH 13/16] exception handling during testing --- .gitignore | 2 ++ run.py | 2 +- test_accuracy.py | 20 ++++++++++++++------ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 89aec3e..bcd0d7e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ data/ runs/ checkpoints/ *.pt +inference/ +outputs/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/run.py b/run.py index 01f30eb..2c326dc 100644 --- a/run.py +++ b/run.py @@ -197,7 +197,7 @@ def load_model_and_optimizer(checkpoint_path, is_cuda, model, optimizer): map_location = torch.device('cuda') if is_cuda else torch.device('cpu') - checkpoint = torch.load(checkpoint_path, map_location=map_location) + checkpoint = torch.load(checkpoint_path, map_location=map_location, weights_only=True) model.load_state_dict(checkpoint['model_state_dict']) diff --git a/test_accuracy.py b/test_accuracy.py index 7f9b861..058b737 100644 --- a/test_accuracy.py +++ b/test_accuracy.py @@ -1,6 +1,5 @@ import torch from torch.utils.data import DataLoader -from torch.autograd import Variable import numpy as np from choiloader import ChoiDataset, collate_fn from tqdm import tqdm @@ -15,6 +14,7 @@ import accuracy from models import naive from timeit import default_timer as timer +from models.max_sentence_embedding import Model logger = utils.setup_logger(__name__, 'test_accuracy.log') @@ -71,9 +71,10 @@ def main(args): print('Running on Choi') # Load the model - with open(args.model, 'rb') as f: - model = torch.load(f) - + model = Model(input_size=300, hidden=256, num_layers=2) + map_location = torch.device('cuda') if args.cuda else torch.device('cpu') + checkpoint = torch.load(args.model, map_location=map_location, weights_only=True) + model.load_state_dict(checkpoint['model_state_dict']) model = maybe_cuda(model) model.eval() @@ -107,10 +108,17 @@ def main(args): break pbar.update() - output = model(data) + + try: + output = model(data) + except Exception as e: + print(f"Error while passing batch {i+1} to the model") + print(f"Exception: {e}") + continue + targets_var = maybe_cuda(torch.cat(targets, 0), args.cuda) batch_loss = 0 - output_prob = softmax(output.cpu().numpy()) + output_prob = softmax(output.detach().cpu().numpy()) output_seg = output_prob[:, 1] > args.seg_threshold target_seg = targets_var.cpu().numpy() batch_accurate = (output_seg == target_seg).sum() From af475ba3168527b62facd6a0581c763a4b612014 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Sat, 19 Oct 2024 18:01:25 +0530 Subject: [PATCH 14/16] added requirements.txt --- requirements.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e1914bc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +gensim==4.3.3 +nltk==3.9.1 +numpy==1.26.4 +pandas==2.2.3 +pathlib2==2.3.7.post1 +segeval==2.0.11 +tensorboard-logger==0.1.0 +termcolor==2.5.0 +torch==2.4.1 +tqdm==4.66.5 +protobuf==3.20.1 \ No newline at end of file From 110b4e5e8bb3f09d0cedee2ee15d46f01bfc2dae Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Mon, 21 Oct 2024 11:46:39 +0530 Subject: [PATCH 15/16] support for multi GPU training --- run.py | 38 ++++++++++++++++++++++++++++++-------- test_accuracy.py | 5 ++++- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/run.py b/run.py index 2c326dc..41fe32c 100644 --- a/run.py +++ b/run.py @@ -1,7 +1,7 @@ import torch from torch.utils.data import DataLoader, Subset import torch.nn.functional as F - +from torch.nn import DataParallel from choiloader import ChoiDataset, collate_fn from tqdm import tqdm from argparse import ArgumentParser @@ -198,8 +198,11 @@ def load_model_and_optimizer(checkpoint_path, is_cuda, model, optimizer): map_location = torch.device('cuda') if is_cuda else torch.device('cpu') checkpoint = torch.load(checkpoint_path, map_location=map_location, weights_only=True) - - model.load_state_dict(checkpoint['model_state_dict']) + + if isinstance(model, torch.nn.DataParallel): + model.module.load_state_dict(checkpoint['model_state_dict']) + else: + model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) @@ -234,16 +237,27 @@ def main(args): dev_dataset = Subset(dev_dataset,range(1000)) test_dataset = Subset(test_dataset,range(1000)) - train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True, + train_batch_size,test_batch_size = args.bs, args.test_bs + + if torch.cuda.device_count() > 1: + num_gpus = torch.cuda.device_count() + print(f"Using {num_gpus} GPUs") + train_batch_size = args.bs * num_gpus + test_batch_size = args.test_bs * num_gpus + + train_dl = DataLoader(train_dataset, batch_size=train_batch_size, collate_fn=collate_fn, shuffle=True, num_workers=args.num_workers,pin_memory=args.pin_memory) - dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, + dev_dl = DataLoader(dev_dataset, batch_size=test_batch_size, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers,pin_memory=args.pin_memory) - test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, + test_dl = DataLoader(test_dataset, batch_size=test_batch_size, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers,pin_memory=args.pin_memory) model = Model(input_size=300, hidden=256, num_layers=2) model = maybe_cuda(model) + if torch.cuda.device_count() > 1 and not args.infer: + model = DataParallel(model) + optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) if args.load_from: @@ -258,8 +272,12 @@ def main(args): best_val_pk = 1.0 for j in range(args.epochs): train(model, args, j, train_dl, logger, optimizer) + if isinstance(model, torch.nn.DataParallel): + model_state_dict = model.module.state_dict() + else: + model_state_dict = model.state_dict() torch.save({ - 'model_state_dict': model.state_dict(), + 'model_state_dict': model_state_dict, 'optimizer_state_dict': optimizer.state_dict() }, open(checkpoint_path / f'model{j:03d}.pt', 'wb')) @@ -269,8 +287,12 @@ def main(args): test_pk = test(model, args, j, test_dl, logger, threshold) print(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}') best_val_pk = val_pk + if isinstance(model, torch.nn.DataParallel): + model_state_dict = model.module.state_dict() + else: + model_state_dict = model.state_dict() torch.save({ - 'model_state_dict': model.state_dict(), + 'model_state_dict': model_state_dict, 'optimizer_state_dict': optimizer.state_dict() }, open(checkpoint_path / f'best_model.pt', 'wb')) diff --git a/test_accuracy.py b/test_accuracy.py index 058b737..543b758 100644 --- a/test_accuracy.py +++ b/test_accuracy.py @@ -74,7 +74,10 @@ def main(args): model = Model(input_size=300, hidden=256, num_layers=2) map_location = torch.device('cuda') if args.cuda else torch.device('cpu') checkpoint = torch.load(args.model, map_location=map_location, weights_only=True) - model.load_state_dict(checkpoint['model_state_dict']) + if isinstance(model, torch.nn.DataParallel): + model.module.load_state_dict(checkpoint['model_state_dict']) + else: + model.load_state_dict(checkpoint['model_state_dict']) model = maybe_cuda(model) model.eval() From d4921b358c41b440b023b2af58a20771a7cea464 Mon Sep 17 00:00:00 2001 From: jiteshm17 Date: Tue, 22 Oct 2024 07:46:26 +0530 Subject: [PATCH 16/16] flag for multi gpu training --- run.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/run.py b/run.py index 41fe32c..c7e3d76 100644 --- a/run.py +++ b/run.py @@ -99,7 +99,7 @@ def train(model, args, epoch, dataset, logger, optimizer): except Exception as e: print(f"Error while passing batch {i+1} to the model") print(f"Exception: {e}") - print(f"Paths: {paths}") + # print(f"Paths: {paths}") continue target_var = maybe_cuda(torch.cat(target, 0), args.cuda) @@ -130,7 +130,7 @@ def validate(model, args, epoch, dataset, logger): except Exception as e: print(f"Error while passing batch {i+1} to the model") print(f"Exception: {e}") - print(f"Paths: {paths}") + # print(f"Paths: {paths}") continue targets_var = maybe_cuda(torch.cat(target, 0), args.cuda) @@ -164,7 +164,7 @@ def test(model, args, epoch, dataset, logger, threshold): except Exception as e: print(f"Error while passing batch {i+1} to the model") print(f"Exception: {e}") - print(f"Paths: {paths}") + # print(f"Paths: {paths}") continue targets_var = maybe_cuda(torch.cat(target, 0), args.cuda) @@ -239,7 +239,7 @@ def main(args): train_batch_size,test_batch_size = args.bs, args.test_bs - if torch.cuda.device_count() > 1: + if args.multi_gpu and torch.cuda.device_count() > 1: num_gpus = torch.cuda.device_count() print(f"Using {num_gpus} GPUs") train_batch_size = args.bs * num_gpus @@ -255,7 +255,7 @@ def main(args): model = Model(input_size=300, hidden=256, num_layers=2) model = maybe_cuda(model) - if torch.cuda.device_count() > 1 and not args.infer: + if args.multi_gpu and torch.cuda.device_count() > 1 and not args.infer: model = DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) @@ -304,6 +304,7 @@ def main(args): if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--cuda', help='Use cuda?', action='store_true') + parser.add_argument('--multi_gpu', help='Use multiple GPUs', action='store_true') parser.add_argument('--pin_memory', help='Pin Memory?', action='store_true') parser.add_argument('--subset', help='Use a sample of 1000 rows', action='store_true') parser.add_argument('--benchmark', help='Use PyTorch profiler', action='store_true')