From 15c668720f8278eb3479abc029ea6a47b68d85a3 Mon Sep 17 00:00:00 2001
From: Jitesh Malipeddi <jitesh@Jiteshs-MacBook-Air.local>
Date: Mon, 7 Oct 2024 17:54:13 +0530
Subject: [PATCH 01/16] updated files to work with python3

---
 .gitignore                       |   5 +
 accuracy.py                      |  47 ++--
 annotate_wiki_file.py            |  43 ++--
 calc_statistics.py               | 110 ++++-----
 check_annotated_wiki_file.py     | 142 ++++--------
 chen_cities_converter.py         |  75 +++---
 chen_elements_convertor.py       |  75 +++---
 choi_convertor.py                |  65 +++---
 choiloader.py                    |  62 ++---
 clean_wiki_dataset.py            |  41 ++--
 configgenerator.py               |   8 +-
 convert_seperator.py             |  34 +--
 evaluate.py                      |  12 +-
 gpu2cpu.py                       |  21 +-
 graphseg_gen.sh                  |  17 +-
 graphseg_timer.py                |  49 ++--
 models/from_presentation.py      |  30 +--
 models/max_sentence_embedding.py |  34 +--
 models/naive.py                  |  27 +--
 models/single_lstm.py            |  40 ++--
 run.py                           | 220 +++++++-----------
 run_web_server.py                |  27 ++-
 seg_comparsion.py                | 130 +++++------
 test_accuracy.py                 | 102 ++++----
 test_accuracy_choi.py            | 129 +++++------
 tests.py                         | 168 ++++++--------
 text_manipulation.py             |  97 ++++----
 times_profiler.py                |  33 +--
 utils.py                         | 136 +++--------
 wiki_extractor.py                |   8 +-
 wiki_loader.py                   |  33 +--
 wiki_processor.py                | 384 ++++++++++++-------------------
 wiki_utils.py                    |  33 ++-
 33 files changed, 1016 insertions(+), 1421 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7bbc71c..de6246e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+.DS_Store
+config.json
+data/
+runs/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/accuracy.py b/accuracy.py
index 39758f6..c8581fe 100644
--- a/accuracy.py
+++ b/accuracy.py
@@ -1,39 +1,37 @@
 import segeval as seg
 import numpy as np
 
-
 def softmax(x):
     max_each_row = np.max(x, axis=1, keepdims=True)
     exps = np.exp(x - max_each_row)
     sums = np.sum(exps, axis=1, keepdims=True)
     return exps / sums
 
-
 class Accuracy:
     def __init__(self, threshold=0.3):
         self.pk_to_weight = []
         self.windiff_to_weight = []
         self.threshold = threshold
 
-    def update(self, h, gold, sentences_length = None):
+    def update(self, h, gold, sentences_length=None):
         h_boundaries = self.get_seg_boundaries(h, sentences_length)
         gold_boundaries = self.get_seg_boundaries(gold, sentences_length)
         pk, count_pk = self.pk(h_boundaries, gold_boundaries)
-        windiff, count_wd = -1, 400;# self.win_diff(h_boundaries, gold_boundaries)
+        windiff, count_wd = -1, 400  # Placeholder for windiff calculation
 
         if pk != -1:
             self.pk_to_weight.append((pk, count_pk))
         else:
-            print ('pk error')
+            print('pk error')
 
         if windiff != -1:
             self.windiff_to_weight.append((windiff, count_wd))
 
-    def get_seg_boundaries(self, classifications, sentences_length = None):
+    def get_seg_boundaries(self, classifications, sentences_length=None):
         """
-        :param list of tuples, each tuple is a sentence and its class (1 if it the sentence starts a segment, 0 otherwise).
-        e.g: [(this is, 0), (a segment, 1) , (and another one, 1)
-        :return: boundaries of segmentation to use for pk method. For given example the function will return (4, 3)
+        :param classifications: list of tuples, each tuple is a sentence and its class (1 if the sentence starts a segment, 0 otherwise).
+        :param sentences_length: list of sentence lengths (optional)
+        :return: boundaries of segmentation for pk method.
         """
         curr_seg_length = 0
         boundaries = []
@@ -41,7 +39,7 @@ def get_seg_boundaries(self, classifications, sentences_length = None):
             is_split_point = bool(classifications[i])
             add_to_current_segment = 1 if sentences_length is None else sentences_length[i]
             curr_seg_length += add_to_current_segment
-            if (is_split_point):
+            if is_split_point:
                 boundaries.append(curr_seg_length)
                 curr_seg_length = 0
 
@@ -49,10 +47,10 @@ def get_seg_boundaries(self, classifications, sentences_length = None):
 
     def pk(self, h, gold, window_size=-1):
         """
-        :param gold: gold segmentation (item in the list contains the number of words in segment) 
-        :param h: hypothesis segmentation  (each item in the list contains the number of words in segment)
-        :param window_size: optional 
-        :return: accuracy
+        :param h: hypothesis segmentation
+        :param gold: gold segmentation
+        :param window_size: optional window size
+        :return: pk accuracy
         """
         if window_size != -1:
             false_seg_count, total_count = seg.pk(h, gold, window_size=window_size, return_parts=True)
@@ -60,19 +58,18 @@ def pk(self, h, gold, window_size=-1):
             false_seg_count, total_count = seg.pk(h, gold, return_parts=True)
 
         if total_count == 0:
-            # TODO: Check when happens
             false_prob = -1
         else:
-            false_prob = float(false_seg_count) / float(total_count)
+            false_prob = float(false_seg_count) / total_count
 
         return false_prob, total_count
 
     def win_diff(self, h, gold, window_size=-1):
         """
-        :param gold: gold segmentation (item in the list contains the number of words in segment) 
-        :param h: hypothesis segmentation  (each item in the list contains the number of words in segment)
-        :param window_size: optional 
-        :return: accuracy
+        :param h: hypothesis segmentation
+        :param gold: gold segmentation
+        :param window_size: optional window size
+        :return: win_diff accuracy
         """
         if window_size != -1:
             false_seg_count, total_count = seg.window_diff(h, gold, window_size=window_size, return_parts=True)
@@ -82,14 +79,12 @@ def win_diff(self, h, gold, window_size=-1):
         if total_count == 0:
             false_prob = -1
         else:
-            false_prob = float(false_seg_count) / float(total_count)
+            false_prob = float(false_seg_count) / total_count
 
         return false_prob, total_count
 
     def calc_accuracy(self):
-        pk = sum([pw[0] * pw[1] for pw in self.pk_to_weight]) / sum([pw[1] for pw in self.pk_to_weight]) if len(
-            self.pk_to_weight) > 0 else -1.0
-        windiff = sum([pw[0] * pw[1] for pw in self.windiff_to_weight]) / sum(
-            [pw[1] for pw in self.windiff_to_weight]) if len(self.windiff_to_weight) > 0 else -1.0
+        pk = sum(pw[0] * pw[1] for pw in self.pk_to_weight) / sum(pw[1] for pw in self.pk_to_weight) if self.pk_to_weight else -1.0
+        windiff = sum(pw[0] * pw[1] for pw in self.windiff_to_weight) / sum(pw[1] for pw in self.windiff_to_weight) if self.windiff_to_weight else -1.0
 
-        return pk, windiff
+        return pk, windiff
\ No newline at end of file
diff --git a/annotate_wiki_file.py b/annotate_wiki_file.py
index e67ee32..9994021 100644
--- a/annotate_wiki_file.py
+++ b/annotate_wiki_file.py
@@ -1,56 +1,47 @@
 from argparse import ArgumentParser
 from wiki_loader import read_wiki_file
 import pandas as pd
-from pathlib2 import Path
+from pathlib import Path  # Use pathlib, not pathlib2
 import os
 
-
 def get_files(path):
-    all_objects = Path(path).glob('**/*')
+    all_objects = Path(path).rglob('*')  # Use rglob for '**/*' pattern
     files = (str(p) for p in all_objects if p.is_file())
     return files
 
 def generate_segmentation_template(path, output_path):
-    writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
-    sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False)
-    df = pd.DataFrame({ 'Cut here': [0] * len(sentences),'Sentences': sentences})
-    df = df[['Cut here','Sentences']]
-
-    df.to_excel(writer, sheet_name='segment')
-    writer.save()
-
+    with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:  # Use context manager for ExcelWriter
+        sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False)
+        df = pd.DataFrame({'Cut here': [0] * len(sentences), 'Sentences': sentences})
+        df = df[['Cut here', 'Sentences']]
+        df.to_excel(writer, sheet_name='segment')
 
 def generate_test_article(path, output_path):
-    sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False,
+    sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False, 
                                      high_granularity=False)
     article_text = "\n".join(sentences)
-    with open(output_path, "w") as f:
-        f.write(article_text.encode('utf-8'))
-    f.close()
+    with open(output_path, "w", encoding='utf-8') as f:  # Use context manager and specify encoding
+        f.write(article_text)
 
-def generate_folder(input_folder,output_folder):
+def generate_folder(input_folder, output_folder, to_text):
     counter = 0
     input_files = get_files(input_folder)
     for file in input_files:
         id = os.path.basename(file)
-        file_name = id + ".xlsx" if not args.toText else id
+        file_name = f"{id}.xlsx" if not to_text else id
         output_file = os.path.join(output_folder, file_name)
-        if (args.toText):
+        if to_text:
             generate_test_article(file, output_file)
         else:
-            generate_segmentation_template(file,output_file)
+            generate_segmentation_template(file, output_file)
         counter += 1
-    print 'generates ' + str(counter) + ' files'
-
-
+    print(f'Generated {counter} files')
 
 if __name__ == '__main__':
-
     parser = ArgumentParser()
     parser.add_argument('--path', help='input folder path', default='/home/michael/Downloads/migo/68943', type=str)
     parser.add_argument('--output_path', help='output folder path', default='blah.xlsx', type=str)
-    parser.add_argument('--toText', help='output to text files ?', action='store_true')
+    parser.add_argument('--toText', help='output to text files?', action='store_true')
     args = parser.parse_args()
 
-    generate_folder(args.path,args.output_path)
-
+    generate_folder(args.path, args.output_path, args.toText)
\ No newline at end of file
diff --git a/calc_statistics.py b/calc_statistics.py
index 031e314..af9ca24 100644
--- a/calc_statistics.py
+++ b/calc_statistics.py
@@ -1,8 +1,5 @@
-from __future__ import division
-
 import torch
 from torch.utils.data import DataLoader
-from torch.autograd import Variable
 import numpy as np
 
 from choiloader import ChoiDataset, collate_fn
@@ -11,19 +8,17 @@
 from utils import maybe_cuda
 import utils
 import sys
-from pathlib2 import Path
+from pathlib import Path  # Use pathlib instead of pathlib2
 from wiki_loader import WikipediaDataSet
 import accuracy
 
 logger = utils.setup_logger(__name__, 'train.log')
 
-
-
 def main(args):
     sys.path.append(str(Path(__file__).parent))
 
     utils.read_config_file(args.config)
-    utils.config.update(args.__dict__)
+    utils.config.update(vars(args))  # Update config with args dictionary
 
     logger.debug('Running with config %s', utils.config)
     article_with_problems = 0
@@ -39,94 +34,85 @@ def main(args):
     min_num_sentences = 1000
     max_num_sentences = 0
 
-
     dl = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
     docs_num_segments_vec = np.zeros(len(dl))
     segments_num_sentences_vec = []
-    print 'num of docs is ' + str(len(dl))
+    print(f'Number of documents: {len(dl)}')
 
     with tqdm(desc='Testing', total=len(dl)) as pbar:
-
         for i, (data, targets, paths) in enumerate(dl):
-            if (len(paths) == 0):
+            if len(paths) == 0:
                 article_with_problems += 1
                 docs_num_segments_vec[i] = np.nan
                 continue
             try:
-
-                if ( ((i % 1000 ) == 0) & i > 0):
-                    print i
+                if i % 1000 == 0 and i > 0:
+                    print(i)
                 if len(targets) > 0:
-                    targets_var = Variable(maybe_cuda(torch.cat(targets, 0), None), requires_grad=False)
-                    target_seg = targets_var.data.cpu().numpy()
+                    targets_var = maybe_cuda(torch.cat(targets, 0), None)
+                    target_seg = targets_var.cpu().numpy()
                     target_seg = np.concatenate([target_seg, np.array([1])])
                 else:
                     target_seg = np.ones(1)
-                num_sentences += (len(target_seg))
-                doc_num_of_segment = (sum(target_seg))
-                if (doc_num_of_segment < min_num_segment):
-                    min_num_segment = doc_num_of_segment
-                if (doc_num_of_segment > max_num_segment):
-                    max_num_segment = doc_num_of_segment
+                
+                num_sentences += len(target_seg)
+                doc_num_of_segment = sum(target_seg)
+                
+                min_num_segment = min(min_num_segment, doc_num_of_segment)
+                max_num_segment = max(max_num_segment, doc_num_of_segment)
+                
                 num_segments += doc_num_of_segment
                 num_documents += 1
                 docs_num_segments_vec[i] = doc_num_of_segment
 
                 one_inds = np.where(target_seg == 1)[0]
                 one_inds += 1
-                one_inds = np.concatenate((np.zeros(1),one_inds))
-                if (len(one_inds) == 1):
+                one_inds = np.concatenate(([0], one_inds))
+
+                if len(one_inds) == 1:
                     sentences_in_segments = [len(target_seg)]
                 else:
                     sentences_in_segments = one_inds[1:] - one_inds[:-1]
-                segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec,sentences_in_segments))
+                
+                segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec, sentences_in_segments))
                 current_min = np.min(sentences_in_segments)
                 current_max = np.max(sentences_in_segments)
-                if (current_min < min_num_sentences):
-                    min_num_sentences = current_min
-                if (current_max > max_num_sentences):
-                    max_num_sentences = current_max
-
-
+                
+                min_num_sentences = min(min_num_sentences, current_min)
+                max_num_sentences = max(max_num_sentences, current_max)
 
             except Exception as e:
-                logger.info('Exception "%s" in batch %s', e, i)
+                logger.info(f'Exception "{e}" in batch {i}')
                 logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True)
                 raise
 
+    print(f'Total sentences: {num_sentences}.')
+    print(f'Total segments: {num_segments}.')
+    print(f'Total documents: {num_documents}.')
+    print(f'Average segment size: {num_sentences / num_segments:.3f}.')
+    print(f'Min #segments in a document: {min_num_segment}.')
+    print(f'Max #segments in a document: {max_num_segment}.')
+    print(f'Min #sentences in a segment: {min_num_sentences}.')
+    print(f'Max #sentences in a segment: {max_num_sentences}.')
+
+    print('\nNew computing method\n')
+    print(f'Number of documents: {len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum()}.')
+    print(f'Total segments: {np.nansum(docs_num_segments_vec)}.')
+    print(f'Total sentences: {np.sum(segments_num_sentences_vec)}.')
+
+    print(f'Min #segments in a document: {np.nanmin(docs_num_segments_vec)}.')
+    print(f'Max #segments in a document: {np.nanmax(docs_num_segments_vec)}.')
+    print(f'Mean segments in a document: {np.nanmean(docs_num_segments_vec):.3f}.')
+    print(f'Standard deviation of segments in a document: {np.nanstd(docs_num_segments_vec):.3f}.')
 
+    print(f'\nMin #sentences in a segment: {np.min(segments_num_sentences_vec)}.')
+    print(f'Max #sentences in a segment: {np.max(segments_num_sentences_vec)}.')
+    print(f'Average segment size: {np.mean(segments_num_sentences_vec):.3f}.')
+    print(f'Standard deviation of segment size: {np.std(segments_num_sentences_vec):.3f}.')
 
-    print 'total sentences: {}.'.format(num_sentences)
-    print 'total segments: {}.'.format(num_segments)
-    print 'total documents: {}.'.format(num_documents)
-    print 'average segment size is: {:.3}.'.format(np.true_divide(num_sentences,num_segments))
-    print 'min #segment in document: {}.'.format(min_num_segment)
-    print 'max #segment in document: {}.'.format(max_num_segment)
-    print 'min #sentence in segment: {}.'.format(min_num_sentences)
-    print 'max #sentence in segment: {}.'.format(max_num_sentences)
-
-
-    print ''
-    print 'new computing method'
-    print ''
-    print 'num of documents: {}.'.format(len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum())
-    print 'total segments: {}.'.format(np.nansum(docs_num_segments_vec))
-    print 'total sentences: {}.'.format(np.sum(segments_num_sentences_vec))
-    print ''
-    print 'min #segment in document: {}.'.format(np.nanmin(docs_num_segments_vec))
-    print 'max #segment in document: {}.'.format(np.nanmax(docs_num_segments_vec))
-    print 'mean segments in document: {:.3}.'.format(np.nanmean(docs_num_segments_vec))
-    print 'std segments in document: {:.3}.'.format(np.nanstd(docs_num_segments_vec))
-    print ''
-    print 'min #sentence in segment: {}.'.format(np.min(segments_num_sentences_vec))
-    print 'max #sentence in segment: {}.'.format(np.max(segments_num_sentences_vec))
-    print 'average segment size is: {:.3}.'.format(np.mean(segments_num_sentences_vec))
-    print 'std segment size is: {:.3}.'.format(np.std(segments_num_sentences_vec))
-
-    print ''
-    print 'article with problems {}'.format(article_with_problems)
+    print(f'\nArticles with problems: {article_with_problems}')
 
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--config', help='Path to config.json', default='config.json')
-    main(parser.parse_args())
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/check_annotated_wiki_file.py b/check_annotated_wiki_file.py
index 7d1b00d..a67f168 100644
--- a/check_annotated_wiki_file.py
+++ b/check_annotated_wiki_file.py
@@ -6,165 +6,123 @@
 import os
 from glob import glob
 
-
-graphseg_delimeter = "=========="
-
+graphseg_delimiter = "=========="
 
 def generate_segmentation_template(path, output_path):
-    writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
-    sentences, _, _ = read_wiki_file(path, None, False)
-
-    sentences = [' '.join(s) + '.' for s in sentences]
-    df = pd.DataFrame({'Sentences': sentences, 'Cut here': [0] * len(sentences)})
-    df = df[['Sentences', 'Cut here']]
-
-    df.to_excel(writer, sheet_name='segment')
-    writer.save()
-
+    with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:  # Use context manager
+        sentences, _, _ = read_wiki_file(path, None, False)
+        sentences = [' '.join(s) + '.' for s in sentences]
+        df = pd.DataFrame({'Sentences': sentences, 'Cut here': [0] * len(sentences)})
+        df = df[['Sentences', 'Cut here']]
+        df.to_excel(writer, sheet_name='segment')
 
 def target_place_to_list(targets):
-    list_of_targets = []
-    for i in range(targets[-1] + 1):
-        if i in targets:
-            list_of_targets.append(1)
-        else:
-            list_of_targets.append(0)
-
-    list_of_targets[-1] = 1
+    list_of_targets = [1 if i in targets else 0 for i in range(targets[-1] + 1)]
+    list_of_targets[-1] = 1  # Ensure the last sentence is marked as the end
     return list_of_targets
 
-
 def get_graphseg_segments(file_path):
-    file = open(str(file_path), "r")
-    raw_content = file.read()
-    file.close()
-    sentences = [s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"]
-    sentences_length = []
+    with open(str(file_path), "r", encoding='utf-8') as file:
+        raw_content = file.read()
+
+    sentences = [s for s in raw_content.strip().split("\n") if s and s != "\n"]
     h = []
-    t = []
 
     for sentence in sentences:
-        if sentence == graphseg_delimeter:
-            if len(h) > 0:
+        if sentence == graphseg_delimiter:
+            if h:
                 h[-1] = 1
         else:
             h.append(0)
-        #words = extract_sentence_words(sentence)
-        #sentences_length.append(len(words))
-        #t.append(0)
-        #h.append(0)
-
-    #t[-1] = 1  # end of last segment
-    h[-1] = 1  # they already segment it correctly.
 
+    h[-1] = 1  # Correct segmentation for the last sentence
     return h
 
-
 def get_xlsx_segments(xlsx_path):
     df = pd.read_excel(xlsx_path)
     outputs = df['Cut here'].values
-    outputs[-1] = 1
+    outputs[-1] = 1  # Ensure the last sentence is marked as the end
     return outputs
 
-
 def get_gold_segments(path):
-    sentences, targets, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False,high_granularity=False)
-
+    sentences, targets, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False, high_granularity=False)
     return target_place_to_list(targets)
 
-
 def get_sub_folders_for_graphseg(folder):
-    d = folder
-    folders = [os.path.join(d, o) for o in os.listdir(d) if os.path.isdir(os.path.join(d, o))]
-    print folders
-    return folders
-
-
-def analyszie_folder(wiki_folder,xlsx_folder,isGraphseg, use_xlsx_sub_folders = False):
+    sub_folders = [os.path.join(folder, o) for o in os.listdir(folder) if os.path.isdir(os.path.join(folder, o))]
+    print(sub_folders)
+    return sub_folders
 
+def analyze_folder(wiki_folder, xlsx_folder, is_graphseg, use_xlsx_sub_folders=False):
     acc = accuracy.Accuracy()
 
     input_files = get_files(wiki_folder)
     if use_xlsx_sub_folders:
-        annotated_files_folders= []
-        for f in os.listdir(xlsx_folder):
-            sub_folder_path = xlsx_folder  + f
-            if os.path.isdir(sub_folder_path):
-                annotated_files_folders.append(sub_folder_path)
+        annotated_files_folders = [os.path.join(xlsx_folder, f) for f in os.listdir(xlsx_folder) if os.path.isdir(os.path.join(xlsx_folder, f))]
     else:
         annotated_files_folders = [xlsx_folder]
 
-
-
-
     for file in input_files:
         id = os.path.basename(file)
-        file_name = id + ".xlsx" if not isGraphseg else id
-        xlsx_file_paths = [os.path.join(xlsx_folder,file_name) for xlsx_folder in annotated_files_folders]
-        print str(xlsx_file_paths)
-        print str(file)
+        file_name = f"{id}.xlsx" if not is_graphseg else id
+        xlsx_file_paths = [os.path.join(folder, file_name) for folder in annotated_files_folders]
+        print(xlsx_file_paths)
+        print(file)
 
         for xlsx_file_path in xlsx_file_paths:
             if os.path.isfile(xlsx_file_path):
-                if (isGraphseg):
+                if is_graphseg:
                     tested_segments = get_graphseg_segments(xlsx_file_path)
                 else:
-                    tested_segments = get_xlsx_segments(xlsx_file_path )
+                    tested_segments = get_xlsx_segments(xlsx_file_path)
             else:
                 tested_segments = None
 
             gold_segments = get_gold_segments(file)
-            if (tested_segments is not  None) and (len(tested_segments) != len(gold_segments)):
-                print "(len(tested_segments) != len(gold_segments))"
-                print "stop run"
-                return 1000,1000
-            if tested_segments is not None :
-                acc.update(tested_segments,gold_segments)
+            if tested_segments is not None and len(tested_segments) != len(gold_segments):
+                print("(len(tested_segments) != len(gold_segments))")
+                print("Stopping run")
+                return 1000, 1000
 
+            if tested_segments is not None:
+                acc.update(tested_segments, gold_segments)
 
-    #Print results:
+    # Print results
     calculated_pk, calculated_windiff = acc.calc_accuracy()
     print('Finished testing.')
-    print ('Pk: {:.4}.'.format(calculated_pk))
-    print ('')
-
-    return calculated_pk,calculated_windiff
-
+    print(f'Pk: {calculated_pk:.4f}.')
+    print()
 
-def result_to_file(pk_list,wd_list,path_list,result_file_path):
-    writer = pd.ExcelWriter(result_file_path, engine='xlsxwriter')
-
-    df = pd.DataFrame({ 'pk': pk_list,'wd': wd_list,'folders': path_list})
-    df = df[['pk','wd','folders']]
-
-    df.to_excel(writer, sheet_name='annotated_result')
-    writer.save()
+    return calculated_pk, calculated_windiff
 
+def result_to_file(pk_list, wd_list, path_list, result_file_path):
+    with pd.ExcelWriter(result_file_path, engine='xlsxwriter') as writer:  # Use context manager
+        df = pd.DataFrame({'pk': pk_list, 'wd': wd_list, 'folders': path_list})
+        df.to_excel(writer, sheet_name='annotated_result')
 
 if __name__ == '__main__':
-
     parser = ArgumentParser()
     parser.add_argument('--path', help='wiki folder, truth', type=str)
-    parser.add_argument('--xlsx_path', help='folder with xlsx files',  type=str)
-    parser.add_argument('--graphseg', help='to calc graphseg pk', action='store_true')
+    parser.add_argument('--xlsx_path', help='folder with xlsx files', type=str)
+    parser.add_argument('--graphseg', help='to calculate graphseg pk', action='store_true')
 
     args = parser.parse_args()
     pk_list = []
     wd_list = []
     path_list = []
 
-    if (args.graphseg):
+    if args.graphseg:
         graphseg_folders = get_sub_folders_for_graphseg(args.xlsx_path)
         for folder in graphseg_folders:
-            pk,wd =  analyszie_folder(args.path,folder,args.graphseg)
+            pk, wd = analyze_folder(args.path, folder, args.graphseg)
             pk_list.append(pk)
             wd_list.append(wd)
             path_list.append(folder)
     else:
-        pk, wd = analyszie_folder(args.path, args.xlsx_path, args.graphseg, use_xlsx_sub_folders=True)
+        pk, wd = analyze_folder(args.path, args.xlsx_path, args.graphseg, use_xlsx_sub_folders=True)
         pk_list.append(pk)
         wd_list.append(wd)
         path_list.append(args.xlsx_path)
 
-    #writing result to file
-    result_to_file(pk_list,wd_list,path_list,os.path.join(args.xlsx_path,"result_pk.xlsx") )
+    # Write result to file
+    result_to_file(pk_list, wd_list, path_list, os.path.join(args.xlsx_path, "result_pk.xlsx"))
\ No newline at end of file
diff --git a/chen_cities_converter.py b/chen_cities_converter.py
index 3f1fbc2..1a13670 100644
--- a/chen_cities_converter.py
+++ b/chen_cities_converter.py
@@ -1,81 +1,72 @@
 import utils
-from pathlib2 import Path
+from pathlib import Path
 from argparse import ArgumentParser
 import os
 import wiki_utils
 
-
-
-
-
 def main(args):
     utils.read_config_file(args.config)
-    utils.config.update(args.__dict__)
-
+    utils.config.update(vars(args))  # Update config with args as a dictionary
 
     file_path = args.input
     output_folder_path = args.output
     special_delim_sign_path = args.sign
 
-    file = open(str(special_delim_sign_path), "r")
-    special_delim_sign = file.read().encode('utf-8').split("\n")[0]
-    file.close()
-
-    file = open(str(file_path ), "r")
-    raw_content = file.read()
-    file.close()
-
-    result_file_path = None
+    # Open and read the special delimiter sign file
+    with open(special_delim_sign_path, "r", encoding='utf-8') as file:
+        special_delim_sign = file.read().split("\n")[0]
 
+    # Open and read the input file
+    with open(file_path, "r", encoding='utf-8') as file:
+        raw_content = file.read()
 
-    sentences  = [s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"]
+    sentences = [s for s in raw_content.strip().split("\n") if s]
 
     last_doc_id = 0
     last_topic = ""
+    result_file_path = None
 
     for sentence in sentences:
-
         first_comma_index = sentence.index(',')
-        second_comma_index = sentence[first_comma_index + 1 :].index(',')
-        current_doc_id = sentence[0:first_comma_index]
+        second_comma_index = sentence[first_comma_index + 1:].index(',') + first_comma_index + 1
+        current_doc_id = sentence[:first_comma_index]
         sign_index = sentence.index(special_delim_sign)
-        start_sentence_index = sign_index  + 1
+        start_sentence_index = sign_index + 1
         actual_sentence = sentence[start_sentence_index:]
-        current_topic = sentence[first_comma_index + second_comma_index + 2:sign_index]
-
+        current_topic = sentence[second_comma_index + 1:sign_index]
 
-        if (current_doc_id != last_doc_id):
+        # Handle new document id and create new file for it
+        if current_doc_id != last_doc_id:
             last_doc_id = current_doc_id
-            print 'new file index'
-            print last_doc_id
-            if (result_file_path != None):
+            print('New file index:', last_doc_id)
+            if result_file_path:
                 result_file.close()
-            result_file_path = os.path.join(output_folder_path ,str(current_doc_id) + ".text")
 
-            result_file = open(str(result_file_path), "w")
+            result_file_path = os.path.join(output_folder_path, f"{current_doc_id}.text")
+            result_file = open(result_file_path, "w", encoding='utf-8')
             last_topic = ""
 
-
-
-        if (current_topic != last_topic):
+        # Write new topic to file if changed
+        if current_topic != last_topic:
             last_topic = current_topic
-            level = 1 if (current_topic == "TOP-LEVEL SEGMENT") else 2
-            result_file.write((wiki_utils.get_segment_seperator(level, current_topic) + ".").encode('utf-8'))
-            result_file.write("\n".encode('utf-8'))
+            level = 1 if current_topic == "TOP-LEVEL SEGMENT" else 2
+            result_file.write(wiki_utils.get_segment_seperator(level, current_topic) + ".\n")
+
+        if '\n' in sentence:
+            print('Backslash in sentence')
 
-        if  ('\n' in sentence):
-            print 'back slash in sentnece'
-        result_file.write(actual_sentence.encode('utf-8'))
-        #result_file.write(".".encode('utf-8'))
-        result_file.write("\n".encode('utf-8'))
+        # Write actual sentence to file
+        result_file.write(actual_sentence + "\n")
 
+    if result_file_path:
+        result_file.close()
 
 if __name__ == '__main__':
     parser = ArgumentParser()
 
     parser.add_argument('--config', help='Path to config.json', default='config.json')
     parser.add_argument('--input', help='Chen text file', required=True)
-    parser.add_argument('--output', help='folder for converted files', required=True)
-    parser.add_argument('--sign', help='folder for converted files', required=True)
+    parser.add_argument('--output', help='Folder for converted files', required=True)
+    parser.add_argument('--sign', help='File containing special delimiter sign', required=True)
 
     main(parser.parse_args())
\ No newline at end of file
diff --git a/chen_elements_convertor.py b/chen_elements_convertor.py
index 74c37a9..c70c79f 100644
--- a/chen_elements_convertor.py
+++ b/chen_elements_convertor.py
@@ -3,76 +3,71 @@
 import os
 import wiki_utils
 
-
 def main(args):
     utils.read_config_file(args.config)
-    utils.config.update(args.__dict__)
+    utils.config.update(vars(args))  # Use vars(args) for dictionary-like access
 
     file_path = args.input
     segments_path = args.segment
     output_folder_path = args.output
 
+    # Read the segments content file
+    with open(segments_path, "r", encoding='utf-8') as file:
+        segments_content = file.read()
 
-    file = open(str(segments_path), "r")
-    segments_content = file.read()
-    file.close()
-
-    file = open(str(file_path ), "r")
-    raw_content = file.read()
-    file.close()
+    # Read the input file
+    with open(file_path, "r", encoding='utf-8') as file:
+        raw_content = file.read()
 
-    sentences  = [s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"]
-    segments = [s for s in segments_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n"]
+    sentences = [s for s in raw_content.strip().split("\n") if s]
+    segments = [s for s in segments_content.strip().split("\n") if s]
 
-    result_file_path = None
+    if len(sentences) != len(segments):
+        print("len(sentences) != len(segments)")
+        return
 
     last_doc_id = 0
     last_topic = ""
+    result_file_path = None
 
-    if (len(sentences) != len(segments)):
-        print "len(sentences) != len(segments)"
-        return
-
-    for i in range(len(sentences)) :
-
+    for i in range(len(sentences)):
         sentence = sentences[i]
-        segment = segments[i].encode('utf-8').split("\r")[0]
+        segment = segments[i].split("\r")[0]
 
         first_comma_index = segment.index(',')
-        second_comma_index = segment[first_comma_index + 1 :].index(',')
-        current_doc_id = segment[0:first_comma_index]
-        current_topic = segment[first_comma_index + second_comma_index + 2:]
+        second_comma_index = segment[first_comma_index + 1:].index(',') + first_comma_index + 1
+        current_doc_id = segment[:first_comma_index]
+        current_topic = segment[second_comma_index + 1:]
 
-        if (current_doc_id != last_doc_id):
+        # Handle new document id and create a new file for it
+        if current_doc_id != last_doc_id:
             last_doc_id = current_doc_id
-            print 'new file index'
-            print last_doc_id
-            if (result_file_path != None):
+            print('New file index:', last_doc_id)
+            if result_file_path:
                 result_file.close()
-            result_file_path = os.path.join(output_folder_path ,str(current_doc_id) + ".text")
-            result_file = open(str(result_file_path), "w")
+
+            result_file_path = os.path.join(output_folder_path, f"{current_doc_id}.text")
+            result_file = open(result_file_path, "w", encoding='utf-8')
             last_topic = ""
 
-        if (current_topic != last_topic):
+        # Write new topic to the file if changed
+        if current_topic != last_topic:
             last_topic = current_topic
-            level = 1 if (current_topic == "TOP-LEVEL SEGMENT") else 2
-            result_file.write((wiki_utils.get_segment_seperator(level ,current_topic) +".").encode('utf-8'))
-            result_file.write("\n".encode('utf-8'))
+            level = 1 if current_topic == "TOP-LEVEL SEGMENT" else 2
+            result_file.write(wiki_utils.get_segment_seperator(level, current_topic) + ".\n")
 
-        actual_sentence = sentence
-        result_file.write(actual_sentence.encode('utf-8'))
-        if  ('\n' in sentence):
-            print 'back slash in sentnece'
-        #result_file.write(".".encode('utf-8'))
-        result_file.write("\n".encode('utf-8'))
+        # Write the actual sentence to the file
+        result_file.write(sentence + "\n")
 
+    if result_file_path:
+        result_file.close()
 
 if __name__ == '__main__':
     parser = ArgumentParser()
 
     parser.add_argument('--config', help='Path to config.json', default='config.json')
     parser.add_argument('--input', help='Chen text file', required=True)
-    parser.add_argument('--segment', help='regina segmentation file', required=True)
-    parser.add_argument('--output', help='folder for converted files', required=True)
+    parser.add_argument('--segment', help='Regina segmentation file', required=True)
+    parser.add_argument('--output', help='Folder for converted files', required=True)
 
     main(parser.parse_args())
\ No newline at end of file
diff --git a/choi_convertor.py b/choi_convertor.py
index f6a7f8c..e7c2103 100644
--- a/choi_convertor.py
+++ b/choi_convertor.py
@@ -1,63 +1,52 @@
 import os
-from pathlib2 import Path
+from pathlib import Path
 from argparse import ArgumentParser
-from shutil import  move
+from shutil import move
 
-
-
-def removeEmptyFolders(path, removeRoot=True):
+def remove_empty_folders(path, remove_root=True):
     if not os.path.isdir(path):
         return
 
-    # remove empty subfolders
-    files = os.listdir(path)
-    for f in files:
+    # Remove empty subfolders
+    for f in os.listdir(path):
         fullpath = os.path.join(path, f)
         if os.path.isdir(fullpath):
-            removeEmptyFolders(fullpath)
+            remove_empty_folders(fullpath)
 
-    # if folder empty, delete it
-    files = os.listdir(path)
-    if len(files) == 0 and removeRoot:
-        #print "Removing empty folder:", path
+    # If folder is empty, delete it
+    if not os.listdir(path) and remove_root:
         os.rmdir(path)
 
-
-
-def convert_choi_to_bySegLength(path):
-    folders =  [o for o in os.listdir(path) if os.path.isdir(os.path.join(path, o))]
+def convert_choi_to_by_seg_length(path):
+    folders = [o for o in os.listdir(path) if os.path.isdir(os.path.join(path, o))]
 
     for folder in folders:
         full_folder_path = os.path.join(path, folder)
-        seg_folders = [o for o in os.listdir(full_folder_path ) if os.path.isdir(os.path.join(full_folder_path , o))]
+        seg_folders = [o for o in os.listdir(full_folder_path) if os.path.isdir(os.path.join(full_folder_path, o))]
+
         for seg_folder in seg_folders:
-            full_seg_folder_path = os.path.join(full_folder_path ,seg_folder )
-            convertedPathList = full_seg_folder_path.split(os.sep)
+            full_seg_folder_path = os.path.join(full_folder_path, seg_folder)
+            converted_path_list = full_seg_folder_path.split(os.sep)
 
+            converted_path = os.path.sep.join(converted_path_list[:-2] + [converted_path_list[-1], converted_path_list[-2]])
+            if not os.path.exists(converted_path):
+                os.makedirs(converted_path)
 
-            convertedPath = os.path.sep.join(convertedPathList[:-2] + [convertedPathList[-1]] + [convertedPathList[-2]])
-            if not os.path.exists(convertedPath):
-                os.makedirs(convertedPath)
-            all_objects = Path(full_seg_folder_path).glob('**/*')
+            all_objects = Path(full_seg_folder_path).rglob('*')  # Use rglob for recursive search
             files = (str(p) for p in all_objects if p.is_file())
-            for file in files:
-                target = os.path.join(convertedPath ,os.path.basename(file) )
-                move(file,target)
-            print "Removing empty folder: ", full_seg_folder_path
-            removeEmptyFolders(full_seg_folder_path)
-
-
-
-def main (args):
 
+            for file in files:
+                target = os.path.join(converted_path, os.path.basename(file))
+                move(file, target)
 
-    convert_choi_to_bySegLength(args.input)
+            print(f"Removing empty folder: {full_seg_folder_path}")
+            remove_empty_folders(full_seg_folder_path)
 
-    print ('done')
+def main(args):
+    convert_choi_to_by_seg_length(args.input)
+    print('done')
 
 if __name__ == '__main__':
-
     parser = ArgumentParser()
     parser.add_argument('--input', help='Path to choi dataset', required=True)
-    main(parser.parse_args())
-
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/choiloader.py b/choiloader.py
index dfd1904..2085fdc 100644
--- a/choiloader.py
+++ b/choiloader.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-from pathlib2 import Path
-
 import torch
 from torch.utils.data import Dataset
 import numpy as np
@@ -8,13 +5,12 @@
 from text_manipulation import split_sentences, word_model, extract_sentence_words
 import utils
 import math
-
+from pathlib import Path  # Use pathlib, which is built-in with Python 3
 
 logger = utils.setup_logger(__name__, 'train.log')
 
-
 def get_choi_files(path):
-    all_objects = Path(path).glob('**/*.ref')
+    all_objects = Path(path).rglob('*.ref')  # Use rglob for recursive file search
     files = [str(p) for p in all_objects if p.is_file()]
     return files
 
@@ -24,7 +20,7 @@ def collate_fn(batch):
     paths = []
 
     window_size = 1
-    before_sentence_count = int(math.ceil(float(window_size - 1) /2))
+    before_sentence_count = int(math.ceil((window_size - 1) / 2.0))  # Python 3 division
     after_sentence_count = window_size - before_sentence_count - 1
 
     for data, targets, path in batch:
@@ -32,8 +28,8 @@ def collate_fn(batch):
             max_index = len(data)
             tensored_data = []
             for curr_sentence_index in range(0, len(data)):
-                from_index = max([0, curr_sentence_index - before_sentence_count])
-                to_index = min([curr_sentence_index + after_sentence_count + 1, max_index])
+                from_index = max(0, curr_sentence_index - before_sentence_count)
+                to_index = min(curr_sentence_index + after_sentence_count + 1, max_index)
                 sentences_window = [word for sentence in data[from_index:to_index] for word in sentence]
                 tensored_data.append(torch.FloatTensor(np.concatenate(sentences_window)))
             tensored_targets = torch.zeros(len(data)).long()
@@ -43,77 +39,69 @@ def collate_fn(batch):
             batched_targets.append(tensored_targets)
             paths.append(path)
         except Exception as e:
-            logger.info('Exception "%s" in file: "%s"', e, path)
+            logger.info(f'Exception "{e}" in file: "{path}"')
             logger.debug('Exception!', exc_info=True)
             continue
 
     return batched_data, batched_targets, paths
 
 def clean_paragraph(paragraph):
-    cleaned_paragraph= paragraph.replace("'' ", " ").replace(" 's", "'s").replace("``", "").strip('\n')
+    cleaned_paragraph = paragraph.replace("'' ", " ").replace(" 's", "'s").replace("``", "").strip('\n')
     return cleaned_paragraph
 
-def read_choi_file(path, word2vec, train, return_w2v_tensors = True,manifesto=False):
-    seperator = '========' if manifesto else '=========='
-    with Path(path).open('r') as f:
+def read_choi_file(path, word2vec, train, return_w2v_tensors=True, manifesto=False):
+    separator = '========' if manifesto else '=========='
+    with open(path, 'r', encoding='utf-8') as f:
         raw_text = f.read()
-    paragraphs = [clean_paragraph(p) for p in raw_text.strip().split(seperator)
-                  if len(p) > 5 and p != "\n"]
+
+    paragraphs = [clean_paragraph(p) for p in raw_text.strip().split(separator) if len(p) > 5 and p != "\n"]
     if train:
         random.shuffle(paragraphs)
 
     targets = []
     new_text = []
-    lastparagraphsentenceidx = 0
+    last_paragraph_sentence_idx = 0
 
     for paragraph in paragraphs:
-        if manifesto:
-            sentences = split_sentences(paragraph,0)
-        else:
-            sentences = [s for s in paragraph.split('\n') if len(s.split()) > 0]
-
+        sentences = split_sentences(paragraph, 0) if manifesto else [s for s in paragraph.split('\n') if s.split()]
         if sentences:
-            sentences_count =0
-            # This is the number of sentences in the paragraph and where we need to split.
+            sentence_count = 0
             for sentence in sentences:
                 words = extract_sentence_words(sentence)
-                if (len(words) == 0):
+                if len(words) == 0:
                     continue
-                sentences_count +=1
+                sentence_count += 1
                 if return_w2v_tensors:
                     new_text.append([word_model(w, word2vec) for w in words])
                 else:
                     new_text.append(words)
 
-            lastparagraphsentenceidx += sentences_count
-            targets.append(lastparagraphsentenceidx - 1)
+            last_paragraph_sentence_idx += sentence_count
+            targets.append(last_paragraph_sentence_idx - 1)
 
     return new_text, targets, path
 
-
-# Returns a list of batch_size that contains a list of sentences, where each word is encoded using word2vec.
 class ChoiDataset(Dataset):
-    def __init__(self, root, word2vec, train=False, folder=False,manifesto=False, folders_paths = None):
+    def __init__(self, root, word2vec, train=False, folder=False, manifesto=False, folders_paths=None):
         self.manifesto = manifesto
         if folders_paths is not None:
             self.textfiles = []
             for f in folders_paths:
                 self.textfiles.extend(list(f.glob('*.ref')))
-        elif (folder):
+        elif folder:
             self.textfiles = get_choi_files(root)
         else:
-            self.textfiles = list(Path(root).glob('**/*.ref'))
+            self.textfiles = list(Path(root).rglob('*.ref'))
 
         if len(self.textfiles) == 0:
-            raise RuntimeError('Found 0 images in subfolders of: {}'.format(root))
+            raise RuntimeError(f'Found 0 files in subfolders of: {root}')
         self.train = train
         self.root = root
         self.word2vec = word2vec
 
     def __getitem__(self, index):
         path = self.textfiles[index]
-
-        return read_choi_file(path, self.word2vec, self.train,manifesto=self.manifesto)
+        return read_choi_file(path, self.word2vec, self.train, manifesto=self.manifesto)
 
     def __len__(self):
-        return len(self.textfiles)
+        return len(self.textfiles)
\ No newline at end of file
diff --git a/clean_wiki_dataset.py b/clean_wiki_dataset.py
index a2eaf12..29f4718 100644
--- a/clean_wiki_dataset.py
+++ b/clean_wiki_dataset.py
@@ -1,45 +1,44 @@
-from pathlib2 import Path
+from pathlib import Path
 import wiki_processor
 from argparse import ArgumentParser
 
 def remove_malicious_files(dataset_path):
+    # Read the malicious file IDs from the file
     with open('malicious_wiki_files', 'r') as f:
         malicious_file_ids = f.read().splitlines()
 
-    test_path = Path(dataset_path).joinpath(Path('test'))
-    train_path = Path(dataset_path).joinpath(Path('train'))
-    dev_path = Path(dataset_path).joinpath(Path('dev'))
+    # Define paths for test, train, and dev datasets
+    test_path = Path(dataset_path).joinpath('test')
+    train_path = Path(dataset_path).joinpath('train')
+    dev_path = Path(dataset_path).joinpath('dev')
 
     deleted_file_count = 0
 
-    for id in malicious_file_ids:
-        file_path_suffix = Path(wiki_processor.get_file_path(id)).joinpath(id)
+    # Iterate over the malicious file IDs and delete the corresponding files
+    for file_id in malicious_file_ids:
+        file_path_suffix = Path(wiki_processor.get_file_path(file_id)).joinpath(file_id)
+        
         if test_path.joinpath(file_path_suffix).exists():
-            test_path.joinpath(file_path_suffix).remove()
+            test_path.joinpath(file_path_suffix).unlink()  # Use .unlink() to delete a file
             deleted_file_count += 1
 
         elif train_path.joinpath(file_path_suffix).exists():
-            train_path.joinpath(file_path_suffix).remove()
+            train_path.joinpath(file_path_suffix).unlink()
             deleted_file_count += 1
 
         elif dev_path.joinpath(file_path_suffix).exists():
-            dev_path.joinpath(file_path_suffix).remove()
-            deleted_file_count +=1
+            dev_path.joinpath(file_path_suffix).unlink()
+            deleted_file_count += 1
 
         else:
-            raise Exception('meliciious file is not included in dataset: ' + str(id))
-
-    print ('Deleted ' + str (deleted_file_count) + ' files. Malicious file count: ' + str(len(malicious_file_ids)))
+            raise Exception(f'Malicious file is not included in the dataset: {file_id}')
 
-def main(arg):
-    remove_malicious_files(arg.path)
+    print(f'Deleted {deleted_file_count} files. Malicious file count: {len(malicious_file_ids)}')
 
+def main(args):
+    remove_malicious_files(args.path)
 
 if __name__ == '__main__':
     parser = ArgumentParser()
-    parser.add_argument('--path', help='Path to dataset')
-
-    main(parser.parse_args())
-
-
-
+    parser.add_argument('--path', help='Path to dataset', required=True)
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/configgenerator.py b/configgenerator.py
index b4ce994..ae1c95d 100644
--- a/configgenerator.py
+++ b/configgenerator.py
@@ -1,10 +1,12 @@
 import json
 
+# Define the configuration data
 jsondata = {
-    "word2vecfile": "/home/omri/datasets/word2vec/GoogleNews-vectors-negative300.bin",
+    "word2vecfile": "/Users/jitesh/Downloads/text-segmentation/data/word2vec/GoogleNews-vectors-negative300.bin",
     "choidataset": "/home/omri/code/text-segmentation-2017/data/choi",
-    "wikidataset": "/home/omri/datasets/wikipedia/process_dump_r",
+    "wikidataset": "/home/omri/datasets/wikipedia/process_dump_r"
 }
 
+# Write the data to config.json
 with open('config.json', 'w') as f:
-    json.dump(jsondata, f)
+    json.dump(jsondata, f, indent=4)  # Added indent for better readability
\ No newline at end of file
diff --git a/convert_seperator.py b/convert_seperator.py
index cb11cc0..a6609f8 100644
--- a/convert_seperator.py
+++ b/convert_seperator.py
@@ -1,27 +1,31 @@
-from pathlib2 import Path
+from pathlib import Path
 import os
 
-#root ='/home/adir/Projects/text-segmentation-2017/data/choi/'
-root  = '/home/adir/Projects/text-segmentation-2017/data/choi/1/3-5'
-output ='/home/adir/Projects/text-segmentation-2017/data/part_choi/'
-delimeter = '=========='
+# Define paths and settings
+root = '/home/adir/Projects/text-segmentation-2017/data/choi/1/3-5'
+output = '/home/adir/Projects/text-segmentation-2017/data/part_choi/'
+delimiter = '=========='
 truth = '********************************************'
 
-textfiles = list(Path(root).glob('**/*.ref'))
-
+# Get all .ref files recursively from the root directory
+textfiles = list(Path(root).rglob('*.ref'))
 
 counter = 0
 
+# Iterate over all text files
 for file in textfiles:
     counter += 1
-    with file.open('r') as f:
+    with file.open('r', encoding='utf-8') as f:
         raw_text = f.read()
-    new_text = raw_text.replace('==========',truth)
-    f.close()
-    new_file_path = os.path.join(output,str(counter) + "_" + os.path.basename(str(file)))
-    with open(new_file_path, "w") as f:
-        f.write(new_text)
-    f.close()
 
-print 'done'
+    # Replace the old delimiter with the new "truth" separator
+    new_text = raw_text.replace(delimiter, truth)
+
+    # Create a new file path for the modified content
+    new_file_path = os.path.join(output, f"{counter}_{file.name}")
+
+    # Write the new content to the new file
+    with open(new_file_path, "w", encoding='utf-8') as f:
+        f.write(new_text)
 
+print('done')
\ No newline at end of file
diff --git a/evaluate.py b/evaluate.py
index 8d9693e..ea515a0 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -1,6 +1,5 @@
 import torch
 import numpy as np
-from torch.autograd import Variable
 from choiloader import word_model
 import utils
 import text_manipulation
@@ -9,30 +8,27 @@ def load_model(model_path=None, is_cuda=None):
     if model_path is None:
         model_path = utils.config['model']
 
-    with open(model_path, 'r') as f:
+    # Open and load the model
+    with open(model_path, 'rb') as f:  # Use 'rb' for reading binary models
         model = torch.load(f)
 
     model.eval()
+
     if is_cuda is None:
         is_cuda = utils.config['cuda']
 
     return utils.maybe_cuda(model, is_cuda)
 
-
 def prepare_tensor(sentences):
     tensored_data = []
     for sentence in sentences:
         if len(sentence) > 0:
             tensored_data.append(utils.maybe_cuda(torch.FloatTensor(np.concatenate(sentence))))
-
     return tensored_data
 
-
-
 def text_to_word2vec(sentences, word2vec):
     new_text = []
     for sentence in sentences:
         words = text_manipulation.extract_sentence_words(sentence)
         new_text.append([word_model(w, word2vec) for w in words])
-
-    return new_text
+    return new_text
\ No newline at end of file
diff --git a/gpu2cpu.py b/gpu2cpu.py
index 2156bb9..51da4a5 100644
--- a/gpu2cpu.py
+++ b/gpu2cpu.py
@@ -1,29 +1,30 @@
 import torch
 from argparse import ArgumentParser
-from pathlib2 import Path
-
+from pathlib import Path
 
 def main(args):
     input_path = Path(args.input)
+    
+    # Load the model from the input file (in binary mode)
     with input_path.open('rb') as f:
-        model = torch.load(f)
+        model = torch.load(f, map_location=torch.device('cpu'))  # Ensure loading to CPU
 
-    model = model.cpu()
+    model = model.cpu()  # Ensure the model is on CPU
 
-    if args.output is not None:
+    # Determine the output path
+    if args.output:
         output_path = Path(args.output)
     else:
         output_path = input_path.parent / (input_path.stem + '_cpu' + input_path.suffix)
 
+    # Save the CPU model to the output file
     with output_path.open('wb') as f:
         torch.save(model, f)
 
-
-
 if __name__ == '__main__':
     parser = ArgumentParser()
-    parser.add_argument('-i', '--input', help='Path to original .t7 file')
-    parser.add_argument('-o', '--output', help='Output path')
+    parser.add_argument('-i', '--input', help='Path to original model file', required=True)
+    parser.add_argument('-o', '--output', help='Output path for the CPU model')
     args = parser.parse_args()
 
-    main(args)
+    main(args)
\ No newline at end of file
diff --git a/graphseg_gen.sh b/graphseg_gen.sh
index d206fd9..d6bff99 100644
--- a/graphseg_gen.sh
+++ b/graphseg_gen.sh
@@ -1,5 +1,18 @@
 #!/bin/bash
-for i in 0.2  0.25  0.3  0.35  0.4  0.45  0.5
+
+# Check if the minimum segment size is provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 <min_segment_size>"
+    exit 1
+fi
+
+# Define the range of threshold values
+for i in 0.2 0.25 0.3 0.35 0.4 0.45 0.5
 do
-            python graphseg_timer.py --input ~/Downloads/wiki_dev_100_np_seperators --output ~/Downloads/wiki_dev_100_np_seperators_output --jar graphseg.jar --threshold $i --min_segment $1
+    # Run the Python script with the corresponding threshold and minimum segment size
+    python graphseg_timer.py --input ~/Downloads/wiki_dev_100_np_seperators \
+                             --output ~/Downloads/wiki_dev_100_np_seperators_output \
+                             --jar graphseg.jar \
+                             --threshold $i \
+                             --min_segment $1
 done
\ No newline at end of file
diff --git a/graphseg_timer.py b/graphseg_timer.py
index c5cefce..4a2debd 100644
--- a/graphseg_timer.py
+++ b/graphseg_timer.py
@@ -4,51 +4,44 @@
 import utils
 from argparse import ArgumentParser
 
-
 def main(input, output, jar_path, threshold, min_segment):
-    # java -jar graphseg.jar /home/seg-input /home/seg-output 0.25 3
-
-
-    # for min_segment in range(1, 11):
-    #     for tresh in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9,
-    #                   0.95]:
-
-    output_folder = os.path.join(output,
-                                 'graphseg_output_{}_{}'.format(min_segment, threshold))
+    # Create an output folder based on the threshold and min_segment
+    output_folder = os.path.join(output, f'graphseg_output_{min_segment}_{threshold}')
 
+    # Ensure the output folder exists
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
+
+    # Setup logger
     logger = utils.setup_logger(__name__, os.path.join(output_folder, 'graphseg_timer.log'), delete_old=True)
 
-    beginning_comd = ['java', '-jar', jar_path, input]
-    params = [str(threshold), str(min_segment)]
-    cmd = beginning_comd + [output_folder] + params
-    print cmd
+    # Prepare the command
+    cmd = ['java', '-jar', jar_path, input, output_folder, str(threshold), str(min_segment)]
+    print(cmd)
+
+    # Measure execution time
     start = timer()
-    # os.system(cmd)
-    subprocess.call(cmd)
+    subprocess.call(cmd)  # Use subprocess to execute the command
     end = timer()
-    print 'it tooks seconds:'
-    print end - start
-    logger.info('running on parmas: ' + str(params[0]) + " , " + str(params[1]))
-    logger.info('it tooks seconds:')
-    logger.info(end - start)
-    logger.info('\n')
-
-    print ('done')
 
+    # Log the results
+    logger.info(f'Running with params: threshold={threshold}, min_segment={min_segment}')
+    logger.info(f'Execution time (seconds): {end - start}')
+    
+    print(f'Execution time (seconds): {end - start}')
+    print('Done')
 
 if __name__ == '__main__':
     parser = ArgumentParser()
-    parser.add_argument('--input', help='input folder path',
+    parser.add_argument('--input', help='Input folder path',
                         default='/home/adir/Projects/data/wikipedia/wiki4_no_seperators', type=str)
-    parser.add_argument('--output', help='output folder path',
+    parser.add_argument('--output', help='Output folder path',
                         default='/home/adir/Projects/data/wikipedia/wiki4_output_graphseg/', type=str)
-    parser.add_argument('--jar', help='graphseg jar path path',
+    parser.add_argument('--jar', help='Graphseg jar file path',
                         default='/home/adir/Projects/graphseg/binary/graphseg.jar', type=str)
     parser.add_argument('--threshold', type=float, required=True)
     parser.add_argument('--min_segment', type=int, required=True)
 
     args = parser.parse_args()
 
-    main(args.input, args.output, args.jar, args.threshold, args.min_segment)
+    main(args.input, args.output, args.jar, args.threshold, args.min_segment)
\ No newline at end of file
diff --git a/models/from_presentation.py b/models/from_presentation.py
index 3750b19..39e138d 100644
--- a/models/from_presentation.py
+++ b/models/from_presentation.py
@@ -1,25 +1,19 @@
-from __future__ import print_function
-from __future__ import division
-
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 from utils import maybe_cuda, setup_logger, unsort
 import numpy as np
 from times_profiler import profiler
 
-
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
 
-
+# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required.
 def zero_state(module, batch_size):
     # * 2 is for the two directions
-    return Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))), \
-           Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)))
-
+    return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \
+           maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))
 
 class SentenceEncodingRNN(nn.Module):
     def __init__(self, input_size=300, hidden=128, num_layers=2):
@@ -43,7 +37,6 @@ def forward(self, x):
 
         return reshaped
 
-
 class Model(nn.Module):
     def __init__(self, sentence_encoder, hidden=128, num_layers=2):
         super(Model, self).__init__()
@@ -65,19 +58,17 @@ def __init__(self, sentence_encoder, hidden=128, num_layers=2):
 
         self.criterion = nn.CrossEntropyLoss()
 
-
     def pad(self, s, max_length):
         s_length = s.size()[0]
-        v = Variable(maybe_cuda(s.unsqueeze(0).unsqueeze(0)))
+        v = maybe_cuda(s.unsqueeze(0).unsqueeze(0))
         padded = F.pad(v, (0, 0, 0, max_length - s_length))  # (1, 1, max_length, 300)
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
 
-
     def pad_document(self, d, max_document_length):
         d_length = d.size()[0]
         v = d.unsqueeze(0).unsqueeze(0)
-        padded = F.pad(v, (0, 0,0, max_document_length - d_length ))  # (1, 1, max_length, 300)
+        padded = F.pad(v, (0, 0, 0, max_document_length - d_length))  # (1, 1, max_length, 300)
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
 
@@ -103,18 +94,18 @@ def forward(self, batch):
 
         padded_sentences = [self.pad(s, max_length) for s in sorted_sentences]
         big_tensor = torch.cat(padded_sentences, 1)  # (max_length, batch size, 300)
-        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths)
+        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False)
         profiler.set()  # 1
         encoded_sentences = self.sentence_encoder(packed_tensor)
         profiler.set()  # 2
-        unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order))))
+        unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
 
         index = 0
         encoded_documents = []
         for sentences_count in sentences_per_doc:
             end_index = index + sentences_count
-            encoded_documents.append(unsorted_encodings[index : end_index, :])
+            encoded_documents.append(unsorted_encodings[index: end_index, :])
             index = end_index
 
         doc_sizes = [doc.size()[0] for doc in encoded_documents]
@@ -124,7 +115,7 @@ def forward(self, batch):
         ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx]
         padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents]
         docs_tensor = torch.cat(padded_docs, 1)
-        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)
+        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False)
         profiler.set()  # 3
         sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size))
         profiler.set()  # 4
@@ -142,9 +133,8 @@ def forward(self, batch):
         profiler.finish(profilerLogger)  # 5
         return x
 
-
 def create():
     sentence_encoder = SentenceEncodingRNN(input_size=300,
                                            hidden=256,
                                            num_layers=2)
-    return Model(sentence_encoder, hidden=256, num_layers=2)
+    return Model(sentence_encoder, hidden=256, num_layers=2)
\ No newline at end of file
diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py
index 6d168b6..847d053 100644
--- a/models/max_sentence_embedding.py
+++ b/models/max_sentence_embedding.py
@@ -1,25 +1,19 @@
-from __future__ import print_function
-from __future__ import division
-
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 from utils import maybe_cuda, setup_logger, unsort
 import numpy as np
 from times_profiler import profiler
 
-
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
 
-
+# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required.
 def zero_state(module, batch_size):
     # * 2 is for the two directions
-    return Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))), \
-           Variable(maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)))
-
+    return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \
+           maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))
 
 class SentenceEncodingRNN(nn.Module):
     def __init__(self, input_size, hidden, num_layers):
@@ -38,15 +32,14 @@ def forward(self, x):
         batch_size = x.batch_sizes[0]
         s = zero_state(self, batch_size)
         packed_output, _ = self.lstm(x, s)
-        padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256) 
+        padded_output, lengths = pad_packed_sequence(packed_output)  # (max sentence len, batch, 256)
 
-        maxes = Variable(maybe_cuda(torch.zeros(batch_size, padded_output.size(2))))
+        maxes = maybe_cuda(torch.zeros(batch_size, padded_output.size(2)))
         for i in range(batch_size):
             maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0]
 
         return maxes
 
-
 class Model(nn.Module):
     def __init__(self, sentence_encoder, hidden=128, num_layers=2):
         super(Model, self).__init__()
@@ -68,19 +61,17 @@ def __init__(self, sentence_encoder, hidden=128, num_layers=2):
 
         self.criterion = nn.CrossEntropyLoss()
 
-
     def pad(self, s, max_length):
         s_length = s.size()[0]
-        v = Variable(maybe_cuda(s.unsqueeze(0).unsqueeze(0)))
+        v = maybe_cuda(s.unsqueeze(0).unsqueeze(0))
         padded = F.pad(v, (0, 0, 0, max_length - s_length))  # (1, 1, max_length, 300)
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
 
-
     def pad_document(self, d, max_document_length):
         d_length = d.size()[0]
         v = d.unsqueeze(0).unsqueeze(0)
-        padded = F.pad(v, (0, 0,0, max_document_length - d_length ))  # (1, 1, max_length, 300)
+        padded = F.pad(v, (0, 0, 0, max_document_length - d_length))  # (1, 1, max_length, 300)
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
 
@@ -104,16 +95,16 @@ def forward(self, batch):
 
         padded_sentences = [self.pad(s, max_length) for s in sorted_sentences]
         big_tensor = torch.cat(padded_sentences, 1)  # (max_length, batch size, 300)
-        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths)
+        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False)
         encoded_sentences = self.sentence_encoder(packed_tensor)
-        unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order))))
+        unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
 
         index = 0
         encoded_documents = []
         for sentences_count in sentences_per_doc:
             end_index = index + sentences_count
-            encoded_documents.append(unsorted_encodings[index : end_index, :])
+            encoded_documents.append(unsorted_encodings[index: end_index, :])
             index = end_index
 
         doc_sizes = [doc.size()[0] for doc in encoded_documents]
@@ -123,7 +114,7 @@ def forward(self, batch):
         ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx]
         padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents]
         docs_tensor = torch.cat(padded_docs, 1)
-        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)
+        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False)
         sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size))
         padded_x, _ = pad_packed_sequence(sentence_lstm_output)  # (max sentence len, batch, 256)
 
@@ -137,9 +128,8 @@ def forward(self, batch):
         x = self.h2s(sentence_outputs)
         return x
 
-
 def create():
     sentence_encoder = SentenceEncodingRNN(input_size=300,
                                            hidden=256,
                                            num_layers=2)
-    return Model(sentence_encoder, hidden=256, num_layers=2)
+    return Model(sentence_encoder, hidden=256, num_layers=2)
\ No newline at end of file
diff --git a/models/naive.py b/models/naive.py
index f8c8820..a5ed174 100644
--- a/models/naive.py
+++ b/models/naive.py
@@ -1,12 +1,9 @@
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.nn.functional as F
 from utils import maybe_cuda, unsort
 import numpy as np
 
-
-
 class Naive(nn.Module):
     def __init__(self, segment_average_size):
         super(Naive, self).__init__()
@@ -14,30 +11,26 @@ def __init__(self, segment_average_size):
         self.segment_average_size = segment_average_size
         self.criterion = nn.CrossEntropyLoss()
 
-
-
-    def create_random_output(self,size):
-
-        cut_probability = float (1) / self.segment_average_size
+    def create_random_output(self, size):
+        cut_probability = float(1) / self.segment_average_size
 
         cuts = np.random.choice([0, 1], size=(size,), p=[1-cut_probability, cut_probability])
-        ret = torch.zeros(size,2)
+        ret = torch.zeros(size, 2)
+
+        for i in range(ret.size(0)):
+            ret[i, 1] = cuts[i]
+            ret[i, 0] = 1 - cuts[i]
 
-        for i in range(ret.size()[0]):
-            ret[i,1] = cuts[i]
-            ret[i,0] = 1 - cuts[i]
         return ret
 
     def forward(self, x):
-
         batch_segmentations = []
         for document in x:
             num_sentences = len(document)
             doc_segmentation = self.create_random_output(num_sentences - 1)
             batch_segmentations.append(doc_segmentation)
-        batch_output = torch.cat(batch_segmentations,0)
-        return Variable(batch_output)
-
+        batch_output = torch.cat(batch_segmentations, 0)
+        return batch_output  # No need for Variable wrapper
 
 def create():
-    return Naive(13)
+    return Naive(13)
\ No newline at end of file
diff --git a/models/single_lstm.py b/models/single_lstm.py
index 5560141..72920bf 100644
--- a/models/single_lstm.py
+++ b/models/single_lstm.py
@@ -1,25 +1,19 @@
-from __future__ import print_function
-from __future__ import division
-
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 from utils import maybe_cuda, setup_logger, unsort
 import numpy as np
 from times_profiler import profiler
 
-
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
 
-
+# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required.
 def zero_state(module, batch_size):
     # * 2 is for the two directions
-    return Variable(maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden))), \
-           Variable(maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden)))
-
+    return maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden)), \
+           maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden))
 
 class SentenceEncodingRNN(nn.Module):
     def __init__(self, input_size, hidden, num_layers):
@@ -43,7 +37,6 @@ def forward(self, x):
 
         return reshaped
 
-
 class Model(nn.Module):
     def __init__(self, sentence_encoder, hidden, num_layers):
         super(Model, self).__init__()
@@ -65,19 +58,17 @@ def __init__(self, sentence_encoder, hidden, num_layers):
 
         self.criterion = nn.CrossEntropyLoss()
 
-
     def pad(self, s, max_length):
-        s_length = s.size()[0]
-        v = Variable(maybe_cuda(s.unsqueeze(0).unsqueeze(0)))
+        s_length = s.size(0)
+        v = maybe_cuda(s.unsqueeze(0).unsqueeze(0))
         padded = F.pad(v, (0, 0, 0, max_length - s_length))  # (1, 1, max_length, 300)
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
 
-
     def pad_document(self, d, max_document_length):
-        d_length = d.size()[0]
+        d_length = d.size(0)
         v = d.unsqueeze(0).unsqueeze(0)
-        padded = F.pad(v, (0, 0,0, max_document_length - d_length ))  # (1, 1, max_length, 300)
+        padded = F.pad(v, (0, 0, 0, max_document_length - d_length))  # (1, 1, max_length, 300)
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
 
@@ -92,10 +83,10 @@ def forward(self, batch):
             all_batch_sentences.extend(document)
             sentences_per_doc.append(len(document))
 
-        lengths = [s.size()[0] for s in all_batch_sentences]
+        lengths = [s.size(0) for s in all_batch_sentences]
         sort_order = np.argsort(lengths)[::-1]
         sorted_sentences = [all_batch_sentences[i] for i in sort_order]
-        sorted_lengths = [s.size()[0] for s in sorted_sentences]
+        sorted_lengths = [s.size(0) for s in sorted_sentences]
 
         max_length = max(lengths)
         logger.debug('Num sentences: %s, max sentence length: %s', 
@@ -103,28 +94,28 @@ def forward(self, batch):
 
         padded_sentences = [self.pad(s, max_length) for s in sorted_sentences]
         big_tensor = torch.cat(padded_sentences, 1)  # (max_length, batch size, 300)
-        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths)
+        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False)
         profiler.set()  # 1
         encoded_sentences = self.sentence_encoder(packed_tensor)
         profiler.set()  # 2
-        unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order))))
+        unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
 
         index = 0
         encoded_documents = []
         for sentences_count in sentences_per_doc:
             end_index = index + sentences_count
-            encoded_documents.append(unsorted_encodings[index : end_index, :])
+            encoded_documents.append(unsorted_encodings[index: end_index, :])
             index = end_index
 
-        doc_sizes = [doc.size()[0] for doc in encoded_documents]
+        doc_sizes = [doc.size(0) for doc in encoded_documents]
         max_doc_size = np.max(doc_sizes)
         ordered_document_idx = np.argsort(doc_sizes)[::-1]
         ordered_doc_sizes = sorted(doc_sizes)[::-1]
         ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx]
         padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents]
         docs_tensor = torch.cat(padded_docs, 1)
-        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)
+        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False)
         profiler.set()  # 3
         sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size))
         profiler.set()  # 4
@@ -142,9 +133,8 @@ def forward(self, batch):
         profiler.finish(profilerLogger)  # 5
         return x
 
-
 def create():
     sentence_encoder = SentenceEncodingRNN(input_size=300,
                                            hidden=256,
                                            num_layers=4)
-    return Model(sentence_encoder, hidden=256, num_layers=4)
+    return Model(sentence_encoder, hidden=256, num_layers=4)
\ No newline at end of file
diff --git a/run.py b/run.py
index 9465ba7..b042079 100644
--- a/run.py
+++ b/run.py
@@ -12,7 +12,7 @@
 from tensorboard_logger import configure, log_value
 import os
 import sys
-from pathlib2 import Path
+from pathlib import Path
 from wiki_loader import WikipediaDataSet
 import accuracy
 import numpy as np
@@ -22,20 +22,17 @@
 
 preds_stats = utils.predictions_analysis()
 
-
 def softmax(x):
     max_each_row = np.max(x, axis=1, keepdims=True)
     exps = np.exp(x - max_each_row)
     sums = np.sum(exps, axis=1, keepdims=True)
     return exps / sums
 
-
 def import_model(model_name):
     module = __import__('models.' + model_name, fromlist=['models'])
     return module.create()
 
-
-class Accuracies(object):
+class Accuracies:
     def __init__(self):
         self.thresholds = np.arange(0, 1, 0.05)
         self.accuracies = {k: accuracy.Accuracy() for k in self.thresholds}
@@ -47,7 +44,7 @@ def update(self, output_np, targets_np):
             to_idx = int(current_idx + document_sentence_count)
 
             for threshold in self.thresholds:
-                output = ((output_np[current_idx: to_idx, :])[:, 1] > threshold)
+                output = (output_np[current_idx: to_idx, 1] > threshold)
                 h = np.append(output, [1])
                 tt = np.append(t, [1])
 
@@ -68,123 +65,91 @@ def calc_accuracy(self):
 
         return min_pk, min_epoch_windiff, min_threshold
 
-
 def train(model, args, epoch, dataset, logger, optimizer):
     model.train()
-    total_loss = float(0)
+    total_loss = 0.0  # Changed to float value
     with tqdm(desc='Training', total=len(dataset)) as pbar:
         for i, (data, target, paths) in enumerate(dataset):
-            if True:
-                if i == args.stop_after:
-                    break
-
-                pbar.update()
-                model.zero_grad()
-                output = model(data)
-                target_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False)
-                loss = model.criterion(output, target_var)
-                loss.backward()
-
-                optimizer.step()
-                total_loss += loss.data[0]
-                # logger.debug('Batch %s - Train error %7.4f', i, loss.data[0])
-                pbar.set_description('Training, loss={:.4}'.format(loss.data[0]))
-            # except Exception as e:
-                # logger.info('Exception "%s" in batch %s', e, i)
-                # logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True)
-                # pass
-
-    total_loss = total_loss / len(dataset)
-    logger.debug('Training Epoch: {}, Loss: {:.4}.'.format(epoch + 1, total_loss))
-    log_value('Training Loss', total_loss, epoch + 1)
+            if i == args.stop_after:
+                break
+
+            pbar.update()
+            model.zero_grad()
+            output = model(data)
+            target_var = maybe_cuda(torch.cat(target, 0), args.cuda)
+            loss = model.criterion(output, target_var)
+            loss.backward()
 
+            optimizer.step()
+            total_loss += loss.item()  # Replaced deprecated .data[0] with .item()
+
+            pbar.set_description(f'Training, loss={loss.item():.4}')
+
+    total_loss /= len(dataset)
+    logger.debug(f'Training Epoch: {epoch + 1}, Loss: {total_loss:.4}')
+    log_value('Training Loss', total_loss, epoch + 1)
 
 def validate(model, args, epoch, dataset, logger):
     model.eval()
-    with tqdm(desc='Validatinging', total=len(dataset)) as pbar:
+    with tqdm(desc='Validating', total=len(dataset)) as pbar:
         acc = Accuracies()
         for i, (data, target, paths) in enumerate(dataset):
-            if True:
-                if i == args.stop_after:
-                    break
-                pbar.update()
-                output = model(data)
-                output_softmax = F.softmax(output, 1)
-                targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False)
-
-                output_seg = output.data.cpu().numpy().argmax(axis=1)
-                target_seg = targets_var.data.cpu().numpy()
-                preds_stats.add(output_seg, target_seg)
+            if i == args.stop_after:
+                break
+            pbar.update()
+            output = model(data)
+            output_softmax = F.softmax(output, dim=1)
+            targets_var = maybe_cuda(torch.cat(target, 0), args.cuda)
 
-                acc.update(output_softmax.data.cpu().numpy(), target)
+            output_seg = output.argmax(dim=1).cpu().numpy()
+            target_seg = targets_var.cpu().numpy()
+            preds_stats.add(output_seg, target_seg)
 
-
-            # except Exception as e:
-            #     # logger.info('Exception "%s" in batch %s', e, i)
-            #     logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True)
-            #     pass
+            acc.update(output_softmax.cpu().numpy(), target)
 
         epoch_pk, epoch_windiff, threshold = acc.calc_accuracy()
 
-        logger.info('Validating Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1,
-                                                                                                            preds_stats.get_accuracy(),
-                                                                                                            epoch_pk,
-                                                                                                            epoch_windiff,
-                                                                                                            preds_stats.get_f1()))
+        logger.info(f'Validating Epoch: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, '
+                    f'Pk: {epoch_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}')
         preds_stats.reset()
 
         return epoch_pk, threshold
 
-
 def test(model, args, epoch, dataset, logger, threshold):
     model.eval()
     with tqdm(desc='Testing', total=len(dataset)) as pbar:
         acc = accuracy.Accuracy()
         for i, (data, target, paths) in enumerate(dataset):
-            if True:
-                if i == args.stop_after:
-                    break
-                pbar.update()
-                output = model(data)
-                output_softmax = F.softmax(output, 1)
-                targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False)
-                output_seg = output.data.cpu().numpy().argmax(axis=1)
-                target_seg = targets_var.data.cpu().numpy()
-                preds_stats.add(output_seg, target_seg)
-
-                current_idx = 0
-
-                for k, t in enumerate(target):
-                    document_sentence_count = len(t)
-                    to_idx = int(current_idx + document_sentence_count)
-
-                    output = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > threshold)
-                    h = np.append(output, [1])
-                    tt = np.append(t, [1])
-
-                    acc.update(h, tt)
-
-                    current_idx = to_idx
-
-                    # acc.update(output_softmax.data.cpu().numpy(), target)
+            if i == args.stop_after:
+                break
+            pbar.update()
+            output = model(data)
+            output_softmax = F.softmax(output, dim=1)
+            targets_var = maybe_cuda(torch.cat(target, 0), args.cuda)
+            output_seg = output.argmax(dim=1).cpu().numpy()
+            target_seg = targets_var.cpu().numpy()
+            preds_stats.add(output_seg, target_seg)
+
+            current_idx = 0
+            for k, t in enumerate(target):
+                document_sentence_count = len(t)
+                to_idx = int(current_idx + document_sentence_count)
+
+                output = (output_softmax.cpu().numpy()[current_idx:to_idx, 1] > threshold)
+                h = np.append(output, [1])
+                tt = np.append(t, [1])
 
-            #
-            # except Exception as e:
-            #     # logger.info('Exception "%s" in batch %s', e, i)
-            #     logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True)
+                acc.update(h, tt)
+                current_idx = to_idx
 
         epoch_pk, epoch_windiff = acc.calc_accuracy()
 
-        logger.debug('Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1,
-                                                                                                          preds_stats.get_accuracy(),
-                                                                                                          epoch_pk,
-                                                                                                          epoch_windiff,
-                                                                                                          preds_stats.get_f1()))
+        logger.debug(f'Testing Epoch: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, '
+                     f'Pk: {epoch_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}')
         preds_stats.reset()
 
         return epoch_pk
 
-
 def main(args):
     sys.path.append(str(Path(__file__).parent))
 
@@ -194,30 +159,20 @@ def main(args):
     logger = utils.setup_logger(__name__, os.path.join(args.checkpoint_dir, 'train.log'))
 
     utils.read_config_file(args.config)
-    utils.config.update(args.__dict__)
-    logger.debug('Running with config %s', utils.config)
+    utils.config.update(vars(args))  # Updated to use vars(args)
+    logger.debug(f'Running with config {utils.config}')
 
     configure(os.path.join('runs', args.expname))
 
-    if not args.test:
-        word2vec = gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True)
-    else:
-        word2vec = None
+    word2vec = None if args.test else gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True)
 
     if not args.infer:
-        if args.wiki:
-            dataset_path = Path(utils.config['wikidataset'])
-            train_dataset = WikipediaDataSet(dataset_path / 'train', word2vec=word2vec,
-                                             high_granularity=args.high_granularity)
-            dev_dataset = WikipediaDataSet(dataset_path / 'dev', word2vec=word2vec, high_granularity=args.high_granularity)
-            test_dataset = WikipediaDataSet(dataset_path / 'test', word2vec=word2vec,
-                                            high_granularity=args.high_granularity)
-
-        else:
-            dataset_path = utils.config['choidataset']
-            train_dataset = ChoiDataset(dataset_path, word2vec)
-            dev_dataset = ChoiDataset(dataset_path, word2vec)
-            test_dataset = ChoiDataset(dataset_path, word2vec)
+        dataset_class = WikipediaDataSet if args.wiki else ChoiDataset
+        dataset_path = Path(utils.config['wikidataset']) if args.wiki else Path(utils.config['choidataset'])
+
+        train_dataset = dataset_class(dataset_path / 'train', word2vec, high_granularity=args.high_granularity)
+        dev_dataset = dataset_class(dataset_path / 'dev', word2vec, high_granularity=args.high_granularity)
+        test_dataset = dataset_class(dataset_path / 'test', word2vec, high_granularity=args.high_granularity)
 
         train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True,
                               num_workers=args.num_workers)
@@ -226,60 +181,45 @@ def main(args):
         test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
                              num_workers=args.num_workers)
 
-    assert bool(args.model) ^ bool(args.load_from)  # exactly one of them must be set
-
-    if args.model:
-        model = import_model(args.model)
-    elif args.load_from:
-        with open(args.load_from, 'rb') as f:
-            model = torch.load(f)
-
-    model.train()
+    model = import_model(args.model) if args.model else torch.load(open(args.load_from, 'rb'))
     model = maybe_cuda(model)
 
     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
     if not args.infer:
         best_val_pk = 1.0
         for j in range(args.epochs):
             train(model, args, j, train_dl, logger, optimizer)
-            with (checkpoint_path / 'model{:03d}.t7'.format(j)).open('wb') as f:
-                torch.save(model, f)
+            torch.save(model, open(checkpoint_path / f'model{j:03d}.t7', 'wb'))
 
             val_pk, threshold = validate(model, args, j, dev_dl, logger)
             if val_pk < best_val_pk:
                 test_pk = test(model, args, j, test_dl, logger, threshold)
-                logger.debug(
-                    colored(
-                        'Current best model from epoch {} with p_k {} and threshold {}'.format(j, test_pk, threshold),
-                        'green'))
+                logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green'))
                 best_val_pk = val_pk
-                with (checkpoint_path / 'best_model.t7'.format(j)).open('wb') as f:
-                    torch.save(model, f)
+                torch.save(model, open(checkpoint_path / 'best_model.t7', 'wb'))
 
     else:
-        test_dataset = WikipediaDataSet(args.infer, word2vec=word2vec,
-                                        high_granularity=args.high_granularity)
-        test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
-                             num_workers=args.num_workers)
-        print test(model, args, 0, test_dl, logger, 0.4)
-
+        test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity),
+                             batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers)
+        print(test(model, args, 0, test_dl, logger, 0.4))
 
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--cuda', help='Use cuda?', action='store_true')
-    parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true')
+    parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true')
     parser.add_argument('--bs', help='Batch size', type=int, default=8)
-    parser.add_argument('--test_bs', help='Batch size', type=int, default=5)
+    parser.add_argument('--test_bs', help='Test batch size', type=int, default=5)
     parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10)
     parser.add_argument('--model', help='Model to run - will import and run')
     parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue')
     parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1')
     parser.add_argument('--checkpoint_dir', help='Checkpoint directory', default='checkpoints')
-    parser.add_argument('--stop_after', help='Number of batches to stop after', default=None, type=int)
+    parser.add_argument('--stop_after', help='Number of batches to stop after', type=int)
     parser.add_argument('--config', help='Path to config.json', default='config.json')
-    parser.add_argument('--wiki', help='Use wikipedia as dataset?', action='store_true')
+    parser.add_argument('--wiki', help='Use Wikipedia as dataset?', action='store_true')
     parser.add_argument('--num_workers', help='How many workers to use for data loading', type=int, default=0)
-    parser.add_argument('--high_granularity', help='Use high granularity for wikipedia dataset segmentation', action='store_true')
-    parser.add_argument('--infer', help='inference_dir', type=str)
+    parser.add_argument('--high_granularity', help='Use high granularity for Wikipedia dataset segmentation', action='store_true')
+    parser.add_argument('--infer', help='Inference directory', type=str)
 
-    main(parser.parse_args())
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/run_web_server.py b/run_web_server.py
index 5bd70db..e98c53d 100644
--- a/run_web_server.py
+++ b/run_web_server.py
@@ -1,18 +1,23 @@
 from argparse import ArgumentParser
 from utils import config, read_config_file
+from webapp import app
 
-parser = ArgumentParser()
-parser.add_argument('--cuda', help='Is cuda?', action='store_true')
-parser.add_argument('--model', help='Model file path', required=True)
-parser.add_argument('--config', help='Path to config.json', default='config.json')
-parser.add_argument('--test', help='Use fake word2vec', action='store_true')
-parser.add_argument('--port', type=int, help='List to this port')
+def main(args):
+    # Read configuration from the config file
+    read_config_file(args.config)
+    config.update(vars(args))  # Use vars(args) to convert argparse.Namespace to a dictionary
 
-args = parser.parse_args()
+    # Run the web server
+    app.run(debug=True, port=args.port)
 
-read_config_file(args.config)
-config.update(args.__dict__)
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--cuda', help='Is cuda?', action='store_true')
+    parser.add_argument('--model', help='Model file path', required=True)
+    parser.add_argument('--config', help='Path to config.json', default='config.json')
+    parser.add_argument('--test', help='Use fake word2vec', action='store_true')
+    parser.add_argument('--port', type=int, help='Port to listen on', default=5000)
 
-from webapp import app
-app.run(debug=True, port=args.port)
+    args = parser.parse_args()
 
+    main(args)
\ No newline at end of file
diff --git a/seg_comparsion.py b/seg_comparsion.py
index b5fe8e3..7142126 100644
--- a/seg_comparsion.py
+++ b/seg_comparsion.py
@@ -9,43 +9,41 @@
 from choiloader import ChoiDataset, collate_fn, read_choi_file
 from torch.utils.data import DataLoader
 from test_accuracy import softmax
-from wiki_loader import clean_section,split_sentences,section_delimiter,extract_sentence_words
+from wiki_loader import clean_section, split_sentences, section_delimiter, extract_sentence_words
 import os
 import sys
 
-
 preds_stats = utils.predictions_analysis()
 paragraphs_delimiter = "=="
 
 def main(args):
-
     utils.read_config_file(args.config)
 
-
     if not args.test:
         word2vec = gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True)
     else:
         word2vec = None
 
+    # Load model
     with open(args.model, 'rb') as f:
         model = torch.load(f)
     model = maybe_cuda(model)
     model.eval()
 
-    data_path = args.folder
-    if (args.wiki):
-        dataset = WikipediaDataSet(args.folder,word2vec,folder=True)
-        delimeter = section_delimiter
-
-    elif args.choi: #not in use
-        dataset = ChoiDataset(args.folder, word2vec,is_cache_path=True)
-        delimeter = paragraphs_delimiter
+    # Set dataset and delimiter based on the input type
+    if args.wiki:
+        dataset = WikipediaDataSet(args.folder, word2vec, folder=True)
+        delimiter = section_delimiter
+    elif args.choi:  # Not in use but kept for reference
+        dataset = ChoiDataset(args.folder, word2vec, is_cache_path=True)
+        delimiter = paragraphs_delimiter
     else:
-        print 'required dataset type'
+        print('Dataset type is required')
         return
 
-    dl = DataLoader(dataset,batch_size=1, collate_fn=collate_fn, shuffle=False)
+    dl = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
 
+    # Process each document in the dataset
     for i, (data, targets, paths) in enumerate(dl):
         doc_path = str(paths[0])
         output = model(data)
@@ -56,67 +54,63 @@ def main(args):
         target_seg = targets_var.data.cpu().numpy()
         preds_stats.add(output_seg, target_seg)
 
+        # Create the output folder if it doesn't exist
         if not os.path.exists(args.output_folder):
             os.makedirs(args.output_folder)
 
-        result_file_path = os.path.join(args.output_folder,os.path.basename(doc_path))
-        result_file = open(str(result_file_path ),"w")
-
-        file = open(str(doc_path), "r")
-        raw_content = file.read()
-        file.close()
-        sections = [clean_section(s) for s in raw_content.decode('utf-8').strip().split(delimeter) if len(s) > 0 and s != "\n"]
-
-        sum_sentences = 0
-        total_num_sentences = 0
-        bad_sentences = 0
-
-        for section in sections:
-            sentences = split_sentences(section)
-            if sentences:
-                total_num_sentences += len(sentences)
-                for i in range(0,len(sentences)):
-                    sentence = sentences[i]
-                    words = extract_sentence_words(sentence)
-                    sentence = " ".join(words)
-
-                    result_file.write(sentence.encode('utf-8'))
-
-                    sys.stdout.flush()
-                    result_file.write("\n".encode('utf-8'))
-                    if (len(target_seg) == sum_sentences): ## last sentence
-                        continue
-                    if (target_seg[sum_sentences]):
-                        result_file.write(delimeter.encode('utf-8'))
-                        sys.stdout.flush()
-                        result_file.write("\n".encode('utf-8'))
-                    if (output_seg[sum_sentences]):
-                        result_file.write("*******Our_Segmentation********".encode('utf-8'))
-                        result_file.write("\n".encode('utf-8'))
-                    sum_sentences += 1
-        result_file.close()
-
-        if ((total_num_sentences - bad_sentences) != (len(target_seg) + 1)): ## +1 last sentence segment doesn't counted
-            print 'Pick another article'
-            print 'len(targets) + 1= ' + str(len(target_seg) + 1)
-            print 'total_num_sentences - bad_sentences= ' + str(total_num_sentences - bad_sentences)
-        else :
-            print 'finish comparsion'
-            print 'result at ' + str(result_file_path )
-            print ('F1: {:.4}.'.format(preds_stats.get_f1()))
-            print ('Accuracy: {:.4}.'.format(preds_stats.get_accuracy()))
-
+        # Write the result file
+        result_file_path = os.path.join(args.output_folder, os.path.basename(doc_path))
+        with open(result_file_path, "w", encoding='utf-8') as result_file:
+            with open(doc_path, "r", encoding='utf-8') as file:
+                raw_content = file.read()
+
+            sections = [clean_section(s) for s in raw_content.strip().split(delimiter) if len(s) > 0 and s != "\n"]
+
+            sum_sentences = 0
+            total_num_sentences = 0
+            bad_sentences = 0
+
+            for section in sections:
+                sentences = split_sentences(section)
+                if sentences:
+                    total_num_sentences += len(sentences)
+                    for i, sentence in enumerate(sentences):
+                        words = extract_sentence_words(sentence)
+                        sentence = " ".join(words)
+
+                        result_file.write(sentence + "\n")
+
+                        if len(target_seg) == sum_sentences:  # Last sentence
+                            continue
+
+                        if target_seg[sum_sentences]:  # True segmentation
+                            result_file.write(delimiter + "\n")
+                        
+                        if output_seg[sum_sentences]:  # Model segmentation
+                            result_file.write("*******Our_Segmentation********\n")
+                        
+                        sum_sentences += 1
+
+        if (total_num_sentences - bad_sentences) != (len(target_seg) + 1):  # +1 for last sentence
+            print('Pick another article')
+            print(f'len(targets) + 1 = {len(target_seg) + 1}')
+            print(f'total_num_sentences - bad_sentences = {total_num_sentences - bad_sentences}')
+        else:
+            print('Finished comparison')
+            print(f'Result at {result_file_path}')
+            print(f'F1: {preds_stats.get_f1():.4}.')
+            print(f'Accuracy: {preds_stats.get_accuracy():.4}.')
 
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--cuda', help='Use cuda?', action='store_true')
-    parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true')
+    parser.add_argument('--test', help='Test mode? (e.g., fake word2vec)', action='store_true')
     parser.add_argument('--model', help='Model to run - will import and run', required=True)
     parser.add_argument('--config', help='Path to config.json', default='config.json')
-    parser.add_argument('--folder', help='folder with files to test on', required=True)
-    parser.add_argument('--output_folder', help='folder for result', required=True)
-    parser.add_argument('--wiki', help='if its wiki article', action='store_true')
-    parser.add_argument('--manifesto', help='if its manifesto article', action='store_true')
-    parser.add_argument('--choi', help='if its choi article', action='store_true')
+    parser.add_argument('--folder', help='Folder with files to test on', required=True)
+    parser.add_argument('--output_folder', help='Folder for results', required=True)
+    parser.add_argument('--wiki', help='If the dataset is from Wikipedia', action='store_true')
+    parser.add_argument('--manifesto', help='If the dataset is from Manifesto', action='store_true')
+    parser.add_argument('--choi', help='If the dataset is from Choi', action='store_true')
 
-    main(parser.parse_args())
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/test_accuracy.py b/test_accuracy.py
index 6763796..7f9b861 100644
--- a/test_accuracy.py
+++ b/test_accuracy.py
@@ -1,10 +1,7 @@
-from __future__ import division
-
 import torch
 from torch.utils.data import DataLoader
 from torch.autograd import Variable
 import numpy as np
-
 from choiloader import ChoiDataset, collate_fn
 from tqdm import tqdm
 from argparse import ArgumentParser
@@ -13,7 +10,7 @@
 import utils
 import os
 import sys
-from pathlib2 import Path
+from pathlib import Path
 from wiki_loader import WikipediaDataSet
 import accuracy
 from models import naive
@@ -29,13 +26,12 @@ def softmax(x):
     sums = np.sum(exps, axis=1, keepdims=True)
     return exps / sums
 
-def getSegmentsFolders(path):
-
+def get_segments_folders(path):
     ret_folders = []
     folders = [o for o in os.listdir(path) if os.path.isdir(os.path.join(path, o))]
     for folder in folders:
-        if folder.__contains__("-"):
-            ret_folders.append(os.path.join(path,folder))
+        if '-' in folder:
+            ret_folders.append(os.path.join(path, folder))
     return ret_folders
 
 
@@ -45,10 +41,10 @@ def main(args):
     sys.path.append(str(Path(__file__).parent))
 
     utils.read_config_file(args.config)
-    utils.config.update(args.__dict__)
+    utils.config.update(vars(args))  # Use vars for better argument handling
 
     logger.debug('Running with config %s', utils.config)
-    print ('Running with threshold: ' + str(args.seg_threshold))
+    print(f'Running with threshold: {args.seg_threshold}')
     preds_stats = utils.predictions_analysis()
 
     if not args.test:
@@ -57,55 +53,54 @@ def main(args):
         word2vec = None
 
     word2vec_done = timer()
-    print 'Loading word2vec ellapsed: ' + str(word2vec_done - start) + ' seconds'
+    print(f'Loading word2vec elapsed: {word2vec_done - start} seconds')
     dirname = 'test'
 
+    # Determine dataset based on wiki flag
     if args.wiki:
         dataset_folders = [Path(utils.config['wikidataset']) / dirname]
-        if (args.wiki_folder):
-            dataset_folders = []
-            dataset_folders.append(args.wiki_folder)
-        print 'running on wikipedia'
+        if args.wiki_folder:
+            dataset_folders = [args.wiki_folder]
+        print('Running on Wikipedia')
     else:
-        if (args.bySegLength):
-            dataset_folders = getSegmentsFolders(utils.config['choidataset'])
-            print 'run on choi by segments length'
-        else :
+        if args.bySegLength:
+            dataset_folders = get_segments_folders(utils.config['choidataset'])
+            print('Running on Choi by segments length')
+        else:
             dataset_folders = [utils.config['choidataset']]
-            print 'running on Choi'
-
+            print('Running on Choi')
 
+    # Load the model
     with open(args.model, 'rb') as f:
         model = torch.load(f)
 
     model = maybe_cuda(model)
     model.eval()
 
-    if (args.naive):
+    if args.naive:
         model = naive.create()
 
     for dataset_path in dataset_folders:
+        if args.bySegLength:
+            print('Segment is', os.path.basename(dataset_path), ":")
 
-        if (args.bySegLength):
-            print 'Segment is ',os.path.basename(dataset_path), " :"
-
+        # Load dataset
         if args.wiki:
-            if (args.wiki_folder):
+            if args.wiki_folder:
                 dataset = WikipediaDataSet(dataset_path, word2vec, folder=True, high_granularity=False)
-            else :
+            else:
                 dataset = WikipediaDataSet(dataset_path, word2vec, high_granularity=False)
         else:
-            dataset = ChoiDataset(dataset_path , word2vec)
+            dataset = ChoiDataset(dataset_path, word2vec)
 
         dl = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=False)
 
-
-
+        # Testing loop
         with tqdm(desc='Testing', total=len(dl)) as pbar:
             total_accurate = 0
             total_count = 0
             total_loss = 0
-            acc =  accuracy.Accuracy()
+            acc = accuracy.Accuracy()
 
             for i, (data, targets, paths) in enumerate(dl):
                 if i == args.stop_after:
@@ -113,16 +108,16 @@ def main(args):
 
                 pbar.update()
                 output = model(data)
-                targets_var = Variable(maybe_cuda(torch.cat(targets, 0), args.cuda), requires_grad=False)
+                targets_var = maybe_cuda(torch.cat(targets, 0), args.cuda)
                 batch_loss = 0
-                output_prob = softmax(output.data.cpu().numpy())
+                output_prob = softmax(output.cpu().numpy())
                 output_seg = output_prob[:, 1] > args.seg_threshold
-                target_seg = targets_var.data.cpu().numpy()
+                target_seg = targets_var.cpu().numpy()
                 batch_accurate = (output_seg == target_seg).sum()
                 total_accurate += batch_accurate
                 total_count += len(target_seg)
                 total_loss += batch_loss
-                preds_stats.add(output_seg,target_seg)
+                preds_stats.add(output_seg, target_seg)
 
                 current_target_idx = 0
                 for k, t in enumerate(targets):
@@ -131,18 +126,18 @@ def main(args):
                     to_idx = int(current_target_idx + document_sentence_count)
                     h = output_seg[current_target_idx: to_idx]
 
-                    # hypothesis and targets are missing classification of last sentence, and therefore we will add
-                    # 1 for both
+                    # Add classification for the last sentence
                     h = np.append(h, [1])
                     t = np.append(t.cpu().numpy(), [1])
 
-                    acc.update(h,t, sentences_length=sentences_length)
+                    acc.update(h, t, sentences_length=sentences_length)
 
                     current_target_idx = to_idx
 
                 logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i, batch_loss, batch_accurate / len(target_seg))
                 pbar.set_description('Testing, Accuracy={:.4}'.format(batch_accurate / len(target_seg)))
 
+        # Logging results
         average_loss = total_loss / len(dl)
         average_accuracy = total_accurate / total_count
         calculated_pk, _ = acc.calc_accuracy()
@@ -150,29 +145,26 @@ def main(args):
         logger.info('Finished testing.')
         logger.info('Average loss: %s', average_loss)
         logger.info('Average accuracy: %s', average_accuracy)
-        logger.info('Pk: {:.4}.'.format(calculated_pk))
-        logger.info('F1: {:.4}.'.format(preds_stats.get_f1()))
-
+        logger.info(f'Pk: {calculated_pk:.4}.')
+        logger.info(f'F1: {preds_stats.get_f1():.4}.')
 
         end = timer()
-        print ('Seconds to execute to whole flow: ' + str(end - start))
-
+        print(f'Seconds to execute the whole flow: {end - start}')
 
 
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--cuda', help='Use cuda?', action='store_true')
-    parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true')
+    parser.add_argument('--test', help='Test mode? (e.g., fake word2vec)', action='store_true')
     parser.add_argument('--bs', help='Batch size', type=int, default=8)
-    parser.add_argument('--model', help='Model to run - will import and run', required=True)
-    parser.add_argument('--stop_after', help='Number of batches to stop after', default=None, type=int)
+    parser.add_argument('--model', help='Model to run', required=True)
+    parser.add_argument('--stop_after', help='Number of batches to stop after', type=int)
     parser.add_argument('--config', help='Path to config.json', default='config.json')
-    parser.add_argument('--wiki', help='Use wikipedia as dataset?', action='store_true')
-    parser.add_argument('--bySegLength', help='calc pk on choi by segments length?', action='store_true')
-    parser.add_argument('--wiki_folder', help='path to folder which contains wiki documents')
-    parser.add_argument('--naive', help='use naive model', action='store_true')
-    parser.add_argument('--seg_threshold', help='Threshold for binary classificetion', type=float, default=0.4)
-    parser.add_argument('--calc_word', help='Whether to calc P_K by word', action='store_true')
-
-
-    main(parser.parse_args())
+    parser.add_argument('--wiki', help='Use Wikipedia as dataset?', action='store_true')
+    parser.add_argument('--bySegLength', help='Calculate pk on Choi by segments length?', action='store_true')
+    parser.add_argument('--wiki_folder', help='Path to folder containing wiki documents')
+    parser.add_argument('--naive', help='Use naive model', action='store_true')
+    parser.add_argument('--seg_threshold', help='Threshold for binary classification', type=float, default=0.4)
+    parser.add_argument('--calc_word', help='Calculate P_K by word', action='store_true')
+
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/test_accuracy_choi.py b/test_accuracy_choi.py
index 97fff4b..84ef0fa 100644
--- a/test_accuracy_choi.py
+++ b/test_accuracy_choi.py
@@ -2,7 +2,6 @@
 from torch.utils.data import DataLoader
 from torch.autograd import Variable
 import torch.nn.functional as F
-
 from choiloader import ChoiDataset, collate_fn
 from tqdm import tqdm
 from argparse import ArgumentParser
@@ -12,7 +11,7 @@
 from tensorboard_logger import configure
 import os
 import sys
-from pathlib2 import Path
+from pathlib import Path
 import accuracy
 import numpy as np
 from termcolor import colored
@@ -21,32 +20,29 @@
 
 preds_stats = utils.predictions_analysis()
 
-
 def softmax(x):
     max_each_row = np.max(x, axis=1, keepdims=True)
     exps = np.exp(x - max_each_row)
     sums = np.sum(exps, axis=1, keepdims=True)
     return exps / sums
 
-
 def import_model(model_name):
     module = __import__('models.' + model_name, fromlist=['models'])
     return module.create()
 
-
-class Accuracies(object):
+class Accuracies:
     def __init__(self):
         self.thresholds = np.arange(0, 1, 0.05)
         self.accuracies = {k: accuracy.Accuracy() for k in self.thresholds}
 
     def update(self, output_np, targets_np):
         current_idx = 0
-        for k, t in enumerate(targets_np):
+        for t in targets_np:
             document_sentence_count = len(t)
             to_idx = int(current_idx + document_sentence_count)
 
             for threshold in self.thresholds:
-                output = ((output_np[current_idx: to_idx, :])[:, 1] > threshold)
+                output = (output_np[current_idx: to_idx, 1] > threshold)
                 h = np.append(output, [1])
                 tt = np.append(t, [1])
 
@@ -69,84 +65,72 @@ def calc_accuracy(self):
 
 def validate(model, args, epoch, dataset, logger):
     model.eval()
-    with tqdm(desc='Validatinging', total=len(dataset)) as pbar:
+    with tqdm(desc='Validating', total=len(dataset)) as pbar:
         acc = Accuracies()
         for i, (data, target, paths) in enumerate(dataset):
-            if True:
-                if i == args.stop_after:
-                    break
-                pbar.update()
-                output = model(data)
-                output_softmax = F.softmax(output, 1)
-                targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False)
+            if i == args.stop_after:
+                break
+            pbar.update()
+            output = model(data)
+            output_softmax = F.softmax(output, dim=1)
+            targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False)
 
-                output_seg = output.data.cpu().numpy().argmax(axis=1)
-                target_seg = targets_var.data.cpu().numpy()
-                preds_stats.add(output_seg, target_seg)
+            output_seg = output.argmax(dim=1).cpu().numpy()
+            target_seg = targets_var.cpu().numpy()
+            preds_stats.add(output_seg, target_seg)
 
-                acc.update(output_softmax.data.cpu().numpy(), target)
+            acc.update(output_softmax.cpu().numpy(), target)
 
         epoch_pk, epoch_windiff, threshold = acc.calc_accuracy()
 
-        logger.info('Validating Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1,
-                                                                                                            preds_stats.get_accuracy(),
-                                                                                                            epoch_pk,
-                                                                                                            epoch_windiff,
-                                                                                                            preds_stats.get_f1()))
+        logger.info(f'Validating Epoch: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, '
+                    f'Pk: {epoch_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}')
         preds_stats.reset()
 
         return epoch_pk, threshold
 
-
 def test(model, args, epoch, dataset, logger, test_threshold, test_acc):
     model.eval()
     with tqdm(desc='Testing', total=len(dataset)) as pbar:
         for i, (data, target, paths) in enumerate(dataset):
-            if True:
-                if i == args.stop_after:
-                    break
-                pbar.update()
-                output = model(data)
-                output_softmax = F.softmax(output, 1)
-                targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False)
-                output_seg = output.data.cpu().numpy().argmax(axis=1)
-                target_seg = targets_var.data.cpu().numpy()
-                preds_stats.add(output_seg, target_seg)
-
-                current_idx = 0
-
-                for k, t in enumerate(target):
-                    document_sentence_count = len(t)
-                    to_idx = int(current_idx + document_sentence_count)
-
-                    output = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > test_threshold)
-                    h = np.append(output, [1])
-                    tt = np.append(t, [1])
-
-                    test_acc.update(h, tt)
+            if i == args.stop_after:
+                break
+            pbar.update()
+            output = model(data)
+            output_softmax = F.softmax(output, dim=1)
+            targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False)
+            output_seg = output.argmax(dim=1).cpu().numpy()
+            target_seg = targets_var.cpu().numpy()
+            preds_stats.add(output_seg, target_seg)
+
+            current_idx = 0
+            for t in target:
+                document_sentence_count = len(t)
+                to_idx = int(current_idx + document_sentence_count)
+
+                output = (output_softmax.cpu().numpy()[current_idx: to_idx, 1] > test_threshold)
+                h = np.append(output, [1])
+                tt = np.append(t, [1])
 
-                    current_idx = to_idx
+                test_acc.update(h, tt)
+                current_idx = to_idx
 
         test_pk, epoch_windiff = test_acc.calc_accuracy()
 
-        logger.debug('Testing validation section: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1,
-                                                                                                          preds_stats.get_accuracy(),
-                                                                                                          test_pk,
-                                                                                                          epoch_windiff,
-                                                                                                          preds_stats.get_f1()))
+        logger.debug(f'Testing validation section: {epoch + 1}, accuracy: {preds_stats.get_accuracy():.4}, '
+                     f'Pk: {test_pk:.4}, Windiff: {epoch_windiff:.4}, F1: {preds_stats.get_f1():.4}')
         preds_stats.reset()
 
         return test_pk
 
-
 def main(args):
     sys.path.append(str(Path(__file__).parent))
 
-    logger = utils.setup_logger(__name__,  'cross_validate_choi.log')
+    logger = utils.setup_logger(__name__, 'cross_validate_choi.log')
 
     utils.read_config_file(args.config)
-    utils.config.update(args.__dict__)
-    logger.debug('Running with config %s', utils.config)
+    utils.config.update(vars(args))  # Updated to use vars(args) for cleaner conversion
+    logger.debug(f'Running with config {utils.config}')
 
     configure(os.path.join('runs', args.expname))
 
@@ -155,7 +139,6 @@ def main(args):
     else:
         word2vec = None
 
-
     dataset_path = Path(args.flat_choi)
 
     with open(args.load_from, 'rb') as f:
@@ -166,10 +149,10 @@ def main(args):
     test_accuracy = accuracy.Accuracy()
 
     for j in range(5):
-        validate_folder_numbers = range(5)
+        validate_folder_numbers = list(range(5))
         validate_folder_numbers.remove(j)
         validate_folder_names = [dataset_path.joinpath(str(num)) for num in validate_folder_numbers]
-        dev_dataset = ChoiDataset(dataset_path , word2vec, folder=True, folders_paths=validate_folder_names)
+        dev_dataset = ChoiDataset(dataset_path, word2vec, folder=True, folders_paths=validate_folder_names)
         test_dataset = ChoiDataset(dataset_path, word2vec, folder=True, folders_paths=[dataset_path.joinpath(str(j))])
 
         dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
@@ -179,28 +162,24 @@ def main(args):
 
         _, threshold = validate(model, args, j, dev_dl, logger)
         test_pk = test(model, args, j, test_dl, logger, threshold, test_accuracy)
-        logger.debug(colored('Cross validation section {} with p_k {} and threshold {}'.format(j, test_pk, threshold),'green'))
+        logger.debug(colored(f'Cross validation section {j} with p_k {test_pk} and threshold {threshold}', 'green'))
 
     cross_validation_pk, _ = test_accuracy.calc_accuracy()
-    print ('Final cross validaiton Pk is: ' + str(cross_validation_pk))
-    logger.debug(
-        colored('Final cross validaiton Pk is: {}'.format(cross_validation_pk), 'green'))
-
-
+    print(f'Final cross validation Pk is: {cross_validation_pk}')
+    logger.debug(colored(f'Final cross validation Pk is: {cross_validation_pk}', 'green'))
 
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--cuda', help='Use cuda?', action='store_true')
-    parser.add_argument('--test', help='Test mode? (e.g fake word2vec)', action='store_true')
+    parser.add_argument('--test', help='Test mode? (e.g., fake word2vec)', action='store_true')
     parser.add_argument('--bs', help='Batch size', type=int, default=8)
-    parser.add_argument('--test_bs', help='Batch size', type=int, default=5)
-    parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue')
+    parser.add_argument('--test_bs', help='Test batch size', type=int, default=5)
+    parser.add_argument('--load_from', help='Location of a .t7 model file to load')
     parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1')
-    parser.add_argument('--stop_after', help='Number of batches to stop after', default=None, type=int)
+    parser.add_argument('--stop_after', help='Number of batches to stop after', type=int)
     parser.add_argument('--config', help='Path to config.json', default='config.json')
-    parser.add_argument('--window_size', help='Window size to encode setence', type=int, default=1)
-    parser.add_argument('--num_workers', help='How many workers to use for data loading', type=int, default=0)
+    parser.add_argument('--window_size', help='Window size to encode sentence', type=int, default=1)
+    parser.add_argument('--num_workers', help='Number of workers for data loading', type=int, default=0)
     parser.add_argument('--flat_choi', help='Path to flat choi dataset')
 
-
-    main(parser.parse_args())
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/tests.py b/tests.py
index 6fc40f2..5fe5dce 100644
--- a/tests.py
+++ b/tests.py
@@ -1,75 +1,59 @@
-from __future__ import print_function
-
-from unittest import TestCase
-from utils import unsort
 import unittest
+from utils import unsort
 import accuracy
 import numpy as np
 import text_manipulation
 
-class LoaderTests(TestCase):
-    def testReallyTrivial(self):
-        assert 1 + 1 == 2
+class LoaderTests(unittest.TestCase):
+    def test_really_trivial(self):
+        self.assertEqual(1 + 1, 2)
 
 class PkTests(unittest.TestCase):
     def test_get_boundaries(self):
-        sentences_class = []
-        sentences_class.append(("first sen.", 1))
-        sentences_class.append(("sec sen.", 1))
-        sentences_class.append(("third sen.", 0))
-        sentences_class.append(("forth sen.", 1))
-        sentences_class.append(("fifth sen.", 0))
-        sentences_class.append(("sixth sen.", 0))
-        sentences_class.append(("seventh sen.", 1))
-
+        sentences_class = [
+            ("first sen.", 1),
+            ("sec sen.", 1),
+            ("third sen.", 0),
+            ("forth sen.", 1),
+            ("fifth sen.", 0),
+            ("sixth sen.", 0),
+            ("seventh sen.", 1)
+        ]
         expected = [2, 2, 4, 6]
         result = accuracy.get_seg_boundaries(sentences_class)
-
-        for i, num in enumerate(result):
-            self.assertTrue(num == expected[i])
+        self.assertEqual(result, expected)
 
     def test_get_boundaries2(self):
-        sentences_class = []
-        sentences_class.append(("first sen is 5 words.", 0))
-        sentences_class.append(("sec sen.", 0))
-        sentences_class.append(("third sen is a very very very long sentence.", 1))
-        sentences_class.append(("the forth one is single segment.", 1))
-
-
+        sentences_class = [
+            ("first sen is 5 words.", 0),
+            ("sec sen.", 0),
+            ("third sen is a very very very long sentence.", 1),
+            ("the forth one is a single segment.", 1)
+        ]
         expected = [16, 6]
         result = accuracy.get_seg_boundaries(sentences_class)
-
-        for i, num in enumerate(result):
-            self.assertTrue(num == expected[i])
-
-    def test_pk_perefct_seg(self):
-        sentences_class = []
-        sentences_class.append(("first sen is 5 words.", 0))
-        sentences_class.append(("sec sen.", 0))
-        sentences_class.append(("third sen is a very very very long sentence.", 1))
-        sentences_class.append(("the forth one is single segment.", 1))
-
+        self.assertEqual(result, expected)
+
+    def test_pk_perfect_seg(self):
+        sentences_class = [
+            ("first sen is 5 words.", 0),
+            ("sec sen.", 0),
+            ("third sen is a very very very long sentence.", 1),
+            ("the forth one is a single segment.", 1)
+        ]
         gold = accuracy.get_seg_boundaries(sentences_class)
         h = accuracy.get_seg_boundaries(sentences_class)
 
-        # with specified window size
-        for window_size in range(1, 15, 1):
+        for window_size in range(1, 15):
             acc = accuracy.pk(gold, h, window_size=window_size)
-            self.assertEquals(acc, 1)
+            self.assertEqual(acc, 1)
 
-        # with default window size
         acc = accuracy.pk(gold, h)
-        self.assertEquals(acc, 1)
+        self.assertEqual(acc, 1)
 
     def test_pk_false_neg(self):
-        h = []
-        h.append(("5 words sentence of data.", 0))
-        h.append(("2 sentences same seg.", 1))
-
-        gold = []
-        gold.append(("5 words sentence of data.", 1))
-        gold.append(("2 sentences same seg.", 1))
-
+        h = [("5 words sentence of data.", 0), ("2 sentences same seg.", 1)]
+        gold = [("5 words sentence of data.", 1), ("2 sentences same seg.", 1)]
 
         gold = accuracy.get_seg_boundaries(gold)
         h = accuracy.get_seg_boundaries(h)
@@ -77,75 +61,73 @@ def test_pk_false_neg(self):
         window_size = 3
         comparison_count = 6
 
-        # with default window size
         acc = accuracy.pk(gold, h)
-        self.assertEquals(acc, window_size / comparison_count)
+        self.assertEqual(acc, window_size / comparison_count)
 
         window_size = 4
         acc = accuracy.pk(gold, h)
-        self.assertEquals(acc, window_size / comparison_count)
+        self.assertEqual(acc, window_size / comparison_count)
 
     def test_windiff(self):
-        h = []
-        h.append(("5 words sentence of data.", 0))
-        h.append(("short.", 1))
-        h.append(("extra segmented sen.", 1))
-        h.append(("last and very very very very very long sen.", 1))
-
-
-        gold = []
-        gold.append(("5 words sentence of data.", 1))
-        gold.append(("short.", 1))
-        gold.append(("extra segmented sen.", 0))
-        gold.append(("last and very very very very very long sen.", 1))
-
+        h = [
+            ("5 words sentence of data.", 0),
+            ("short.", 1),
+            ("extra segmented sen.", 1),
+            ("last and very very very very very long sen.", 1)
+        ]
+
+        gold = [
+            ("5 words sentence of data.", 1),
+            ("short.", 1),
+            ("extra segmented sen.", 0),
+            ("last and very very very very very long sen.", 1)
+        ]
 
         gold = accuracy.get_seg_boundaries(gold)
         h = accuracy.get_seg_boundaries(h)
 
         window_size = 3
+        acc = accuracy.win_diff(gold, h, window_size=window_size)
+        self.assertEqual(float(acc), 0.6)
 
-        acc = accuracy.win_diff(gold, h, window_size = window_size)
-        self.assertEquals(float(acc), 0.6)
-        
         window_size = 5
-        expected = float(1)- float(8) / 13
+        expected = 1 - 8 / 13
 
         acc = accuracy.win_diff(gold, h, window_size=window_size)
-        self.assertEquals("{0:.5f}".format(float(acc)), "{0:.5f}".format(expected))
-
+        self.assertAlmostEqual(float(acc), expected, places=5)
 
-class UnsortTests(TestCase):
+class UnsortTests(unittest.TestCase):
     def test_unsort(self):
         x = np.random.randint(0, 100, 10)
         sort_order = np.argsort(x)
         unsort_order = unsort(sort_order)
-        assert np.all(x[sort_order][unsort_order] == x)
-
-
-class SentenceTokenizerTests(TestCase):
-    def test_a_little(self):
-        a = text_manipulation.split_sentences(u"Hello, Mr. Trump, how do you do? What? Where? I don't i.e e.g Russia.")
-        assert a == [u'Hello, Mr. Trump, how do you do?',
-                     u'What?',
-                     u'Where?',
-                     u"I don't i.e e.g Russia."]
+        np.testing.assert_array_equal(x[sort_order][unsort_order], x)
+
+class SentenceTokenizerTests(unittest.TestCase):
+    def test_split_sentences(self):
+        text = u"Hello, Mr. Trump, how do you do? What? Where? I don't i.e e.g Russia."
+        expected = [
+            u'Hello, Mr. Trump, how do you do?',
+            u'What?',
+            u'Where?',
+            u"I don't i.e e.g Russia."
+        ]
+        result = text_manipulation.split_sentences(text)
+        self.assertEqual(result, expected)
 
     def test_linebreaks(self):
         text = u'''Line one. Still line one.
 
         Line two. Can I span
         two lines?'''
-        a = text_manipulation.split_sentences(text)
-        print(a)
-        assert a == [u'Line one.',
-                     u'Still line one.',
-                     u'Line two.',
-                     u'Can I span\n        two lines?']
-
-
-
-
+        expected = [
+            u'Line one.',
+            u'Still line one.',
+            u'Line two.',
+            u'Can I span\n        two lines?'
+        ]
+        result = text_manipulation.split_sentences(text)
+        self.assertEqual(result, expected)
 
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file
diff --git a/text_manipulation.py b/text_manipulation.py
index 238a3d9..50b738f 100644
--- a/text_manipulation.py
+++ b/text_manipulation.py
@@ -1,15 +1,16 @@
 import nltk.data
-import exceptions
 import numpy as np
 from nltk.tokenize import RegexpTokenizer
 import wiki_utils
 import wiki_thresholds
 import utils
+import logging
 
+# Initialize global variables
 sentence_tokenizer = None
 words_tokenizer = None
-missing_stop_words = set(['of', 'a', 'and', 'to'])
-logger = utils.setup_logger(__name__, 'text_manipulation.log', True )
+missing_stop_words = {'of', 'a', 'and', 'to'}
+logger = utils.setup_logger(__name__, 'text_manipulation.log', delete_old=True)
 
 
 def get_punkt():
@@ -19,7 +20,7 @@ def get_punkt():
 
     try:
         tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
-    except exceptions.LookupError:
+    except LookupError:
         nltk.download('punkt')
         tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 
@@ -28,90 +29,85 @@ def get_punkt():
 
 def get_words_tokenizer():
     global words_tokenizer
-
     if words_tokenizer:
         return words_tokenizer
 
     words_tokenizer = RegexpTokenizer(r'\w+')
     return words_tokenizer
 
-
-
 def split_sentence_with_list(sentence):
-
     list_pattern = "\n" + wiki_utils.get_list_token() + "."
-    if sentence.endswith( list_pattern ):
-        #splited_sentence = [str for str in sentence.encode('utf-8').split("\n" + wiki_utils.get_list_token() + ".") if len(str) > 0]
-        splited_sentence = [str for str in sentence.split("\n" + wiki_utils.get_list_token() + ".") if
-                            len(str) > 0]
-        splited_sentence.append(wiki_utils.get_list_token() + ".")
-        return splited_sentence
+    if sentence.endswith(list_pattern):
+        split_sentence = [s for s in sentence.split(list_pattern) if len(s) > 0]
+        split_sentence.append(wiki_utils.get_list_token() + ".")
+        return split_sentence
     else:
         return [sentence]
 
-def split_sentece_colon_new_line(sentence):
-
-    splited_sentence = sentence.split(":\n")
-    if (len(splited_sentence) == 1):
-        return splited_sentence
+def split_sentence_colon_new_line(sentence):
+    split_sentence = sentence.split(":\n")
+    if len(split_sentence) == 1:
+        return split_sentence
+    
     new_sentences = []
-    # -1 . not to add ":" to last sentence
-    for i in range(len(splited_sentence) - 1):
-        if (len(splited_sentence[i]) > 0):
-            new_sentences.append(splited_sentence[i] + ":")
-    if (len(splited_sentence[-1]) > 0):
-        new_sentences.append(splited_sentence[-1])
+    for i in range(len(split_sentence) - 1):
+        if len(split_sentence[i]) > 0:
+            new_sentences.append(split_sentence[i] + ":")
+    
+    if len(split_sentence[-1]) > 0:
+        new_sentences.append(split_sentence[-1])
+    
     return new_sentences
 
-def split_long_sentences_with_backslash_n(max_words_in_sentence,sentences, doc_id):
+def split_long_sentences_with_backslash_n(max_words_in_sentence, sentences, doc_id):
     new_sentences = []
     for sentence in sentences:
         sentence_words = extract_sentence_words(sentence)
         if len(sentence_words) > max_words_in_sentence:
-            splitted_sentences = sentence.split('\n')
-            if len(splitted_sentences) > 1:
-                logger.info("Sentence with backslash was splitted. Doc Id: " + str(doc_id) +"   Sentence:  " + sentence)
-            new_sentences.extend(splitted_sentences )
+            split_sentences = sentence.split('\n')
+            if len(split_sentences) > 1:
+                logger.info(f"Sentence with backslash was split. Doc Id: {doc_id}   Sentence: {sentence}")
+            new_sentences.extend(split_sentences)
         else:
             if "\n" in sentence:
-                logger.info("No split for sentence with backslash n. Doc Id: " + str(doc_id) +"   Sentence:  " + sentence)
+                logger.info(f"No split for sentence with backslash n. Doc Id: {doc_id}   Sentence: {sentence}")
             new_sentences.append(sentence)
     return new_sentences
 
 def split_sentences(text, doc_id):
     sentences = get_punkt().tokenize(text)
-    senteces_list_fix = []
+    sentences_list_fixed = []
     for sentence in sentences:
-        seplited_list_sentence = split_sentence_with_list(sentence)
-        senteces_list_fix.extend(seplited_list_sentence)
+        split_list_sentence = split_sentence_with_list(sentence)
+        sentences_list_fixed.extend(split_list_sentence)
 
-    sentence_colon_fix = []
-    for sentence in senteces_list_fix:
-        splitted_colon_sentence =  split_sentece_colon_new_line(sentence)
-        sentence_colon_fix.extend(splitted_colon_sentence)
-
-    sentences_without_backslash_n = split_long_sentences_with_backslash_n(wiki_thresholds.max_words_in_sentence_with_backslash_n, sentence_colon_fix, doc_id)
-
-    ret_sentences = []
-    for sentence in sentences_without_backslash_n:
-        ret_sentences.append(sentence.replace('\n',' '))
+    sentences_colon_fixed = []
+    for sentence in sentences_list_fixed:
+        split_colon_sentence = split_sentence_colon_new_line(sentence)
+        sentences_colon_fixed.extend(split_colon_sentence)
 
+    sentences_no_backslash_n = split_long_sentences_with_backslash_n(
+        wiki_thresholds.max_words_in_sentence_with_backslash_n, 
+        sentences_colon_fixed, 
+        doc_id
+    )
 
+    ret_sentences = [sentence.replace('\n', ' ') for sentence in sentences_no_backslash_n]
     return ret_sentences
 
-def extract_sentence_words(sentence, remove_missing_emb_words = False,remove_special_tokens = False):
-    if (remove_special_tokens):
+def extract_sentence_words(sentence, remove_missing_emb_words=False, remove_special_tokens=False):
+    if remove_special_tokens:
         for token in wiki_utils.get_special_tokens():
-            # Can't do on sentence words because tokenizer delete '***' of tokens.
             sentence = sentence.replace(token, "")
+    
     tokenizer = get_words_tokenizer()
     sentence_words = tokenizer.tokenize(sentence)
+    
     if remove_missing_emb_words:
         sentence_words = [w for w in sentence_words if w not in missing_stop_words]
 
     return sentence_words
 
-
 def word_model(word, model):
     if model is None:
         return np.random.randn(1, 300)
@@ -119,6 +115,5 @@ def word_model(word, model):
         if word in model:
             return model[word].reshape(1, 300)
         else:
-            #print ('Word missing w2v: ' + word)
-            return model['UNK'].reshape(1, 300)
-
+            # If word not in model, return 'UNK' embedding
+            return model['UNK'].reshape(1, 300)
\ No newline at end of file
diff --git a/times_profiler.py b/times_profiler.py
index 6875a50..b79d73c 100644
--- a/times_profiler.py
+++ b/times_profiler.py
@@ -1,36 +1,41 @@
 from timeit import default_timer as timer
 
 
-class profiler():
-
+class profiler:
     segments = []
     start = 0
     end = 0
 
-
     @staticmethod
-    def set ():
-
+    def set():
+        """
+        Mark the end of a segment and start the timer for the next segment.
+        """
         profiler.end = timer()
         profiler.segments.append(profiler.end - profiler.start)
         profiler.start = timer()
 
-        return
-
     @staticmethod
     def init():
+        """
+        Initialize the profiler by starting the timer.
+        """
         profiler.start = timer()
 
-        return
-
     @staticmethod
     def finish(profilerLog):
+        """
+        Finish profiling and log the results to the provided logger.
+        
+        Args:
+            profilerLog: A logger object to which profiling results will be logged.
+        """
         profiler.end = timer()
         profiler.segments.append(profiler.end - profiler.start)
-        str2log = ""
-        for i in range(len(profiler.segments)):
-            str2log += str(i) +"-"+str(i+1)+" = " + "{:.2f}".format(profiler.segments[i]) + " "
+
+        # Format the results for logging
+        str2log = " ".join([f"{i}-{i+1} = {segment:.2f}" for i, segment in enumerate(profiler.segments)])
         profilerLog.debug(str2log)
-        profiler.segments = []
-        return
 
+        # Clear the segments after logging
+        profiler.segments = []
\ No newline at end of file
diff --git a/utils.py b/utils.py
index 351f58c..cce93b1 100644
--- a/utils.py
+++ b/utils.py
@@ -3,37 +3,29 @@
 import sys
 import numpy as np
 import random
-from pathlib2 import Path
+from pathlib import Path  # Updated to use pathlib (pathlib2 is not needed in Python 3)
 from shutil import copy
 
-
-
 config = {}
 
-
 def read_config_file(path='config.json'):
     global config
-
     with open(path, 'r') as f:
         config.update(json.load(f))
 
-
 def maybe_cuda(x, is_cuda=None):
     global config
-
     if is_cuda is None and 'cuda' in config:
         is_cuda = config['cuda']
-
     if is_cuda:
         return x.cuda()
     return x
 
-
-def setup_logger(logger_name, filename, delete_old = False):
+def setup_logger(logger_name, filename, delete_old=False):
     logger = logging.getLogger(logger_name)
     logger.setLevel(logging.DEBUG)
     stderr_handler = logging.StreamHandler(sys.stderr)
-    file_handler   = logging.FileHandler(filename, mode='w') if delete_old else logging.FileHandler(filename)
+    file_handler = logging.FileHandler(filename, mode='w' if delete_old else 'a')
     file_handler.setLevel(logging.DEBUG)
     stderr_handler.setLevel(logging.INFO)
     formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
@@ -43,134 +35,80 @@ def setup_logger(logger_name, filename, delete_old = False):
     logger.addHandler(file_handler)
     return logger
 
-
 def unsort(sort_order):
     result = [-1] * len(sort_order)
-
     for i, index in enumerate(sort_order):
         result[index] = i
-
     return result
 
-class f1(object):
-
-    def __init__(self,ner_size):
+class F1:
+    def __init__(self, ner_size):
         self.ner_size = ner_size
-        self.tp = np.array([0] * (ner_size +1))
-        self.fp = np.array([0] * (ner_size +1))
-        self.fn = np.array([0] * (ner_size +1))
-
-    def add(self,preds,targets,length):
-        tp = self.tp
-        fp = self.fp
-        fn = self.fn
-        ner_size = self.ner_size
-
-        prediction = np.argmax(preds, 2)
+        self.tp = np.zeros(ner_size + 1)
+        self.fp = np.zeros(ner_size + 1)
+        self.fn = np.zeros(ner_size + 1)
 
+    def add(self, preds, targets, length):
+        prediction = np.argmax(preds, axis=2)
         for i in range(len(targets)):
             for j in range(length[i]):
                 if targets[i, j] == prediction[i, j]:
-                    tp[targets[i, j]] += 1
+                    self.tp[targets[i, j]] += 1
                 else:
-                    fp[targets[i, j]] += 1
-                    fn[prediction[i, j]] += 1
+                    self.fp[targets[i, j]] += 1
+                    self.fn[prediction[i, j]] += 1
 
-        unnamed_entity = ner_size - 1
-        for i in range(ner_size):
+        unnamed_entity = self.ner_size - 1
+        for i in range(self.ner_size):
             if i != unnamed_entity:
-                tp[ner_size] += tp[i]
-                fp[ner_size] += fp[i]
-                fn[ner_size] += fn[i]
-
+                self.tp[self.ner_size] += self.tp[i]
+                self.fp[self.ner_size] += self.fp[i]
+                self.fn[self.ner_size] += self.fn[i]
 
     def score(self):
-        tp = self.tp
-        fp = self.fp
-        fn = self.fn
-        ner_size = self.ner_size
-
-        precision = []
-        recall = []
-        fscore = []
-        for i in range(ner_size + 1):
-            precision.append(tp[i] * 1.0 / (tp[i] + fp[i]))
-            recall.append(tp[i] * 1.0 / (tp[i] + fn[i]))
-            fscore.append(2.0 * precision[i] * recall[i] / (precision[i] + recall[i]))
+        precision = np.divide(self.tp, self.tp + self.fp, out=np.zeros_like(self.tp), where=self.tp + self.fp != 0)
+        recall = np.divide(self.tp, self.tp + self.fn, out=np.zeros_like(self.tp), where=self.tp + self.fn != 0)
+        fscore = 2 * precision * recall / (precision + recall + 1e-8)  # Avoid division by zero
         print(fscore)
+        return fscore[self.ner_size]
 
-        return fscore[ner_size]
-
-
-class predictions_analysis(object):
-
+class predictions_analysis:
     def __init__(self):
         self.tp = 0
         self.tn = 0
         self.fp = 0
         self.fn = 0
 
-
-    def add(self,predicions, targets):
-        self.tp += ((predicions == targets) & (1 == predicions)).sum()
-        self.tn += ((predicions == targets) & (0 == predicions)).sum()
-        self.fp += ((predicions != targets) & (1 == predicions)).sum()
-        self.fn += ((predicions != targets) & (0 == predicions)).sum()
-
+    def add(self, predictions, targets):
+        self.tp += ((predictions == targets) & (predictions == 1)).sum()
+        self.tn += ((predictions == targets) & (predictions == 0)).sum()
+        self.fp += ((predictions != targets) & (predictions == 1)).sum()
+        self.fn += ((predictions != targets) & (predictions == 0)).sum()
 
     def calc_recall(self):
-        if self.tp  == 0 and self.fn == 0:
-            return -1
-
-        return np.true_divide(self.tp, self.tp + self.fn)
+        return np.divide(self.tp, self.tp + self.fn) if self.tp + self.fn != 0 else -1
 
     def calc_precision(self):
-        if self.tp  == 0 and self.fp == 0:
-            return -1
-
-        return  np.true_divide(self.tp,self.tp + self.fp)
-
-
-
+        return np.divide(self.tp, self.tp + self.fp) if self.tp + self.fp != 0 else -1
 
     def get_f1(self):
-        if (self.tp + self.fp == 0):
-            return 0.0
-        if (self.tp + self.fn == 0):
+        if self.tp + self.fp == 0 or self.tp + self.fn == 0:
             return 0.0
         precision = self.calc_precision()
         recall = self.calc_recall()
-        if (not ((precision + recall) == 0)):
-            f1 = 2*(precision*recall) / (precision + recall)
-        else:
-            f1 = 0.0
-
-        return f1
+        return 2 * precision * recall / (precision + recall + 1e-8) if precision + recall != 0 else 0.0
 
     def get_accuracy(self):
-
         total = self.tp + self.tn + self.fp + self.fn
-        if (total == 0) :
-            return 0.0
-        else:
-            return np.true_divide(self.tp + self.tn, total)
-
+        return np.divide(self.tp + self.tn, total) if total != 0 else 0.0
 
     def reset(self):
-        self.tp = 0
-        self.tn = 0
-        self.fn = 0
-        self.fp = 0
-
+        self.tp = self.tn = self.fp = self.fn = 0
 
-def get_random_files(count, input_folder, output_folder, specific_section = True):
+def get_random_files(count, input_folder, output_folder, specific_section=True):
     files = Path(input_folder).glob('*/*/*/*') if specific_section else Path(input_folder).glob('*/*/*/*/*')
-    file_paths = []
-    for f in files:
-        file_paths.append(f)
-
+    file_paths = list(files)
     random_paths = random.sample(file_paths, count)
-
     for random_path in random_paths:
         output_path = Path(output_folder).joinpath(random_path.name)
-        copy(str(random_path), str (output_path))
\ No newline at end of file
+        copy(random_path, output_path)
\ No newline at end of file
diff --git a/wiki_extractor.py b/wiki_extractor.py
index 3921987..fe7814b 100644
--- a/wiki_extractor.py
+++ b/wiki_extractor.py
@@ -329,10 +329,10 @@ def subst(self, params, extractor, depth=0):
         return ''.join([tpl.subst(params, extractor, depth) for tpl in self])
 
     def __str__(self):
-        return ''.join([unicode(x) for x in self])
+        return ''.join([str(x) for x in self])
 
 
-class TemplateText(unicode):
+class TemplateText(str):
     """Fixed text of template"""
 
     def subst(self, params, extractor, depth):
@@ -1361,7 +1361,7 @@ def sharp_expr(expr):
         expr = re.sub('mod', '%', expr)
         expr = re.sub('\bdiv\b', '/', expr)
         expr = re.sub('\bround\b', '|ROUND|', expr)
-        return unicode(eval(expr))
+        return str(eval(expr))
     except:
         return '<span class="error"></span>'
 
@@ -2282,7 +2282,7 @@ def compact(text):
 def handle_unicode(entity):
     numeric_code = int(entity[2:-1])
     if numeric_code >= 0x10000: return ''
-    return unichr(numeric_code)
+    return chr(numeric_code)
 
 
 # ------------------------------------------------------------------------------
diff --git a/wiki_loader.py b/wiki_loader.py
index b15fb27..009b901 100644
--- a/wiki_loader.py
+++ b/wiki_loader.py
@@ -1,43 +1,36 @@
 from torch.utils.data import Dataset
-from text_manipulation import word_model
-from text_manipulation import extract_sentence_words
+from text_manipulation import word_model, extract_sentence_words
 from pathlib2 import Path
 import re
 import wiki_utils
 import os
-
 import utils
 
 logger = utils.setup_logger(__name__, 'train.log')
 
 section_delimiter = "========"
 
-
 def get_files(path):
     all_objects = Path(path).glob('**/*')
     files = [str(p) for p in all_objects if p.is_file()]
     return files
 
-
 def get_cache_path(wiki_folder):
     cache_file_path = wiki_folder / 'paths_cache'
     return cache_file_path
 
-
 def cache_wiki_filenames(wiki_folder):
     files = Path(wiki_folder).glob('*/*/*/*')
     cache_file_path = get_cache_path(wiki_folder)
 
     with cache_file_path.open('w+') as f:
         for file in files:
-            f.write(unicode(file) + u'\n')
-
+            f.write(str(file) + u'\n')
 
 def clean_section(section):
     cleaned_section = section.strip('\n')
     return cleaned_section
 
-
 def get_scections_from_text(txt, high_granularity=True):
     sections_to_keep_pattern = wiki_utils.get_seperator_foramt() if high_granularity else wiki_utils.get_seperator_foramt(
         (1, 2))
@@ -50,27 +43,24 @@ def get_scections_from_text(txt, high_granularity=True):
         sentences = [s for s in txt.strip().split("\n") if len(s) > 0 and s != "\n"]
         txt = '\n'.join(sentences).strip('\n')
 
-
     all_sections = re.split(sections_to_keep_pattern, txt)
     non_empty_sections = [s for s in all_sections if len(s) > 0]
 
     return non_empty_sections
 
-
 def get_sections(path, high_granularity=True):
     file = open(str(path), "r")
     raw_content = file.read()
     file.close()
 
-    clean_txt = raw_content.decode('utf-8').strip()
+    clean_txt = raw_content.strip()
 
     sections = [clean_section(s) for s in get_scections_from_text(clean_txt, high_granularity)]
 
     return sections
 
-
 def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=False, remove_special_tokens=False,
-                   return_as_sentences=False, high_granularity=True,only_letters = False):
+                   return_as_sentences=False, high_granularity=True, only_letters=False):
     data = []
     targets = []
     all_sections = get_sections(path, high_granularity)
@@ -89,10 +79,9 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals
                     if 1 <= len(sentence_words):
                         data.append([word_model(word, word2vec) for word in sentence_words])
                     else:
-                        #raise ValueError('Sentence in wikipedia file is empty')
                         logger.info('Sentence in wikipedia file is empty')
                 else:  # for the annotation. keep sentence as is.
-                    if (only_letters):
+                    if only_letters:
                         sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence)
                         data.append(sentence)
                     else:
@@ -102,20 +91,21 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals
 
     return data, targets, path
 
-
 class WikipediaDataSet(Dataset):
     def __init__(self, root, word2vec, train=True, manifesto=False, folder=False, high_granularity=False):
-
-        if (manifesto):
+        if manifesto:
             self.textfiles = list(Path(root).glob('*'))
         else:
-            if (folder):
+            if folder:
                 self.textfiles = get_files(root)
             else:
                 root_path = Path(root)
                 cache_path = get_cache_path(root_path)
                 if not cache_path.exists():
+                    print('Creating cache....')
                     cache_wiki_filenames(root_path)
+                else:
+                    print(f'Cache exists at {cache_path}')
                 self.textfiles = cache_path.read_text().splitlines()
 
         if len(self.textfiles) == 0:
@@ -127,9 +117,8 @@ def __init__(self, root, word2vec, train=True, manifesto=False, folder=False, hi
 
     def __getitem__(self, index):
         path = self.textfiles[index]
-
         return read_wiki_file(Path(path), self.word2vec, ignore_list=True, remove_special_tokens=True,
                               high_granularity=self.high_granularity)
 
     def __len__(self):
-        return len(self.textfiles)
+        return len(self.textfiles)
\ No newline at end of file
diff --git a/wiki_processor.py b/wiki_processor.py
index fadadba..ba661c8 100644
--- a/wiki_processor.py
+++ b/wiki_processor.py
@@ -3,114 +3,87 @@
 import subprocess
 import re
 from pathlib2 import Path
-from random import shuffle,seed,uniform
+from random import shuffle, seed, uniform
 import math
-from shutil import  move
+from shutil import move
 import utils
 import wiki_utils
 import text_manipulation
 import wiki_thresholds
 import json
 
-
-logger = utils.setup_logger(__name__, 'processor_log.log', True )
+logger = utils.setup_logger(__name__, 'processor_log.log', True)
 doc_split_delimiter = "</doc>"
 id_parts = 7
-# minimal number of sentences in document (used to filter non informal documents such as https://en.wikipedia.org/wiki?curid=32283
 
 seed(1234)
 
-wikipedia_namespaces = ['Category', 'File', 'Ru', 'Wikipedia', 'Talk', 'User', 'MediaWiki', 'Template', 'Help', 'Portal', 'Book', 'Draft',
-                         'Education Program', 'TimedText', 'Module', 'Gadget', 'Gadget definition', 'Media', 'Special']
-
-disambigutaiton_pattern = '(disambiguation)'
-
+wikipedia_namespaces = ['Category', 'File', 'Ru', 'Wikipedia', 'Talk', 'User', 'MediaWiki', 'Template', 'Help', 'Portal', 
+                        'Book', 'Draft', 'Education Program', 'TimedText', 'Module', 'Gadget', 'Gadget definition', 
+                        'Media', 'Special']
 
-global num_sentneces_for_avg
-global sum_sentneces_for_avg
-num_sentneces_for_avg = 0
-sum_sentneces_for_avg = 0
+disambiguation_pattern = '(disambiguation)'
 
+global num_sentences_for_avg
+global sum_sentences_for_avg
+num_sentences_for_avg = 0
+sum_sentences_for_avg = 0
 
-def count_str_occurrences(str,findStr):
-
-    return len(str.split(findStr)) - 1
+def count_str_occurrences(text, findStr):
+    return len(text.split(findStr)) - 1
 
 def get_file_path(id):
-    chopped_id = []
-    id_str = str(id)
-    padding_count = id_parts - len(id_str)
-    while padding_count > 0:
-        id_str = "0" + id_str
-        padding_count-= 1
-
-    for i in range(0,3):
-        chopped_id.append(id_str[:2])
-        id_str = id_str[2:]
-
-    path = ""
-    for sub_path in chopped_id:
-        path =os.path.join(path, sub_path)
-    return path
+    id_str = str(id).zfill(id_parts)
+    return os.path.join(id_str[:2], id_str[2:4], id_str[4:6])
 
 def process_header(header):
     id_match = re.search(r'<doc id="(\d+)" url', header)
     id = id_match.groups()[0]
 
-
     title_match = re.search(r'title="(.*)">', header)
     title = title_match.groups()[0]
 
-    not_valid = title.isdigit() or any(title.startswith(prefix + ':' or prefix + ' talk:' ) for prefix in wikipedia_namespaces) or  title.endswith(disambigutaiton_pattern)
+    not_valid = title.isdigit() or any(title.startswith(prefix + ':' or prefix + ' talk:') 
+                                       for prefix in wikipedia_namespaces) or title.endswith(disambiguation_pattern)
 
     return id, not not_valid
 
 def get_sections(content):
     lines = content.split('\n')
     section = ""
-    # sections include headers
-    sections = []
-    sections.append(wiki_utils.get_segment_seperator(1,"preface."))
+    sections = [wiki_utils.get_segment_separator(1, "preface.")]
     for line in lines:
-        if (wiki_utils.is_seperator_line(line)):
+        if wiki_utils.is_separator_line(line):
             if len(section) > 0:
                 sections.append(section)
             section = ""
             sections.append(line)
-
         else:
-            section += line
-            section += '\n'
+            section += line + '\n'
 
     if len(section) > 0:
         sections.append(section)
 
     return sections
 
-
-
 def process_section(section, id):
-    global num_sentneces_for_avg
-    global sum_sentneces_for_avg
+    global num_sentences_for_avg, sum_sentences_for_avg
     sentences = text_manipulation.split_sentences(section, id)
     section_sentences = []
-    num_lists = 0
-    num_sentences = 0
-    num_formulas = 0
-    num_codes = 0
+    num_lists, num_sentences, num_formulas, num_codes = 0, 0, 0, 0
     last_sentence_was_list = False
+
     for sentence in sentences:
         is_list_sentence = wiki_utils.get_list_token() + "." == sentence.encode('utf-8')
         if '\n' in sentence:
-            logger.info("DocId: " + str(id) + "   back slash in sentence: " + sentence)
-        if (wiki_utils.get_list_token() in sentence) and (wiki_utils.get_list_token() + ".") != sentence.encode('utf-8'):
-            # TODO: delete this if section, since it is not suupposed to happen any more - but still happen
+            logger.info(f"DocId: {id}   backslash in sentence: {sentence}")
+        if wiki_utils.get_list_token() in sentence and (wiki_utils.get_list_token() + ".") != sentence.encode('utf-8'):
             num_lists += 1
             last_sentence_was_list = True
-            logger.info("DocId: " + str(id) +  "     Special case 1: " + sentence)
+            logger.info(f"DocId: {id}     Special case 1: {sentence}")
             continue
         elif is_list_sentence:
-            if (last_sentence_was_list):
+            if last_sentence_was_list:
                 continue
             last_sentence_was_list = True
             num_lists += 1
@@ -118,84 +91,75 @@ def process_section(section, id):
             last_sentence_was_list = False
             sentence_words = text_manipulation.extract_sentence_words(sentence)
             if len(sentence_words) < wiki_thresholds.min_words_in_sentence:
-                # ignore this sentence
                 continue
-            sum_sentneces_for_avg += len(sentence_words)
-            num_sentneces_for_avg += 1
-
+            sum_sentences_for_avg += len(sentence_words)
+            num_sentences_for_avg += 1
 
         num_formulas += count_str_occurrences(sentence, wiki_utils.get_formula_token())
-        num_codes += count_str_occurrences(sentence, wiki_utils.get_codesnipet_token())
+        num_codes += count_str_occurrences(sentence, wiki_utils.get_codesnippet_token())
         num_sentences += 1
         section_sentences.append(sentence)
 
-
     valid_section = True
     error_message = None
-    if (num_sentences < wiki_thresholds.min_sentence_in_section):
+
+    if num_sentences < wiki_thresholds.min_sentence_in_section:
         valid_section = False
-        error_message = "sentences count in section is too low"
+        error_message = "Sentences count in section is too low"
 
-    if (num_sentences > 0):
-        lists_perentage = float(num_lists) / float(num_sentences)
-        if lists_perentage >= wiki_thresholds.max_list_in_section_percentage:
+    if num_sentences > 0:
+        lists_percentage = float(num_lists) / float(num_sentences)
+        if lists_percentage >= wiki_thresholds.max_list_in_section_percentage:
             valid_section = False
-            error_message = "list percentage in section is too high: " + str(lists_perentage)
+            error_message = f"List percentage in section is too high: {lists_percentage}"
 
-    section_text =  ''.join(section_sentences)
+    section_text = ''.join(section_sentences)
     if len(re.findall('[a-zA-Z]', section_text)) < wiki_thresholds.min_section_char_count:
         valid_section = False
-        error_message = "char count in section is too low"
+        error_message = "Char count in section is too low"
 
     if num_formulas >= wiki_thresholds.max_section_formulas_count:
         valid_section = False
-        error_message = "number of formulas in section is too high: " + str(num_formulas)
+        error_message = f"Number of formulas in section is too high: {num_formulas}"
 
-    if num_codes >= wiki_thresholds.max_section_code_snipet_count:
+    if num_codes >= wiki_thresholds.max_section_code_snippet_count:
         valid_section = False
-        error_message = "number of code snippets in section is too high: " + str(num_codes)
-
+        error_message = f"Number of code snippets in section is too high: {num_codes}"
 
     return valid_section, section_sentences, error_message
 
 def is_valid_article(valid_section_count, section_count):
     if valid_section_count < wiki_thresholds.min_valid_section_count:
-        return False, "Valid section count is too low: " + str(valid_section_count)
+        return False, f"Valid section count is too low: {valid_section_count}"
 
-    valid_section_percentage = float(valid_section_count) / float (section_count)
+    valid_section_percentage = float(valid_section_count) / float(section_count)
     if valid_section_percentage < wiki_thresholds.min_valid_section_percentage:
-        return False, "Valid section percentage is too low: " + str(valid_section_percentage)
-
-
-    return True,""
-
+        return False, f"Valid section percentage is too low: {valid_section_percentage}"
 
+    return True, ""
 
 def max_level_in_article(content):
-    max_lavel = -1
+    max_level = -1
     for line in content:
-        if (wiki_utils.is_seperator_line(line)):
+        if wiki_utils.is_separator_line(line):
             current_level = wiki_utils.get_segment_level(line)
-            if current_level > max_lavel:
-                max_lavel = current_level
-    return max_lavel
-
+            if current_level > max_level:
+                max_level = current_level
+    return max_level
 
 def delete_empty_segment_headers(content):
     num_of_deletions = 0
     max_level = max_level_in_article(content)
-    for handle_level in range(max_level,0,-1):
+    for handle_level in range(max_level, 0, -1):
         last_section_level = -1
         last_section_header = True
-        for i in range(len(content) -1 , -1 , -1):
+        for i in range(len(content) - 1, -1, -1):
             section = content[i]
-            if (wiki_utils.is_seperator_line(section)):
+            if wiki_utils.is_separator_line(section):
                 section_level = wiki_utils.get_segment_level(section)
-                if (section_level == handle_level):
-
-                    # empty section if last seciont was also a header
-                    is_empty =  last_section_header
-                    if (is_empty &  (last_section_level <=  section_level)):
+                if section_level == handle_level:
+                    is_empty = last_section_header
+                    if is_empty and last_section_level <= section_level:
                         del content[i]
                         num_of_deletions += 1
                 last_section_level = section_level
@@ -205,87 +169,57 @@ def delete_empty_segment_headers(content):
 
     return content, num_of_deletions
 
-
 def vec_to_text(sections_with_headers):
-    adjusted_content = ""
-    for section in sections_with_headers:
-        adjusted_content += section + '\n'
-    return adjusted_content
-
+    return '\n'.join(sections_with_headers)
 
 def process_content(content, id):
     sections_with_headers = get_sections(content)
-    adjueted_content_text = ""
     article_lines = []
     section_count = 0
     valid_section_count = 0
-    for i in range(len(sections_with_headers)):
-        section = sections_with_headers[i]
-        if wiki_utils.is_seperator_line(section):
+
+    for section in sections_with_headers:
+        if wiki_utils.is_separator_line(section):
             article_lines.append(section)
         else:
             is_valid_section, section_sentences, message = process_section(section, id)
             section_count += 1
-            if (is_valid_section):
+            if is_valid_section:
                 valid_section_count += 1
                 article_lines.extend(section_sentences)
             else:
-                logger.info('Invalid section in article id: ' + id +
-                            '    Reason: ' + message + '    Content: ' + vec_to_text(section_sentences).strip('\n') )
+                logger.info(f'Invalid section in article id: {id}    Reason: {message}    Content: {vec_to_text(section_sentences).strip()}')
 
-    is_valid,reason = is_valid_article(valid_section_count, section_count )
+    is_valid, reason = is_valid_article(valid_section_count, section_count)
 
     if is_valid:
-        article_content,_ = delete_empty_segment_headers(article_lines)
-        adjueted_content_text = vec_to_text(article_content)
-
-
-    return is_valid, adjueted_content_text,reason
-
-
-# old process content, for comparsion
-# def process_content(content):
-#
-#     # keep only scetions with minimal number of characters
-#     sections = [s.strip('\n') for s in content.strip().split(section_delimiter) if
-#                 len(re.findall('[a-zA-Z]', s)) > min_section_length]
-#
-#     # article must have at least 3 sections, to avoid articles with only one section which is summaization. E.g:
-#     # https://en.wikipedia.org/wiki?curid=821470
-#     sections_count = len(sections)
-#     if sections_count < min_article_sections_count or sections_count >= max_article_sections_count:
-#         return content, False, 'Sections count is: ' + str(sections_count)
-#
-#     # remove first section since it usually the summary of the whole article
-#     adjueted_content = ('\n' + section_delimiter + '\n').join(sections[1:])
-#
-#     return adjueted_content, True, ""
-
+        article_content, _ = delete_empty_segment_headers(article_lines)
+        adjusted_content_text = vec_to_text(article_content)
+    else:
+        adjusted_content_text = ""
 
+    return is_valid, adjusted_content_text, reason
 
 def process_article(article):
-    non_empty_lines  = [l for l in article.strip().split("\n") if l != ""]
+    non_empty_lines = [l for l in article.strip().split("\n") if l != ""]
     header = non_empty_lines[0]
     id, is_valid_header = process_header(header)
 
     if not is_valid_header:
-        logger.info('Invalid header in doc id: ' + str(id)+ '     header:   ' +  header)
+        logger.info(f'Invalid header in doc id: {id}     header:   {header}')
         return "", id, False
 
     content = "\n".join(non_empty_lines[2:])
-    is_valid_content, processed_content , debug = process_content(content, id)
-    if not(is_valid_content):
-        logger.info('Invalid article in doc id: ' + str(id) + '.  ' + debug +'\n\n')
+    is_valid_content, processed_content, debug = process_content(content, id)
+    if not is_valid_content:
+        logger.info(f'Invalid article in doc id: {id}.  {debug}\n\n')
     else:
-        logger.info('Valid article , id: ' + str(id) +'\n\n')
+        logger.info(f'Valid article , id: {id}\n\n')
 
     return processed_content, id, is_valid_content
 
-
-def process_wiki_file(path, output_folder,train_ratio,test_ratio, forbidden_train_ids):
-    train_size = 0
-    dev_size = 0
-    test_size = 0
+def process_wiki_file(path, output_folder, train_ratio, test_ratio, forbidden_train_ids):
+    train_size, dev_size, test_size = 0, 0, 0
     with open(path, "r") as file:
         raw_content = file.read()
 
@@ -295,184 +229,159 @@ def process_wiki_file(path, output_folder,train_ratio,test_ratio, forbidden_trai
 
     for article in articles:
         processed_article, id, is_valid = process_article(article)
-        processed_articles_count+=1
+        processed_articles_count += 1
         if not is_valid:
-            continue;
+            continue
         random_num = uniform(0, 1)
-        if (random_num > train_ratio and  random_num <= train_ratio + test_ratio) or int(id) in forbidden_train_ids:
+        if (random_num > train_ratio and random_num <= train_ratio + test_ratio) or int(id) in forbidden_train_ids:
             partition = "test"
             test_size += 1
-        elif (random_num >  train_ratio + test_ratio):
+        elif random_num > train_ratio + test_ratio:
             partition = "dev"
             dev_size += 1
         else:
             partition = "train"
             train_size += 1
-        output_sub_folder = os.path.join(output_folder,partition, get_file_path(id))
+        output_sub_folder = os.path.join(output_folder, partition, get_file_path(id))
         if not os.path.exists(output_sub_folder):
             os.makedirs(output_sub_folder)
         output_file_path = os.path.join(output_sub_folder, str(id))
         with open(output_file_path, "w") as output_file:
-            output_file.write(processed_article.encode('utf-8'), )
-        created_articles_count+=1
-
-    return created_articles_count, processed_articles_count, train_size,dev_size,test_size
+            output_file.write(processed_article.encode('utf-8'))
+        created_articles_count += 1
 
+    return created_articles_count, processed_articles_count, train_size, dev_size, test_size
 
 def get_forbidden_train_ids():
-    # Return ids of article which must be in test set (and not train/dev)
     with open('wikicities_article_names_to_ids') as f:
         wiki_cities = json.load(f)
 
     with open('wikielements_article_names_to_ids') as f:
         wiki_elements = json.load(f)
 
-    forbidden_train_ids = []
-    for k,v in wiki_cities.iteritems():
-        forbidden_train_ids.append(int(v))
-    for k,v in wiki_elements.iteritems():
-        forbidden_train_ids.append(int(v))
-
-    unique_ids = set(forbidden_train_ids)
-
-    return unique_ids;
-
-
+    forbidden_train_ids = [int(v) for d in (wiki_cities, wiki_elements) for v in d.values()]
+    return set(forbidden_train_ids)
 
 def get_wiki_files(path):
     all_objects = Path(path).glob('**/*')
-    files = (str(p) for p in all_objects if p.is_file())
-    return files
+    return (str(p) for p in all_objects if p.is_file())
 
-
-def process_wiki_folder(input_folder, output_folder,train_ratio,test_ratio):
-    total_train_size = 0
-    total_dev_size = 0
-    total_test_size = 0
-    folders =  [o for o in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, o))]
-    total_created_articles = 0
-    total_processed_articles = 0
+def process_wiki_folder(input_folder, output_folder, train_ratio, test_ratio):
+    total_train_size, total_dev_size, total_test_size = 0, 0, 0
+    folders = [o for o in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, o))]
+    total_created_articles, total_processed_articles = 0, 0
     previous_debug = 0
     forbidden_train_ids = get_forbidden_train_ids()
+
     for folder in folders:
         full_folder_path = os.path.join(input_folder, folder)
         if not os.path.exists(output_folder):
             os.makedirs(output_folder)
         files = get_wiki_files(full_folder_path)
         for file in files:
-            created_articles,  processed_articles, train_size, dev_size, test_size = process_wiki_file(file,  output_folder, float(train_ratio), float(test_ratio), forbidden_train_ids)
+            created_articles, processed_articles, train_size, dev_size, test_size = process_wiki_file(
+                file, output_folder, float(train_ratio), float(test_ratio), forbidden_train_ids
+            )
             total_train_size += train_size
             total_dev_size += dev_size
             total_test_size += test_size
             total_created_articles += created_articles
             total_processed_articles += processed_articles
-            if (total_created_articles - previous_debug > 2500):
+            if total_created_articles - previous_debug > 2500:
                 previous_debug = total_created_articles
-                print ('Created ' + str(total_created_articles) + ' wiki articles, out of ' + str(total_processed_articles) + ' processed articles')
-    total_samples = total_train_size + total_dev_size + total_test_size
-    print 'total_samples = ', str(total_samples)
-    print "#train = ",total_train_size,"ratio: ","{:.2f}".format(total_train_size / float(total_samples))
-    print "#dev = ", total_dev_size,"ratio: ","{:.2f}".format(total_dev_size/ float(total_samples))
-    print "#test = ", total_test_size,"ratio: ","{:.2f}".format(total_test_size / float(total_samples))
+                print(f'Created {total_created_articles} wiki articles, out of {total_processed_articles} processed articles')
 
+    total_samples = total_train_size + total_dev_size + total_test_size
+    print(f'total_samples = {total_samples}')
+    print(f"#train = {total_train_size}, ratio: {total_train_size / float(total_samples):.2f}")
+    print(f"#dev = {total_dev_size}, ratio: {total_dev_size / float(total_samples):.2f}")
+    print(f"#test = {total_test_size}, ratio: {total_test_size / float(total_samples):.2f}")
 
-def move_wiki_file(src,  folder, partition):
-    # get relative path to inputFolder
+def move_wiki_file(src, folder, partition):
     file = os.path.relpath(src, folder)
-
-    # extract file path in train folder
-    dstFile = os.path.join(folder, partition, file)
-    dstdir = os.path.dirname(dstFile)
+    dst_file = os.path.join(folder, partition, file)
+    dstdir = os.path.dirname(dst_file)
     if not os.path.exists(dstdir):
         os.makedirs(dstdir)
-    move(src, dstFile)
-
+    move(src, dst_file)
 
-def removeEmptyFolders(path, removeRoot=True):
+def remove_empty_folders(path, remove_root=True):
     if not os.path.isdir(path):
         return
 
-    # remove empty subfolders
     files = os.listdir(path)
     for f in files:
         fullpath = os.path.join(path, f)
         if os.path.isdir(fullpath):
-            removeEmptyFolders(fullpath)
+            remove_empty_folders(fullpath)
 
-    # if folder empty, delete it
     files = os.listdir(path)
-    if len(files) == 0 and removeRoot:
-        #print "Removing empty folder:", path
+    if len(files) == 0 and remove_root:
         os.rmdir(path)
 
-
-
-def trainTestDev(destFolder, train_size, test_size):
+def train_test_dev(dest_folder, train_size, test_size):
     train_size_ratio = float(train_size)
     test_size_ratio = float(test_size)
     dev_size_ratio = 1 - train_size_ratio - test_size_ratio
 
-    print (destFolder,train_size,test_size)
+    print(dest_folder, train_size, test_size)
 
-    allFiles = []
-    if not os.path.exists(destFolder):
-        print ("Output folder does not exist")
+    all_files = []
+    if not os.path.exists(dest_folder):
+        print("Output folder does not exist")
         return
-    folders =  [o for o in os.listdir(destFolder) if os.path.isdir(os.path.join(destFolder, o))]
+    folders = [o for o in os.listdir(dest_folder) if os.path.isdir(os.path.join(dest_folder, o))]
     for folder in folders:
-        full_folder_path = os.path.join(destFolder, folder)
+        full_folder_path = os.path.join(dest_folder, folder)
         files = get_wiki_files(full_folder_path)
-        allFiles.extend(files)
+        all_files.extend(files)
 
+    shuffle(all_files)
 
-    shuffle(allFiles)
+    train_size = int(math.floor(len(all_files) * train_size_ratio))
+    dev_size = int(math.floor(len(all_files) * dev_size_ratio))
 
-    trainSize = int(math.floor(len(allFiles) * train_size_ratio))
-    devSize = int(math.floor(len(allFiles) * dev_size_ratio))
-    for i in range(0,trainSize):
-        move_wiki_file(allFiles[i], destFolder, partition="train")
+    for i in range(train_size):
+        move_wiki_file(all_files[i], dest_folder, partition="train")
 
-    if devSize > 0:
-        for i in range(trainSize, trainSize + devSize):
-            move_wiki_file(allFiles[i], destFolder, partition="dev")
+    if dev_size > 0:
+        for i in range(train_size, train_size + dev_size):
+            move_wiki_file(all_files[i], dest_folder, partition="dev")
 
-    for i in range(trainSize + devSize,len(allFiles)):
-        move_wiki_file(allFiles[i], destFolder, partition="test")
-    print ("#train = ",trainSize)
-    print ("#dev = ", devSize)
-    print ("#test = ", len(allFiles) - trainSize  -devSize)
+    for i in range(train_size + dev_size, len(all_files)):
+        move_wiki_file(all_files[i], dest_folder, partition="test")
 
-    removeEmptyFolders(destFolder)
+    print(f"#train = {train_size}")
+    print(f"#dev = {dev_size}")
+    print(f"#test = {len(all_files) - train_size - dev_size}")
 
+    remove_empty_folders(dest_folder)
 
-def main (args):
-    global num_sentneces_for_avg
-    global sum_sentneces_for_avg
+def main(args):
+    global num_sentences_for_avg, sum_sentences_for_avg
     if not os.path.exists(args.temp):
         os.makedirs(args.temp)
-    # execute extraction of wikipedia dump
-    cmd = ['python', str(Path(__file__).parent / 'wiki_extractor.py'), '-s', '-o', args.temp, '--article_count', str(args.article_count),'--lists']
-    print cmd
+
+    cmd = ['python', str(Path(__file__).parent / 'wiki_extractor.py'), '-s', '-o', args.temp, '--article_count', str(args.article_count), '--lists']
+    print(cmd)
 
     if args.processes:
         cmd += ['--processes', args.processes]
 
     cmd += [args.input]
 
-    if not args.no_extractor: 
+    if not args.no_extractor:
         subprocess.call(cmd)
-        print ("Finisehd extractor")
-
-
+        print("Finished extractor")
 
     if not os.path.exists(args.output):
         os.makedirs(args.output)
-    # create file per each wiki value from the extracted dump
-    process_wiki_folder(args.temp, args.output,args.train, args.test)
+        
+    process_wiki_folder(args.temp, args.output, args.train, args.test)
 
-    print ("Number of processed sentences: " +  str(num_sentneces_for_avg))
-    print "avg len sentence = " + str(sum_sentneces_for_avg / float(num_sentneces_for_avg))
-    print ('done')
+    print(f"Number of processed sentences: {num_sentences_for_avg}")
+    print(f"avg len sentence = {sum_sentences_for_avg / float(num_sentences_for_avg)}")
+    print('done')
 
 if __name__ == '__main__':
     parser = ArgumentParser()
@@ -483,6 +392,5 @@ def main (args):
     parser.add_argument('--output', help='output folder', required=True)
     parser.add_argument('--train', help='train size ratio', required=True)
     parser.add_argument('--test', help='test size ratio', required=True)
-    parser.add_argument("--article_count", help = 'max number of wikipedia articles to extract', default=1000000)
-    main(parser.parse_args())
-
+    parser.add_argument('--article_count', help='max number of wikipedia articles to extract', default=1000000)
+    main(parser.parse_args())
\ No newline at end of file
diff --git a/wiki_utils.py b/wiki_utils.py
index 32f7d21..8d8bcc8 100644
--- a/wiki_utils.py
+++ b/wiki_utils.py
@@ -1,22 +1,22 @@
 segment_seperator = "========"
 
 
-def get_segment_seperator(level,name):
-    return segment_seperator + "," + str(level) + "," +name
+def get_segment_seperator(level, name):
+    return segment_seperator + "," + str(level) + "," + name
 
-def get_seperator_foramt(levels = None):
-    level_format = '\d' if levels == None else '['+ str(levels[0]) + '-' + str(levels[1]) + ']'
-    seperator_fromat = segment_seperator + ',' + level_format + ",.*?\."
-    return seperator_fromat
+def get_seperator_foramt(levels=None):
+    level_format = '\d' if levels is None else '[' + str(levels[0]) + '-' + str(levels[1]) + ']'
+    separator_format = segment_seperator + ',' + level_format + ",.*?\\."
+    return separator_format
 
 def is_seperator_line(line):
     return line.startswith(segment_seperator)
 
-def get_segment_level(seperator_line):
-    return int (seperator_line.split(',')[1])
+def get_segment_level(separator_line):
+    return int(separator_line.split(',')[1])
 
-def get_segment_name(seperator_line):
-    return seperator_line.split(',')[2]
+def get_segment_name(separator_line):
+    return separator_line.split(',')[2]
 
 def get_list_token():
     return "***LIST***"
@@ -28,10 +28,9 @@ def get_codesnipet_token():
     return "***codice***"
 
 def get_special_tokens():
-    special_tokens = []
-    special_tokens.append(get_list_token())
-    special_tokens.append(get_formula_token())
-    special_tokens.append(get_codesnipet_token())
-    return special_tokens
-
-
+    special_tokens = [
+        get_list_token(),
+        get_formula_token(),
+        get_codesnipet_token()
+    ]
+    return special_tokens
\ No newline at end of file

From 803025f104a5f7211c7434cd2a35adfff4d8480a Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Mon, 7 Oct 2024 19:27:43 +0530
Subject: [PATCH 02/16] removed logs if sentence is empty

---
 .gitignore     | 1 -
 config.json    | 5 +++++
 wiki_loader.py | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 config.json

diff --git a/.gitignore b/.gitignore
index de6246e..477c669 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 .DS_Store
-config.json
 data/
 runs/
 
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..64c1e2a
--- /dev/null
+++ b/config.json
@@ -0,0 +1,5 @@
+{
+    "word2vecfile": "/Users/jitesh/Downloads/text-segmentation/data/word2vec/GoogleNews-vectors-negative300.bin",
+    "choidataset": "/home/omri/code/text-segmentation-2017/data/choi",
+    "wikidataset": "/Users/jitesh/Downloads/text-segmentation/data/wiki 727"
+}
\ No newline at end of file
diff --git a/wiki_loader.py b/wiki_loader.py
index 009b901..c1f74c7 100644
--- a/wiki_loader.py
+++ b/wiki_loader.py
@@ -79,7 +79,8 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals
                     if 1 <= len(sentence_words):
                         data.append([word_model(word, word2vec) for word in sentence_words])
                     else:
-                        logger.info('Sentence in wikipedia file is empty')
+                        # logger.info('Sentence in wikipedia file is empty')
+                        continue
                 else:  # for the annotation. keep sentence as is.
                     if only_letters:
                         sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence)

From 2ce61b243aaad7f80aefa12f09b6632e912fe1e1 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Tue, 8 Oct 2024 19:48:58 +0530
Subject: [PATCH 03/16] code changes for simplified folder structure

---
 wiki_loader.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/wiki_loader.py b/wiki_loader.py
index c1f74c7..6483cd2 100644
--- a/wiki_loader.py
+++ b/wiki_loader.py
@@ -20,12 +20,12 @@ def get_cache_path(wiki_folder):
     return cache_file_path
 
 def cache_wiki_filenames(wiki_folder):
-    files = Path(wiki_folder).glob('*/*/*/*')
+    files = str(Path(wiki_folder))
     cache_file_path = get_cache_path(wiki_folder)
 
     with cache_file_path.open('w+') as f:
-        for file in files:
-            f.write(str(file) + u'\n')
+        for file in os.listdir(files):
+            f.write(os.path.join(files,file) + u'\n')
 
 def clean_section(section):
     cleaned_section = section.strip('\n')
@@ -94,6 +94,7 @@ def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=Fals
 
 class WikipediaDataSet(Dataset):
     def __init__(self, root, word2vec, train=True, manifesto=False, folder=False, high_granularity=False):
+        
         if manifesto:
             self.textfiles = list(Path(root).glob('*'))
         else:

From 117820ced0cbe2318dc98117f1007f21b4199873 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Thu, 10 Oct 2024 11:58:24 +0530
Subject: [PATCH 04/16] added pin memory flag with argparse

---
 .gitignore |  1 +
 run.py     | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 477c669..9644866 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .DS_Store
+.vscode/
 data/
 runs/
 
diff --git a/run.py b/run.py
index b042079..7dceb50 100644
--- a/run.py
+++ b/run.py
@@ -1,6 +1,5 @@
 import torch
 from torch.utils.data import DataLoader
-from torch.autograd import Variable
 import torch.nn.functional as F
 
 from choiloader import ChoiDataset, collate_fn
@@ -175,11 +174,11 @@ def main(args):
         test_dataset = dataset_class(dataset_path / 'test', word2vec, high_granularity=args.high_granularity)
 
         train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True,
-                              num_workers=args.num_workers)
+                              num_workers=args.num_workers,pin_memory=args.pin_memory)
         dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
-                            num_workers=args.num_workers)
+                            num_workers=args.num_workers,pin_memory=args.pin_memory)
         test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
-                             num_workers=args.num_workers)
+                             num_workers=args.num_workers,pin_memory=args.pin_memory)
 
     model = import_model(args.model) if args.model else torch.load(open(args.load_from, 'rb'))
     model = maybe_cuda(model)
@@ -190,23 +189,24 @@ def main(args):
         best_val_pk = 1.0
         for j in range(args.epochs):
             train(model, args, j, train_dl, logger, optimizer)
-            torch.save(model, open(checkpoint_path / f'model{j:03d}.t7', 'wb'))
+            torch.save(model, open(checkpoint_path / f'model{j:03d}.pt', 'wb'))
 
             val_pk, threshold = validate(model, args, j, dev_dl, logger)
             if val_pk < best_val_pk:
                 test_pk = test(model, args, j, test_dl, logger, threshold)
                 logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green'))
                 best_val_pk = val_pk
-                torch.save(model, open(checkpoint_path / 'best_model.t7', 'wb'))
+                torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb'))
 
     else:
         test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity),
-                             batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers)
+                             batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers,pin_memory=args.pin_memory)
         print(test(model, args, 0, test_dl, logger, 0.4))
 
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--cuda', help='Use cuda?', action='store_true')
+    parser.add_argument('--pin_memory', help='Pin Memory?', action='store_true')
     parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true')
     parser.add_argument('--bs', help='Batch size', type=int, default=8)
     parser.add_argument('--test_bs', help='Test batch size', type=int, default=5)

From c6fffb019b21d32ec2fc34594facba60fc38f82b Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Fri, 11 Oct 2024 22:38:21 +0530
Subject: [PATCH 05/16] moved part of model code to collate_fn

---
 .gitignore                       |  1 +
 choiloader.py                    | 39 +++++++++++++++++++++-
 models/max_sentence_embedding.py | 25 ++-------------
 run.py                           | 55 +++++++++++++++++++++++++-------
 4 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9644866..d8a238c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 .vscode/
 data/
 runs/
+checkpoints/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/choiloader.py b/choiloader.py
index 2085fdc..ce6aad9 100644
--- a/choiloader.py
+++ b/choiloader.py
@@ -6,6 +6,8 @@
 import utils
 import math
 from pathlib import Path  # Use pathlib, which is built-in with Python 3
+from torch.nn.utils.rnn import pack_padded_sequence
+import torch.nn.functional as F
 
 logger = utils.setup_logger(__name__, 'train.log')
 
@@ -14,6 +16,37 @@ def get_choi_files(path):
     files = [str(p) for p in all_objects if p.is_file()]
     return files
 
+
+def custom_pad(s, max_length):
+    s_length = s.size()[0]
+    v = utils.maybe_cuda(s.unsqueeze(0).unsqueeze(0))
+    padded = F.pad(v, (0, 0, 0, max_length - s_length))  # (1, 1, max_length, 300)
+    shape = padded.size()
+    return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
+
+def pack_tensor(batch):
+        
+    sentences_per_doc = []
+    all_batch_sentences = []
+    for document in batch:
+        all_batch_sentences.extend(document)
+        sentences_per_doc.append(len(document))
+
+    lengths = [s.size()[0] for s in all_batch_sentences]
+    sort_order = np.argsort(lengths)[::-1]
+    sorted_sentences = [all_batch_sentences[i] for i in sort_order]
+    sorted_lengths = [s.size()[0] for s in sorted_sentences]
+
+    max_length = max(lengths)
+    logger.debug('Num sentences: %s, max sentence length: %s', 
+                    sum(sentences_per_doc), max_length)
+
+    padded_sentences = [custom_pad(s, max_length) for s in sorted_sentences]
+    big_tensor = torch.cat(padded_sentences, 1)  # (max_length, batch size, 300)
+    packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False)
+    return packed_tensor,sentences_per_doc,sort_order
+
+
 def collate_fn(batch):
     batched_data = []
     batched_targets = []
@@ -43,7 +76,11 @@ def collate_fn(batch):
             logger.debug('Exception!', exc_info=True)
             continue
 
-    return batched_data, batched_targets, paths
+    packed_data,sentences_per_doc,sort_order = pack_tensor(batched_data)
+
+    data = (packed_data,sentences_per_doc,sort_order,len(batch))
+    
+    return (data,batched_targets,paths)
 
 def clean_paragraph(paragraph):
     cleaned_paragraph = paragraph.replace("'' ", " ").replace(" 's", "'s").replace("``", "").strip('\n')
diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py
index 847d053..8c57bc5 100644
--- a/models/max_sentence_embedding.py
+++ b/models/max_sentence_embedding.py
@@ -74,28 +74,9 @@ def pad_document(self, d, max_document_length):
         padded = F.pad(v, (0, 0, 0, max_document_length - d_length))  # (1, 1, max_length, 300)
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
-
-    def forward(self, batch):
-        batch_size = len(batch)
-
-        sentences_per_doc = []
-        all_batch_sentences = []
-        for document in batch:
-            all_batch_sentences.extend(document)
-            sentences_per_doc.append(len(document))
-
-        lengths = [s.size()[0] for s in all_batch_sentences]
-        sort_order = np.argsort(lengths)[::-1]
-        sorted_sentences = [all_batch_sentences[i] for i in sort_order]
-        sorted_lengths = [s.size()[0] for s in sorted_sentences]
-
-        max_length = max(lengths)
-        logger.debug('Num sentences: %s, max sentence length: %s', 
-                     sum(sentences_per_doc), max_length)
-
-        padded_sentences = [self.pad(s, max_length) for s in sorted_sentences]
-        big_tensor = torch.cat(padded_sentences, 1)  # (max_length, batch size, 300)
-        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths, enforce_sorted=False)
+    
+    def forward(self, data):
+        packed_tensor, sentences_per_doc, sort_order,batch_size = data
         encoded_sentences = self.sentence_encoder(packed_tensor)
         unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
diff --git a/run.py b/run.py
index 7dceb50..3e783ed 100644
--- a/run.py
+++ b/run.py
@@ -1,5 +1,5 @@
 import torch
-from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader, Subset
 import torch.nn.functional as F
 
 from choiloader import ChoiDataset, collate_fn
@@ -9,6 +9,7 @@
 import gensim
 import utils
 from tensorboard_logger import configure, log_value
+import time
 import os
 import sys
 from pathlib import Path
@@ -17,7 +18,7 @@
 import numpy as np
 from termcolor import colored
 
-torch.multiprocessing.set_sharing_strategy('file_system')
+# torch.multiprocessing.set_sharing_strategy('file_system')
 
 preds_stats = utils.predictions_analysis()
 
@@ -64,6 +65,22 @@ def calc_accuracy(self):
 
         return min_pk, min_epoch_windiff, min_threshold
 
+def tensor_size_in_bytes(tensor):
+    return tensor.numel() * tensor.element_size()
+
+def compute_batch_size(data):
+    total_size=0
+
+    for element in data:
+        num_sentences = len(element)
+        
+        for sentence in element:
+            total_size += tensor_size_in_bytes(sentence)
+
+    return total_size / (1024**2)
+
+
+
 def train(model, args, epoch, dataset, logger, optimizer):
     model.train()
     total_loss = 0.0  # Changed to float value
@@ -74,6 +91,7 @@ def train(model, args, epoch, dataset, logger, optimizer):
 
             pbar.update()
             model.zero_grad()
+            # data_size = compute_batch_size(data)
             output = model(data)
             target_var = maybe_cuda(torch.cat(target, 0), args.cuda)
             loss = model.criterion(output, target_var)
@@ -86,7 +104,7 @@ def train(model, args, epoch, dataset, logger, optimizer):
 
     total_loss /= len(dataset)
     logger.debug(f'Training Epoch: {epoch + 1}, Loss: {total_loss:.4}')
-    log_value('Training Loss', total_loss, epoch + 1)
+    # log_value('Training Loss', total_loss, epoch + 1)
 
 def validate(model, args, epoch, dataset, logger):
     model.eval()
@@ -104,7 +122,7 @@ def validate(model, args, epoch, dataset, logger):
             target_seg = targets_var.cpu().numpy()
             preds_stats.add(output_seg, target_seg)
 
-            acc.update(output_softmax.cpu().numpy(), target)
+            acc.update(output_softmax.detach().cpu().numpy(), target)
 
         epoch_pk, epoch_windiff, threshold = acc.calc_accuracy()
 
@@ -161,7 +179,9 @@ def main(args):
     utils.config.update(vars(args))  # Updated to use vars(args)
     logger.debug(f'Running with config {utils.config}')
 
-    configure(os.path.join('runs', args.expname))
+    
+    # log_dir = os.path.join('runs', args.expname, str(time.time()))
+    # configure(log_dir)
 
     word2vec = None if args.test else gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True)
 
@@ -173,6 +193,11 @@ def main(args):
         dev_dataset = dataset_class(dataset_path / 'dev', word2vec, high_granularity=args.high_granularity)
         test_dataset = dataset_class(dataset_path / 'test', word2vec, high_granularity=args.high_granularity)
 
+        if args.subset:
+            train_dataset = Subset(train_dataset,range(1000))
+            dev_dataset = Subset(dev_dataset,range(1000))
+            test_dataset = Subset(test_dataset,range(1000))
+
         train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True,
                               num_workers=args.num_workers,pin_memory=args.pin_memory)
         dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
@@ -185,6 +210,11 @@ def main(args):
 
     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
 
+    if args.benchmark:
+        for j in range(args.epochs):
+            train(model, args, j, train_dl, logger, optimizer)
+        return 
+
     if not args.infer:
         best_val_pk = 1.0
         for j in range(args.epochs):
@@ -192,11 +222,12 @@ def main(args):
             torch.save(model, open(checkpoint_path / f'model{j:03d}.pt', 'wb'))
 
             val_pk, threshold = validate(model, args, j, dev_dl, logger)
-            if val_pk < best_val_pk:
-                test_pk = test(model, args, j, test_dl, logger, threshold)
-                logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green'))
-                best_val_pk = val_pk
-                torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb'))
+            print(f'Current best model from epoch {j} with p_k {val_pk} and threshold {threshold}')
+            # if val_pk < best_val_pk:
+            #     test_pk = test(model, args, j, test_dl, logger, threshold)
+            #     logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green'))
+            #     best_val_pk = val_pk
+            #     torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb'))
 
     else:
         test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity),
@@ -207,10 +238,12 @@ def main(args):
     parser = ArgumentParser()
     parser.add_argument('--cuda', help='Use cuda?', action='store_true')
     parser.add_argument('--pin_memory', help='Pin Memory?', action='store_true')
+    parser.add_argument('--subset', help='Use a sample of 1000 rows', action='store_true')
+    parser.add_argument('--benchmark', help='Use PyTorch profiler', action='store_true')
     parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true')
     parser.add_argument('--bs', help='Batch size', type=int, default=8)
     parser.add_argument('--test_bs', help='Test batch size', type=int, default=5)
-    parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10)
+    parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=1)
     parser.add_argument('--model', help='Model to run - will import and run')
     parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue')
     parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1')

From 0ef4d15c317f244bdadd30cb6b96e84e3695e0ae Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sat, 12 Oct 2024 11:20:09 +0530
Subject: [PATCH 06/16] removed loop from forward method of base LSTM model

---
 choiloader.py                    |   4 +-
 models/from_presentation.py      |  11 +---
 models/max_sentence_embedding.py | 109 ++++++++++++++-----------------
 models/single_lstm.py            |  11 +---
 run.py                           |   5 +-
 5 files changed, 58 insertions(+), 82 deletions(-)

diff --git a/choiloader.py b/choiloader.py
index ce6aad9..94dfb5f 100644
--- a/choiloader.py
+++ b/choiloader.py
@@ -19,7 +19,7 @@ def get_choi_files(path):
 
 def custom_pad(s, max_length):
     s_length = s.size()[0]
-    v = utils.maybe_cuda(s.unsqueeze(0).unsqueeze(0))
+    v = s.unsqueeze(0).unsqueeze(0)
     padded = F.pad(v, (0, 0, 0, max_length - s_length))  # (1, 1, max_length, 300)
     shape = padded.size()
     return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
@@ -78,7 +78,7 @@ def collate_fn(batch):
 
     packed_data,sentences_per_doc,sort_order = pack_tensor(batched_data)
 
-    data = (packed_data,sentences_per_doc,sort_order,len(batch))
+    data = (packed_data,sentences_per_doc,sort_order)
     
     return (data,batched_targets,paths)
 
diff --git a/models/from_presentation.py b/models/from_presentation.py
index 39e138d..31acece 100644
--- a/models/from_presentation.py
+++ b/models/from_presentation.py
@@ -9,12 +9,6 @@
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
 
-# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required.
-def zero_state(module, batch_size):
-    # * 2 is for the two directions
-    return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \
-           maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))
-
 class SentenceEncodingRNN(nn.Module):
     def __init__(self, input_size=300, hidden=128, num_layers=2):
         super(SentenceEncodingRNN, self).__init__()
@@ -30,8 +24,7 @@ def __init__(self, input_size=300, hidden=128, num_layers=2):
 
     def forward(self, x):
         batch_size = x.batch_sizes[0]
-        s = zero_state(self, batch_size)
-        _, (hidden, _) = self.lstm(x, s)  # (4, batch_size, 128)
+        _, (hidden, _) = self.lstm(x)  # (4, batch_size, 128)
         transposed = hidden.transpose(0, 1)  # (batch_size, 4, 128)
         reshaped = transposed.contiguous().view(batch_size, -1)
 
@@ -117,7 +110,7 @@ def forward(self, batch):
         docs_tensor = torch.cat(padded_docs, 1)
         packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False)
         profiler.set()  # 3
-        sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size))
+        sentence_lstm_output, _ = self.sentence_lstm(packed_docs)
         profiler.set()  # 4
         padded_x, _ = pad_packed_sequence(sentence_lstm_output)  # (max sentence len, batch, 256)
 
diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py
index 8c57bc5..a8c4256 100644
--- a/models/max_sentence_embedding.py
+++ b/models/max_sentence_embedding.py
@@ -9,64 +9,36 @@
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
 
-# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required.
-def zero_state(module, batch_size):
-    # * 2 is for the two directions
-    return maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden)), \
-           maybe_cuda(torch.zeros(module.num_layers * 2, batch_size, module.hidden))
-
-class SentenceEncodingRNN(nn.Module):
-    def __init__(self, input_size, hidden, num_layers):
-        super(SentenceEncodingRNN, self).__init__()
-        self.num_layers = num_layers
-        self.hidden = hidden
-        self.input_size = input_size
-
-        self.lstm = nn.LSTM(input_size=self.input_size,
-                            hidden_size=self.hidden,
-                            num_layers=self.num_layers,
-                            dropout=0,
-                            bidirectional=True)
-
-    def forward(self, x):
-        batch_size = x.batch_sizes[0]
-        s = zero_state(self, batch_size)
-        packed_output, _ = self.lstm(x, s)
-        padded_output, lengths = pad_packed_sequence(packed_output)  # (max sentence len, batch, 256)
-
-        maxes = maybe_cuda(torch.zeros(batch_size, padded_output.size(2)))
-        for i in range(batch_size):
-            maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0]
-
-        return maxes
 
 class Model(nn.Module):
-    def __init__(self, sentence_encoder, hidden=128, num_layers=2):
+    def __init__(self, input_size, hidden=128, num_layers=2):
         super(Model, self).__init__()
 
-        self.sentence_encoder = sentence_encoder
-
-        self.sentence_lstm = nn.LSTM(input_size=sentence_encoder.hidden * 2,
-                                     hidden_size=hidden,
-                                     num_layers=num_layers,
-                                     batch_first=True,
-                                     dropout=0,
-                                     bidirectional=True)
-
-        # We have two labels
-        self.h2s = nn.Linear(hidden * 2, 2)
-
-        self.num_layers = num_layers
+        self.input_size = input_size
         self.hidden = hidden
+        self.num_layers = num_layers
+        
+
+        self.sentence_encoder = nn.LSTM(
+            input_size=self.input_size,
+            hidden_size=self.hidden,
+            num_layers=self.num_layers,
+            dropout=0,
+            bidirectional=True
+        )
+
+        self.sentence_lstm = nn.LSTM(
+            input_size=self.hidden * 2,
+            hidden_size=hidden,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=0,
+            bidirectional=True
+        )
 
+        self.h2s = nn.Linear(hidden * 2, 2)
         self.criterion = nn.CrossEntropyLoss()
 
-    def pad(self, s, max_length):
-        s_length = s.size()[0]
-        v = maybe_cuda(s.unsqueeze(0).unsqueeze(0))
-        padded = F.pad(v, (0, 0, 0, max_length - s_length))  # (1, 1, max_length, 300)
-        shape = padded.size()
-        return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
 
     def pad_document(self, d, max_document_length):
         d_length = d.size()[0]
@@ -75,9 +47,32 @@ def pad_document(self, d, max_document_length):
         shape = padded.size()
         return padded.view(shape[2], 1, shape[3])  # (max_length, 1, 300)
     
+    
+    def forward_sentence_encoding(self, x):
+        # num_sequences = x.batch_sizes[0]
+        packed_output, _ = self.sentence_encoder(x)
+        padded_output, lengths = pad_packed_sequence(packed_output)  # (max sentence len, batch, 256)
+
+        # maxes = maybe_cuda(torch.zeros(num_sequences, padded_output.size(2)))
+        # for i in range(num_sequences):
+        #     maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0]
+
+        # Create a mask based on lengths
+        mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0)
+        mask = maybe_cuda(mask)
+        
+        # Mask padded values by setting them to a very negative value (so they don't affect the max computation)
+        padded_output = padded_output.masked_fill(~mask.unsqueeze(2), float('-inf'))
+
+        # Apply max pooling over the first dimension (time dimension) for each batch
+        maxes, _ = torch.max(padded_output, dim=0)
+
+        return maxes
+    
     def forward(self, data):
-        packed_tensor, sentences_per_doc, sort_order,batch_size = data
-        encoded_sentences = self.sentence_encoder(packed_tensor)
+        packed_tensor, sentences_per_doc, sort_order = data
+        packed_tensor = maybe_cuda(packed_tensor)
+        encoded_sentences = self.forward_sentence_encoding(packed_tensor)
         unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
 
@@ -96,7 +91,7 @@ def forward(self, data):
         padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents]
         docs_tensor = torch.cat(padded_docs, 1)
         packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False)
-        sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size))
+        sentence_lstm_output, _ = self.sentence_lstm(packed_docs)
         padded_x, _ = pad_packed_sequence(sentence_lstm_output)  # (max sentence len, batch, 256)
 
         doc_outputs = []
@@ -107,10 +102,4 @@ def forward(self, data):
         sentence_outputs = torch.cat(unsorted_doc_outputs, 0)
 
         x = self.h2s(sentence_outputs)
-        return x
-
-def create():
-    sentence_encoder = SentenceEncodingRNN(input_size=300,
-                                           hidden=256,
-                                           num_layers=2)
-    return Model(sentence_encoder, hidden=256, num_layers=2)
\ No newline at end of file
+        return x
\ No newline at end of file
diff --git a/models/single_lstm.py b/models/single_lstm.py
index 72920bf..63f94b2 100644
--- a/models/single_lstm.py
+++ b/models/single_lstm.py
@@ -9,12 +9,6 @@
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
 
-# Removed Variable since it is deprecated in PyTorch. Tensors now automatically track gradients if required.
-def zero_state(module, batch_size):
-    # * 2 is for the two directions
-    return maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden)), \
-           maybe_cuda(torch.zeros(module.num_layers, batch_size, module.hidden))
-
 class SentenceEncodingRNN(nn.Module):
     def __init__(self, input_size, hidden, num_layers):
         super(SentenceEncodingRNN, self).__init__()
@@ -30,8 +24,7 @@ def __init__(self, input_size, hidden, num_layers):
 
     def forward(self, x):
         batch_size = x.batch_sizes[0]
-        s = zero_state(self, batch_size)
-        _, (hidden, _) = self.lstm(x, s)  # (4, batch_size, 128)
+        _, (hidden, _) = self.lstm(x)  # (4, batch_size, 128)
         transposed = hidden.transpose(0, 1)  # (batch_size, 4, 128)
         reshaped = transposed.contiguous().view(batch_size, -1)
 
@@ -117,7 +110,7 @@ def forward(self, batch):
         docs_tensor = torch.cat(padded_docs, 1)
         packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False)
         profiler.set()  # 3
-        sentence_lstm_output, _ = self.sentence_lstm(packed_docs, zero_state(self, batch_size=batch_size))
+        sentence_lstm_output, _ = self.sentence_lstm(packed_docs)
         profiler.set()  # 4
         padded_x, _ = pad_packed_sequence(sentence_lstm_output)  # (max sentence len, batch, 256)
 
diff --git a/run.py b/run.py
index 3e783ed..8a3cb52 100644
--- a/run.py
+++ b/run.py
@@ -17,6 +17,7 @@
 import accuracy
 import numpy as np
 from termcolor import colored
+from models.max_sentence_embedding import Model
 
 # torch.multiprocessing.set_sharing_strategy('file_system')
 
@@ -205,7 +206,7 @@ def main(args):
         test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
                              num_workers=args.num_workers,pin_memory=args.pin_memory)
 
-    model = import_model(args.model) if args.model else torch.load(open(args.load_from, 'rb'))
+    model = Model(input_size=300, hidden=256, num_layers=2)
     model = maybe_cuda(model)
 
     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
@@ -244,7 +245,7 @@ def main(args):
     parser.add_argument('--bs', help='Batch size', type=int, default=8)
     parser.add_argument('--test_bs', help='Test batch size', type=int, default=5)
     parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=1)
-    parser.add_argument('--model', help='Model to run - will import and run')
+    parser.add_argument('--model', help='Model to run - will import and run',default='max_sentence_embedding')
     parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue')
     parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1')
     parser.add_argument('--checkpoint_dir', help='Checkpoint directory', default='checkpoints')

From badbfb0d8829f388e30acc01c85ca2aa88c0f895 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sat, 12 Oct 2024 11:53:34 +0530
Subject: [PATCH 07/16] removed loop from forward method of segmentation LSTM
 model

---
 models/max_sentence_embedding.py | 60 +++++++++++++++-----------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py
index a8c4256..a2e18e7 100644
--- a/models/max_sentence_embedding.py
+++ b/models/max_sentence_embedding.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
 from utils import maybe_cuda, setup_logger, unsort
 import numpy as np
 from times_profiler import profiler
@@ -53,10 +53,6 @@ def forward_sentence_encoding(self, x):
         packed_output, _ = self.sentence_encoder(x)
         padded_output, lengths = pad_packed_sequence(packed_output)  # (max sentence len, batch, 256)
 
-        # maxes = maybe_cuda(torch.zeros(num_sequences, padded_output.size(2)))
-        # for i in range(num_sequences):
-        #     maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0]
-
         # Create a mask based on lengths
         mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0)
         mask = maybe_cuda(mask)
@@ -69,37 +65,39 @@ def forward_sentence_encoding(self, x):
 
         return maxes
     
+
+    def forward_helper(self, sentences_per_doc, unsorted_encodings):
+        
+        # Step 3: Efficiently split the unsorted_encodings into separate documents using tensor operations
+        sentences_per_doc = maybe_cuda(torch.LongTensor(sentences_per_doc))
+        encoded_documents = torch.split(unsorted_encodings, sentences_per_doc.tolist())
+
+        # Step 4: Calculate maximum document size and pad documents in one go
+        padded_docs = pad_sequence(encoded_documents, batch_first=True)
+
+        # Step 5: Pack the padded documents for LSTM processing
+        packed_docs = pack_padded_sequence(padded_docs, sentences_per_doc, batch_first=True, enforce_sorted=False)
+
+        # Step 6: Pass through document-level LSTM
+        sentence_lstm_output, _ = self.sentence_lstm(packed_docs)
+
+        # Step 7: Unpack the LSTM output
+        padded_x, _ = pad_packed_sequence(sentence_lstm_output, batch_first=True)
+
+        # Step 8: Select the final hidden states (excluding last prediction) without using a loop
+        doc_outputs = [padded_x[i, :doc_len-1, :] for i, doc_len in enumerate(sentences_per_doc.tolist())]
+
+        # Step 9: Concatenate the outputs into one tensor
+        sentence_outputs = torch.cat(doc_outputs, dim=0)
+
+        return sentence_outputs
+    
     def forward(self, data):
         packed_tensor, sentences_per_doc, sort_order = data
         packed_tensor = maybe_cuda(packed_tensor)
         encoded_sentences = self.forward_sentence_encoding(packed_tensor)
         unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
-
-        index = 0
-        encoded_documents = []
-        for sentences_count in sentences_per_doc:
-            end_index = index + sentences_count
-            encoded_documents.append(unsorted_encodings[index: end_index, :])
-            index = end_index
-
-        doc_sizes = [doc.size()[0] for doc in encoded_documents]
-        max_doc_size = np.max(doc_sizes)
-        ordered_document_idx = np.argsort(doc_sizes)[::-1]
-        ordered_doc_sizes = sorted(doc_sizes)[::-1]
-        ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx]
-        padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents]
-        docs_tensor = torch.cat(padded_docs, 1)
-        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes, enforce_sorted=False)
-        sentence_lstm_output, _ = self.sentence_lstm(packed_docs)
-        padded_x, _ = pad_packed_sequence(sentence_lstm_output)  # (max sentence len, batch, 256)
-
-        doc_outputs = []
-        for i, doc_len in enumerate(ordered_doc_sizes):
-            doc_outputs.append(padded_x[0:doc_len - 1, i, :])  # -1 to remove last prediction
-
-        unsorted_doc_outputs = [doc_outputs[i] for i in unsort(ordered_document_idx)]
-        sentence_outputs = torch.cat(unsorted_doc_outputs, 0)
-
+        sentence_outputs = self.forward_helper(sentences_per_doc, unsorted_encodings)
         x = self.h2s(sentence_outputs)
         return x
\ No newline at end of file

From 05471290785d98f853c9132c6d06a437cca82992 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sat, 12 Oct 2024 12:05:43 +0530
Subject: [PATCH 08/16] minor bug fix

---
 models/max_sentence_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py
index a2e18e7..d31d506 100644
--- a/models/max_sentence_embedding.py
+++ b/models/max_sentence_embedding.py
@@ -69,7 +69,7 @@ def forward_sentence_encoding(self, x):
     def forward_helper(self, sentences_per_doc, unsorted_encodings):
         
         # Step 3: Efficiently split the unsorted_encodings into separate documents using tensor operations
-        sentences_per_doc = maybe_cuda(torch.LongTensor(sentences_per_doc))
+        sentences_per_doc = torch.LongTensor(sentences_per_doc)
         encoded_documents = torch.split(unsorted_encodings, sentences_per_doc.tolist())
 
         # Step 4: Calculate maximum document size and pad documents in one go

From 9039a3aff08600e208691bd32bb80c0f8eb7c6a0 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sat, 12 Oct 2024 14:42:50 +0530
Subject: [PATCH 09/16] saving optim and model state

---
 run.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/run.py b/run.py
index 8a3cb52..a5dabca 100644
--- a/run.py
+++ b/run.py
@@ -153,7 +153,7 @@ def test(model, args, epoch, dataset, logger, threshold):
                 document_sentence_count = len(t)
                 to_idx = int(current_idx + document_sentence_count)
 
-                output = (output_softmax.cpu().numpy()[current_idx:to_idx, 1] > threshold)
+                output = (output_softmax.detach().cpu().numpy()[current_idx:to_idx, 1] > threshold)
                 h = np.append(output, [1])
                 tt = np.append(t, [1])
 
@@ -180,10 +180,6 @@ def main(args):
     utils.config.update(vars(args))  # Updated to use vars(args)
     logger.debug(f'Running with config {utils.config}')
 
-    
-    # log_dir = os.path.join('runs', args.expname, str(time.time()))
-    # configure(log_dir)
-
     word2vec = None if args.test else gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True)
 
     if not args.infer:
@@ -220,15 +216,21 @@ def main(args):
         best_val_pk = 1.0
         for j in range(args.epochs):
             train(model, args, j, train_dl, logger, optimizer)
-            torch.save(model, open(checkpoint_path / f'model{j:03d}.pt', 'wb'))
+            torch.save({
+                'model_state_dict': model.state_dict(),
+                'optimizer_state_dict': optimizer.state_dict()
+            }, open(checkpoint_path / f'model{j:03d}.pt', 'wb'))
 
             val_pk, threshold = validate(model, args, j, dev_dl, logger)
             print(f'Current best model from epoch {j} with p_k {val_pk} and threshold {threshold}')
-            # if val_pk < best_val_pk:
-            #     test_pk = test(model, args, j, test_dl, logger, threshold)
-            #     logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green'))
-            #     best_val_pk = val_pk
-            #     torch.save(model, open(checkpoint_path / 'best_model.pt', 'wb'))
+            if val_pk < best_val_pk:
+                test_pk = test(model, args, j, test_dl, logger, threshold)
+                logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green'))
+                best_val_pk = val_pk
+                torch.save({
+                    'model_state_dict': model.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict()
+                }, open(checkpoint_path / f'best_model.pt', 'wb'))
 
     else:
         test_dl = DataLoader(WikipediaDataSet(args.infer, word2vec=word2vec, high_granularity=args.high_granularity),
@@ -244,7 +246,7 @@ def main(args):
     parser.add_argument('--test', help='Test mode? (e.g. fake word2vec)', action='store_true')
     parser.add_argument('--bs', help='Batch size', type=int, default=8)
     parser.add_argument('--test_bs', help='Test batch size', type=int, default=5)
-    parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=1)
+    parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10)
     parser.add_argument('--model', help='Model to run - will import and run',default='max_sentence_embedding')
     parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue')
     parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1')

From b322d285127e9d41cb6f928f891d022019a2ad40 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sun, 13 Oct 2024 13:06:47 +0530
Subject: [PATCH 10/16] added code to remove poorly formatted data

---
 wiki_loader.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/wiki_loader.py b/wiki_loader.py
index 6483cd2..ed9736d 100644
--- a/wiki_loader.py
+++ b/wiki_loader.py
@@ -123,4 +123,14 @@ def __getitem__(self, index):
                               high_granularity=self.high_granularity)
 
     def __len__(self):
-        return len(self.textfiles)
\ No newline at end of file
+        return len(self.textfiles)
+    
+
+if __name__ == "__main__":
+    root = "/Users/jitesh/Downloads/text-segmentation/data/wiki 727/train"
+    for path in os.listdir(root):
+        if path.startswith('paths_'):
+            continue
+        all_sections = get_sections(os.path.join(root,path), high_granularity=False)
+        if len(all_sections) <= 1:
+            print(os.path.join(root,path))

From 154829e91baa09a7cb1d8057f611bf083538534e Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sun, 13 Oct 2024 13:10:41 +0530
Subject: [PATCH 11/16] removed .cuda() calls in Model calss

---
 models/max_sentence_embedding.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py
index d31d506..3231614 100644
--- a/models/max_sentence_embedding.py
+++ b/models/max_sentence_embedding.py
@@ -2,9 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
-from utils import maybe_cuda, setup_logger, unsort
-import numpy as np
-from times_profiler import profiler
+from utils import setup_logger, unsort
 
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
@@ -55,7 +53,7 @@ def forward_sentence_encoding(self, x):
 
         # Create a mask based on lengths
         mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0)
-        mask = maybe_cuda(mask)
+        # mask = maybe_cuda(mask)
         
         # Mask padded values by setting them to a very negative value (so they don't affect the max computation)
         padded_output = padded_output.masked_fill(~mask.unsqueeze(2), float('-inf'))
@@ -94,9 +92,10 @@ def forward_helper(self, sentences_per_doc, unsorted_encodings):
     
     def forward(self, data):
         packed_tensor, sentences_per_doc, sort_order = data
-        packed_tensor = maybe_cuda(packed_tensor)
+        # packed_tensor = maybe_cuda(packed_tensor)
         encoded_sentences = self.forward_sentence_encoding(packed_tensor)
-        unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
+        unsort_order = torch.LongTensor(unsort(sort_order))
+        # unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
         sentence_outputs = self.forward_helper(sentences_per_doc, unsorted_encodings)
         x = self.h2s(sentence_outputs)

From f2fbf76e5bf4a35907a312e5368fce14402961b6 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sun, 13 Oct 2024 23:56:58 +0530
Subject: [PATCH 12/16] support for checkpointing training progress

---
 .gitignore                       |  1 +
 models/max_sentence_embedding.py | 11 +++---
 run.py                           | 60 +++++++++++++++++++++++++++-----
 3 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index d8a238c..89aec3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 data/
 runs/
 checkpoints/
+*.pt
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/models/max_sentence_embedding.py b/models/max_sentence_embedding.py
index 3231614..d31d506 100644
--- a/models/max_sentence_embedding.py
+++ b/models/max_sentence_embedding.py
@@ -2,7 +2,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
-from utils import setup_logger, unsort
+from utils import maybe_cuda, setup_logger, unsort
+import numpy as np
+from times_profiler import profiler
 
 logger = setup_logger(__name__, 'train.log')
 profilerLogger = setup_logger("profilerLogger", 'profiler.log', True)
@@ -53,7 +55,7 @@ def forward_sentence_encoding(self, x):
 
         # Create a mask based on lengths
         mask = torch.arange(padded_output.size(0)).unsqueeze(1) < lengths.unsqueeze(0)
-        # mask = maybe_cuda(mask)
+        mask = maybe_cuda(mask)
         
         # Mask padded values by setting them to a very negative value (so they don't affect the max computation)
         padded_output = padded_output.masked_fill(~mask.unsqueeze(2), float('-inf'))
@@ -92,10 +94,9 @@ def forward_helper(self, sentences_per_doc, unsorted_encodings):
     
     def forward(self, data):
         packed_tensor, sentences_per_doc, sort_order = data
-        # packed_tensor = maybe_cuda(packed_tensor)
+        packed_tensor = maybe_cuda(packed_tensor)
         encoded_sentences = self.forward_sentence_encoding(packed_tensor)
-        unsort_order = torch.LongTensor(unsort(sort_order))
-        # unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
+        unsort_order = maybe_cuda(torch.LongTensor(unsort(sort_order)))
         unsorted_encodings = encoded_sentences.index_select(0, unsort_order)
         sentence_outputs = self.forward_helper(sentences_per_doc, unsorted_encodings)
         x = self.h2s(sentence_outputs)
diff --git a/run.py b/run.py
index a5dabca..01f30eb 100644
--- a/run.py
+++ b/run.py
@@ -21,6 +21,8 @@
 
 # torch.multiprocessing.set_sharing_strategy('file_system')
 
+torch.manual_seed(42)
+
 preds_stats = utils.predictions_analysis()
 
 def softmax(x):
@@ -89,11 +91,17 @@ def train(model, args, epoch, dataset, logger, optimizer):
         for i, (data, target, paths) in enumerate(dataset):
             if i == args.stop_after:
                 break
-
             pbar.update()
             model.zero_grad()
-            # data_size = compute_batch_size(data)
-            output = model(data)
+
+            try:
+                output = model(data)
+            except Exception as e:
+                print(f"Error while passing batch {i+1} to the model")
+                print(f"Exception: {e}")
+                print(f"Paths: {paths}")
+                continue
+            
             target_var = maybe_cuda(torch.cat(target, 0), args.cuda)
             loss = model.criterion(output, target_var)
             loss.backward()
@@ -115,8 +123,16 @@ def validate(model, args, epoch, dataset, logger):
             if i == args.stop_after:
                 break
             pbar.update()
-            output = model(data)
-            output_softmax = F.softmax(output, dim=1)
+            
+            try:
+                output = model(data)
+                output_softmax = F.softmax(output, dim=1)
+            except Exception as e:
+                print(f"Error while passing batch {i+1} to the model")
+                print(f"Exception: {e}")
+                print(f"Paths: {paths}")
+                continue            
+            
             targets_var = maybe_cuda(torch.cat(target, 0), args.cuda)
 
             output_seg = output.argmax(dim=1).cpu().numpy()
@@ -141,8 +157,16 @@ def test(model, args, epoch, dataset, logger, threshold):
             if i == args.stop_after:
                 break
             pbar.update()
-            output = model(data)
-            output_softmax = F.softmax(output, dim=1)
+            
+            try:
+                output = model(data)
+                output_softmax = F.softmax(output, dim=1)
+            except Exception as e:
+                print(f"Error while passing batch {i+1} to the model")
+                print(f"Exception: {e}")
+                print(f"Paths: {paths}")
+                continue
+            
             targets_var = maybe_cuda(torch.cat(target, 0), args.cuda)
             output_seg = output.argmax(dim=1).cpu().numpy()
             target_seg = targets_var.cpu().numpy()
@@ -168,6 +192,21 @@ def test(model, args, epoch, dataset, logger, threshold):
 
         return epoch_pk
 
+
+def load_model_and_optimizer(checkpoint_path, is_cuda, model, optimizer):
+
+    map_location = torch.device('cuda') if is_cuda else torch.device('cpu')
+
+    checkpoint = torch.load(checkpoint_path, map_location=map_location)
+    
+    model.load_state_dict(checkpoint['model_state_dict'])
+    
+    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+
+    print(f"Loaded model and optimizer state from {checkpoint_path}")
+    return model, optimizer
+
+
 def main(args):
     sys.path.append(str(Path(__file__).parent))
 
@@ -207,6 +246,9 @@ def main(args):
 
     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
 
+    if args.load_from:
+        model, optimizer = load_model_and_optimizer(args.load_from, args.cuda, model, optimizer)
+
     if args.benchmark:
         for j in range(args.epochs):
             train(model, args, j, train_dl, logger, optimizer)
@@ -225,7 +267,7 @@ def main(args):
             print(f'Current best model from epoch {j} with p_k {val_pk} and threshold {threshold}')
             if val_pk < best_val_pk:
                 test_pk = test(model, args, j, test_dl, logger, threshold)
-                logger.debug(colored(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}', 'green'))
+                print(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}')
                 best_val_pk = val_pk
                 torch.save({
                     'model_state_dict': model.state_dict(),
@@ -248,7 +290,7 @@ def main(args):
     parser.add_argument('--test_bs', help='Test batch size', type=int, default=5)
     parser.add_argument('--epochs', help='Number of epochs to run', type=int, default=10)
     parser.add_argument('--model', help='Model to run - will import and run',default='max_sentence_embedding')
-    parser.add_argument('--load_from', help='Location of a .t7 model file to load. Training will continue')
+    parser.add_argument('--load_from', help='Location of a .pt model file to load. Training will continue')
     parser.add_argument('--expname', help='Experiment name to appear on tensorboard', default='exp1')
     parser.add_argument('--checkpoint_dir', help='Checkpoint directory', default='checkpoints')
     parser.add_argument('--stop_after', help='Number of batches to stop after', type=int)

From ded3aa26a83b8c02c90dcd177a13b4cde2749d48 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sat, 19 Oct 2024 17:34:33 +0530
Subject: [PATCH 13/16] exception handling during testing

---
 .gitignore       |  2 ++
 run.py           |  2 +-
 test_accuracy.py | 20 ++++++++++++++------
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 89aec3e..bcd0d7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,8 @@ data/
 runs/
 checkpoints/
 *.pt
+inference/
+outputs/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/run.py b/run.py
index 01f30eb..2c326dc 100644
--- a/run.py
+++ b/run.py
@@ -197,7 +197,7 @@ def load_model_and_optimizer(checkpoint_path, is_cuda, model, optimizer):
 
     map_location = torch.device('cuda') if is_cuda else torch.device('cpu')
 
-    checkpoint = torch.load(checkpoint_path, map_location=map_location)
+    checkpoint = torch.load(checkpoint_path, map_location=map_location, weights_only=True)
     
     model.load_state_dict(checkpoint['model_state_dict'])
     
diff --git a/test_accuracy.py b/test_accuracy.py
index 7f9b861..058b737 100644
--- a/test_accuracy.py
+++ b/test_accuracy.py
@@ -1,6 +1,5 @@
 import torch
 from torch.utils.data import DataLoader
-from torch.autograd import Variable
 import numpy as np
 from choiloader import ChoiDataset, collate_fn
 from tqdm import tqdm
@@ -15,6 +14,7 @@
 import accuracy
 from models import naive
 from timeit import default_timer as timer
+from models.max_sentence_embedding import Model
 
 
 logger = utils.setup_logger(__name__, 'test_accuracy.log')
@@ -71,9 +71,10 @@ def main(args):
             print('Running on Choi')
 
     # Load the model
-    with open(args.model, 'rb') as f:
-        model = torch.load(f)
-
+    model = Model(input_size=300, hidden=256, num_layers=2)
+    map_location = torch.device('cuda') if args.cuda else torch.device('cpu')
+    checkpoint = torch.load(args.model, map_location=map_location, weights_only=True)
+    model.load_state_dict(checkpoint['model_state_dict'])
     model = maybe_cuda(model)
     model.eval()
 
@@ -107,10 +108,17 @@ def main(args):
                     break
 
                 pbar.update()
-                output = model(data)
+                
+                try:
+                    output = model(data)
+                except Exception as e:
+                    print(f"Error while passing batch {i+1} to the model")
+                    print(f"Exception: {e}")
+                    continue
+                
                 targets_var = maybe_cuda(torch.cat(targets, 0), args.cuda)
                 batch_loss = 0
-                output_prob = softmax(output.cpu().numpy())
+                output_prob = softmax(output.detach().cpu().numpy())
                 output_seg = output_prob[:, 1] > args.seg_threshold
                 target_seg = targets_var.cpu().numpy()
                 batch_accurate = (output_seg == target_seg).sum()

From af475ba3168527b62facd6a0581c763a4b612014 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Sat, 19 Oct 2024 18:01:25 +0530
Subject: [PATCH 14/16] added requirements.txt

---
 requirements.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e1914bc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+gensim==4.3.3
+nltk==3.9.1
+numpy==1.26.4
+pandas==2.2.3
+pathlib2==2.3.7.post1
+segeval==2.0.11
+tensorboard-logger==0.1.0
+termcolor==2.5.0
+torch==2.4.1
+tqdm==4.66.5
+protobuf==3.20.1
\ No newline at end of file

From 110b4e5e8bb3f09d0cedee2ee15d46f01bfc2dae Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Mon, 21 Oct 2024 11:46:39 +0530
Subject: [PATCH 15/16] support for multi GPU training

---
 run.py           | 38 ++++++++++++++++++++++++++++++--------
 test_accuracy.py |  5 ++++-
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/run.py b/run.py
index 2c326dc..41fe32c 100644
--- a/run.py
+++ b/run.py
@@ -1,7 +1,7 @@
 import torch
 from torch.utils.data import DataLoader, Subset
 import torch.nn.functional as F
-
+from torch.nn import DataParallel
 from choiloader import ChoiDataset, collate_fn
 from tqdm import tqdm
 from argparse import ArgumentParser
@@ -198,8 +198,11 @@ def load_model_and_optimizer(checkpoint_path, is_cuda, model, optimizer):
     map_location = torch.device('cuda') if is_cuda else torch.device('cpu')
 
     checkpoint = torch.load(checkpoint_path, map_location=map_location, weights_only=True)
-    
-    model.load_state_dict(checkpoint['model_state_dict'])
+
+    if isinstance(model, torch.nn.DataParallel):
+        model.module.load_state_dict(checkpoint['model_state_dict'])
+    else:
+        model.load_state_dict(checkpoint['model_state_dict'])
     
     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 
@@ -234,16 +237,27 @@ def main(args):
             dev_dataset = Subset(dev_dataset,range(1000))
             test_dataset = Subset(test_dataset,range(1000))
 
-        train_dl = DataLoader(train_dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True,
+        train_batch_size,test_batch_size = args.bs, args.test_bs
+        
+        if torch.cuda.device_count() > 1:
+            num_gpus = torch.cuda.device_count()
+            print(f"Using {num_gpus} GPUs")
+            train_batch_size = args.bs * num_gpus
+            test_batch_size = args.test_bs * num_gpus
+        
+        train_dl = DataLoader(train_dataset, batch_size=train_batch_size, collate_fn=collate_fn, shuffle=True,
                               num_workers=args.num_workers,pin_memory=args.pin_memory)
-        dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
+        dev_dl = DataLoader(dev_dataset, batch_size=test_batch_size, collate_fn=collate_fn, shuffle=False,
                             num_workers=args.num_workers,pin_memory=args.pin_memory)
-        test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False,
+        test_dl = DataLoader(test_dataset, batch_size=test_batch_size, collate_fn=collate_fn, shuffle=False,
                              num_workers=args.num_workers,pin_memory=args.pin_memory)
 
     model = Model(input_size=300, hidden=256, num_layers=2)
     model = maybe_cuda(model)
 
+    if torch.cuda.device_count() > 1 and not args.infer:
+        model = DataParallel(model)
+
     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
 
     if args.load_from:
@@ -258,8 +272,12 @@ def main(args):
         best_val_pk = 1.0
         for j in range(args.epochs):
             train(model, args, j, train_dl, logger, optimizer)
+            if isinstance(model, torch.nn.DataParallel):
+                model_state_dict = model.module.state_dict()
+            else:
+                model_state_dict = model.state_dict()
             torch.save({
-                'model_state_dict': model.state_dict(),
+                'model_state_dict': model_state_dict,
                 'optimizer_state_dict': optimizer.state_dict()
             }, open(checkpoint_path / f'model{j:03d}.pt', 'wb'))
 
@@ -269,8 +287,12 @@ def main(args):
                 test_pk = test(model, args, j, test_dl, logger, threshold)
                 print(f'Current best model from epoch {j} with p_k {test_pk} and threshold {threshold}')
                 best_val_pk = val_pk
+                if isinstance(model, torch.nn.DataParallel):
+                    model_state_dict = model.module.state_dict()
+                else:
+                    model_state_dict = model.state_dict()
                 torch.save({
-                    'model_state_dict': model.state_dict(),
+                    'model_state_dict': model_state_dict,
                     'optimizer_state_dict': optimizer.state_dict()
                 }, open(checkpoint_path / f'best_model.pt', 'wb'))
 
diff --git a/test_accuracy.py b/test_accuracy.py
index 058b737..543b758 100644
--- a/test_accuracy.py
+++ b/test_accuracy.py
@@ -74,7 +74,10 @@ def main(args):
     model = Model(input_size=300, hidden=256, num_layers=2)
     map_location = torch.device('cuda') if args.cuda else torch.device('cpu')
     checkpoint = torch.load(args.model, map_location=map_location, weights_only=True)
-    model.load_state_dict(checkpoint['model_state_dict'])
+    if isinstance(model, torch.nn.DataParallel):
+        model.module.load_state_dict(checkpoint['model_state_dict'])
+    else:
+        model.load_state_dict(checkpoint['model_state_dict'])
     model = maybe_cuda(model)
     model.eval()
 

From d4921b358c41b440b023b2af58a20771a7cea464 Mon Sep 17 00:00:00 2001
From: jiteshm17 <jiteshmalipeddi92@gmail.com>
Date: Tue, 22 Oct 2024 07:46:26 +0530
Subject: [PATCH 16/16] flag for multi gpu training

---
 run.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/run.py b/run.py
index 41fe32c..c7e3d76 100644
--- a/run.py
+++ b/run.py
@@ -99,7 +99,7 @@ def train(model, args, epoch, dataset, logger, optimizer):
             except Exception as e:
                 print(f"Error while passing batch {i+1} to the model")
                 print(f"Exception: {e}")
-                print(f"Paths: {paths}")
+                # print(f"Paths: {paths}")
                 continue
             
             target_var = maybe_cuda(torch.cat(target, 0), args.cuda)
@@ -130,7 +130,7 @@ def validate(model, args, epoch, dataset, logger):
             except Exception as e:
                 print(f"Error while passing batch {i+1} to the model")
                 print(f"Exception: {e}")
-                print(f"Paths: {paths}")
+                # print(f"Paths: {paths}")
                 continue            
             
             targets_var = maybe_cuda(torch.cat(target, 0), args.cuda)
@@ -164,7 +164,7 @@ def test(model, args, epoch, dataset, logger, threshold):
             except Exception as e:
                 print(f"Error while passing batch {i+1} to the model")
                 print(f"Exception: {e}")
-                print(f"Paths: {paths}")
+                # print(f"Paths: {paths}")
                 continue
             
             targets_var = maybe_cuda(torch.cat(target, 0), args.cuda)
@@ -239,7 +239,7 @@ def main(args):
 
         train_batch_size,test_batch_size = args.bs, args.test_bs
         
-        if torch.cuda.device_count() > 1:
+        if args.multi_gpu and torch.cuda.device_count() > 1:
             num_gpus = torch.cuda.device_count()
             print(f"Using {num_gpus} GPUs")
             train_batch_size = args.bs * num_gpus
@@ -255,7 +255,7 @@ def main(args):
     model = Model(input_size=300, hidden=256, num_layers=2)
     model = maybe_cuda(model)
 
-    if torch.cuda.device_count() > 1 and not args.infer:
+    if args.multi_gpu and torch.cuda.device_count() > 1 and not args.infer:
         model = DataParallel(model)
 
     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
@@ -304,6 +304,7 @@ def main(args):
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--cuda', help='Use cuda?', action='store_true')
+    parser.add_argument('--multi_gpu', help='Use multiple GPUs', action='store_true')
     parser.add_argument('--pin_memory', help='Pin Memory?', action='store_true')
     parser.add_argument('--subset', help='Use a sample of 1000 rows', action='store_true')
     parser.add_argument('--benchmark', help='Use PyTorch profiler', action='store_true')