koomri · jiteshm17 · Oct 7, 2024 · Oct 7, 2024 · Oct 8, 2024 · Oct 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,12 @@
+.DS_Store
+.vscode/
+data/
+runs/
+checkpoints/
+*.pt
+inference/
+outputs/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/accuracy.py b/accuracy.py
@@ -1,78 +1,75 @@
 import segeval as seg
 import numpy as np
 
-
 def softmax(x):
     max_each_row = np.max(x, axis=1, keepdims=True)
     exps = np.exp(x - max_each_row)
     sums = np.sum(exps, axis=1, keepdims=True)
     return exps / sums
 
-
 class Accuracy:
     def __init__(self, threshold=0.3):
         self.pk_to_weight = []
         self.windiff_to_weight = []
         self.threshold = threshold
 
-    def update(self, h, gold, sentences_length = None):
+    def update(self, h, gold, sentences_length=None):
         h_boundaries = self.get_seg_boundaries(h, sentences_length)
         gold_boundaries = self.get_seg_boundaries(gold, sentences_length)
         pk, count_pk = self.pk(h_boundaries, gold_boundaries)
-        windiff, count_wd = -1, 400;# self.win_diff(h_boundaries, gold_boundaries)
+        windiff, count_wd = -1, 400  # Placeholder for windiff calculation
 
         if pk != -1:
             self.pk_to_weight.append((pk, count_pk))
         else:
-            print ('pk error')
+            print('pk error')
 
         if windiff != -1:
             self.windiff_to_weight.append((windiff, count_wd))
 
-    def get_seg_boundaries(self, classifications, sentences_length = None):
+    def get_seg_boundaries(self, classifications, sentences_length=None):
         """
-        :param list of tuples, each tuple is a sentence and its class (1 if it the sentence starts a segment, 0 otherwise).
-        e.g: [(this is, 0), (a segment, 1) , (and another one, 1)
-        :return: boundaries of segmentation to use for pk method. For given example the function will return (4, 3)
+        :param classifications: list of tuples, each tuple is a sentence and its class (1 if the sentence starts a segment, 0 otherwise).
+        :param sentences_length: list of sentence lengths (optional)
+        :return: boundaries of segmentation for pk method.
         """
         curr_seg_length = 0
         boundaries = []
         for i, classification in enumerate(classifications):
             is_split_point = bool(classifications[i])
             add_to_current_segment = 1 if sentences_length is None else sentences_length[i]
             curr_seg_length += add_to_current_segment
-            if (is_split_point):
+            if is_split_point:
                 boundaries.append(curr_seg_length)
                 curr_seg_length = 0
 
         return boundaries
 
     def pk(self, h, gold, window_size=-1):
         """
-        :param gold: gold segmentation (item in the list contains the number of words in segment) 
-        :param h: hypothesis segmentation  (each item in the list contains the number of words in segment)
-        :param window_size: optional 
-        :return: accuracy
+        :param h: hypothesis segmentation
+        :param gold: gold segmentation
+        :param window_size: optional window size
+        :return: pk accuracy
         """
         if window_size != -1:
             false_seg_count, total_count = seg.pk(h, gold, window_size=window_size, return_parts=True)
         else:
             false_seg_count, total_count = seg.pk(h, gold, return_parts=True)
 
         if total_count == 0:
-            # TODO: Check when happens
             false_prob = -1
         else:
-            false_prob = float(false_seg_count) / float(total_count)
+            false_prob = float(false_seg_count) / total_count
 
         return false_prob, total_count
 
     def win_diff(self, h, gold, window_size=-1):
         """
-        :param gold: gold segmentation (item in the list contains the number of words in segment) 
-        :param h: hypothesis segmentation  (each item in the list contains the number of words in segment)
-        :param window_size: optional 
-        :return: accuracy
+        :param h: hypothesis segmentation
+        :param gold: gold segmentation
+        :param window_size: optional window size
+        :return: win_diff accuracy
         """
         if window_size != -1:
             false_seg_count, total_count = seg.window_diff(h, gold, window_size=window_size, return_parts=True)
@@ -82,14 +79,12 @@ def win_diff(self, h, gold, window_size=-1):
         if total_count == 0:
             false_prob = -1
         else:
-            false_prob = float(false_seg_count) / float(total_count)
+            false_prob = float(false_seg_count) / total_count
 
         return false_prob, total_count
 
     def calc_accuracy(self):
-        pk = sum([pw[0] * pw[1] for pw in self.pk_to_weight]) / sum([pw[1] for pw in self.pk_to_weight]) if len(
-            self.pk_to_weight) > 0 else -1.0
-        windiff = sum([pw[0] * pw[1] for pw in self.windiff_to_weight]) / sum(
-            [pw[1] for pw in self.windiff_to_weight]) if len(self.windiff_to_weight) > 0 else -1.0
+        pk = sum(pw[0] * pw[1] for pw in self.pk_to_weight) / sum(pw[1] for pw in self.pk_to_weight) if self.pk_to_weight else -1.0
+        windiff = sum(pw[0] * pw[1] for pw in self.windiff_to_weight) / sum(pw[1] for pw in self.windiff_to_weight) if self.windiff_to_weight else -1.0
 
-        return pk, windiff
+        return pk, windiff
diff --git a/annotate_wiki_file.py b/annotate_wiki_file.py
@@ -1,56 +1,47 @@
 from argparse import ArgumentParser
 from wiki_loader import read_wiki_file
 import pandas as pd
-from pathlib2 import Path
+from pathlib import Path  # Use pathlib, not pathlib2
 import os
 
-
 def get_files(path):
-    all_objects = Path(path).glob('**/*')
+    all_objects = Path(path).rglob('*')  # Use rglob for '**/*' pattern
     files = (str(p) for p in all_objects if p.is_file())
     return files
 
 def generate_segmentation_template(path, output_path):
-    writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
-    sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False)
-    df = pd.DataFrame({ 'Cut here': [0] * len(sentences),'Sentences': sentences})
-    df = df[['Cut here','Sentences']]
-
-    df.to_excel(writer, sheet_name='segment')
-    writer.save()
-
+    with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:  # Use context manager for ExcelWriter
+        sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False)
+        df = pd.DataFrame({'Cut here': [0] * len(sentences), 'Sentences': sentences})
+        df = df[['Cut here', 'Sentences']]
+        df.to_excel(writer, sheet_name='segment')
 
 def generate_test_article(path, output_path):
-    sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False,
+    sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False, 
                                      high_granularity=False)
     article_text = "\n".join(sentences)
-    with open(output_path, "w") as f:
-        f.write(article_text.encode('utf-8'))
-    f.close()
+    with open(output_path, "w", encoding='utf-8') as f:  # Use context manager and specify encoding
+        f.write(article_text)
 
-def generate_folder(input_folder,output_folder):
+def generate_folder(input_folder, output_folder, to_text):
     counter = 0
     input_files = get_files(input_folder)
     for file in input_files:
         id = os.path.basename(file)
-        file_name = id + ".xlsx" if not args.toText else id
+        file_name = f"{id}.xlsx" if not to_text else id
         output_file = os.path.join(output_folder, file_name)
-        if (args.toText):
+        if to_text:
             generate_test_article(file, output_file)
         else:
-            generate_segmentation_template(file,output_file)
+            generate_segmentation_template(file, output_file)
         counter += 1
-    print 'generates ' + str(counter) + ' files'
-
-
+    print(f'Generated {counter} files')
 
 if __name__ == '__main__':
-
     parser = ArgumentParser()
     parser.add_argument('--path', help='input folder path', default='/home/michael/Downloads/migo/68943', type=str)
     parser.add_argument('--output_path', help='output folder path', default='blah.xlsx', type=str)
-    parser.add_argument('--toText', help='output to text files ?', action='store_true')
+    parser.add_argument('--toText', help='output to text files?', action='store_true')
     args = parser.parse_args()
 
-    generate_folder(args.path,args.output_path)
-
+    generate_folder(args.path, args.output_path, args.toText)
diff --git a/calc_statistics.py b/calc_statistics.py
@@ -1,8 +1,5 @@
-from __future__ import division
-
 import torch
 from torch.utils.data import DataLoader
-from torch.autograd import Variable
 import numpy as np
 
 from choiloader import ChoiDataset, collate_fn
@@ -11,19 +8,17 @@
 from utils import maybe_cuda
 import utils
 import sys
-from pathlib2 import Path
+from pathlib import Path  # Use pathlib instead of pathlib2
 from wiki_loader import WikipediaDataSet
 import accuracy
 
 logger = utils.setup_logger(__name__, 'train.log')
 
-
-
 def main(args):
     sys.path.append(str(Path(__file__).parent))
 
     utils.read_config_file(args.config)
-    utils.config.update(args.__dict__)
+    utils.config.update(vars(args))  # Update config with args dictionary
 
     logger.debug('Running with config %s', utils.config)
     article_with_problems = 0
@@ -39,94 +34,85 @@ def main(args):
     min_num_sentences = 1000
     max_num_sentences = 0
 
-
     dl = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
     docs_num_segments_vec = np.zeros(len(dl))
     segments_num_sentences_vec = []
-    print 'num of docs is ' + str(len(dl))
+    print(f'Number of documents: {len(dl)}')
 
     with tqdm(desc='Testing', total=len(dl)) as pbar:
-
         for i, (data, targets, paths) in enumerate(dl):
-            if (len(paths) == 0):
+            if len(paths) == 0:
                 article_with_problems += 1
                 docs_num_segments_vec[i] = np.nan
                 continue
             try:
-
-                if ( ((i % 1000 ) == 0) & i > 0):
-                    print i
+                if i % 1000 == 0 and i > 0:
+                    print(i)
                 if len(targets) > 0:
-                    targets_var = Variable(maybe_cuda(torch.cat(targets, 0), None), requires_grad=False)
-                    target_seg = targets_var.data.cpu().numpy()
+                    targets_var = maybe_cuda(torch.cat(targets, 0), None)
+                    target_seg = targets_var.cpu().numpy()
                     target_seg = np.concatenate([target_seg, np.array([1])])
                 else:
                     target_seg = np.ones(1)
-                num_sentences += (len(target_seg))
-                doc_num_of_segment = (sum(target_seg))
-                if (doc_num_of_segment < min_num_segment):
-                    min_num_segment = doc_num_of_segment
-                if (doc_num_of_segment > max_num_segment):
-                    max_num_segment = doc_num_of_segment
+
+                num_sentences += len(target_seg)
+                doc_num_of_segment = sum(target_seg)
+
+                min_num_segment = min(min_num_segment, doc_num_of_segment)
+                max_num_segment = max(max_num_segment, doc_num_of_segment)
+
                 num_segments += doc_num_of_segment
                 num_documents += 1
                 docs_num_segments_vec[i] = doc_num_of_segment
 
                 one_inds = np.where(target_seg == 1)[0]
                 one_inds += 1
-                one_inds = np.concatenate((np.zeros(1),one_inds))
-                if (len(one_inds) == 1):
+                one_inds = np.concatenate(([0], one_inds))
+
+                if len(one_inds) == 1:
                     sentences_in_segments = [len(target_seg)]
                 else:
                     sentences_in_segments = one_inds[1:] - one_inds[:-1]
-                segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec,sentences_in_segments))
+
+                segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec, sentences_in_segments))
                 current_min = np.min(sentences_in_segments)
                 current_max = np.max(sentences_in_segments)
-                if (current_min < min_num_sentences):
-                    min_num_sentences = current_min
-                if (current_max > max_num_sentences):
-                    max_num_sentences = current_max
-
-
+
+                min_num_sentences = min(min_num_sentences, current_min)
+                max_num_sentences = max(max_num_sentences, current_max)
 
             except Exception as e:
-                logger.info('Exception "%s" in batch %s', e, i)
+                logger.info(f'Exception "{e}" in batch {i}')
                 logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True)
                 raise
 
+    print(f'Total sentences: {num_sentences}.')
+    print(f'Total segments: {num_segments}.')
+    print(f'Total documents: {num_documents}.')
+    print(f'Average segment size: {num_sentences / num_segments:.3f}.')
+    print(f'Min #segments in a document: {min_num_segment}.')
+    print(f'Max #segments in a document: {max_num_segment}.')
+    print(f'Min #sentences in a segment: {min_num_sentences}.')
+    print(f'Max #sentences in a segment: {max_num_sentences}.')
+
+    print('\nNew computing method\n')
+    print(f'Number of documents: {len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum()}.')
+    print(f'Total segments: {np.nansum(docs_num_segments_vec)}.')
+    print(f'Total sentences: {np.sum(segments_num_sentences_vec)}.')
+
+    print(f'Min #segments in a document: {np.nanmin(docs_num_segments_vec)}.')
+    print(f'Max #segments in a document: {np.nanmax(docs_num_segments_vec)}.')
+    print(f'Mean segments in a document: {np.nanmean(docs_num_segments_vec):.3f}.')
+    print(f'Standard deviation of segments in a document: {np.nanstd(docs_num_segments_vec):.3f}.')
 
+    print(f'\nMin #sentences in a segment: {np.min(segments_num_sentences_vec)}.')
+    print(f'Max #sentences in a segment: {np.max(segments_num_sentences_vec)}.')
+    print(f'Average segment size: {np.mean(segments_num_sentences_vec):.3f}.')
+    print(f'Standard deviation of segment size: {np.std(segments_num_sentences_vec):.3f}.')
 
-    print 'total sentences: {}.'.format(num_sentences)
-    print 'total segments: {}.'.format(num_segments)
-    print 'total documents: {}.'.format(num_documents)
-    print 'average segment size is: {:.3}.'.format(np.true_divide(num_sentences,num_segments))
-    print 'min #segment in document: {}.'.format(min_num_segment)
-    print 'max #segment in document: {}.'.format(max_num_segment)
-    print 'min #sentence in segment: {}.'.format(min_num_sentences)
-    print 'max #sentence in segment: {}.'.format(max_num_sentences)
-
-
-    print ''
-    print 'new computing method'
-    print ''
-    print 'num of documents: {}.'.format(len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum())
-    print 'total segments: {}.'.format(np.nansum(docs_num_segments_vec))
-    print 'total sentences: {}.'.format(np.sum(segments_num_sentences_vec))
-    print ''
-    print 'min #segment in document: {}.'.format(np.nanmin(docs_num_segments_vec))
-    print 'max #segment in document: {}.'.format(np.nanmax(docs_num_segments_vec))
-    print 'mean segments in document: {:.3}.'.format(np.nanmean(docs_num_segments_vec))
-    print 'std segments in document: {:.3}.'.format(np.nanstd(docs_num_segments_vec))
-    print ''
-    print 'min #sentence in segment: {}.'.format(np.min(segments_num_sentences_vec))
-    print 'max #sentence in segment: {}.'.format(np.max(segments_num_sentences_vec))
-    print 'average segment size is: {:.3}.'.format(np.mean(segments_num_sentences_vec))
-    print 'std segment size is: {:.3}.'.format(np.std(segments_num_sentences_vec))
-
-    print ''
-    print 'article with problems {}'.format(article_with_problems)
+    print(f'\nArticles with problems: {article_with_problems}')
 
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('--config', help='Path to config.json', default='config.json')
-    main(parser.parse_args())
+    main(parser.parse_args())