Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
.DS_Store
.vscode/
data/
runs/
checkpoints/
*.pt
inference/
outputs/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
47 changes: 21 additions & 26 deletions accuracy.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,75 @@
import segeval as seg
import numpy as np


def softmax(x):
max_each_row = np.max(x, axis=1, keepdims=True)
exps = np.exp(x - max_each_row)
sums = np.sum(exps, axis=1, keepdims=True)
return exps / sums


class Accuracy:
def __init__(self, threshold=0.3):
self.pk_to_weight = []
self.windiff_to_weight = []
self.threshold = threshold

def update(self, h, gold, sentences_length = None):
def update(self, h, gold, sentences_length=None):
h_boundaries = self.get_seg_boundaries(h, sentences_length)
gold_boundaries = self.get_seg_boundaries(gold, sentences_length)
pk, count_pk = self.pk(h_boundaries, gold_boundaries)
windiff, count_wd = -1, 400;# self.win_diff(h_boundaries, gold_boundaries)
windiff, count_wd = -1, 400 # Placeholder for windiff calculation

if pk != -1:
self.pk_to_weight.append((pk, count_pk))
else:
print ('pk error')
print('pk error')

if windiff != -1:
self.windiff_to_weight.append((windiff, count_wd))

def get_seg_boundaries(self, classifications, sentences_length = None):
def get_seg_boundaries(self, classifications, sentences_length=None):
"""
:param list of tuples, each tuple is a sentence and its class (1 if it the sentence starts a segment, 0 otherwise).
e.g: [(this is, 0), (a segment, 1) , (and another one, 1)
:return: boundaries of segmentation to use for pk method. For given example the function will return (4, 3)
:param classifications: list of tuples, each tuple is a sentence and its class (1 if the sentence starts a segment, 0 otherwise).
:param sentences_length: list of sentence lengths (optional)
:return: boundaries of segmentation for pk method.
"""
curr_seg_length = 0
boundaries = []
for i, classification in enumerate(classifications):
is_split_point = bool(classifications[i])
add_to_current_segment = 1 if sentences_length is None else sentences_length[i]
curr_seg_length += add_to_current_segment
if (is_split_point):
if is_split_point:
boundaries.append(curr_seg_length)
curr_seg_length = 0

return boundaries

def pk(self, h, gold, window_size=-1):
"""
:param gold: gold segmentation (item in the list contains the number of words in segment)
:param h: hypothesis segmentation (each item in the list contains the number of words in segment)
:param window_size: optional
:return: accuracy
:param h: hypothesis segmentation
:param gold: gold segmentation
:param window_size: optional window size
:return: pk accuracy
"""
if window_size != -1:
false_seg_count, total_count = seg.pk(h, gold, window_size=window_size, return_parts=True)
else:
false_seg_count, total_count = seg.pk(h, gold, return_parts=True)

if total_count == 0:
# TODO: Check when happens
false_prob = -1
else:
false_prob = float(false_seg_count) / float(total_count)
false_prob = float(false_seg_count) / total_count

return false_prob, total_count

def win_diff(self, h, gold, window_size=-1):
"""
:param gold: gold segmentation (item in the list contains the number of words in segment)
:param h: hypothesis segmentation (each item in the list contains the number of words in segment)
:param window_size: optional
:return: accuracy
:param h: hypothesis segmentation
:param gold: gold segmentation
:param window_size: optional window size
:return: win_diff accuracy
"""
if window_size != -1:
false_seg_count, total_count = seg.window_diff(h, gold, window_size=window_size, return_parts=True)
Expand All @@ -82,14 +79,12 @@ def win_diff(self, h, gold, window_size=-1):
if total_count == 0:
false_prob = -1
else:
false_prob = float(false_seg_count) / float(total_count)
false_prob = float(false_seg_count) / total_count

return false_prob, total_count

def calc_accuracy(self):
pk = sum([pw[0] * pw[1] for pw in self.pk_to_weight]) / sum([pw[1] for pw in self.pk_to_weight]) if len(
self.pk_to_weight) > 0 else -1.0
windiff = sum([pw[0] * pw[1] for pw in self.windiff_to_weight]) / sum(
[pw[1] for pw in self.windiff_to_weight]) if len(self.windiff_to_weight) > 0 else -1.0
pk = sum(pw[0] * pw[1] for pw in self.pk_to_weight) / sum(pw[1] for pw in self.pk_to_weight) if self.pk_to_weight else -1.0
windiff = sum(pw[0] * pw[1] for pw in self.windiff_to_weight) / sum(pw[1] for pw in self.windiff_to_weight) if self.windiff_to_weight else -1.0

return pk, windiff
return pk, windiff
43 changes: 17 additions & 26 deletions annotate_wiki_file.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,47 @@
from argparse import ArgumentParser
from wiki_loader import read_wiki_file
import pandas as pd
from pathlib2 import Path
from pathlib import Path # Use pathlib, not pathlib2
import os


def get_files(path):
all_objects = Path(path).glob('**/*')
all_objects = Path(path).rglob('*') # Use rglob for '**/*' pattern
files = (str(p) for p in all_objects if p.is_file())
return files

def generate_segmentation_template(path, output_path):
writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False)
df = pd.DataFrame({ 'Cut here': [0] * len(sentences),'Sentences': sentences})
df = df[['Cut here','Sentences']]

df.to_excel(writer, sheet_name='segment')
writer.save()

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer: # Use context manager for ExcelWriter
sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False)
df = pd.DataFrame({'Cut here': [0] * len(sentences), 'Sentences': sentences})
df = df[['Cut here', 'Sentences']]
df.to_excel(writer, sheet_name='segment')

def generate_test_article(path, output_path):
sentences, _, _ = read_wiki_file(path, None, remove_preface_segment= True, return_as_sentences=True, ignore_list=True, remove_special_tokens = False,
sentences, _, _ = read_wiki_file(path, None, remove_preface_segment=True, return_as_sentences=True, ignore_list=True, remove_special_tokens=False,
high_granularity=False)
article_text = "\n".join(sentences)
with open(output_path, "w") as f:
f.write(article_text.encode('utf-8'))
f.close()
with open(output_path, "w", encoding='utf-8') as f: # Use context manager and specify encoding
f.write(article_text)

def generate_folder(input_folder,output_folder):
def generate_folder(input_folder, output_folder, to_text):
counter = 0
input_files = get_files(input_folder)
for file in input_files:
id = os.path.basename(file)
file_name = id + ".xlsx" if not args.toText else id
file_name = f"{id}.xlsx" if not to_text else id
output_file = os.path.join(output_folder, file_name)
if (args.toText):
if to_text:
generate_test_article(file, output_file)
else:
generate_segmentation_template(file,output_file)
generate_segmentation_template(file, output_file)
counter += 1
print 'generates ' + str(counter) + ' files'


print(f'Generated {counter} files')

if __name__ == '__main__':

parser = ArgumentParser()
parser.add_argument('--path', help='input folder path', default='/home/michael/Downloads/migo/68943', type=str)
parser.add_argument('--output_path', help='output folder path', default='blah.xlsx', type=str)
parser.add_argument('--toText', help='output to text files ?', action='store_true')
parser.add_argument('--toText', help='output to text files?', action='store_true')
args = parser.parse_args()

generate_folder(args.path,args.output_path)

generate_folder(args.path, args.output_path, args.toText)
110 changes: 48 additions & 62 deletions calc_statistics.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from __future__ import division

import torch
from torch.utils.data import DataLoader
from torch.autograd import Variable
import numpy as np

from choiloader import ChoiDataset, collate_fn
Expand All @@ -11,19 +8,17 @@
from utils import maybe_cuda
import utils
import sys
from pathlib2 import Path
from pathlib import Path # Use pathlib instead of pathlib2
from wiki_loader import WikipediaDataSet
import accuracy

logger = utils.setup_logger(__name__, 'train.log')



def main(args):
sys.path.append(str(Path(__file__).parent))

utils.read_config_file(args.config)
utils.config.update(args.__dict__)
utils.config.update(vars(args)) # Update config with args dictionary

logger.debug('Running with config %s', utils.config)
article_with_problems = 0
Expand All @@ -39,94 +34,85 @@ def main(args):
min_num_sentences = 1000
max_num_sentences = 0


dl = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
docs_num_segments_vec = np.zeros(len(dl))
segments_num_sentences_vec = []
print 'num of docs is ' + str(len(dl))
print(f'Number of documents: {len(dl)}')

with tqdm(desc='Testing', total=len(dl)) as pbar:

for i, (data, targets, paths) in enumerate(dl):
if (len(paths) == 0):
if len(paths) == 0:
article_with_problems += 1
docs_num_segments_vec[i] = np.nan
continue
try:

if ( ((i % 1000 ) == 0) & i > 0):
print i
if i % 1000 == 0 and i > 0:
print(i)
if len(targets) > 0:
targets_var = Variable(maybe_cuda(torch.cat(targets, 0), None), requires_grad=False)
target_seg = targets_var.data.cpu().numpy()
targets_var = maybe_cuda(torch.cat(targets, 0), None)
target_seg = targets_var.cpu().numpy()
target_seg = np.concatenate([target_seg, np.array([1])])
else:
target_seg = np.ones(1)
num_sentences += (len(target_seg))
doc_num_of_segment = (sum(target_seg))
if (doc_num_of_segment < min_num_segment):
min_num_segment = doc_num_of_segment
if (doc_num_of_segment > max_num_segment):
max_num_segment = doc_num_of_segment

num_sentences += len(target_seg)
doc_num_of_segment = sum(target_seg)

min_num_segment = min(min_num_segment, doc_num_of_segment)
max_num_segment = max(max_num_segment, doc_num_of_segment)

num_segments += doc_num_of_segment
num_documents += 1
docs_num_segments_vec[i] = doc_num_of_segment

one_inds = np.where(target_seg == 1)[0]
one_inds += 1
one_inds = np.concatenate((np.zeros(1),one_inds))
if (len(one_inds) == 1):
one_inds = np.concatenate(([0], one_inds))

if len(one_inds) == 1:
sentences_in_segments = [len(target_seg)]
else:
sentences_in_segments = one_inds[1:] - one_inds[:-1]
segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec,sentences_in_segments))

segments_num_sentences_vec = np.concatenate((segments_num_sentences_vec, sentences_in_segments))
current_min = np.min(sentences_in_segments)
current_max = np.max(sentences_in_segments)
if (current_min < min_num_sentences):
min_num_sentences = current_min
if (current_max > max_num_sentences):
max_num_sentences = current_max



min_num_sentences = min(min_num_sentences, current_min)
max_num_sentences = max(max_num_sentences, current_max)

except Exception as e:
logger.info('Exception "%s" in batch %s', e, i)
logger.info(f'Exception "{e}" in batch {i}')
logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True)
raise

print(f'Total sentences: {num_sentences}.')
print(f'Total segments: {num_segments}.')
print(f'Total documents: {num_documents}.')
print(f'Average segment size: {num_sentences / num_segments:.3f}.')
print(f'Min #segments in a document: {min_num_segment}.')
print(f'Max #segments in a document: {max_num_segment}.')
print(f'Min #sentences in a segment: {min_num_sentences}.')
print(f'Max #sentences in a segment: {max_num_sentences}.')

print('\nNew computing method\n')
print(f'Number of documents: {len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum()}.')
print(f'Total segments: {np.nansum(docs_num_segments_vec)}.')
print(f'Total sentences: {np.sum(segments_num_sentences_vec)}.')

print(f'Min #segments in a document: {np.nanmin(docs_num_segments_vec)}.')
print(f'Max #segments in a document: {np.nanmax(docs_num_segments_vec)}.')
print(f'Mean segments in a document: {np.nanmean(docs_num_segments_vec):.3f}.')
print(f'Standard deviation of segments in a document: {np.nanstd(docs_num_segments_vec):.3f}.')

print(f'\nMin #sentences in a segment: {np.min(segments_num_sentences_vec)}.')
print(f'Max #sentences in a segment: {np.max(segments_num_sentences_vec)}.')
print(f'Average segment size: {np.mean(segments_num_sentences_vec):.3f}.')
print(f'Standard deviation of segment size: {np.std(segments_num_sentences_vec):.3f}.')

print 'total sentences: {}.'.format(num_sentences)
print 'total segments: {}.'.format(num_segments)
print 'total documents: {}.'.format(num_documents)
print 'average segment size is: {:.3}.'.format(np.true_divide(num_sentences,num_segments))
print 'min #segment in document: {}.'.format(min_num_segment)
print 'max #segment in document: {}.'.format(max_num_segment)
print 'min #sentence in segment: {}.'.format(min_num_sentences)
print 'max #sentence in segment: {}.'.format(max_num_sentences)


print ''
print 'new computing method'
print ''
print 'num of documents: {}.'.format(len(docs_num_segments_vec) - np.isnan(docs_num_segments_vec).sum())
print 'total segments: {}.'.format(np.nansum(docs_num_segments_vec))
print 'total sentences: {}.'.format(np.sum(segments_num_sentences_vec))
print ''
print 'min #segment in document: {}.'.format(np.nanmin(docs_num_segments_vec))
print 'max #segment in document: {}.'.format(np.nanmax(docs_num_segments_vec))
print 'mean segments in document: {:.3}.'.format(np.nanmean(docs_num_segments_vec))
print 'std segments in document: {:.3}.'.format(np.nanstd(docs_num_segments_vec))
print ''
print 'min #sentence in segment: {}.'.format(np.min(segments_num_sentences_vec))
print 'max #sentence in segment: {}.'.format(np.max(segments_num_sentences_vec))
print 'average segment size is: {:.3}.'.format(np.mean(segments_num_sentences_vec))
print 'std segment size is: {:.3}.'.format(np.std(segments_num_sentences_vec))

print ''
print 'article with problems {}'.format(article_with_problems)
print(f'\nArticles with problems: {article_with_problems}')

if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--config', help='Path to config.json', default='config.json')
main(parser.parse_args())
main(parser.parse_args())
Loading