From af95fefd3f27b08042bbbbe2aa3fc8e5b4c137af Mon Sep 17 00:00:00 2001 From: Mahmoud Aslan Date: Thu, 18 Jul 2019 20:32:15 +0300 Subject: [PATCH] Added optional fractionl splitting to split_wiki To split only a fraction of the wiki data instead of the entire thing, useful for those of us who cannot afford splitting the entire wiki or want to experiment with a fraction of it. --- nlputils.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/nlputils.py b/nlputils.py index e72dfa4..09ac9db 100644 --- a/nlputils.py +++ b/nlputils.py @@ -26,8 +26,15 @@ def get_wiki(path,lang): shutil.rmtree(path/'text') -def split_wiki(path,lang): - dest = path/'docs' +def split_wiki(path,lang,frac=1): + if frac > 1 or frac <= 0: + print(f"frac must be > 0 and <= 1, {frac} was given!") + return + + if frac == 1: + dest = path/'docs' + else: + dest = path/f'docs_{frac}' name = f'{lang}wiki' if dest.exists(): print(f"{dest} already exists; not splitting") @@ -37,15 +44,26 @@ def split_wiki(path,lang): title_re = re.compile(rf'') lines = (path/name).open() f=None + + n_written = 0 + if frac < 1: + text = (path/name).read_text() + n_files = len(list(filter(lambda title: len(title) <= 150, title_re.findall(text)))) + del text + limit = n_files*frac + else: + limit = n_files for i,l in enumerate(lines): if i%100000 == 0: print(i) if l.startswith('