From dacd0aaef807524184902920f36ff73b511e65d8 Mon Sep 17 00:00:00 2001 From: Dan Date: Sat, 5 Jun 2021 18:16:00 +1000 Subject: [PATCH] Update nlputils.py Update the options when call WikiExtractor. This is due to the argument update at https://github.com/attardi/wikiextractor/blob/master/wikiextractor/WikiExtractor.py For example: --no_templates change to --no-templates. Besides, other options (such as --min_text_length, --filter_disambig_pages, and --log_file) do not existed anymore. --- nlputils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nlputils.py b/nlputils.py index e72dfa4..8eebc42 100644 --- a/nlputils.py +++ b/nlputils.py @@ -20,8 +20,8 @@ def get_wiki(path,lang): with working_directory(path): if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git') print("extracting...") - os.system("python wikiextractor/WikiExtractor.py --processes 4 --no_templates " + - f"--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {xml_fn}") + os.system("python -m wikiextractor.wikiextractor.WikiExtractor --no-templates -b 100G -q " + f"{xml_fn}") + shutil.move(str(path/'text/AA/wiki_00'), str(path/name)) shutil.rmtree(path/'text')