From 052051f8b171b0b5439e783b723388a0bf7629c8 Mon Sep 17 00:00:00 2001 From: Martin Borcin Date: Wed, 8 Sep 2021 17:48:30 +0200 Subject: [PATCH 1/2] fixed a bug creating empty labels, fixed a bug causing a fail on empty lines in jsonl file and added options to specify json field names --- jsonl_to_conll/cli.py | 7 +++++-- jsonl_to_conll/convert.py | 12 ++++++------ jsonl_to_conll/io.py | 11 ++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/jsonl_to_conll/cli.py b/jsonl_to_conll/cli.py index cdfb230..f920b9c 100644 --- a/jsonl_to_conll/cli.py +++ b/jsonl_to_conll/cli.py @@ -5,11 +5,14 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("input_filename", help="Input JSONL filename", type=str) parser.add_argument("output_filename", help="Output CONLL filename", type=str) + parser.add_argument("-s", "--separator", help="Separator to use between words and tags", type=str, default=' ', nargs='?') + parser.add_argument("--text_field", help="Name of the JSON field the text is stored in", type=str, default='text', nargs='?') + parser.add_argument("--label_field", help="Name of the JSON field the labels are stored in", type=str, default='label', nargs='?') args = parser.parse_args() data = io.read_jsonl(args.input_filename) - data = convert.flatten_all(data) - io.json_to_text(data, args.output_filename) + data = convert.flatten_all(data, args.text_field, args.label_field) + io.json_to_text(data, args.output_filename, args.separator) if __name__ == "__main__": main() diff --git a/jsonl_to_conll/convert.py b/jsonl_to_conll/convert.py index f7dfa2a..5047292 100644 --- a/jsonl_to_conll/convert.py +++ b/jsonl_to_conll/convert.py @@ -1,13 +1,13 @@ import json -def flatten(data): +def flatten(data, text_field, label_field): output_text = [] beg_index = 0 end_index = 0 - text = data["text"] - all_labels = sorted(data["labels"]) + text = data[text_field] + all_labels = sorted(data[label_field]) for ind in range(len(all_labels)): next_label = all_labels[ind] @@ -16,12 +16,12 @@ def flatten(data): label = next_label beg_index = label[0] end_index = label[1] - label_text = text[beg_index:end_index] + label_text = text[beg_index:end_index].strip() output_text += [(label_word, "B-" + label[2]) if not i else (label_word, "I-" + label[2]) for i, label_word in enumerate(label_text.split(" "))] output_text += [(label_word, "O") for label_word in text[end_index:].strip().split()] return output_text -def flatten_all(datas): - return [flatten(data) for data in datas] +def flatten_all(datas, text_field, label_field): + return [flatten(data, text_field, label_field) for data in datas] diff --git a/jsonl_to_conll/io.py b/jsonl_to_conll/io.py index c8f8c32..c27711a 100644 --- a/jsonl_to_conll/io.py +++ b/jsonl_to_conll/io.py @@ -1,15 +1,12 @@ import json -def json_to_text(jsons, output_filename): +def json_to_text(jsons, output_filename, separator): with open(output_filename, "w") as f: for each_json in jsons: for line in each_json: - f.writelines(" ".join(line) + "\n") - f.writelines("\n") + f.write(separator.join(line) + "\n") + f.write("\n") def read_jsonl(filename): - result = [] with open(filename, "r") as f: - for line in f.readlines(): - result.append(json.loads(line)) - return result + return [json.loads(line) for line in f if line.strip()] From 47e861515e0ff3439f4b387ca025094e6d503554 Mon Sep 17 00:00:00 2001 From: Martin Borcin Date: Wed, 8 Sep 2021 18:52:43 +0200 Subject: [PATCH 2/2] updated READMEs --- README.md | 23 +++++++++++++++++++++-- README.rst | 29 +++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b0f0ac4..b9014b9 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,31 @@ pip3 install --user jsonl-to-conll ## Usage ### Sample Usage +Basic usage: ```bash jsonl-to-conll input.jsonl output.conll ``` +To specify a separator for the conll file and atypical names for the 'text' and 'label' json fields: +```bash +jsonl-to-conll input.jsonl output.conll -s $'\t' --text_field 'data' --label_field 'labels' +``` + ### Documentation ```bash -usage: jsonl-to-conll [-h] input_filename output_filename -jsonl-to-conll: error: the following arguments are required: input_filename, output_filename +usage: jsonl-to-conll [-h] [-s [SEPARATOR]] [--text_field [TEXT_FIELD]] [--label_field [LABEL_FIELD]] input_filename output_filename + +positional arguments: + input_filename Input JSONL filename + output_filename Output CONLL filename + +optional arguments: + -h, --help show this help message and exit + -s [SEPARATOR], --separator [SEPARATOR] + Separator to use between words and tags + --text_field [TEXT_FIELD] + Name of the JSON field the text is stored in + --label_field [LABEL_FIELD] + Name of the JSON field the labels are stored in + ``` diff --git a/README.rst b/README.rst index cd2f280..26579de 100644 --- a/README.rst +++ b/README.rst @@ -19,14 +19,39 @@ Usage Sample Usage ^^^^^^^^^^^^ + +Basic usage: + .. code-block:: bash jsonl-to-conll input.jsonl output.conll + +To specify a separator for the conll file and atypical names for the 'text' and 'label' json fields: + +.. code-block:: bash + + jsonl-to-conll input.jsonl output.conll -s $'\t' --text_field 'data' --label_field 'labels' + + + Documentation ^^^^^^^^^^^^^ .. code-block:: bash + + usage: jsonl-to-conll [-h] [-s [SEPARATOR]] [--text_field [TEXT_FIELD]] [--label_field [LABEL_FIELD]] input_filename output_filename + + positional arguments: + input_filename Input JSONL filename + output_filename Output CONLL filename + + optional arguments: + -h, --help show this help message and exit + -s [SEPARATOR], --separator [SEPARATOR] + Separator to use between words and tags + --text_field [TEXT_FIELD] + Name of the JSON field the text is stored in + --label_field [LABEL_FIELD] + Name of the JSON field the labels are stored in - usage: jsonl-to-conll [-h] input_filename output_filename - jsonl-to-conll: error: the following arguments are required: input_filename, output_filename