Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,31 @@ pip3 install --user jsonl-to-conll

## Usage
### Sample Usage
Basic usage:
```bash
jsonl-to-conll input.jsonl output.conll
```

To specify a separator for the conll file and atypical names for the 'text' and 'label' json fields:
```bash
jsonl-to-conll input.jsonl output.conll -s $'\t' --text_field 'data' --label_field 'labels'
```

### Documentation
```bash
usage: jsonl-to-conll [-h] input_filename output_filename
jsonl-to-conll: error: the following arguments are required: input_filename, output_filename
usage: jsonl-to-conll [-h] [-s [SEPARATOR]] [--text_field [TEXT_FIELD]] [--label_field [LABEL_FIELD]] input_filename output_filename

positional arguments:
input_filename Input JSONL filename
output_filename Output CONLL filename

optional arguments:
-h, --help show this help message and exit
-s [SEPARATOR], --separator [SEPARATOR]
Separator to use between words and tags
--text_field [TEXT_FIELD]
Name of the JSON field the text is stored in
--label_field [LABEL_FIELD]
Name of the JSON field the labels are stored in

```
29 changes: 27 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,39 @@ Usage
Sample Usage
^^^^^^^^^^^^


Basic usage:

.. code-block:: bash

jsonl-to-conll input.jsonl output.conll


To specify a separator for the conll file and atypical names for the 'text' and 'label' json fields:

.. code-block:: bash

jsonl-to-conll input.jsonl output.conll -s $'\t' --text_field 'data' --label_field 'labels'



Documentation
^^^^^^^^^^^^^

.. code-block:: bash

usage: jsonl-to-conll [-h] [-s [SEPARATOR]] [--text_field [TEXT_FIELD]] [--label_field [LABEL_FIELD]] input_filename output_filename

positional arguments:
input_filename Input JSONL filename
output_filename Output CONLL filename

optional arguments:
-h, --help show this help message and exit
-s [SEPARATOR], --separator [SEPARATOR]
Separator to use between words and tags
--text_field [TEXT_FIELD]
Name of the JSON field the text is stored in
--label_field [LABEL_FIELD]
Name of the JSON field the labels are stored in

usage: jsonl-to-conll [-h] input_filename output_filename
jsonl-to-conll: error: the following arguments are required: input_filename, output_filename
7 changes: 5 additions & 2 deletions jsonl_to_conll/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("input_filename", help="Input JSONL filename", type=str)
parser.add_argument("output_filename", help="Output CONLL filename", type=str)
parser.add_argument("-s", "--separator", help="Separator to use between words and tags", type=str, default=' ', nargs='?')
parser.add_argument("--text_field", help="Name of the JSON field the text is stored in", type=str, default='text', nargs='?')
parser.add_argument("--label_field", help="Name of the JSON field the labels are stored in", type=str, default='label', nargs='?')
args = parser.parse_args()

data = io.read_jsonl(args.input_filename)
data = convert.flatten_all(data)
io.json_to_text(data, args.output_filename)
data = convert.flatten_all(data, args.text_field, args.label_field)
io.json_to_text(data, args.output_filename, args.separator)

if __name__ == "__main__":
main()
12 changes: 6 additions & 6 deletions jsonl_to_conll/convert.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import json


def flatten(data):
def flatten(data, text_field, label_field):
output_text = []
beg_index = 0
end_index = 0

text = data["text"]
all_labels = sorted(data["labels"])
text = data[text_field]
all_labels = sorted(data[label_field])

for ind in range(len(all_labels)):
next_label = all_labels[ind]
Expand All @@ -16,12 +16,12 @@ def flatten(data):
label = next_label
beg_index = label[0]
end_index = label[1]
label_text = text[beg_index:end_index]
label_text = text[beg_index:end_index].strip()
output_text += [(label_word, "B-" + label[2]) if not i else (label_word, "I-" + label[2]) for i, label_word in enumerate(label_text.split(" "))]

output_text += [(label_word, "O") for label_word in text[end_index:].strip().split()]
return output_text


def flatten_all(datas):
return [flatten(data) for data in datas]
def flatten_all(datas, text_field, label_field):
return [flatten(data, text_field, label_field) for data in datas]
11 changes: 4 additions & 7 deletions jsonl_to_conll/io.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import json

def json_to_text(jsons, output_filename):
def json_to_text(jsons, output_filename, separator):
with open(output_filename, "w") as f:
for each_json in jsons:
for line in each_json:
f.writelines(" ".join(line) + "\n")
f.writelines("\n")
f.write(separator.join(line) + "\n")
f.write("\n")

def read_jsonl(filename):
result = []
with open(filename, "r") as f:
for line in f.readlines():
result.append(json.loads(line))
return result
return [json.loads(line) for line in f if line.strip()]