forked from google-research/bert
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
32 lines (27 loc) · 801 Bytes
/
preprocess.py
File metadata and controls
32 lines (27 loc) · 801 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from __future__ import print_function
import sys
import argparse
from nltk import tokenize
class Preprocess:
def __init__(self):
self.task = 'preprocess'
def proc(self):
while 1:
try: line = sys.stdin.readline()
except KeyboardInterrupt: break
if not line: break
line = line.strip()
if not line: continue
if '</doc>' == line:
print('')
continue
if '<doc' == line[:4]: continue
if '[[' == line[:2]: continue
lines = tokenize.sent_tokenize(line)
for l in lines:
print(l)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
args = parser.parse_args()
pre = Preprocess()
pre.proc()