-
Notifications
You must be signed in to change notification settings - Fork 108
/
make_sentlines.py
41 lines (33 loc) · 1.08 KB
/
make_sentlines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import sys
from glob import glob
from blingfire import text_to_sentences
file_dir = sys.argv[1]
def convert_into_sentences(lines):
stack = []
sent_L = []
n_sent = 0
for chunk in lines:
if not chunk.strip():
if stack:
sents = text_to_sentences(
" ".join(stack).strip().replace('\n', ' ')).split('\n')
sent_L.extend(sents)
n_sent += len(sents)
sent_L.append('\n')
stack = []
continue
stack.append(chunk.strip())
if stack:
sents = text_to_sentences(
" ".join(stack).strip().replace('\n', ' ')).split('\n')
sent_L.extend(sents)
n_sent += len(sents)
return sent_L, n_sent
file_list = list(sorted(glob(os.path.join(file_dir, '*.txt'))))
for i, file_path in enumerate(file_list):
sents, n_sent = convert_into_sentences(open(file_path).readlines())
print('\n'.join(sents))
print('\n\n\n\n')
sys.stderr.write(
'{}/{}\t{}\t{}\n'.format(i, len(file_list), n_sent, file_path))