-
Notifications
You must be signed in to change notification settings - Fork 43
/
preprocess_copy.py
130 lines (106 loc) · 3.7 KB
/
preprocess_copy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import argparse
import codecs
import json
import numpy as np
import re
# Get a counter for the iterations
from tqdm import tqdm
tqdm.monitor_interval = 0
from collections import Counter
print("Loaded libraries...")
parser = argparse.ArgumentParser(
description="Builds an extractive summary from a json prediction.")
parser.add_argument('-src', required=True, type=str,
help="""Path of the src file""")
parser.add_argument('-tgt', required=True, type=str,
help="""Path of the tgt file""")
parser.add_argument('-output', type=str,
default='data/processed/multicopy',
help="""Path of the output files""")
parser.add_argument('-prune', type=int, default=200,
help="Prune to that number of words.")
parser.add_argument('-num_examples', type=int, default=100000,
help="Prune to that number of examples.")
opt = parser.parse_args()
def compile_substring(start, end, split):
if start == end:
return split[start]
return " ".join(split[start:end+1])
def format_json(s):
return json.dumps({'sentence':s})+"\n"
def splits(s, num=200):
return s.split()[:num]
def make_BIO_tgt(s, t):
# tsplit = t.split()
ssplit = s#.split()
startix = 0
endix = 0
matches = []
matchstrings = Counter()
while endix < len(ssplit):
# last check is to make sure that phrases at end can be copied
searchstring = compile_substring(startix, endix, ssplit)
if searchstring in t \
and endix < len(ssplit)-1:
endix +=1
else:
# only phrases, not words
# uncomment the -1 if you only want phrases > len 1
if startix >= endix:#-1:
matches.extend(["0"] * (endix-startix + 1))
endix += 1
else:
# First one has to be 2 if you want phrases not words
full_string = compile_substring(startix, endix-1, ssplit)
if matchstrings[full_string] >= 1:
matches.extend(["0"]*(endix-startix))
else:
matches.extend(["1"]*(endix-startix))
matchstrings[full_string] +=1
#endix += 1
startix = endix
return " ".join(matches)
def main():
lcounter = 0
max_total = opt.num_examples
SOURCE_PATH = opt.src
TARGET_PATH = opt.tgt
NEW_TARGET_PATH = opt.output + ".txt"
PRED_SRC_PATH = opt.output + ".pred.txt"
PRED_TGT_PATH = opt.output + ".src.txt"
with codecs.open(SOURCE_PATH, 'r', "utf-8") as sfile:
for ix, l in enumerate(sfile):
lcounter +=1
if lcounter >= max_total:
break
sfile = codecs.open(SOURCE_PATH, 'r', "utf-8")
tfile = codecs.open(TARGET_PATH, 'r', "utf-8")
outf = codecs.open(NEW_TARGET_PATH, 'w', "utf-8", buffering=1)
outf_tgt_src = codecs.open(PRED_SRC_PATH, 'w', "utf-8", buffering=1)
outf_tgt_tgt = codecs.open(PRED_TGT_PATH, 'w', "utf-8", buffering=1)
actual_lines = 0
for ix, (s, t) in tqdm(enumerate(zip(sfile,tfile)), total=lcounter):
ssplit = splits(s, num=opt.prune)
# Skip empty lines
if len(ssplit) < 2 or len(t.split()) < 2:
continue
else:
actual_lines += 1
# Build the target
tgt = make_BIO_tgt(ssplit,t)
# Format for allennlp
for token, tag in zip(ssplit, tgt.split()):
outf.write(token+"###"+tag + " ")
outf.write("\n")
# Format for predicting with allennlp
outf_tgt_src.write(format_json(" ".join(ssplit)))
outf_tgt_tgt.write(tgt + "\n")
if actual_lines >= max_total:
break
sfile.close()
tfile.close()
outf.close()
outf_tgt_src.close()
outf_tgt_tgt.close()
if __name__ == "__main__":
main()