-
Notifications
You must be signed in to change notification settings - Fork 0
/
simpleqe.py
228 lines (194 loc) · 8.07 KB
/
simpleqe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/python
# Mikel L Forcada 2017
#
# Weighted distance-based quality (postediting time) estimation
# The programme reads three training files and three testing files
# Three files: (1) time measurement, (2) source segment, (3) MTed segment
# The program performs a grid search in parameter space (2 parameters: alpha
# and beta).
# When the grid is 1 x 1, can be used to test
# To do (20170126)
# Lowercasing (not lowercased)
import sys
import argparse
# from functools import wraps
import math
import random
import mpmath # to avoid underflows in exponentials
import os
# Incorporating a better tokenizer which is Unicode-aware
from nltk.tokenize import word_tokenize
reload(sys)
sys.setdefaultencoding("utf-8")
def readdata(filename):
return ((open(filename).read()).rstrip("\n")).split("\n")
# Levenshtein distance between two sequences, taken from
# https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
def levenshtein(seq1, seq2):
oneago = None
thisrow = range(1, len(seq2) + 1) + [0]
for x in xrange(len(seq1)):
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
for y in xrange(len(seq2)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
thisrow[y] = min(delcost, addcost, subcost)
return thisrow[len(seq2) - 1]
# main
# Argument parsing
parser = argparse.ArgumentParser()
parser.add_argument("tr_time",help="Training PE time")
parser.add_argument("tr_source", help="Training source segments")
parser.add_argument("tr_mt", help="Training MTed segments")
parser.add_argument("te_time",help="Testing PE time")
parser.add_argument("te_source", help="Testing source segments")
parser.add_argument("te_mt", help="Testing MTed segments")
parser.add_argument("low_alpha", type=float, help="Low value of alpha")
parser.add_argument("high_alpha", type=float, help="High value of alpha")
parser.add_argument("low_beta", type=float, help="Low value of beta")
parser.add_argument("high_beta", type=float, help="High value of beta")
parser.add_argument("npoints", type=int, help="Number of points")
parser.add_argument("--tokenize", action="store_true", dest="tokenize", help="Use advanced tokenization (default: word and space-based)")
parser.add_argument("--character", action="store_true", dest="character", help="Use character-based edit distance (default: word-based")
parser.add_argument("--mae", action="store_true", dest="mae", default=False, help="Minimize according to MAE (default RMSE)")
parser.add_argument("--verbose", action="store_true", dest="verbose", default=False, help="Print each calculation")
parser.add_argument("--alpha_only", action="store_true", dest="alpha_only", default=False, help="Optimize alpha only")
parser.add_argument("--beta_only", action="store_true", dest="beta_only", default=False, help="Optimize alpha only")
parser.add_argument("--produce_output", nargs=1, dest="filename", help="Write output file")
args=parser.parse_args()
train_pe_time = readdata(args.tr_time)
train_source = readdata(args.tr_source)
train_mt = readdata(args.tr_mt)
test_pe_time = readdata(args.te_time)
test_source = readdata(args.te_source)
test_mt = readdata(args.te_mt)
alpha1 = args.low_alpha
alpha2 = args.high_alpha
beta1 = args.low_beta
beta2 = args.high_beta
points = args.npoints
# Tokenize as the input is zipped (using lambda (!))
# This could be written more nicely I assume
# idea taken from http://stackoverflow.com/questions/8372399/zip-with-list-output-instead-of-tuple
if args.tokenize :
postzip = lambda a,b,c : [ [ q[0], word_tokenize(q[1]), word_tokenize(q[2]) ] for q in zip(a,b,c) ]
elif args.character :
postzip = lambda a,b,c : [ [ q[0], q[1], q[2] ] for q in zip(a,b,c) ]
else : # poor man's tokenization
postzip = lambda a,b,c : [ [ q[0], q[1].split() , q[2].split ] for q in zip(a,b,c) ]
# Input is tokenized before zipping.
train_zipped=postzip(train_pe_time,train_source,train_mt) # 0 is time, 1 is source, 2 is mt
test_zipped=postzip(test_pe_time,test_source,test_mt)
# print len(test_zipped)
dscache=[] # rudimentary cache for ds and dmt below
dmtcache=[]
for i in range(len(train_zipped)*len(test_zipped)) :
dscache.append(-1)
dmtcache.append(-1)
# cache for exponentials
expcache = [None] * len(train_zipped)
# Exponent ranges precomputed to use mpmath only if necessary
epsilon =0.01 # for safety, can be zero
safeminval = (1-epsilon)*math.log(sys.float_info.min)
safemaxval = (1-epsilon)*math.log(sys.float_info.max)
saferange = safemaxval-safeminval
safemidval = (safemaxval+safeminval)/2
# Initialize optimal values to starting values
bestalpha=alpha1
bestbeta=beta1
besterr=float("inf")
if args.alpha_only :
brange = range(1)
else :
brange=range(0,points+1)
if args.beta_only :
arange= range(1)
else :
arange=range(0,points+1)
if args.filename :
out = open(args.filename[0],"w")
for ia in arange :
for ib in brange :
i=0 # cache index
currentalpha = alpha1 + (alpha2-alpha1)*ia/points
currentbeta = beta1 + (beta2-beta1)*ib/points
forRMSE=0
forMAE=0
test_samples=0
for test in test_zipped :
using_mpmath=False
minexp=float("inf") # building a range for the exponential
maxexp=-float("inf") # building a range for the exponential
iexp = 0 # exponential cache index
for train in train_zipped : # first loop over train just computes
# exponents and their ranges
if dscache[i]==-1 :
ds=levenshtein(test[1],train[1])
dscache[i]=ds
dmt=levenshtein(test[2],train[2])
dmtcache[i]=dmt
else :
ds=dscache[i]
dmt=dmtcache[i]
exponent=-currentalpha*ds-currentbeta*dmt
expcache[iexp]=exponent
iexp = iexp + 1
if exponent<minexp :
minexp = exponent
if exponent>maxexp :
maxexp = exponent
i = i + 1 # next train--test pair in cache
# end for
# To avoid using mpmath
# The range must fit in that of regular floats
# In that case, values are centered using an offset
exprange=maxexp-minexp
expmiddle=(maxexp+minexp)/2
if exprange < saferange :
using_mpmath=False
offset=safemidval-expmiddle
else :
using_mpmath=True
iexp = 0 # exponential cache index
numerator=0
denominator=0
for train in train_zipped : # second loop over train computes
# the actual weight of each example
if using_mpmath : # for this test example
factor=mpmath.exp(expcache[iexp])
if args.verbose :
print "Used mpmath:", i, iexp, expcache[iexp]
else:
factor=math.exp(offset+expcache[iexp]) # the offset affects
# all terms in the
# numerator and the
# denominator
iexp = iexp + 1
numerator = numerator + factor*float(train[0])
denominator = denominator + factor
# end for
predicted_time = numerator / denominator
if args.filename:
out.write("AlaShefLen_word_mt\t{0}\t{1}".format(iexp,predicted_time)+os.linesep)
test_samples = test_samples + 1
dev=predicted_time-float(test[0])
forRMSE = forRMSE + dev*dev
forMAE = forMAE + math.fabs(dev)
RMSE = math.sqrt(forRMSE/test_samples)
MAE = forMAE/test_samples
if args.verbose :
print currentalpha, currentbeta, RMSE, MAE
if args.mae :
err=MAE
else :
err=RMSE
if err < besterr :
besterr = err
bestalpha = currentalpha
bestbeta = currentbeta
print bestalpha, bestbeta, "RMSE=", RMSE, "MAE=", MAE
print "Best of", test_samples, ":"
print bestalpha, bestbeta, besterr
if args.filename :
out.close()