forked from tastyminerals/ukwac-heads-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ukwac_api.py
executable file
·952 lines (807 loc) · 37.6 KB
/
ukwac_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Simple ukWaC corpus semantic roles API.
Allows you to parse ukWac extracted head files.
ukWaC corpus: http://wacky.sslmit.unibo.it/doku.php?id=corpora
"A large corpus automatically annotated with semantic role information"
http://dgfs2017.uni-saarland.de/wordpress/abstracts/clposter/cl_5_saye.pdf
"An exploration of semantic features in an unsupervised thematic fit evaluation
framework"
http://www.coli.uni-saarland.de/~asayeed/roledm-journal.pdf
Usage examples:
from ulwac_api import Corpus
corpus = Corpus('ukwac_corpus') # set corpus dir with gzipped files
corpus.create('ukwac.h5', mode='set') # generates .pickles and .h5 file
CorpusReader is a subclass of Corpus, so you might prefer:
from ukwac_api import CorpusReader
reader = CorpusReader('ukwac_corpus')
reader.create() # generated .pickles only! since no .h5 fname is given
reader.connect('ukwac.h5') # set the generated .h5 file
reader.read() # returns iterator
You can read from corpus files directly without conversion to .h5.
from ukwac_api import CorpusReader
reader = CorpusReader('ukwac_corpus')
reader.read() # returns iterator
"""
from __future__ import division
import argparse
from itertools import chain, izip_longest
from collections import defaultdict as dd
from datetime import datetime
import gzip
import math
import random
import numpy as np
from keras.utils.np_utils import to_categorical
from nltk.stem.wordnet import WordNetLemmatizer
import xml.etree.cElementTree as cet
import os
import pickle
import re
import sys
import tables
class Corpus:
"""Converts ukWaC head xml file(s) to .h5 file."""
def __init__(self, corpus_path, filterfile='', algs=[]):
"""
Args:
| *corpus_path* (str) -- directory with ukWac head files
| *filterfile* (str) -- a file of allowed words
| *algs* (list) -- a list of allowed algorithms
"""
self.path = corpus_path
self.wordfilter = filterfile # list of words to include
self.algs = set(algs) or set(['malt', 'malt_span', 'linear', 'failed'])
self._selfcheck()
self._setup()
def _setup(self):
"""Setting constants required for xml parsing."""
self.lem = WordNetLemmatizer()
# wordnet postags
self.wn = dd(lambda: u'n')
self.wn.update({'v': u'v', 'n': u'n', 'j': u'a', 'r': u'r'})
self.lemmatize = False
self.h5path = None # .h5 file path, default None
self.files = [os.path.join(self.path, p) for p in os.listdir(self.path)
if p.endswith(('.xml', '.gz'))]
# init wordfilter
if self.wordfilter:
self.set_wordfilter(self.wordfilter)
# couple of filtering regex for bad heads
self.junk = re.compile('|'.join(['&',
'@ord@',
'^[^\w]+.*',
'.*[^\w]+$',
'[0-9*]+',
'.*@[\w]+.[\w]+',
'_+',
'[\w]+/'
]))
# alg map
self.ALGS = {'malt': 1, 'malt_span': 2, 'linear': 3, 'failed': 4}
# roles map
self.ROLES = {'a0': 5,
'a1': 1,
'a2': 4,
'a3': 10,
'a4': 14,
'a5': 28,
'am': 25,
'am-adv': 12,
'am-cau': 8,
'am-dir': 18,
'am-dis': 6,
'am-ext': 24,
'am-loc': 2,
'am-mnr': 7,
'am-mod': 21,
'am-neg': 13,
'am-pnc': 15,
'am-prd': 27,
'am-tmp': 3,
'c-a1': 26,
'c-v': 9,
'r-a0': 11,
'r-a1': 17,
'r-a2': 20,
'r-a3': 29,
'r-am-cau': 23,
'r-am-loc': 16,
'r-am-mnr': 19,
'r-am-tmp': 22
}
def _selfcheck(self):
assert isinstance(self.path, str)
assert os.path.exists(self.path)
assert os.path.isdir(self.path)
if self.wordfilter:
assert isinstance(self.wordfilter, str)
assert os.path.exists(self.wordfilter)
assert os.path.isfile(self.wordfilter)
assert isinstance(self.algs, set)
def _valid(self, head, pos=None, alg=None, role=None):
"""
1) Check if a given head is in a word filter if the filter is set.
2) Check if alg parameter is in the allowed algorithms list. This way
you can control which results of malt, malt_span or linear algorithms
to include in the output.
3) Do various "junk" checking.
4) Check whether xml element is a singleton and should be included
into the output.
<Even though we were able to successfully find a phrase head it
does not mean that the head or its governor can be a valid word.
It might happen so, that instead of a phrase head we receive a
chunk of punctuation chars or a gardbled word or something we do
not want to see in our tensor. For this reason we need to do some
light "junk" filtering.>
<TODO:
We need to check if a role contains infinitivals like "to live to
dance". Remove "to" token and reconnect each verb with its
governor. This is relatively rare case that is why I decided skip
it for now. Besides, it drags in unecessary comlexity.>
Args:
| *head* (str) -- phrase head or a governor token
| *pos* (str) -- word POS-tag
| *alg* (str) -- algorithm (strategy) used to retrive the token
(if the token is a phrase head, ``None`` if it is a governor)
| *role* (str) -- dependency type
Returns:
*bool* -- ``True`` if a given token is good, else ``False``
"""
# check if word is in filtered list
if self.wordfilter and head not in self.wordfilter:
return False
# heads allowed algs check
if alg is not None and alg not in self.algs:
return False
# heads consistency check
if ' ' in head:
return False
if len(head) > 53: # max possible length in ukWaC
return False
elif len(re.sub(r'\w', '', head)) > 3:
return False
elif not re.sub(r'\W', '', head):
return False
elif len(head) <= 5: # filtering short words that end with numbers
if re.match(r'.*[0-9]+', head):
return False
elif len(re.sub(r'[^\.]', '', head)) > 1: # filter head with > 1 dots
return False
elif re.search(r'[;:&]', head):
return False
# heads junk check
if self.junk.match(head):
return False
# head singletons check
if alg == 'failed':
# singletons allowed postags
if pos not in {'jj', 'rb', 'vv', 'vb', 'vh'}:
return False
elif pos != 'md' and role == 'am-mod':
return False
elif role != 'am-neg':
return False
return True
def _lem(self, head, pos):
"""Lemmatize given head."""
if self.lemmatize:
return self.lem.lemmatize(head, self.wn[pos[0]])
else:
return head
def create(self, h5name=None, compress=0, lemmatize=False, mode='single'):
"""
Create three .h5 files: ${h5name}_train.h5, ${h5name}_valid.h5 and
${h5name}_test.h5.
The ukwac head files will be split according to the following
train/valid/test ratio --> 70/20/10.
Args:
| *h5name* (str) -- .h5 file name
| *compress* (int) -- level of compression [0-9]
| *lemmatize* (bool) -- use nltk lemmatizer to lemmatize
govs and their deps (doubles processing time)
| *mode* (str) -- store mode, 'single' {gov,dep}, 'set' {gov,deps}
'single' mode stores in .h5
(sid, gov, gov pos, arg, arg pos, role, alg)
'set' mode stores in .h5
(sid, gov, gov pos, arg0, arg0 pos, role0, alg0,
arg1, arg1 pos, role1, alg1,
...)
WARNING!
.h5 files are created using "single" mode by default.
If you attempt to use "set" read mode on such .h5 files,
it will fail.
In other words, if you plan to read .h5 files using "single" mode,
do `create('ukwac_single.h5')`.
If you plan to read .h5 using "set" or "random_set" mode,
do `create('ukwac_set.h5', mode='set')`.
"""
# split files into train/valid/test groups
total = len(self.files)
s1 = int(round(total * 0.7)) # train sample size
s2 = int(round(total * 0.2)) # valid sample size
groups = [self.files[:s1], self.files[s1:s1+s2], self.files[s1+s2:]]
# word and postag mappings
wordfreq = dd(int)
wmap = dd() # word mapping from word --> int id
posmap = dd()
for name, gr in zip(['train', 'valid', 'test'], groups):
h5name = name + '_' + h5name if h5name else None
if h5name:
print("Creating {0} file...".format(h5name))
wordfreq, wmap, posmap = self._create(gr, [wordfreq, wmap, posmap],
h5name, compress, lemmatize,
mode)
print('')
self._pickle('wordfreq.pickle', wordfreq)
self._pickle('wmap.pickle', wmap)
self._pickle('posmap.pickle', posmap)
def _create(self, files, maps, h5name, compress, lemmatize, mode):
"""
Iteratively parse ukWaC head files and convert them to .h5 file.
If h5name is None, generate .pickle word and postag mappings only.
"""
def update_progress():
progress = round((100 - (left / total * 100)), 1)
total0 = int(math.floor(total/100))
left0 = int(math.floor(left/100))
out = '[{0}] {1}%'.format('#'*(total0-left0)+'-'*(left0), progress)
elapsed = ' Elapsed: {}'.format(datetime.now() - start_time)
out = '\r' + out + ' {0}/{1}'.format(total-left, total) + elapsed
sys.stdout.write(out)
sys.stdout.flush()
sys.stdout.write('\n') if total-left == 0 else None
if lemmatize:
self.lemmatize = True
# set vars for progress bar
total = len(files)
left = len(files)
start_time = datetime.now()
# continue from last time
wordfreq = maps[0]
wmap = maps[1]
posmap = maps[2]
# setup id number generators
wit = iter(xrange(int(3e6))) # approximate vocab size
pit = iter(xrange(80)) # approximate postag vocab size
# set pytables EArray
if h5name is not None:
h5file = tables.open_file(h5name, mode='w')
atom = tables.Int32Atom()
filters = None
if compress:
filters = tables.Filters(complevel=compress, complib='blosc',
fletcher32=True)
if mode == 'single':
expectedrows = int(4e9)
elif mode == 'set':
expectedrows = int(2.022e8)
earr = h5file.create_earray(h5file.root, 'ukwac', atom, (0,),
'earr', filters=filters,
expectedrows=expectedrows)
# we add -1 at the beginning of .h5 to delimit 1-st govset
if mode == 'set':
earr.append(tuple([-1])) # adding first set delimiter
# parse all corpus files
sid = 0 # sentence id num
cache = [] # gov,deps accumulator
seen = False # keep track of <s> tag
skip = 0 # number of (gov,deps) to skip in order to set correct sid
for fname in iter(files):
fname = fname if fname.endswith('.xml') else gzip.open(fname, 'r')
xmliter = cet.iterparse(fname, events=('end',))
for _, elem in xmliter:
if elem.tag == 'governor':
if mode == 'set' and cache and cache[1]:
if h5name is not None:
# sid, gov, gov pos, (arg, arg pos, role, alg)+
earr.append(tuple(cache[0] +
[i for i in chain(*cache[1])] +
[-1])) # -1 is set delimiter
cache = [] # reset
head, pos = elem.text.rsplit('/', 2)[:2]
if not self._valid(head, pos):
continue
head = self._lem(head, pos)
wordfreq[head] += 1
wmap[head] = wmap.get(head) or wit.next()
posmap[pos] = posmap.get(pos) or pit.next()
cache.append([sid, wmap.get(head), posmap.get(pos)])
cache.append([]) # padding for dep, to keep row pair
elif elem.tag == 'dep' and elem.attrib.get('algorithm'):
head, pos = elem.text.rsplit('/', 2)[:2]
alg = elem.attrib.get('algorithm')
role = elem.attrib.get('type')
if cache and self._valid(head, pos, alg, role):
if pos in {'prp', 'vbg'}:
pos = 'nn'
if mode == 'single':
head = self._lem(head, pos)
wordfreq[head] += 1
wmap[head] = wmap.get(head) or wit.next()
posmap[pos] = posmap.get(pos) or pit.next()
cache[-1] = [wmap.get(head), posmap.get(pos),
self.ROLES[role], self.ALGS[alg], sid]
if h5name is not None:
# sid, gov, gov pos, arg, arg pos, role, alg
earr.append(tuple(chain(*cache))) # flatten
continue
elif mode == 'set':
head = self._lem(head, pos)
wordfreq[head] += 1
wmap[head] = wmap.get(head) or wit.next()
posmap[pos] = posmap.get(pos) or pit.next()
cache[-1].append((wmap.get(head),
posmap.get(pos),
self.ROLES[role],
self.ALGS[alg]))
# single 'end' event shifts <s> index, we need to adjust
if elem.tag == 's' and not seen:
seen = True
if seen and elem.tag != 'text':
skip += 1
if skip > 1:
seen = False
skip = 0
sid += 1
elem.clear()
del xmliter
fname.close() if isinstance(fname, gzip.GzipFile) else None
left -= 1
update_progress()
h5file.close() if h5name is not None else None
return wordfreq, wmap, posmap
def _pickle(self, fname, obj):
with open(fname, 'wb') as f:
pickle.dump(obj, f)
def _load_pickle(self, fname):
try:
with open(fname, 'rb') as f:
data = pickle.load(f)
except (IOError, OSError) as ex:
print 'ERROR! Pickle could not be loaded!'
print ex
data = {}
return data
def clean_up(self):
"""WARNING! All *.h5, *.pickle files in current dir will be removed"""
files = [fname for fname in os.listdir(os.getcwd())
if fname.endswith(('.h5', '.pickle'))]
for fname in files:
try:
os.remove(fname)
except OSError as ex:
print 'ERROR! Can not remove "%s"'.format(fname)
print ex
def set_wordfilter(self, wfilter):
"""Word filter setter."""
with open(self.wordfilter, 'r') as f:
fdata = f.read().strip().splitlines()
self.wordfilter = frozenset(fdata)
def generate_freqlist(self, wordfreq='wordfreq.pickle', size=50000):
"""
Generate and write a list of most frequent words. You can then use this
list as a word filter to restrict the words included into the output.
Args:
| *wordfreq* (pickle) -- a word frequency dict
| *size* (int) -- number of words in the list
"""
wordfreq = self._load_pickle(wordfreq)
if not wordfreq:
sys.exit(1)
try:
os.remove('wordfreq.txt')
except OSError:
pass
sorted_freqs = sorted(wordfreq, key=wordfreq.get, reverse=True)[:size]
with open('wordfreq.txt', 'a') as f:
for w in sorted_freqs:
f.write(w)
f.write('\n')
print 'Successfully generated "wordfreq.txt", size:', size
class CorpusReader(Corpus):
"""Reads ukWaC head xml file(s) directly or from .h5 file."""
def __init__(self, corpus='.', filterfile='', algs=[]):
"""
Args:
| *corpus_path* (str) -- directory with ukWac head files
| *filterfile* (str) -- a file of allowed words
| *algs* (list) -- a list of allowed algorithms
"""
self.path = corpus # directory with gzipped ukWaC head files
self.wordfilter = filterfile # list of words to include
self.algs = set(algs) or set(['malt', 'malt_span', 'linear', 'failed'])
self._selfcheck()
self._setup()
def connect(self, h5file):
"""Set previously generated .h5 file and use it for reading.
Args:
| *h5file* (str) -- .h5 file name
"""
self.h5path = h5file
def disconnect(self):
"""Unset previously generated .h5 file. CorpusReader will switch into
direct reading mode."""
self.h5path = None
def _read_from_xml(self, mode, delex):
"""Read the corpus directly by iterating through its files and return
a corpus iterator. If you already generated .h5 file, load *.pickle
maps to delexicalize (if True) the output, else return xml strings.
<Make sure self.path contains ukwac files and set correctly upon
class instantiation!>
Args:
| *mode* (str) -- reading mode 'set' or 'single'
| *delex* (bool) -- use delexicalization if True
"""
# load *.pickle maps
# wordfreq = self._load_pickle('wordfreq.pickle') if delex else {}
wmap = self._load_pickle('wmap.pickle') if delex else {}
posmap = self._load_pickle('posmap.pickle') if delex else {}
# parse all corpus files
sid = 0 # sentence id num
cache = [] # gov,deps accumulator
seen = False # keep track of <s> tag
skip = 0 # number of (gov,deps) to skip in order to set correct sid
for fname in iter(self.files):
fname = fname if fname.endswith('.xml') else gzip.open(fname, 'r')
xmliter = cet.iterparse(fname, events=('end',))
for _, elem in xmliter:
if elem.tag == 'governor':
if mode == 'set' and cache and cache[1]:
yield (tuple(cache[0]), tuple(cache[1]))
cache = [] # reset
head, pos = elem.text.rsplit('/', 2)[:2]
if not self._valid(head, pos):
continue
head = self._lem(head, pos)
cache.append([sid, wmap.get(head) or head,
posmap.get(pos) or pos])
cache.append([]) # padding for dep, to keep row pair
elif elem.tag == 'dep' and elem.attrib.get('algorithm'):
head, pos = elem.text.rsplit('/', 2)[:2]
alg = elem.attrib.get('algorithm')
role = elem.attrib.get('type')
if cache and self._valid(head, pos, alg, role):
if pos in {'prp', 'vbg'}:
pos = 'nn'
if mode == 'single':
head = self._lem(head, pos)
cache[-1] = (wmap.get(head) or head,
posmap.get(pos) or pos,
self.ROLES[role],
self.ALGS[alg])
yield tuple(chain(*cache))
# continue
elif mode == 'set':
head = self._lem(head, pos)
cache[-1].append((wmap.get(head) or head,
posmap.get(pos) or pos,
self.ROLES[role],
self.ALGS[alg]))
# single 'end' event shifts <s> index, we need to adjust
if elem.tag == 's' and not seen:
seen = True
if seen and elem.tag != 'text':
skip += 1
if skip > 1:
seen = False
skip = 0
sid += 1
elem.clear()
del xmliter
fname.close() if isinstance(fname, gzip.GzipFile) else None
def _read_from_h5(self, mode):
"""Read from previously generated .h5 file.
Args:
mode (str) -- reading mode: "single", "set", "random_set"
WARNING!
.h5 file is created using "single" mode by default.
If you attempt to use "set" read mode on such .h5 file,
this function will crash.
In other words, if you plan to read .h5 file using "single" mode,
do create('ukwac_single.h5').
If you plan to read .h5 using "set" or "random_set" mode,
do create('ukwac_set.h5', mode='set').
"""
h5 = tables.open_file(self.h5path, mode='r')
ukwac = h5.root.ukwac
if mode == "single":
chunk = [iter(ukwac)] * 7 # chunk size is constant
for row in izip_longest(fillvalue=None, *chunk):
yield row
elif mode == "set":
cache = []
# in 'set' mode chunk size is not constant
itukwac = iter(ukwac)
itukwac.next() # we skip the first -1
for elem in itukwac:
if elem == -1:
yield tuple(cache)
cache = []
else:
cache.append(elem)
elif mode == "random_set":
sids = set()
while len(sids) < 202193377: # number of govsets in ukwac
start = random.randint(0, int(2.022e8)) # "set" earray size
end = start + 170 # extracted from ukwac
cache = []
begin = False
first = False
for i in ukwac.iterrows(start, end):
if i == -1 and not begin:
begin = True
first = True
elif i != -1 and begin:
# check sent id if we already seen it
if first and i in sids:
begin = False
first = False
continue
elif first and i not in sids:
cache.append(i)
first = False
continue
cache.append(i)
elif i == -1 and begin:
begin = False
sids.add(cache[0])
yield tuple(cache)
cache = []
h5.close()
def _extract_elems(self, mode, delex, extract):
"""
Read and extract given xml elements from corpus.
Args:
| *mode* (str) -- reading mode, "set"/"single"
| *delex* (bool) -- apply delexicalization (words -> digits)
| *extract* (list) -- extract only the elements of attributes from
the provided list.
"""
# load *.pickle maps
# wordfreq = self._load_pickle('wordfreq.pickle') if delex else {}
wmap = self._load_pickle('wmap.pickle') if delex else {}
posmap = self._load_pickle('posmap.pickle') if delex else {}
elems = frozenset(extract)
attrs = dd()
# parse all corpus files
sid = 0 # sentence id num
cache = [] # gov,deps accumulator
seen = False # keep track of <s> tag
skip = 0 # number of (gov,deps) to skip in order to set correct sid
for fname in iter(self.files):
fname = fname if fname.endswith('.xml') else gzip.open(fname, 'r')
xmliter = cet.iterparse(fname, events=('end',))
for _, elem in xmliter:
if elem.tag == 'governor':
if mode == 'set' and cache and cache[1]:
yield (tuple(cache[0]), tuple(cache[1]))
cache = [] # reset
head, pos = elem.text.rsplit('/', 2)[:2]
if not self._valid(head, pos):
continue
head = self._lem(head, pos)
cache.append([sid, wmap.get(head) or head,
posmap.get(pos) or pos])
cache.append([]) # padding for dep, to keep row pair
elif elem.tag == 'dep' and elem.attrib.get('algorithm'):
head, pos = elem.text.rsplit('/', 2)[:2]
alg = elem.attrib.get('algorithm')
role = elem.attrib.get('type')
if cache and self._valid(head, pos, alg, role):
if pos in {'prp', 'vbg'}:
pos = 'nn'
# collecting extract attributes
if elems is not None:
attrs = dd()
for el in elems:
attrs[el] = tuple(tuple(e.rsplit('/', 2))
for e in
elem.attrib.get(el).split())
if mode == 'single':
head = self._lem(head, pos)
cache[-1] = (wmap.get(head) or head,
posmap.get(pos) or pos,
self.ROLES[role],
self.ALGS[alg],
attrs)
yield tuple(chain(*cache))
elif mode == 'set':
head = self._lem(head, pos)
cache[-1].append((wmap.get(head) or head,
posmap.get(pos) or pos,
self.ROLES[role],
self.ALGS[alg],
attrs))
# single 'end' event shifts <s> index, we need to adjust
if elem.tag == 's' and not seen:
seen = True
if seen and elem.tag != 'text':
skip += 1
if skip > 1:
seen = False
skip = 0
sid += 1
elem.clear()
del xmliter
fname.close() if isinstance(fname, gzip.GzipFile) else None
def read(self, mode='set', delex=False, extract=[], lemmatize=False):
"""
Read the corpus using one of the given modes.
Reading modes: 'single', 'set', 'random_set'
'single' mode returns a tuple of a governor and its dependant argument:
('send id', 'gov', 'gov postag', 'arg', 'arg postag', 'role', 'alg')
'set' mode returns a tuple of tuples of a governor and all its
dependant arguments found within one sentence:
('sent id', 'governor', 'gov postag',
(('arg', 'arg postag', 'role', 'alg'), (...), (...))
)
'random_set' mode (.h5 only) returns a randomly (without replacement)
chosen part of the .h5 file, looks for a valid chunk of gov and its
args and returns it.
Delexicalization (here) is the process of replacing words with ints.
If h5 corpus file is connected, read from it. Keep in mind that mode
and delexicalization won't work when reading from h5. Reading from h5
is intended to be fast and its elements are already delexicalized.
If extract list of additional xml attributes is provided, read and
extract these attributes from the corpus and return together with the
main tuple. The attributes will be collected in a dict and supplied as
the last element of the output tuple.
Mode will be applied accordingly if set.
Delexicalization will be applied as well if delexicalize is set and
.pickle files exist.
If lemmatize is set, use nltk lemmatizer on govs and deps. The "source"
and "text" attributes are coming from malt parser and should be
lemmatized already.
Args:
| *mode* (str) -- reading mode, "set"/"single"
| *delex* (bool) -- apply delexicalization (words -> digits)
| *extract* (list) -- extract only the elements of attributes from
the provided list. Xml head files contain 'source' and 'text'
attributes whose values can be extracted in addition to already
existing ones.
| *lemmatize* (bool) -- use nltk lemmatizer to lemmatize
govs and their deps (doubles processing time)
Returns:
| *iterator* -- corpus line iterator
"""
if lemmatize:
self.lemmatize = True
if mode != 'random_set':
if self.h5path is None and extract is None:
return self._read_from_xml(mode, delex)
elif self.h5path is None and extract is not None:
return self._extract_elems(mode, delex, extract)
return self._read_from_h5(mode)
# debug only
def decode(self, batch, wmap):
"""Decode batch ids into str"""
selor = dict([(v, k) for k, v in self.ROLES.items()])
pamw = dict([v, k] for k, v in wmap.items())
return dict([(selor.get(k, 'UNK_ROLE'), pamw.get(v))
for k, v in batch.items()])
def get_minibatch(self, filtered_roles, batch_size=1, random=False, neg=1):
"""
Generate k-noise samples for target role + 1 positive sample from data.
Noise and positive samples share inputs.
TODO: implement random reading
Args:
filtered_roles (set) -- roles included into batch
batch_size (int) -- batch size
random (bool) -- randomly pick data from .h5 file
neg (int) -- number of negative samples
"""
# wmap.pickle is generated by self.create() function
assert os.path.isfile('wmap.pickle'), '"wmap.pickle" is not found!'
with open('wmap.pickle', 'rb') as f:
wmap = pickle.load(f)
# unk_word_id, unk_role_id, nn_missing_word_id, nn_unk_word_id
unk_word_id = sorted(wmap.values())[-1] + 1
nn_missing_word_id = unk_word_id
# nn_unk_word_id = unk_word_id + 1
unk_role_id = sorted(self.ROLES.values())[-1] + 1
# get unigram counts
# n_words = len(wmap) # used for "unigram_counts"
n_roles = len(self.ROLES)
# unigram_counts = dict((r, [0]*n_words) for r in xrange(n_roles+1))
v_id = 0
assert type(filtered_roles) == set, '"filtered_roles" arg must be of set type!'
filtered_roles = filtered_roles ^ {unk_role_id}
# only set and random_set modes are available for minibatch
mode = 'set' if not random else 'random_set'
for govset in self.read(mode=mode):
# {role_id: word_id}
batch = dict([(self.ROLES.get(i, unk_role_id), unk_word_id)
for i in filtered_roles])
# govset=(sent_id, gov, gov_pos, arg0, arg0_pos, arg0_role, alg0,)
govid = govset[1]
# extract only [arg0, arg0_role, arg1, arg1_role, ...] ids
ids = [govset[3:][i] for i in xrange(0, len(govset[3:]), 2)]
# iterate over id pairs, fill batch with matching arg,role pairs
for arg, role in izip_longest(*[iter(ids)] * 2):
if batch.get(role):
batch[role] = arg
batch[v_id] = govid
# print batch
# print self.decode(batch,wmap)
# rest is taken from batcher.py
non_missing_inputs = [n for n, v in enumerate(batch.values())
if v != nn_missing_word_id]
roles, words = map(list, zip(*batch.items()))
x_w_i, x_r_i, y_w_i, y_r_i = [], [], [], []
n_total_samples, n_neg_samples = 0, 0
# Generate samples for each given (non-missing-word) role-word pair
for role_idx in non_missing_inputs:
# Positive sample
# Remove current role-word pair from context ...
input_words = words[:]
input_roles = roles[:]
del input_words[role_idx]
del input_roles[role_idx]
# ... and set it as target
target_word = words[role_idx]
target_role = roles[role_idx]
target_words = [target_word]
target_roles = [target_role]
x_w_i.append(input_words)
x_r_i.append(input_roles)
n_total_samples += 1
# generate k neg samples corrupting one non missing input role
for _ in xrange(neg):
noise_role = target_role
noise_word = np.random.randint(nn_missing_word_id)
# noise_word = np.random.choice(word_ids, p=unigram_counts[noise_role])
target_words.append(noise_word)
target_roles.append(noise_role)
n_neg_samples += 1
n_total_samples += 1
y_w_i.append(target_words)
y_r_i.append(target_roles)
if len(x_w_i) >= batch_size:
yield (
np.asarray(x_w_i, dtype=np.int32),
np.asarray(x_r_i, dtype=np.int32),
np.asarray(y_w_i, dtype=np.int32),
np.asarray(y_r_i, dtype=np.int32),
np.asarray(to_categorical(y_w_i, nn_missing_word_id+2),
dtype=np.int32),
# added +2 otherwise fails
np.asarray(to_categorical(y_r_i, n_roles+2),
dtype=np.int32),
n_total_samples,
n_neg_samples
)
x_w_i, x_r_i, y_w_i, y_r_i = [], [], [], []
n_total_samples, n_neg_samples = 0, 0
def main():
corpus = Corpus(args.dir, args.filter, args.algs)
if args.out and os.path.isdir(args.out):
h5file = os.path.join(args.out, 'ukwac.h5')
else:
h5file = args.out
corpus.create(h5file, args.compress, args.lemmatize)
if __name__ == "__main__":
prs = argparse.ArgumentParser(description="""Convert annotated ukWaC\
corpus to .h5 format.""")
prs.add_argument('-d', '--dir', required=True,
help='Specify the directory where annotated ukWaC corpus \
files are located.')
prs.add_argument('-o', '--out', default=None, required=False,
help='Specify the output file name. If not specified do \
not create .h5 file and generate only .pickle dicts.')
prs.add_argument('-filter', '--filter', default=[], required=False,
help='Specify the word-per-line filter. Words not in the \
file will be skipped.')
prs.add_argument('-alg', '--algs', required=False, default=[], nargs='*',
help='Specify the name of the head extraction algorithm \
whose results will be included in the output \
(default malt, malt_span, linear, failed).')
prs.add_argument('-lemmatize', '--lemmatize', required=False,
help='Apply lemmatization for govs and deps.',
action='store_true')
prs.add_argument('-c', '--compress', default=0, required=False, type=int,
help='Specify the level of compression [0-9] for h5 file\
(default 0, no compression).')
args = prs.parse_args()
main()