-
Notifications
You must be signed in to change notification settings - Fork 30
/
AMICorpusHandler.py
440 lines (367 loc) · 19.4 KB
/
AMICorpusHandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
from xml.dom import minidom
import utils
import os
import urllib.request
import zipfile
from collections import defaultdict
import nltk.data
__author__ = "Gwena Cunha"
"""
Handler for AMI Corpus
Functions:
1. Extract transcripts
2. Abstractive summary
3. Extractive summary
4. (Optional) Formatting: .TXT file to .STORY file
Parses XML in Python using minidom
"""
class AMICorpusHandler:
def __init__(self, args, in_file_ext='xml', out_file_ext='txt'):
self.args = args
self.in_file_ext = in_file_ext
self.out_file_ext = out_file_ext
self.ami_dir = self.args.ami_xml_dir + 'ami_public_manual_1.6.2'
self.download_corpus()
self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def download_corpus(self):
""" Check if AMI Corpus exists and only download it if it doesn't
:return: directory where AMI Corpus is located
"""
download_link = 'http://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/ami_public_manual_1.6.2.zip'
# directory = os.path.dirname(self.ami_dir)
if not os.path.exists(self.ami_dir):
# 1. Ensure data directory exists
utils.ensure_dir(self.args.ami_xml_dir)
# 2. Download AMI Corpus
print("Downloading AMI Corpus to: {}".format(self.ami_dir))
zipped_ami_filename = self.ami_dir + '.zip'
urllib.request.urlretrieve(download_link, zipped_ami_filename)
# 3. Unzip zip file
zip_ref = zipfile.ZipFile(zipped_ami_filename, 'r')
zip_ref.extractall(self.ami_dir)
zip_ref.close()
# 4. Delete zip file
os.remove(zipped_ami_filename)
else:
print("AMI Corpus has already been downloaded in: {}".format(self.ami_dir))
def get_corpus_directory(self):
return self.ami_dir
""" Begin 1/4: EXTRACT TRANSCRIPT """
def extract_transcript(self, do_transcripts_speaker=True):
""" Extracts transcript for each speaker (if boolean var is True) and also for the entire meeting,
where each transcript is a line in a new text file.
Summaries are originally saved in `data/ami_public_manual_1.6.2/words/`:
* Example: `EN2001a.A.words.xml`
* Meeting name: `EN2001`
* Meetings are divided into 1 hour parts: `a` (each hour is a consecutive lowercase letter)
* Speaker: `A` (usually there are four speakers named A, B, C and D, but E is sometimes also present)
Each `.xml` file has a number of tags with the words and their respective times in the audio/video file.
In order for us to extract the summaries, we have to put those words back together in sentences and paragraphs.
Thus xml parsing is required.
Output: 2 folders with corresponding .txt files
* `data/ami-transcripts-speaker/`: meeting transcripts for each speaker
* `data/ami-transcripts/`: complete meeting transcripts (all speakers together)
:param do_transcripts_speaker: boolean to indicate if speaker transcription is needed
:return:
"""
if do_transcripts_speaker:
self.extract_transcript_speaker()
print("\nSaving transcripts in: {}".format(self.args.results_transcripts_dir))
# Get every file
transcript_speaker_dir = self.args.results_transcripts_speaker_dir
transcript_speaker_files = [f for f in os.listdir(transcript_speaker_dir) if f.endswith('.{}'.format(self.out_file_ext))]
# Group speaker files by meeting
group_speaker_files = defaultdict(list)
for t in transcript_speaker_files:
meeting = t.split('.')[0]
if bool(group_speaker_files[meeting]):
group_speaker_files[meeting].append(t)
else:
group_speaker_files[meeting] = [t]
# Directory to save meetings transcripts
transcript_dir = self.args.results_transcripts_dir
utils.ensure_dir(transcript_dir)
# Group transcripts by meeting
for key in group_speaker_files.keys():
# print(group_speaker_files[key])
transcript_filename = '{}.transcript.txt'.format(key)
all_transcripts_file = open(transcript_dir + transcript_filename, 'w')
transcript_speakers_filename = group_speaker_files[key]
for transcript_filename in transcript_speakers_filename:
file_content = open(transcript_speaker_dir + transcript_filename, 'r').read()
all_transcripts_file.write(file_content + '\n')
all_transcripts_file.close()
print("Number of meetings: {}".format(len(group_speaker_files.keys())))
def extract_transcript_speaker(self):
""" Extracts transcript from each speaker (A, B, C, D, and sometimes E)
"""
print("Extracting speaker transcripts to: {}".format(self.args.results_transcripts_speaker_dir))
words_dir = self.ami_dir + '/words/'
utils.ensure_dir(self.args.results_transcripts_speaker_dir)
words_files = [f for f in os.listdir(words_dir) if f.endswith('.{}'.format(self.in_file_ext))]
for words_filename in words_files:
# print("Extracting transcript from {}".format(words_filename))
self.extract_transcript_single_file(words_dir, words_filename)
print("Transcripts: {}".format(len(words_files)))
def extract_transcript_single_file(self, words_dir, filename):
""" Extract transcript for a single .xml file
:param words_dir: directory containing .xml files with meeting transcription
:param filename: name of each .xml meeting file
:return:
"""
# parse an xml file by name
mydoc = minidom.parse(words_dir + filename)
items = mydoc.getElementsByTagName('w')
transcript = ""
count = 0
for elem in items:
elem_text = elem.firstChild.data
if count > 0 and not elem.hasAttribute('punc'):
transcript += " "
transcript += elem_text
count += 1
# transcript += '\n'
# Save transcript
results_filename = filename.split('.words.{}'.format(self.in_file_ext))[0] + '.transcript.{}'.format(self.out_file_ext)
self.save_file(transcript, self.args.results_transcripts_speaker_dir, results_filename)
return transcript
""" End: EXTRACT TRANSCRIPT """
""" Begin 2/4: ABSTRACTIVE SUMMARY """
def extract_abstractive_summary(self):
""" Obtain abstractive summary from each meeting
* Located in `data/ami_public_manual_1.6.2/abstractive/*.xml`
"""
print("\nObtaining abstractive summaries to: {}".format(self.args.results_summary_dir))
sum_dir = self.ami_dir + '/abstractive/'
sum_files = [f for f in os.listdir(sum_dir) if f.endswith('.{}'.format(self.in_file_ext))]
utils.ensure_dir(self.args.results_summary_dir)
# Obtain summary with only 1 sentence/highlight
results_abstractive_summary_dir = '{}/{}-one_highlight/'.format(self.args.results_summary_dir,
utils.ABSTRACTIVE_SUMMARY_TAG)
for sum_filename in sum_files:
self.extract_abstractive_summary_single_file_single_highlight(sum_dir, sum_filename, results_abstractive_summary_dir)
# Obtain summary with only all available sentences/highlights
results_abstractive_summary_dir = '{}/{}/'.format(self.args.results_summary_dir, utils.ABSTRACTIVE_SUMMARY_TAG)
for sum_filename in sum_files:
self.extract_abstractive_summary_single_file(sum_dir, sum_filename, results_abstractive_summary_dir)
def extract_abstractive_summary_single_file_single_highlight(self, summary_dir, summary_filename, results_dir):
""" Obtain abstractive summary (1 sentence/highlight) from one meeting (one file)
* Extract text between `abstract` tag
* Return first element inside this tag
:param summary_dir: directory containing .xml files with meeting abstract/summary
:param summary_filename: name of each summary file
:param results_dir: directory to save .txt files with summaries
:return:
"""
# parse an xml file by name
mydoc = minidom.parse(summary_dir + summary_filename)
items = mydoc.getElementsByTagName('abstract')
summary = items[0].firstChild.nextSibling.firstChild.data
# Save summary
# results_dir = self.args.results_summary_dir
results_filename = summary_filename.replace('.{}'.format(self.in_file_ext), '.{}'.format(self.out_file_ext))
# summary_filename.split('.{}'.format(self.in_file_ext))[0] + '.summary.txt' # '.summary.txt'
self.save_file(summary, results_dir, results_filename)
return summary
def extract_abstractive_summary_single_file(self, summary_dir, summary_filename, results_dir):
""" Obtain abstractive summary (ALL sentences/highlights) from one meeting (one file)
* Extract text between `abstract` tag
* Text between `abstract` tag is composed of text in `sentence` tags
* Return all these tags as a paragraph
:param summary_dir: directory containing .xml files with meeting abstract/summary
:param summary_filename: name of each summary file
:param results_dir: directory to save .txt files with summaries
:return:
"""
# parse an xml file by name
mydoc = minidom.parse(summary_dir + summary_filename)
items = mydoc.getElementsByTagName('abstract')
items_sentences = items[0].getElementsByTagName('sentence')
summary = ''
for item in items_sentences:
summary += item.firstChild.data + ' '
# Save summary
# results_dir = self.args.results_summary_dir
results_filename = summary_filename.replace('.{}'.format(self.in_file_ext), '.{}'.format(self.out_file_ext))
# summary_filename.split('.{}'.format(self.in_file_ext))[0] + '.summary.txt' # '.summary.txt'
self.save_file(summary, results_dir, results_filename)
return summary
""" End: ABSTRACTIVE SUMMARY """
""" Begin 3/4: EXTRACTIVE SUMMARY """
def extract_extractive_summary(self):
""" Extract summary from each meeting
* Located in `data/ami_public_manual_1.6.2/extractive/*.xml`
"""
print("\nObtaining extractive summaries to: {}".format(self.args.results_summary_dir))
sum_dir = self.ami_dir + '/extractive/'
utils.ensure_dir(self.args.results_summary_dir)
results_extractive_summary_dir = '{}/{}/'.format(self.args.results_summary_dir, utils.EXTRACTIVE_SUMMARY_TAG)
sum_files = [f for f in os.listdir(sum_dir) if f.endswith('.extsumm.{}'.format(self.in_file_ext))]
for sum_filename in sum_files:
self.extract_extractive_summary_single_file(sum_dir, sum_filename, results_extractive_summary_dir)
print("Summaries: {}".format(len(sum_files)))
def extract_extractive_summary_single_file(self, summary_dir, summary_filename, results_dir):
""" Obtain extractive summary (ALL sentences/highlights) from one meeting (one file)
* Extract text between `extsumm` tag
* Text between `extsumm` tag is composed of children nodes such as the below example:
* `<nite:child href="ES2002a.B.dialog-act.xml#id(ES2002a.B.dialog-act.dharshi.3)"/>`
* This node refers to a a node of ID `ES2002a.B.dialog-act.dharshi.3` in a file named `ES2002a.B.dialog-act.xml` in `data/dialogueActs/`
* This ID is:
```
<dact nite:id="ES2002a.B.dialog-act.dharshi.3" reflexivity="true">
<nite:pointer role="da-aspect" href="da-types.xml#id(ami_da_4)"/>
<nite:child href="ES2002a.B.words.xml#id(ES2002a.B.words4)..id(ES2002a.B.words16)"/>
</dact>
```
* This indicates words 4 to 16 in `ES2002a.B.words.xml`
* Return all the collected words as a paragraph
:param summary_dir: directory containing .xml files with meeting abstract/summary
:param summary_filename: name of each summary file
:param results_dir: directory to save .txt files with summaries
:return:
"""
# parse an xml file by name
mydoc = minidom.parse(summary_dir + summary_filename)
mydoc = mydoc.childNodes[0].childNodes[1] # getElementsByTagName('extsumm')
items_extsumm = mydoc.childNodes
summary = ""
for items_ext in items_extsumm:
if items_ext.attributes is not None: # Element
filename, init_id, final_id = self.obtain_ids(items_ext)
summary += self.obtain_dialogue_act_node_references(filename, init_id, final_id)
# Save summary
# results_dir = self.args.results_summary_dir
results_filename = summary_filename.replace('.{}'.format(self.in_file_ext), '.{}'.format(self.out_file_ext))
self.save_file(summary, results_dir, results_filename)
return summary
def obtain_dialogue_act_node_references(self, filename, init_id, final_id):
""" Obtain node references from extractive summary file
:param filename:
:param init_id: initial ID for extractive summary
:param final_id: final ID for extractive summary
:return:
"""
sum_dir = self.ami_dir + '/dialogueActs/'
# Obtain array with dialogAct IDs
print(init_id)
id_arr = [init_id]
if final_id is not None:
init_id_index = int(init_id.split('dialog-act.')[-1].split('.')[-1])
final_id_index = int(final_id.split('dialog-act.')[-1].split('.')[-1])
root_name = final_id.split(str(final_id_index))[0]
for i in range(init_id_index+1, final_id_index+1):
id_arr.append(root_name + str(i))
# parse an xml file by name
segment_nodes = self.obtain_node_from_textname(sum_dir + filename, id_arr, tag='dact')
words = self.obtain_dialogue_act_node(segment_nodes)
return words
def obtain_dialogue_act_node(self, segment_nodes):
""" Obtain nodes from references and word references in node.
:param segment_nodes: nodes representing segment
:return:
"""
sum_dir = self.ami_dir + '/words/'
sentence_segment = ""
for segment_node in segment_nodes:
# Obtain child node with words information
for seg in segment_node.childNodes:
if seg.attributes is not None:
if seg.nodeName == 'nite:child':
items_extsumm = seg # segment_node.childNodes[3] # Element
filename, init_id, final_id = self.obtain_ids(items_extsumm)
# Obtain array with words IDs
id_arr = [init_id]
if final_id is not None:
init_id_index = int(init_id.split('words')[-1])
final_id_index = int(final_id.split('words')[-1])
root_name = final_id.split(str(final_id_index))[0]
for i in range(init_id_index + 1, final_id_index + 1):
id_arr.append(root_name + str(i))
# parse an xml file by name
word_nodes = self.obtain_node_from_textname(sum_dir + filename, id_arr, tag='w')
sentence_segment += self.obtain_dialogue_act_word_references(word_nodes)
return sentence_segment
def obtain_dialogue_act_word_references(self, word_nodes):
""" Obtain words from word references
:param word_nodes: nodes representing words
:return: string with concatenated words
"""
words = ""
for word_node in word_nodes:
words += word_node.firstChild.data + " "
return words
def obtain_ids(self, items_ext):
href = items_ext.getAttribute('href')
filename = href.split('#')[0]
id_tmp = href.split('#')[-1].split(')..id(')
init_id = id_tmp[0].split('id(')[-1].split(')')[0]
final_id = None
if len(id_tmp) > 1:
final_id = id_tmp[1].split(')')[0]
return filename, init_id, final_id
def obtain_node_from_textname(self, path, id_arr, tag):
mydoc = minidom.parse(path)
items_dact = mydoc.getElementsByTagName(tag)
nodes = []
for item in items_dact:
nite_id = item.getAttribute('nite:id')
if nite_id in id_arr:
nodes.append(item)
return nodes
""" End: EXTRACTIVE SUMMARY """
""" Begin 4/4: FORMAT SUMMARY INTO .STORY """
def check_for_meeting_summary(self, meeting_name, summary_files):
""" Check if summary exists for a certain meeting
:param meeting_name: prefix indicating meeting name
:param summary_files: existing summary files
:return:
"""
for s in summary_files:
if meeting_name in s:
return s
return None
def transform_to_story(self, is_speaker_transcript=False):
""" Transform AMI Corpus into CNN-DailyMail News Dataset story format
:param is_speaker_transcript: boolean indicating if transcripts are from each speaker or of all together.
True means each speaker will have their own file. In this case, each speaker will have a different transcript
but the same summary.
:return:
"""
print("Make .story files")
if is_speaker_transcript:
transcript_dir = self.args.results_transcripts_speaker_dir
else:
transcript_dir = self.args.results_transcripts_dir
transcript_files = [f for f in os.listdir(transcript_dir) if f.endswith('.{}'.format(self.out_file_ext))]
summary_dir = self.args.results_summary_dir + self.args.summary_type + '/' #'abstractive-test/'
summary_files = [f for f in os.listdir(summary_dir) if f.endswith('.{}'.format(self.out_file_ext))]
story_dir = utils.get_new_data_dir_name(transcript_dir, '-stories')
utils.ensure_dir(story_dir)
story_dir += self.args.summary_type + '/'
utils.ensure_dir(story_dir)
for t in transcript_files:
# Check if meeting transcript exists
meeting_name = t.split('.')[0]
# print(meeting_name)
s = self.check_for_meeting_summary(meeting_name, summary_files)
if s is not None:
if is_speaker_transcript:
speaker_letter = t.split('.')[1]
story_filename = '{}_{}.story'.format(meeting_name, speaker_letter)
else:
story_filename = '{}.story'.format(meeting_name)
summary = open(summary_dir+s, 'r').read()
transcript = open(transcript_dir+t, 'r').read()
story_file = open(story_dir+story_filename, 'w')
# Separate summary into sentences
sentences = self.sent_detector.tokenize(summary.strip())
story_file.write('{}\n'.format(transcript))
for sent in sentences:
story_file.write('\n\n@highlight\n\n{}'.format(sent))
story_file.close()
""" End: FORMAT SUMMARY INTO .STORY """
def save_file(self, text, dir, filename):
utils.ensure_dir(dir)
file = open(dir + filename, 'w')
file.write(text)
file.close()