-
Notifications
You must be signed in to change notification settings - Fork 3
/
rte3_to_eopformat.pl
85 lines (70 loc) · 1.97 KB
/
rte3_to_eopformat.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# this small perl script will convert german RTE3 (translated)
# data into EXCITEMENT (RTE5-based) XML format.
# It doesn't really parse XML... so only works on specific formats
# for IT data.
use strict;
use warnings;
#
# Global variables
# XML Header as Here-doc
my $XML_HEADER = <<END;
<?xml version="1.0" encoding="UTF-8"?>
<entailment-corpus lang="DE">
END
# XML Footer as Here-doc
my $XML_FOOTER = <<END;
</entailment-corpus>
END
#
# Start of the code
unless ($ARGV[0])
{
print "Usage: perl rte3_to_eopformat.pl [rte3 XML file]\nConverted output will be printed on STDOUT. \n(e.g. perl rte3_to_eopformat.pl testin.xml > testout.xml)\n ";
die;
}
open FILE, "<$ARGV[0]" or die "unable to open $ARGV[0]";
print STDERR "Processing $ARGV[0]...\n";
my @line;
while(<FILE>)
{
s/\r\n$/\n/; # change dos \r\n to simple \n
push @line, $_;
}
close FILE;
#
# Start of the processing
# Now the file is in @line
print $XML_HEADER;
for(my $i=0; $i < @line; $i++)
{
next unless $line[$i] =~ /<pair id="/; # we only process pairs! pass until seeing pair
my $pairline = $line[$i];
my @contentlines = ();
my $pairend;
while(not ($line[++$i] =~ /<\/pair>/))
{
push @contentlines, $line[$i];
}
$pairend = $line[$i];
# remove length, some prediction
$pairline =~ s/ BoW_Prediction="\S+"//;
$pairline =~ s/ length="\S+"//;
$pairline =~ s/ Triple_Prediction="\S+"//;
# Vico data
if ($pairline =~ /judgement/ and $pairline =~ /hLangFeatures/)
{
# add missing task, and remove unneeded attributes
$pairline =~ s/ judgement="\S+"//;
$pairline =~ s/ hLangFeatures="\S+"//;
$pairline =~ s/>/ task="SUM">/
}
# Convert output
$pairline =~ s/entailment="YES"/entailment="ENTAILMENT"/;
$pairline =~ s/entailment="NO"/entailment="NONENTAILMENT"/;
print $pairline;
#print $tline;
#print $hline;
print @contentlines;
print $pairend;
}
print $XML_FOOTER;