-
Notifications
You must be signed in to change notification settings - Fork 0
/
utexer.py
executable file
·166 lines (146 loc) · 6.29 KB
/
utexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
"""uTeXer -- replace unicode signs through LaTeX equivalents."""
from optparse import OptionParser
import os
import sys
import xml.etree.ElementTree as ET
# A dictionary of unicode signs which one can get when using pdftotext."
PDFTOTEXT = {0xFF:'ß', 0x1C:'fi'}
################################################################################
class WholeProgram():
"""The whole program in a class. The program is too small for more classes
;-)."""
def __init__(self):
"""Set up cmd parser."""
usage = """usage: %prog [options] INPUTFILE
If no output file is specified with the -o option, the input file will be
overwritten. If no input file is specified, stdin/stdout will be used (but you
can redirect stdout with -o too)."""
parser = OptionParser(usage=usage)
parser.add_option("-e", "--encoding", dest="encoding",
help="Set encoding for stdin (default UTF-8)",
metavar="ENC", default='UTF-8')
parser.add_option("-l", "--ligature",
action="store_true", dest="ligature", default=False,
help='replace ligatures through normal letters (at least in'+\
' Latin languages where they are only for better '+\
'readibility)')
parser.add_option("-o", "--output", dest="output",
help="set output file (if unset, overwrite input file)",
metavar="FILE")
parser.add_option("-p", "--pdftotext",
action="store_true", dest="pdftotext", default=False,
help='Replace some signs generated just by PDFtotext')
parser.add_option("-s", "--strip-pagebreak",
action="store_true", dest="strip_pagebreak", default=False,
help='Strip the pagebreak character')
parser.add_option("-u", "--userdict", dest="userdict",
help="set path to user-defined replacements/additions for "+\
"unicode mappings (format described in README)",
metavar="FILE", default=None)
self.input = None
(self.options, self.args) = parser.parse_args()
# translation table
self.table = {}
def parse_unicode_table(self):
"""Parse the XML table containing unicode symbols and their LaTeX
equivalent."""
# search unicode.xml in the same directory:
spath = os.path.dirname(os.path.realpath(__file__))
if os.path.exists(os.path.join(spath, 'unicode.xml')):
unicodexml = os.path.join(spath, 'unicode.xml')
elif os.path.exists(os.path.join(spath, '..', 'share', 'utexer',
'unicode.xml')):
unicodexml = os.path.join(spath, '..', 'share', 'utexer', \
'unicode.xml')
else:
print("Error: unicode.xml not found!")
sys.exit(127)
root = ET.fromstring( open( unicodexml ).read())
for child in root:
attr = child.attrib
if child.tag == 'character' and attr.get('mode') == 'math':
try:
latex = [e for e in list(child) if e.tag == 'latex'][0]
i_d = attr['id'][1:]
if i_d.find('-')>0:
ids = i_d[1:].split('-')
else:
ids = [i_d]
for i_d in ids:
if int(i_d, 16) >= 128: # do not translate ascii!
self.table[int(i_d, 16)] = latex.text
except (KeyError, IndexError):
continue
def setup_table(self):
"""Set up unicode translation table by parsing XML file and adding
(Latin) ligatures, if wished."""
self.parse_unicode_table() # initialise self.table()
# translate ligatures?
if self.options.ligature:
self.table[64256] = 'ff'
self.table[64257] = 'fi'
self.table[64258] = 'fl'
self.table[64259] = 'ffi'
self.table[64260] = 'ffl'
if self.options.pdftotext:
self.table.update(PDFTOTEXT)
if self.options.userdict:
for line in open(self.options.userdict).read().split('\n'):
try:
num, replacement = line.split('\t')
self.table[int(num)] = replacement
except ValueError:
continue
# strip new line
if self.options.strip_pagebreak:
self.table[12] = '\n'
def translate(self):
"""Use self.t to translate all unicode sequences through
LaTeX-equivalents."""
try:
cnt = self.read_input()
except UnicodeDecodeError:
print("Could not decode input stream, wrong encoding?")
sys.exit(0)
# the actual translation
cnt = cnt.translate(self.table)
if self.options.pdftotext:
cnt = self.replace_dieresis(cnt)
self.write_output( cnt )
def read_input(self):
"""Read input from stdin or file, as appropriate. Decode it to unicode
using self.options.encoding."""
if not self.args:
self.input = sys.stdin.detach()
else:
self.input = open( self.args[0], 'rb' )
cnt = self.input.read().decode( self.options.encoding )
self.input.close()
return cnt
def write_output(self, string):
"""Take an unicode string and write it to file / stdout, as appropriate.
Use self.options.encoding as file encoding."""
if not self.options.output:
if not self.args:
output = sys.stdout.detach()
else:
output = open(self.args[0], 'wb')
else:
output = open(self.options.output, 'wb')
output.write( string.encode( self.options.encoding ) )
output.close()
def replace_dieresis(self, cnt):
"""Replace dieresis occuring when using pdftotext with German texts set
in LaTeX, inproperly."""
R = {'\xa8o':'\xf6', '\xa8u':'\xfc', '\xa8a': '\xe4', \
'\xa8O':'\xf6', '\xa8U':'\xfc', '\xa8A': '\xe4' }
for r in R:
cnt = cnt.replace(r, R[r])
return cnt
def main():
prog = WholeProgram()
prog.setup_table()
prog.translate()
if __name__ == '__main__':
main()