-
Notifications
You must be signed in to change notification settings - Fork 0
/
FootnoteParser.py
88 lines (72 loc) · 2.66 KB
/
FootnoteParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import sys, traceback
import os
import re
import logging
from html.parser import HTMLParser
from ParsingRules import *
IGNORE_TAGS = [ 'title', 'style']
ALLOWED_TAGS = ['i', 'img', 'br']
class FootnoteHTMLParser(HTMLParser):
""" ignores the tags, and puts the text content,
so that it can be aggregated as a footnote easily.
Also preservers the img"""
def __init__(self, html_file ):
HTMLParser.__init__(self)
self.content = ''
self.html_file = html_file
self.ignore_data = 0
def handle_starttag(self, tag, attrs):
#logging.debug("{ %s " % tag)
if tag == 'title' or tag == 'style':
self.ignore_data = 1
if tag == 'br':
self.content += '<%s/>' % tag;
if tag == 'img':
attrs_filtered = []
for (attrib,value) in attrs:
if attrib in DTP_SUPPORTED_ATTRIBS[tag]:
attrs_filtered.append( (attrib,value) )
strattrs = "".join([' %s="%s"' % (name, value) for name, value in attrs])
self.content += ('<%s %s>' % (tag,strattrs)).replace(' >','>')
def handle_endtag(self, tag):
if tag == 'title' or tag=='style':
self.ignore_data = 0
def handle_data(self, data):
if self.ignore_data == 1:
return;
self.content += data;
def handle_comment(self, data):
#print ( 'comment: ' + data )
self.content += '<!--' + data + '-->'
def handle_charref(self, name):
#print ('charref : ', name)
self.content += '&#%s;' % name
def handle_entityref(self, name):
self.content += '&%s;' % name
def pre_process_html(self, str):
str = str.replace('start -->','>')
return str
def process_html(self):
try:
file = open( self.html_file )
except :
traceback.print_exc(file=sys.stdout)
logging.warning(" Warning : Unresolved link to "+ self.html_file)
sys.exit(-5)
return '',''
html_contents = file.read()
file.close()
#rename the file
#shutil.move(self.html_file, self.html_file+'~')
html_contents = self.pre_process_html(html_contents)
try:
self.feed( html_contents )
except Exception as err:
msg = 'Error while parsing file %s : %s ' % (self.html_file, str(err))
traceback.print_exc(file=sys.stdout)
sys.stderr.write(msg+'\n')
logging.critical(msg)
sys.exit(-4)
anchor = os.path.split(self.html_file)[-1].split('.')[0]
self.content = self.content.strip()
return self.content, anchor