-
Notifications
You must be signed in to change notification settings - Fork 2
/
darlington_classifier.py
executable file
·177 lines (156 loc) · 6.54 KB
/
darlington_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/python
# Categorise Darlington web content
import Tkinter
import handclassifier
import datetime
import random
import os
import cPickle as pickle
from collections import defaultdict
# This can be installed with 'pip install warctools'. Beware that there are
# several old versions floating around under different names in the index.
from hanzo.warctools import WarcRecord
from hanzo.httptools import RequestMessage, ResponseMessage
#####
#UTILITY FUNCTIONS
#####
def parse_http_response(record):
"""Parses the payload of an HTTP 'response' record, returning code,
content type and body.
Adapted from github's internetarchive/warctools hanzo/warcfilter.py,
commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced."""
message = ResponseMessage(RequestMessage())
remainder = message.feed(record.content[1])
message.close()
if remainder or not message.complete():
if remainder:
print 'trailing data in http response for', record.url
if not message.complete():
print 'truncated http response for', record.url
header = message.header
mime_type = [v for k,v in header.headers if k.lower() == b'content-type']
if mime_type:
mime_type = mime_type[0].split(b';')[0]
else:
mime_type = None
return header.code, mime_type, message.get_body()
#####
#MAIN
#####
categories = ("1 - Information transmission",
"2 - Electronic service delivery",
"3 - Participation and collaboration",
"4 - Interactive democracy",
"D - Data, not for browsing",
"X - Exclude",
"? - Unable to determine")
dirname = 'dton-test-5'
picklefn = 'dton-content.pickle'
outfn = 'dton-hand-classifications.csv'
# Due to an error in lis.darlington.gov.uk/robots.txt, we have a lot of pages
# in our set which should not have been collected. Let's drop them.
discardurls = ('http://lis.darlington.gov.uk/profiles/',
'http://lis.darlington.gov.uk/dataviews/',
'http://lis.darlington.gov.uk/advancedprofiles/',
'http://lis.darlington.gov.uk/advanceddataviews/',
'http://lis.darlington.gov.uk/explorer/',
'http://lis.darlington.gov.uk/map/',
'http://lis.darlington.gov.uk/cache/',
'http://lis.darlington.gov.uk/ajax/')
# HTTP status codes which represent a record successfully returned - these
# are all we are interested in sampling
successcodes = (200, 201, 202, 203, 206)
# Total number of items is ~612k
proptoclassify = 0.002
r = random.Random()
r.seed(1818118181) # Arbitrary
content = []
rejects = defaultdict(int)
#Load all the objects into memory first
try:
with open(picklefn, "rb") as fh:
print "Unpickling selected sample."
content = pickle.load(fh)
except IOError:
print "Pickled file does not appear to exist. Loading content."
for fn in os.listdir(dirname):
if not fn.endswith('.warc.gz'):
continue
wf = WarcRecord.open_archive(dirname+'/'+fn, mode='rb')
try:
print fn
for record in wf:
if not record.type in [WarcRecord.RESPONSE,
WarcRecord.RESOURCE,
WarcRecord.CONVERSION]:
continue
if (record.type == WarcRecord.RESPONSE
and record.url.startswith('http')):
ccode, cmime, cbody = parse_http_response(record)
if ccode not in successcodes:
continue
else:
ccode = None
cmime = record.content[0]
cbody = record.content[1]
# This could be 'None' if there is no Content-Type field in the header.
# if not cmime.startswith(('text','application/xhtml','None')):
# print "Rejecting", cmime, "\n\tfor", record.type, record.url
# rejects[cmime] += 1
# continue
# if cmime.startswith(('text/csv','text/css')):
# print "Rejecting", cmime, "\n\tfor", record.type, record.url
# rejects[cmime] += 1
# continue
if record.url.startswith(discardurls):
# print "Rejecting", record.url
rejects['discardurls'] += 1
continue
rval = r.random()
if rval > proptoclassify:
# print "Not selecting ("+str(rval)+")", record.url
rejects['not sampled'] += 1
continue
# print "Adding:", ccode, cmime, record.url
# Read article URL into memory. Don't need the article body with
# the Wayback classfier as it's fetched through the Wayback index.
# Not sending it through here as the second part of the tuple
# saves a good deal of memory.
# TODO: Could make this a FilePart or similar to vastly
# reduce the memory load if this is a problem.
# TODO: Could change interface to pass the mimetype - maybe
# make it easier to send to an appropriate program, or to name
# the file correctly when it's sent to a web browser?
content.append((record.url,None,ccode,cmime))
except IOError as e:
print e
wf.close()
with open(picklefn, 'wb') as output:
print "Pickling content for use next time."
pickle.dump(content, output)
print "There are", len(content), "objects to classify."
print "Rejects:", rejects
try:
output = open(outfn, 'r')
#first check how many classifications have already been done
completed = 0
for line in output:
completed = completed + 1
output.close()
print completed, "classifications already completed"
content = content[completed:]
except:
print "Nothing classified yet"
if len(content) == 0:
exit("Nothing to classify. Exiting.")
# TODO: Check for records which are recorded as "? - Unable to determine",
# remove them from the output file and add them to the end of the content
# list to try again
#Now we are ready to classify
output = open(outfn, 'a')
#Initialise and run the GUI
classifier = handclassifier.ManualWaybackClassifierSingle(items=content,
labels=categories,
output=output)
Tkinter.mainloop()
output.close()