forked from openspending/dpkg-uk25k
-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve.py
88 lines (77 loc) · 2.61 KB
/
retrieve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import urlparse
import urllib
import urllib2
import sqlaload as sl
import sys
from datetime import datetime
import traceback
from common import *
from functools import partial
binary_formats = ['.xls', 'xls', 'xlx', 'xlsx', 'zip', 'pdf', 'Zipped CSV', 'Excel']
def fix_url(url):
# The correct character set for URLs is "broken". This is probably close enough.
if isinstance(url, unicode):
url = url.encode('utf-8', 'ignore')
scheme, netloc, path, qs, anchor = urlparse.urlsplit(url)
path = urllib.quote(path, '/%')
url = urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
url = url.replace(" ", "%20")
if url.startswith('"'):
print "FOO"
url = url[1:]
_url = url.lower()
if not (_url.startswith('http://') or _url.startswith('https://')):
url = 'http://' + url
return url
def retrieve(row, engine, force):
ret_table = sl.get_table(engine, 'retrieval_log')
#print row.get('package_name'), row['url'].encode('utf-8')
try:
import os
if not force and os.path.exists(source_path(row)):
return
url = fix_url(row['url'])
print "Fetching %s" % url
res = urllib2.urlopen(url)
fh = open(source_path(row), 'wb')
fh.write(res.read())
sl.add_row(engine, ret_table, {
'resource_id': row['resource_id'],
'status': '200',
'message': "",
'content-type': res.headers.get('content-type', ''),
'timestamp': datetime.now()
})
except Exception, ioe:
print traceback.format_exc()
status = 0
if hasattr(ioe, 'code'):
status = ioe.code
sl.add_row(engine, ret_table, {
'resource_id': row['resource_id'],
'status': status,
'message': unicode(ioe),
'timestamp': datetime.now()
})
assert False, unicode(ioe).encode('utf-8')
def connect():
engine = db_connect()
src_table = sl.get_table(engine, 'source')
return engine,src_table
def describe(row):
return 'retrieve: %(package_name)s/%(resource_id)s (%(url)s)' % row
def test_retrieve_all():
engine,src_table = connect()
for row in sl.all(engine, src_table):
f = partial(retrieve, row, engine, False)
f.description = describe(row)
yield f,
if __name__ == '__main__':
engine,src_table = connect()
for id in sys.argv[1:]:
row = sl.find_one(engine, src_table, resource_id=id)
if row is None:
print "Could not find row %s" % id
else:
print describe(row)
retrieve(row, engine, True)