forked from openspending/dpkg-uk25k
-
Notifications
You must be signed in to change notification settings - Fork 0
/
common.py
53 lines (43 loc) · 1.75 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import sqlaload as sl
def source_path(row):
source_dir = 'sources'
if not os.path.isdir(source_dir):
os.makedirs(source_dir)
return os.path.join(source_dir, row['resource_id'])
def db_connect():
return sl.connect("postgresql:///uk25k")
def normalise_header(h):
h = h.lower().strip()
h = h.replace('no.', 'number')
h = h.replace(' ', '').replace('.', '').replace(',', '')
return h
def normalise_header_map(headers):
# This is pretty grotesque. We need to disambiguate between
# multiple identically-named columns after
# normalisation. Impedence mismatch with messytables here, it was
# supposed to take care of this
# Pass 1: normalise while counting the occurances of each normalised value
norm_headers_count = {}
norm_headers_tmp = []
for h in headers:
n = normalise_header(h)
norm_headers_count.setdefault(n, 0)
norm_headers_count[n] = norm_headers_count[n] + 1
norm_headers_tmp.append((n,h))
# We're going to need an incrementing counter for each header that occurs more than once
norm_headers_count = {k:0 for k,v in norm_headers_count.iteritems() if v > 1}
# Pass 2: assemble normalised headers from counters
norm_headers = {}
for n,h in norm_headers_tmp:
if norm_headers_count.has_key(n):
norm_headers_count[n] = norm_headers_count[n] + 1
norm_headers[h] = "%s.%d" % (n, norm_headers_count[n])
else:
norm_headers[h] = n
return norm_headers
def normalise_header_list(headers):
return sorted(normalise_header_map(headers).values())
def normalised_columns_map(table):
columns = filter(lambda c: c != 'id', map(lambda c: c.name, table.c))
return normalise_header_map(columns)