-
Notifications
You must be signed in to change notification settings - Fork 7
/
spend_diff.py
125 lines (108 loc) · 4.43 KB
/
spend_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
'''
Diffs spending data, so you can load into OpenSpending only the transactions
that are new or changed, compared to the last load.
The reason you cannot use (the other) 'diff' to do this is because of the
presence of carriage returns within lines. This program handles them correctly.
Rows that from the Previous CSV that are not present in New CSV are ignored,
as OpenSpending does not handle transaction removal from its database.
Each CSV containing transaction data can be gigabytes large, but this script
does not load it all into memory. It loads the IDs and 8 byte checksums for every
line in Previous CSV into memory and then works through the New CSV with a line
in memory at a time.
'''
import argparse
import sys
import os.path
import csv
import _csv
import hashlib
import re
import shlex
class DiffError(Exception): pass
str1 = ''
str2 = ''
def spend_diff(previous_csv_filepath, new_csv_filepath, key_column):
# Check CSVs exist
previous_csv_filepath = os.path.expanduser(previous_csv_filepath)
if not os.path.exists(previous_csv_filepath):
raise DiffError('Could not find Previous CSV file: %s' % previous_csv_filepath)
new_csv_filepath = os.path.expanduser(new_csv_filepath)
if not os.path.exists(new_csv_filepath):
raise DiffError('Could not find New CSV file: %s' % new_csv_filepath)
# Open the previous CSV and save the hashes of the lines
previous_lines = {} # id: line hash
with open(previous_csv_filepath, 'rb') as f:
header = f.readline()
header_cells = parse_csv_line(header)
try:
key_column_index = header_cells.index(key_column)
except ValueError:
raise DiffError('Could not find key %r in header column %r' %
(key_column, header_cells))
for line, row in csv_rows(f):
# Store hash
key = row[key_column_index]
previous_lines[key] = line_checksum(line)
# Open the new CSV and print lines non-matching lines
with open(new_csv_filepath, 'rb') as f:
csv_reader = csv.reader(f, delimiter=',', quotechar='"')
header = f.readline()
header_cells = parse_csv_line(header)
header_yielded_yet = False
try:
key_column_index = header_cells.index(key_column)
except ValueError:
raise DiffError('Could not find key %r in header column %r',
key_column, header_cells)
for line, row in csv_rows(f):
# Transaction row, so compare hash
key = row[key_column_index]
previous_hash = previous_lines.get(key)
if previous_hash:
if previous_hash == line_checksum(line):
# line is the same
continue
else:
# line has changed
pass
if not header_yielded_yet:
yield header.rstrip('\n\r')
header_yielded_yet = True
yield line
def csv_rows(file_handler):
'''Returns each row of a CSV as both a string and a list,
working as a generator.
Where there is a newline inside a string, the row returned
will also contain the newline char.
No trailing '\n' characters are returned
'''
row = ''
for line in file_handler:
row += line
try:
row_cells = parse_csv_line(row)
except _csv.Error, e:
if 'newline inside string' in str(e):
continue
yield row.rstrip('\n\r'), row_cells
row = ''
def parse_csv_line(line):
return list(csv.reader([line]))[0]
def line_checksum(line):
return hashlib.md5(line).hexdigest()[:8]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Diffs spending data to find what is new and changed.')
parser.add_argument('previous_csv', metavar='PREVIOUS.CSV', type=str,
help='Previous spend data')
parser.add_argument('new_csv', metavar='NEW.CSV', type=str,
help='New spend data')
parser.add_argument('key_column', metavar='KEY_COLUMN', type=str,
help='Title of the column which is the unique key')
args = parser.parse_args()
try:
for line in spend_diff(args.previous_csv, args.new_csv, args.key_column):
print line
except DiffError, e:
print >> sys.stderr, 'ERROR: %s\n' % e
parser.print_help(argparse._sys.stderr)
sys.exit(1)