-
Notifications
You must be signed in to change notification settings - Fork 0
/
recsys_svd_gd.py
326 lines (260 loc) · 9.64 KB
/
recsys_svd_gd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import os
import datetime
import sys
import numpy as np
"""
Recommender system based on latent factors, to be uncovered by
singular value decomposition (SVD), with training by manual gradient
descent (GD).
James McDermott <[email protected]>
Based mostly on Funk, Netflix Update: Try This at Home,
http://sifter.org/~simon/journal/20061211.html and on Paterek,
Improving regularized singular value decomposition for collaborative
filtering.
Several other recommenders out there keep things simpler by using a
library for SVD, but my goal is to teach (and learn about) the
gradient descent.
TODO
Clip predictions to the range [1, 5].
We have K, an offset for the whole datset, and C, an offset per user
(some users are more generous than others) and D, an offset per item
(some movies are better than others). Funk initialises them and then
GDs them. I have implemented this but at the moment it's causing ML to
diverge after a few iterations. On the fake data it actually helps, a
little. The alternative is to initialise them and then leave them
alone.
Also, Funk recommends to calculate their (initial) values as
posteriors after observing the data, rather than as straight-up
averages.
Implement a train-test split? Check
https://github.com/ahaldar/Movie-Recommender/blob/master/movie-reco.py
for this and for cosine similarity.
Momentum.
My implementation GDs all parameters at once, rather than working on
"one factor at a time" as suggested. I've tried the alternative
several times but not systematically, finding no improvement. Should
try this test properly.
Initialisation of U and V seems to matter. Funk uses a constant 0.1
for all. I tried a light Gaussian noise and it's far better ON THE
FAKE DATASET -- almost always. Need to test whether it helps on the ML
dataset. The disadvantage is it makes things stochastic.
"""
# reading/processing input data
###############################
def make_tiny_test_data():
# I usually start with a hand-written dataset of this small size
# to make things concrete for myself and make sure my code will be
# dimensionally right.
A = np.array([
# user, item, rating
[0, 0, 3],
[1, 0, 4],
[0, 1, 2],
[1, 1, 3],
[0, 2, 5],
[1, 2, 4],
[0, 3, 3]
])
n = 2 # users
m = 4 # items
return A, n, m
def read_fake_ratings():
# fake ratings generated with an assumption of 2 hidden factors,
# see make_fake_ratings.py
filename = "fake_ratings.csv"
A = np.genfromtxt(filename, delimiter=",", skip_header=1)
return make_consecutive(A)
def read_movielens():
# A well-known test dataset of ratings from real users. It comes
# from:
# http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html
# Thanks due to GroupLens research group, U of Minnesota
filename = "movielens/ratings.csv"
A = np.genfromtxt(filename, delimiter=",", skip_header=1)
A = A[:, :3]
return make_consecutive(A)
def make_consecutive(A):
# We need consecutive user-ids and movie-ids because we're going
# to refer to indices in matrices. The raw data mightn't have
# that, eg we might have user 17 and user 19 but no user 18. This
# function just renames users and movies to make all consecutive.
users_to_IDs = {} # map of renames
movies_to_IDs = {} # map of renames
B = np.zeros_like(A) # our output, same shape as input
n_users_visited = 0 # this means "visited during our iteration"
n_movies_visited = 0
for idx, (user, movie, rating) in enumerate(A):
user = int(user) # A might be floating-point
movie = int(movie) # so we convert
if user not in users_to_IDs:
users_to_IDs[user] = n_users_visited
n_users_visited += 1
if movie not in movies_to_IDs:
movies_to_IDs[movie] = n_movies_visited
n_movies_visited += 1
B[idx] = (users_to_IDs[user], movies_to_IDs[movie], rating)
n = len(set(A[:, 0]))
m = len(set(A[:, 1]))
# Let's check we got things right
assert n == n_users_visited
assert m == n_movies_visited
assert set(B[:, 0]) == set(range(n))
assert set(B[:, 1]) == set(range(m))
return B, n, m
# methods for display
############################################
def display(A, Ahat):
Ahat_at_known = [Ahat[int(i), int(j)] for i, j, rating in A]
c = np.corrcoef(ratings, Ahat_at_known)[0][1]
examples = [(ratings[i], Ahat_at_known[i]) for i in range(5)]
return c, examples
def calc_rmse(Ahat):
# RMSE against known ratings only
SSE = sum((rating - Ahat[int(i), int(j)])**2 for i, j, rating in A)
MSE = SSE / len(A)
return np.sqrt(MSE)
def cost(U, V, Ahat):
error = calc_rmse(Ahat)
regularisation = np.linalg.norm(U) + np.linalg.norm(V)
return error + Lambda * regularisation
# core of the model
###################
def predict(U, V, C=None, D=None, M=None):
# A rating is modelled as:
#
# R = M + C[i] + D[j] + sum U[i,k]V[k,j]
#
# M is a scalar: the mean of all ratings.
#
# C is a value per user: how generous is a user (positive or
# negative, ie above or below M.)
#
# D is a value per movie: how good is a movie (positive or
# negative, ie above or below M.)
#
# UV then models the movie-factor-user interaction, and can be
# positive or negative.
#
# UV can also be positive or negative for any particular
# item-movie, but will be centered at 0
# @ means matrix multiplication, ie np.dot(U, V). It's new in
# Python 3.x.
Ahat = U @ V
if C is not None:
Ahat += C # C has shape (n, 1) so we can add
if D is not None:
Ahat += D # D has shape (1, m) so we can add
if M is not None:
Ahat += M # M a scalar
return Ahat
# settings and setup
####################
timestamp = str(datetime.datetime.now()).replace(" ", "_").replace(":", "_")
dirname = "saves"
if not os.path.isdir(dirname):
os.mkdir(dirname)
np.random.seed(0) # for replication
print_every = 5 # how often should we print?
maxits = 10000 # number of iterations allowed
tol = 0.01 # tolerance: stop when cost less than this
alpha = 0.001 # learning rate. Funk recommneds 0.001
Lambda = 0.02 # Regularisation strength for U and V. Funk
# recommends 0.02. (We use uppercase L because "lambda"
# is a Python reserved word.)
Lambda2 = 0.05 # Regularisation strength for C and D. Paterek recommends 0.05.
GD_on_CD = True # Should we do gradient descent on C and D?
# Several datasets for testing. We customize some of the paramters
# per-dataset.
dataset_name = "fake"
if dataset_name == "movielens":
# Achieves about RMSE 0.60, R 0.82. Very slow start (first 150 its)
A, n, m = read_movielens()
h = 10
maxits = 500
alpha = 0.001
GD_on_CD = False # I haven't been able to make it work on this dataset
print_every = 1
elif dataset_name == "fake":
# Achieves about RMSE 0.22, R 0.97 with these settings.
A, n, m = read_fake_ratings()
h = 2
maxits = 1000
alpha = 0.01
print_every = 10
elif dataset_name == "tiny":
A, n, m = make_tiny_test_data()
h = 2
maxits = 1000
alpha = 0.001
print_every = 10
print(n, m, h)
ratings = A[:, 2]
def initialise(A):
# Initialise all the variables: U, V, C, D, M
# Initialise U and V with light Gaussian noise.
# U = 0.1 * np.ones((n, h))
# V = 0.1 * np.ones((h, m))
U = np.random.normal(loc=0, scale=0.01, size=(n, h))
V = np.random.normal(loc=0, scale=0.01, size=(h, m))
# Initialise C, D and M based on the data.
C = np.zeros((n, 1), dtype=float)
C_count = np.zeros((n, 1))
D = np.zeros((1, m), dtype=float)
D_count = np.zeros((1, m))
M = 0.0
M_count = 0
for i, j, rating in A:
i = int(i)
j = int(j)
C[i, :] += rating
C_count[i, :] += 1
D[:, j] += rating
D_count[:, j] += 1
M += rating
M_count += 1
M = M / float(M_count) # mean of the data
C = C / C_count - M # per-user mean, less M
D = D / D_count - M # per-movie mean, less M
return U, V, C, D, M
U, V, C, D, M = initialise(A)
# main loop carrying out gradient descent
#########################################
for it in range(maxits):
Ahat = predict(U, V, C, D, M) # make a prediction
# We copy, and make updates on the copy (it's slightly inaccurate otherwise)
U_ = U.copy()
V_ = V.copy()
if GD_on_CD:
C_ = C.copy()
D_ = D.copy()
if print_every is not None and it % print_every == 0:
c = calc_rmse(Ahat)
print(it, c, display(A, Ahat))
if c < tol:
break
for i, j, rating in A:
i = int(i)
j = int(j)
err = rating - Ahat[i, j] # How bad was our prediction? Can be negative!
for k in range(h):
# The most important lines: GD on U and V. Paterek Section
# 3.2
U_[i, k] += alpha * (err * V[k, j] - Lambda * U[i, k])
V_[k, j] += alpha * (err * U[i, k] - Lambda * V[k, j])
if GD_on_CD:
# Paterek Section 3.3. These are done once per rating, not k times.
C_[i, 0] += alpha * (err - Lambda2 * (C[i, 0] + D[0, j] - M))
D_[0, j] += alpha * (err - Lambda2 * (C[i, 0] + D[0, j] - M))
# Overwrite variables with updated versions.
U = U_
V = V_
if GD_on_CD:
C = C_
D = D_
# Save everything.
basefilename = os.path.join(dirname, timestamp)
np.savetxt(basefilename + "_U.dat", U)
np.savetxt(basefilename + "_V.dat", V)
np.savetxt(basefilename + "_C.dat", C)
np.savetxt(basefilename + "_D.dat", D)
np.savetxt(basefilename + "_M.dat", np.array([M]))