-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodels.py
322 lines (250 loc) · 10.4 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# # In this file we define some models to use for the prediction that were not available in any libraries.
# #These are based on the lab 10 of the course.
import numpy as np
import scipy.sparse as sp
import random, copy
def split_data(ratings, p_test=0.1, verbose=False):
"""split the ratings to training data and test data.
Args:
ratings:
the data to be split
p_test:
the percentage of data to be used for the test set
Returns:
Train set and test set
"""
num_rows, num_cols=ratings.shape
train = sp.lil_matrix((num_rows, num_cols))
test = sp.lil_matrix((num_rows, num_cols))
nz_items, nz_users = ratings.nonzero()
for user in set(nz_users):
row, col = ratings[:, user].nonzero()
selects = np.random.choice(row, size=int(len(row) * p_test))
residual = list(set(row) - set(selects))
# add to train set
for r in residual:
train[r, user] = ratings[r, user]
# add to test set
for s in selects:
test[s, user] = ratings[s, user]
if(verbose==True):
print("Total number of nonzero elements in train data:{v}".format(v=train.nnz))
print("Total number of nonzero elements in test data:{v}".format(v=test.nnz))
return train, test
def init_MF(train, num_features):
"""init the parameter for matrix factorization."""
num_items, num_users = train.shape
user_features = np.random.rand(num_features,num_users)/num_users
user_features[0,:]=np.ones((num_users,))
item_features = np.random.rand(num_features,num_items)/num_items
item_features[0,:]=sp.csr_matrix.mean(train,axis=1).reshape(num_items,)
return user_features, item_features
def compute_error(data, user_features, item_features, nz):
"""compute the loss (MSE) of the prediction of nonzero elements."""
mse=0
for row,col in nz:
w_d = item_features[:,row]
z_n = user_features[:,col]
prediction= w_d @ z_n.T
error_prediction = (data[row,col] - prediction ) **2
mse+=error_prediction
return np.sqrt(mse / len(nz))
def build_index_groups(train):
"""build groups for nnz rows and cols."""
nz_row, nz_col = train.nonzero()
nz_train = list(zip(nz_row, nz_col))
grouped_nz_train_byrow = group_by(nz_train, index=0)
nz_row_colindices = [(g, np.array([v[1] for v in value]))
for g, value in grouped_nz_train_byrow]
grouped_nz_train_bycol = group_by(nz_train, index=1)
nz_col_rowindices = [(g, np.array([v[0] for v in value]))
for g, value in grouped_nz_train_bycol]
return nz_train, nz_row_colindices, nz_col_rowindices
def baseline_global_mean(train, test, verbose=False):
"""Compute the baseline global mean of the train data
Args:
train:
the data to be used as input
test:
the data to be used for measuring the rmse
Returns:
The global mean of the train data
"""
global_mean_train = train[train.nonzero()].mean()
test_nonzero_dense = test[test.nonzero()].todense()
mse = calculate_mse( test_nonzero_dense, global_mean_train )
rmse = np.sqrt( mse / test_nonzero_dense.shape[1] )
if(verbose==True):
print("Baseline global RMSE on test: ", rmse[0,0])
return global_mean_train
def calculate_mse(real_label, prediction):
"""calculate MSE."""
t = real_label - prediction
return 1.0 * t.dot(t.T)
def baseline_user_mean(train, test,verbose=False):
"""Compute the baseline user mean of the train data
Args:
train:
the data to be used as input
test:
the data to be used for measuring the rmse
Returns:
The user mean of the train data
"""
mse = 0
count = 0
num_items, num_users = train.shape
sums=train.sum(axis=0) #sum for each user
mean_user=np.zeros((1,num_users))
for j in range(0,num_users):
if(sums[0,j] != 0):
elems = train[:,j]
elems_nonzero = elems[elems.nonzero()]
mean_user[0,j] = elems_nonzero.mean()
for i in range(test.shape[0]):
if(test[i,j] != 0):
mean_user_elem = mean_user[0,j]
mse += (test[i,j]-mean_user_elem )**2
count+= 1
rmse = np.sqrt( mse / count )
if(verbose==True):
print("Baseline User RMSE on test: ",rmse)
return mean_user
def baseline_movie_mean(train, test, verbose=False):
"""Compute the baseline movie mean of the train data
Args:
train:
the data to be used as input
test:
the data to be used for measuring the rmse
Returns:
The movie mean of the train data
"""
mse = 0
count = 0
num_items, num_users = train.shape
sums=train.sum(axis=1) #sum for each user
mean_item=np.zeros((num_items,1))
for i in range(0,num_items):
if(sums[i,0] != 0):
elems = train[i,:]
elems_nonzero = elems[elems.nonzero()]
mean_item[i,0] = elems_nonzero.mean()
for j in range(test.shape[1]):
if(test[i,j] != 0):
mean_item_elem = mean_item[i,0]
mse += (test[i,j]-mean_item_elem)**2
count+= 1
rmse = np.sqrt( mse / count )
if(verbose==True):
print("Baseline Movie RMSE on test: ",rmse)
return mean_item
def matrix_factorization_SGD(train, test,gamma, num_features,lambda_user,lambda_item,num_epochs,reg=True,verbose=False):
"""TODO
"""
errors = [0]
# init matrix
user_features, item_features = init_MF(train, num_features) #Z0.T,W0
# find the non-zero ratings indices
nz_row, nz_col = train.nonzero()
nz_train = list(zip(nz_row, nz_col))
nz_row, nz_col = test.nonzero()
nz_test = list(zip(nz_row, nz_col))
print("learn the matrix factorization using SGD...")
for it in range(num_epochs):
# shuffle the training rating indices
np.random.shuffle(nz_train)
# decrease step size
gamma /= 1.2
for d, n in nz_train:
item_data = item_features[:,d]
user_data = user_features[:,n]
prediction = item_data @ user_data.T
prediciton_error = train[d, n] - item_data @ user_data.T
#compute derivative wrt w
grad_w = -prediciton_error * user_data
#compute derivative wrt z
grad_z = -prediciton_error * item_data
#update
if(reg):
item_features[:,d]-= gamma * ( grad_w + lambda_item * item_data)
user_features[:,n]-= gamma * ( grad_z + lambda_user * user_data)
else:
item_features[:,d]-= gamma * grad_w
user_features[:,n]-= gamma * grad_z
rmse = compute_error(train, user_features, item_features, nz_train)
if(verbose==True):
print("iter: {}, RMSE on training set: {}.".format(it, rmse))
errors.append(rmse)
rmse = compute_error(test, user_features, item_features, nz_test)
if(verbose==True):
print("RMSE on test data: {}.".format(rmse))
return user_features, item_features
def update_user_feature(
train, item_features, lambda_user,
nnz_items_per_user, nz_user_itemindices):
"""update user feature matrix."""
"""the best lambda is assumed to be nnz_items_per_user[user] * lambda_user"""
num_user = nnz_items_per_user.shape[0]
num_feature = item_features.shape[0]
lambda_I = lambda_user * np.eye(num_feature)
updated_user_features = np.zeros((num_feature, num_user))
for user, items in nz_user_itemindices:
# extract the columns corresponding to the prediction for given item
M = item_features[:, items]
# update column row of user features
V = M @ train[items, user]
A = M @ M.T + nnz_items_per_user[user] * lambda_I
X = np.linalg.solve(A, V)
updated_user_features[:, user] = np.copy(X.T)
return updated_user_features
def update_item_feature(
train, user_features, lambda_item,
nnz_users_per_item, nz_item_userindices):
"""update item feature matrix."""
"""the best lambda is assumed to be nnz_items_per_item[item] * lambda_item"""
num_item = nnz_users_per_item.shape[0]
num_feature = user_features.shape[0]
lambda_I = lambda_item * sp.eye(num_feature)
updated_item_features = np.zeros((num_feature, num_item))
for item, users in nz_item_userindices:
# extract the columns corresponding to the prediction for given user
M = user_features[:, users]
V = M @ train[item, users].T
A = M @ M.T + nnz_users_per_item[item] * lambda_I
X = np.linalg.solve(A, V)
updated_item_features[:, item] = np.copy(X.T)
return updated_item_features
def ALS(train, test,num_features,lambda_user, lambda_item,stop_criterion,verbose=False):
"""TODO
"""
change = 1
error_list = [0, 0]
# init ALS
user_features, item_features = init_MF(train, num_features)
# get the number of non-zero ratings for each user and item
nnz_items_per_user, nnz_users_per_item = train.getnnz(axis=0), train.getnnz(axis=1)
# group the indices by row or column index
nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train)
# run ALS
print("\nstart the ALS algorithm...")
while change > stop_criterion:
# update user feature & item feature
user_features = update_user_feature(
train, item_features, lambda_user,
nnz_items_per_user, nz_user_itemindices)
item_features = update_item_feature(
train, user_features, lambda_item,
nnz_users_per_item, nz_item_userindices)
error = compute_error(train, user_features, item_features, nz_train)
if(verbose==True):
print("RMSE on training set: {}.".format(error))
error_list.append(error)
change = np.fabs(error_list[-1] - error_list[-2])
# evaluate the test error
nnz_row, nnz_col = test.nonzero()
nnz_test = list(zip(nnz_row, nnz_col))
rmse = compute_error(test, user_features, item_features, nnz_test)
if(verbose==True):
print("test RMSE after running ALS: {v}.".format(v=rmse))
return user_features, item_features