-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGoodBookData.py
165 lines (132 loc) · 6.85 KB
/
GoodBookData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import numpy as np
import pandas as pd
def load_user_item_matrix_GB_All(max_user=943, max_item=761):
"""
this function loads the user x items matrix from the movie lens data set.
Both input parameter represent a threshold for the maximum user id or maximum item id
The highest user id is 6040 and the highest movie id is 3952 for the original data set, however, the masked data
set contains only 943 users and 1330 items
:return: user-item matrix
"""
# Number of unique user_ids: 943
# Number of unique book_ids: 761
# Sparsity of the data: 0.9816
df = np.zeros(shape=(max_user, max_item))
with open("/Users/mslokom/Documents/RecSys_News/goodbook/ratings_filtered_goodbook.csv", 'r') as f: #subset_GB_O All_2370_allUsers_KNN_fancy_imputation_GB_k_30
for line in f.readlines():
user_id, movie_id, rating, genre = line.split(",")
user_id, movie_id, rating, genre = int(user_id), int(movie_id), float(rating), str (genre)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_user_item_matrix_GB_TrainingSet(max_user=943, max_item=729): #2370 2835 24676 11927 2835
"""
this function loads the user x items matrix from the movie lens data set.
Both input parameter represent a threshold for the maximum user id or maximum item id
The highest user id is 6040 and the highest movie id is 3952 for the original data set, however, the masked data
set contains only 943 users and 1330 items
:return: user-item matrix
"""
df = np.zeros(shape=(max_user, max_item))
with open("/Users/mslokom/Documents/RecSys_News/goodbook/trainVal_small.csv", 'r') as f: # Flixster/trainingSet_GB_1.dat New_Flixster/GB_train.csv
for line in f.readlines():
user_id, movie_id, rating = line.split(",")
user_id, movie_id, rating = int(user_id), int(movie_id), float(rating)#, str (genre)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_user_item_matrix_GB_TestSet(max_user=943, max_item=729): #2370 2835 24676 11927 2835
"""
this function loads the user x items matrix from the movie lens data set.
Both input parameter represent a threshold for the maximum user id or maximum item id
The highest user id is 6040 and the highest movie id is 3952 for the original data set, however, the masked data
set contains only 943 users and 1330 items
:return: user-item matrix
"""
df = np.zeros(shape=(max_user, max_item))
with open("/Users/mslokom/Documents/RecSys_News/goodbook/test_small.csv", 'r') as f: # Flixster/trainingSet_GB_1.dat New_Flixster/GB_train.csv
for line in f.readlines():
user_id, movie_id, rating = line.split(",")
user_id, movie_id, rating = int(user_id), int(movie_id), float(rating)#, str (genre)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_user_item_matrix_GB_Test(max_user=943, max_item=761): # 2370 2008
"""
this function loads the user x items matrix from the movie lens data set.
Both input parameter represent a threshold for the maximum user id or maximum item id
The highest user id is 6040 and the highest movie id is 3952 for the original data set, however, the masked data
set contains only 943 users and 1330 items
:return: user-item matrix
"""
df = np.zeros(shape=(max_user, max_item))
with open("/Users/mslokom/Documents/RecSys_News/goodbook/test_small.csv", 'r') as f:
for line in f.readlines():
user_id, movie_id, rating, genre = line.split(",")
user_id, movie_id, rating, genre = int(user_id), int(movie_id), float(rating), str (genre)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_user_item_GB_Complet(max_user=2370, max_item=2835):# 2370
df = np.zeros(shape=(max_user, max_item))
with open(
"Flixster/With_Fancy_KNN/TrainingSet_2370_allUsers_KNN_fancy_imputation_GB_k_30.dat",
'r') as f:
for line in f.readlines():
user_id, movie_id, rating, timestamp = line.split("::")
user_id, movie_id, rating, timestamp = int(user_id), int(movie_id), float(rating), int(timestamp)
if user_id <= max_user and movie_id <= max_item:
df[user_id - 1, movie_id - 1] = rating
return df
def load_user_item_matrix_GB_limited_ratings(limit=20):
user_item = load_user_item_matrix_GB_All()
user_item_limited = np.zeros(shape=user_item.shape)
for user_index, user in enumerate(user_item):
# filter rating indices
rating_index = np.argwhere(user > 0).reshape(1, -1)[0]
# shuffle them
np.random.shuffle(rating_index)
for i in rating_index[:limit]:
user_item_limited[user_index, i] = user[i]
#print(np.sum(user_item_limited, axis=1))
return user_item_limited
def load_user_item_matrix_GB_trainMasked(max_user=2370, max_item=2835, file_index=-1):
df = np.zeros(shape=(max_user, max_item))
masked_files = [
# ,#0
]
with open(masked_files[file_index], 'r') as f:
for line in f.readlines():
user_id, movie_id, rating, _ = line.split("::")
user_id, movie_id, rating = int(user_id), int(movie_id), float(rating)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_gender_vector_GB(max_user=2370 ): #2370 2008
"""
this function loads and returns the gender for all users with an id smaller than max_user
:param max_user: the highest user id to be retrieved
:return: the gender vector
"""
gender_vec = []
with open("Flixster/subset_GB_User_O.csv", 'r') as f:
for line in f.readlines()[:max_user]:
user_id, gender, _ = line.split(",") #, location, _, _, _ , _
if gender == "M":
gender_vec.append(0)
else:
gender_vec.append(1)
return np.asarray(gender_vec)
def load_user_item_matrix_GB_masked(max_user=2370, max_item=2835, file_index=-1):
files = [
# Here add path to your files. Please note that we start from #0 like in the example
"Flixster/BlurMe/All_GB_blurme_obfuscated_0.01_greedy_avg_top-1.dat",#0
]
df = np.zeros(shape=(max_user, max_item))
with open(files[file_index], 'r') as f:
for line in f.readlines():
user_id, movie_id, rating, _ = line.split("::")
user_id, movie_id, rating = int(user_id), int(movie_id), float(rating)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df