Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Option to add user and/or item features #159

Open
wants to merge 54 commits into
base: user_item_features
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
8a3532f
added asym_rmse and asym_mae
martincousi Mar 26, 2018
de2cd0c
Merge pull request #1 from NicolasHug/master
martincousi Mar 26, 2018
6d18af6
Merge pull request #2 from martincousi/asymetric-measures
martincousi Mar 26, 2018
3f6b1d0
disable print in AlgoBase.compute_baselines()
martincousi Mar 27, 2018
daab1ba
Cancel printing of computation of similarities
martincousi Mar 28, 2018
05ef072
Cancel printing of similiraty computation
martincousi Mar 28, 2018
902246f
add load_features_df() method
martincousi Mar 29, 2018
fb64e98
modified construct_trainset() and load_features_df()
martincousi Mar 29, 2018
13f3a28
modified Trainset.__init__()
martincousi Mar 29, 2018
900c0c0
corrected bugs in print statement
martincousi Mar 29, 2018
68ccfca
use user_features_nb to test if initialized
martincousi Mar 30, 2018
f7fa4d8
revert back changes to accuracy.py
martincousi Mar 30, 2018
c6591ae
revert back changes to AlgoBase
martincousi Mar 30, 2018
e31e857
Update .gitignore
martincousi Mar 30, 2018
7d67963
Update .gitignore
martincousi Mar 30, 2018
73bea50
fixed python 2 compatibility
martincousi Mar 30, 2018
4063da8
construction of Lasso.fit()
martincousi Apr 4, 2018
34dd04b
modified predict and estimate methods
martincousi Apr 4, 2018
d275f84
include features in testset and prediction objects
martincousi Apr 4, 2018
14d1248
update matrix factorization estimate method
martincousi Apr 4, 2018
a2b87c4
adapt estimate methods for all prediction algorithms
martincousi Apr 4, 2018
3c5f7e6
add sklearn arguments to Lasso
martincousi Apr 5, 2018
7b82e78
single underscore for dummy variable
martincousi Apr 5, 2018
bf335c2
update documentation for Lasso and change filename
martincousi Apr 5, 2018
e34a5f9
correct conflict with master
martincousi Apr 5, 2018
4081244
add interaction terms in Lasso
martincousi Apr 5, 2018
d3dd0dd
add interaction terms to Lasso.estimate
martincousi Apr 5, 2018
47ff477
correct conflicts with master
martincousi Apr 5, 2018
1279424
correct verbose conflicts in knns
martincousi Apr 5, 2018
62ccd84
add add_interactions to self in Lasso
martincousi Apr 5, 2018
53b8697
change add_interactions fn name
martincousi Apr 5, 2018
7e34298
remove add_interactions_fn
martincousi Apr 5, 2018
39c2601
correct bad index
martincousi Apr 5, 2018
4f3c3a8
pep8 and description
martincousi Apr 5, 2018
aab90a5
add feature labels
martincousi Apr 5, 2018
bfb2b8d
resolve conflicts
martincousi Apr 5, 2018
f9255e1
remove Lasso
martincousi Apr 5, 2018
1d74bef
Merge branch 'master' into features-dataset
martincousi Apr 5, 2018
e63d5ad
Merge pull request #3 from martincousi/features-dataset
martincousi Apr 5, 2018
ed7180d
Revert "Features dataset"
martincousi Apr 5, 2018
40cc8d2
Merge pull request #4 from martincousi/revert-3-features-dataset
martincousi Apr 5, 2018
32082ce
initialize features_labels
martincousi Apr 5, 2018
e3de208
Revert "Revert "Features dataset""
martincousi Apr 5, 2018
c52d707
Merge pull request #5 from martincousi/revert-4-revert-3-features-dat…
martincousi Apr 5, 2018
4fabe29
add lasso
martincousi Apr 5, 2018
e7adc87
Merge pull request #6 from martincousi/lasso
martincousi Apr 5, 2018
4fc4242
Merge pull request #7 from NicolasHug/master
martincousi Apr 6, 2018
5eaccce
Merge branch 'master' into features-dataset
martincousi Apr 6, 2018
7427b22
Delete linear.py
martincousi Apr 9, 2018
773bd24
Update __init__.py
martincousi Apr 9, 2018
bb0012c
Update __init__.py
martincousi Apr 9, 2018
c76a51a
remove features from Prediction object + typos
martincousi Apr 13, 2018
c34a817
correct accuracy
martincousi Apr 13, 2018
fec4d4f
Correct tests
martincousi Apr 13, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ _site

.coverage
tags
settings.json
settings.json
61 changes: 58 additions & 3 deletions surprise/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ class Dataset:
def __init__(self, reader):

self.reader = reader
self.user_features_nb = 0
self.item_features_nb = 0
self.user_features = {}
self.item_features = {}
self.user_features_labels = []
self.item_features_labels = []

@classmethod
def load_builtin(cls, name='ml-100k'):
Expand Down Expand Up @@ -165,6 +171,36 @@ def load_from_df(cls, df, reader):

return DatasetAutoFolds(reader=reader, df=df)

def load_features_df(self, features_df, user_features=True):
"""Load features from a pandas dataframe into a dataset.

Use this if you want to add user or item features to a dataset. Only
certain prediction algorithms in the :mod:`prediction_algorithms`
package support this additional data.

Args:
features_df(`Dataframe`): The dataframe containing the features. It
must have two columns or more, corresponding to the user or
item (raw) ids, and the features, in this order.
user_features(:obj:`bool`): Whether the features are for the users
or the items. Default is ``True``.
"""

if user_features:
self.user_features_df = features_df
self.user_features = {tup[0]: tup[1:] for tup in
features_df.itertuples(index=False)}
self.user_features_labels = features_df.columns.values.tolist()[1:]
self.user_features_nb = len(self.user_features_labels)
else:
self.item_features_df = features_df
self.item_features = {tup[0]: tup[1:] for tup in
features_df.itertuples(index=False)}
self.item_features_labels = features_df.columns.values.tolist()[1:]
self.item_features_nb = len(self.item_features_labels)

return self

def read_ratings(self, file_name):
"""Return a list of ratings (user, item, rating, timestamp) read from
file_name"""
Expand Down Expand Up @@ -208,20 +244,28 @@ def construct_trainset(self, raw_trainset):
ur = defaultdict(list)
ir = defaultdict(list)

u_features = {}
i_features = {}

# user raw id, item raw id, translated rating, time stamp
for urid, irid, r, timestamp in raw_trainset:
for urid, irid, r, _ in raw_trainset:
try:
uid = raw2inner_id_users[urid]
except KeyError:
uid = current_u_index
raw2inner_id_users[urid] = current_u_index
current_u_index += 1
if self.user_features_nb > 0:
u_features[uid] = self.user_features.get(urid, None)

try:
iid = raw2inner_id_items[irid]
except KeyError:
iid = current_i_index
raw2inner_id_items[irid] = current_i_index
current_i_index += 1
if self.item_features_nb > 0:
i_features[iid] = self.item_features.get(irid, None)

ur[uid].append((iid, r))
ir[iid].append((uid, r))
Expand All @@ -232,8 +276,14 @@ def construct_trainset(self, raw_trainset):

trainset = Trainset(ur,
ir,
u_features,
i_features,
n_users,
n_items,
self.user_features_nb,
self.item_features_nb,
self.user_features_labels,
self.item_features_labels,
n_ratings,
self.reader.rating_scale,
self.reader.offset,
Expand All @@ -244,8 +294,13 @@ def construct_trainset(self, raw_trainset):

def construct_testset(self, raw_testset):

return [(ruid, riid, r_ui_trans)
for (ruid, riid, r_ui_trans, _) in raw_testset]
testset = []
for (ruid, riid, r_ui_trans, _) in raw_testset:
u_features = self.user_features.get(ruid, None)
i_features = self.item_features.get(riid, None)
testset.append((ruid, riid, u_features, i_features, r_ui_trans))

return testset


class DatasetUserFolds(Dataset):
Expand Down
2 changes: 2 additions & 0 deletions surprise/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ class CaseInsensitiveDefaultDict(defaultdict):
Used for the returned dict, so that users can use perf['RMSE'] or
perf['rmse'] indifferently.
"""

def __setitem__(self, key, value):
super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value)

Expand Down Expand Up @@ -333,4 +334,5 @@ def seed_and_eval(seed, *args):
different processes."""

random.seed(seed)

return evaluate(*args, verbose=0)
2 changes: 2 additions & 0 deletions surprise/model_selection/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ class GridSearchCV(BaseSearchCV):
into a pandas `DataFrame` (see :ref:`example
<cv_results_example>`).
"""

def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'],
cv=None, refit=False, return_train_measures=False, n_jobs=1,
pre_dispatch='2*n_jobs', joblib_verbose=0):
Expand Down Expand Up @@ -410,6 +411,7 @@ class RandomizedSearchCV(BaseSearchCV):
into a pandas `DataFrame` (see :ref:`example
<cv_results_example>`).
"""

def __init__(self, algo_class, param_distributions, n_iter=10,
measures=['rmse', 'mae'], cv=None, refit=False,
return_train_measures=False, n_jobs=1,
Expand Down
2 changes: 1 addition & 1 deletion surprise/model_selection/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def split(self, data):

Args:
data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
ratings that will be devided into trainsets and testsets.
ratings that will be divided into trainsets and testsets.

Yields:
tuple of (trainset, testset)
Expand Down
16 changes: 12 additions & 4 deletions surprise/prediction_algorithms/algo_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, **kwargs):
self.skip_train = False

if (guf(self.__class__.fit) is guf(AlgoBase.fit) and
guf(self.__class__.train) is not guf(AlgoBase.train)):
guf(self.__class__.train) is not guf(AlgoBase.train)):
warnings.warn('It looks like this algorithm (' +
str(self.__class__) +
') implements train() '
Expand Down Expand Up @@ -96,7 +96,8 @@ def fit(self, trainset):

return self

def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None,
clip=True, verbose=False):
"""Compute the rating prediction for given user and item.

The ``predict`` method converts raw ids to inner ids and then calls the
Expand All @@ -108,6 +109,10 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
Args:
uid: (Raw) id of the user. See :ref:`this note<raw_inner_note>`.
iid: (Raw) id of the item. See :ref:`this note<raw_inner_note>`.
u_features: List of user features in the same order as used in
the ``fit`` method. Optional, default is ``None``.
i_features: List of item features in the same order as used in
the ``fit`` method. Optional, default is ``None``.
r_ui(float): The true rating :math:`r_{ui}`. Optional, default is
``None``.
clip(bool): Whether to clip the estimation into the rating scale.
Expand Down Expand Up @@ -143,7 +148,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):

details = {}
try:
est = self.estimate(iuid, iiid)
est = self.estimate(iuid, iiid, u_features, i_features)

# If the details dict was also returned
if isinstance(est, tuple):
Expand Down Expand Up @@ -207,9 +212,12 @@ def test(self, testset, verbose=False):
# The ratings are translated back to their original scale.
predictions = [self.predict(uid,
iid,
u_features,
i_features,
r_ui_trans - self.trainset.offset,
verbose=verbose)
for (uid, iid, r_ui_trans) in testset]
for (uid, iid, u_features, i_features, r_ui_trans)
in testset]
return predictions

def compute_baselines(self):
Expand Down
2 changes: 1 addition & 1 deletion surprise/prediction_algorithms/baseline_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def fit(self, trainset):

return self

def estimate(self, u, i):
def estimate(self, u, i, *_):

est = self.trainset.global_mean
if self.trainset.knows_user(u):
Expand Down
4 changes: 2 additions & 2 deletions surprise/prediction_algorithms/co_clustering.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class CoClustering(AlgoBase):
self.n_cltr_u = n_cltr_u
self.n_cltr_i = n_cltr_i
self.n_epochs = n_epochs
self.verbose=verbose
self.verbose = verbose
self.random_state = random_state

def fit(self, trainset):
Expand Down Expand Up @@ -236,7 +236,7 @@ class CoClustering(AlgoBase):

return avg_cltr_u, avg_cltr_i, avg_cocltr

def estimate(self, u, i):
def estimate(self, u, i, *_):

if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
return self.trainset.global_mean
Expand Down
16 changes: 7 additions & 9 deletions surprise/prediction_algorithms/knns.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,10 @@ def fit(self, trainset):

return self

def estimate(self, u, i):
def estimate(self, u, i, *_):

if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
raise PredictionImpossible('User and/or item is unknown.')

x, y = self.switch(u, i)

Expand Down Expand Up @@ -183,10 +183,10 @@ def fit(self, trainset):

return self

def estimate(self, u, i):
def estimate(self, u, i, *_):

if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
raise PredictionImpossible('User and/or item is unknown.')

x, y = self.switch(u, i)

Expand Down Expand Up @@ -254,13 +254,11 @@ class KNNBaseline(SymmetricAlgo):
measure. See :ref:`similarity_measures_configuration` for accepted
options. It is recommended to use the :func:`pearson_baseline
<surprise.similarities.pearson_baseline>` similarity measure.

bsl_options(dict): A dictionary of options for the baseline estimates
computation. See :ref:`baseline_estimates_configuration` for
accepted options.
verbose(bool): Whether to print trace messages of bias estimation,
similarity, etc. Default is True.

"""

def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={},
Expand All @@ -282,7 +280,7 @@ def fit(self, trainset):

return self

def estimate(self, u, i):
def estimate(self, u, i, *_):

est = self.trainset.global_mean
if self.trainset.knows_user(u):
Expand Down Expand Up @@ -384,10 +382,10 @@ def fit(self, trainset):

return self

def estimate(self, u, i):
def estimate(self, u, i, *_):

if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
raise PredictionImpossible('User and/or item is unknown.')

x, y = self.switch(u, i)

Expand Down
10 changes: 5 additions & 5 deletions surprise/prediction_algorithms/matrix_factorization.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ class SVD(AlgoBase):
self.pu = pu
self.qi = qi

def estimate(self, u, i):
def estimate(self, u, i, *_):
# Should we cythonize this as well?

known_user = self.trainset.knows_user(u)
Expand All @@ -275,7 +275,7 @@ class SVD(AlgoBase):
if known_user and known_item:
est = np.dot(self.qi[i], self.pu[u])
else:
raise PredictionImpossible('User and item are unkown.')
raise PredictionImpossible('User and item are unknown.')

return est

Expand Down Expand Up @@ -484,7 +484,7 @@ class SVDpp(AlgoBase):
self.qi = qi
self.yj = yj

def estimate(self, u, i):
def estimate(self, u, i, *_):

est = self.trainset.global_mean

Expand Down Expand Up @@ -715,7 +715,7 @@ class NMF(AlgoBase):
self.pu = pu
self.qi = qi

def estimate(self, u, i):
def estimate(self, u, i, *_):
# Should we cythonize this as well?

known_user = self.trainset.knows_user(u)
Expand All @@ -737,6 +737,6 @@ class NMF(AlgoBase):
if known_user and known_item:
est = np.dot(self.qi[i], self.pu[u])
else:
raise PredictionImpossible('User and item are unkown.')
raise PredictionImpossible('User and item are unknown.')

return est
2 changes: 1 addition & 1 deletion surprise/prediction_algorithms/slope_one.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class SlopeOne(AlgoBase):

return self

def estimate(self, u, i):
def estimate(self, u, i, *_):

if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
Expand Down
Loading