diff --git a/.gitignore b/.gitignore index bd32b905..45019cb0 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ _site .coverage tags -settings.json \ No newline at end of file +settings.json diff --git a/surprise/dataset.py b/surprise/dataset.py index 17638b6c..da8a5ba4 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -53,6 +53,12 @@ class Dataset: def __init__(self, reader): self.reader = reader + self.user_features_nb = 0 + self.item_features_nb = 0 + self.user_features = {} + self.item_features = {} + self.user_features_labels = [] + self.item_features_labels = [] @classmethod def load_builtin(cls, name='ml-100k'): @@ -165,6 +171,36 @@ def load_from_df(cls, df, reader): return DatasetAutoFolds(reader=reader, df=df) + def load_features_df(self, features_df, user_features=True): + """Load features from a pandas dataframe into a dataset. + + Use this if you want to add user or item features to a dataset. Only + certain prediction algorithms in the :mod:`prediction_algorithms` + package support this additional data. + + Args: + features_df(`Dataframe`): The dataframe containing the features. It + must have two columns or more, corresponding to the user or + item (raw) ids, and the features, in this order. + user_features(:obj:`bool`): Whether the features are for the users + or the items. Default is ``True``. + """ + + if user_features: + self.user_features_df = features_df + self.user_features = {tup[0]: tup[1:] for tup in + features_df.itertuples(index=False)} + self.user_features_labels = features_df.columns.values.tolist()[1:] + self.user_features_nb = len(self.user_features_labels) + else: + self.item_features_df = features_df + self.item_features = {tup[0]: tup[1:] for tup in + features_df.itertuples(index=False)} + self.item_features_labels = features_df.columns.values.tolist()[1:] + self.item_features_nb = len(self.item_features_labels) + + return self + def read_ratings(self, file_name): """Return a list of ratings (user, item, rating, timestamp) read from file_name""" @@ -208,20 +244,28 @@ def construct_trainset(self, raw_trainset): ur = defaultdict(list) ir = defaultdict(list) + u_features = {} + i_features = {} + # user raw id, item raw id, translated rating, time stamp - for urid, irid, r, timestamp in raw_trainset: + for urid, irid, r, _ in raw_trainset: try: uid = raw2inner_id_users[urid] except KeyError: uid = current_u_index raw2inner_id_users[urid] = current_u_index current_u_index += 1 + if self.user_features_nb > 0: + u_features[uid] = self.user_features.get(urid, None) + try: iid = raw2inner_id_items[irid] except KeyError: iid = current_i_index raw2inner_id_items[irid] = current_i_index current_i_index += 1 + if self.item_features_nb > 0: + i_features[iid] = self.item_features.get(irid, None) ur[uid].append((iid, r)) ir[iid].append((uid, r)) @@ -232,8 +276,14 @@ def construct_trainset(self, raw_trainset): trainset = Trainset(ur, ir, + u_features, + i_features, n_users, n_items, + self.user_features_nb, + self.item_features_nb, + self.user_features_labels, + self.item_features_labels, n_ratings, self.reader.rating_scale, self.reader.offset, @@ -244,8 +294,13 @@ def construct_trainset(self, raw_trainset): def construct_testset(self, raw_testset): - return [(ruid, riid, r_ui_trans) - for (ruid, riid, r_ui_trans, _) in raw_testset] + testset = [] + for (ruid, riid, r_ui_trans, _) in raw_testset: + u_features = self.user_features.get(ruid, None) + i_features = self.item_features.get(riid, None) + testset.append((ruid, riid, u_features, i_features, r_ui_trans)) + + return testset class DatasetUserFolds(Dataset): diff --git a/surprise/evaluate.py b/surprise/evaluate.py index 55764d8b..65ff2b86 100644 --- a/surprise/evaluate.py +++ b/surprise/evaluate.py @@ -301,6 +301,7 @@ class CaseInsensitiveDefaultDict(defaultdict): Used for the returned dict, so that users can use perf['RMSE'] or perf['rmse'] indifferently. """ + def __setitem__(self, key, value): super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value) @@ -333,4 +334,5 @@ def seed_and_eval(seed, *args): different processes.""" random.seed(seed) + return evaluate(*args, verbose=0) diff --git a/surprise/model_selection/search.py b/surprise/model_selection/search.py index 0510c60f..d1811218 100644 --- a/surprise/model_selection/search.py +++ b/surprise/model_selection/search.py @@ -294,6 +294,7 @@ class GridSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=1, pre_dispatch='2*n_jobs', joblib_verbose=0): @@ -410,6 +411,7 @@ class RandomizedSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=1, diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py index 14697911..5c656565 100644 --- a/surprise/model_selection/split.py +++ b/surprise/model_selection/split.py @@ -372,7 +372,7 @@ def split(self, data): Args: data(:obj:`Dataset`): The data containing - ratings that will be devided into trainsets and testsets. + ratings that will be divided into trainsets and testsets. Yields: tuple of (trainset, testset) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 9becf7bd..5d4c5e02 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -37,7 +37,7 @@ def __init__(self, **kwargs): self.skip_train = False if (guf(self.__class__.fit) is guf(AlgoBase.fit) and - guf(self.__class__.train) is not guf(AlgoBase.train)): + guf(self.__class__.train) is not guf(AlgoBase.train)): warnings.warn('It looks like this algorithm (' + str(self.__class__) + ') implements train() ' @@ -96,7 +96,8 @@ def fit(self, trainset): return self - def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): + def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, + clip=True, verbose=False): """Compute the rating prediction for given user and item. The ``predict`` method converts raw ids to inner ids and then calls the @@ -108,6 +109,10 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): Args: uid: (Raw) id of the user. See :ref:`this note`. iid: (Raw) id of the item. See :ref:`this note`. + u_features: List of user features in the same order as used in + the ``fit`` method. Optional, default is ``None``. + i_features: List of item features in the same order as used in + the ``fit`` method. Optional, default is ``None``. r_ui(float): The true rating :math:`r_{ui}`. Optional, default is ``None``. clip(bool): Whether to clip the estimation into the rating scale. @@ -143,7 +148,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): details = {} try: - est = self.estimate(iuid, iiid) + est = self.estimate(iuid, iiid, u_features, i_features) # If the details dict was also returned if isinstance(est, tuple): @@ -207,9 +212,12 @@ def test(self, testset, verbose=False): # The ratings are translated back to their original scale. predictions = [self.predict(uid, iid, + u_features, + i_features, r_ui_trans - self.trainset.offset, verbose=verbose) - for (uid, iid, r_ui_trans) in testset] + for (uid, iid, u_features, i_features, r_ui_trans) + in testset] return predictions def compute_baselines(self): diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index a5b3036e..a6eb707f 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -37,7 +37,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx index 408780fc..85837718 100644 --- a/surprise/prediction_algorithms/co_clustering.pyx +++ b/surprise/prediction_algorithms/co_clustering.pyx @@ -62,7 +62,7 @@ class CoClustering(AlgoBase): self.n_cltr_u = n_cltr_u self.n_cltr_i = n_cltr_i self.n_epochs = n_epochs - self.verbose=verbose + self.verbose = verbose self.random_state = random_state def fit(self, trainset): @@ -236,7 +236,7 @@ class CoClustering(AlgoBase): return avg_cltr_u, avg_cltr_i, avg_cocltr - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): return self.trainset.global_mean diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 0a15e1d4..245a83dc 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -100,10 +100,10 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) @@ -183,10 +183,10 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) @@ -254,13 +254,11 @@ class KNNBaseline(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. It is recommended to use the :func:`pearson_baseline ` similarity measure. - bsl_options(dict): A dictionary of options for the baseline estimates computation. See :ref:`baseline_estimates_configuration` for accepted options. verbose(bool): Whether to print trace messages of bias estimation, similarity, etc. Default is True. - """ def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}, @@ -282,7 +280,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): @@ -384,10 +382,10 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index 0e898632..7a3cede5 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -253,7 +253,7 @@ class SVD(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -275,7 +275,7 @@ class SVD(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est @@ -484,7 +484,7 @@ class SVDpp(AlgoBase): self.qi = qi self.yj = yj - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean @@ -715,7 +715,7 @@ class NMF(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -737,6 +737,6 @@ class NMF(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est diff --git a/surprise/prediction_algorithms/slope_one.pyx b/surprise/prediction_algorithms/slope_one.pyx index 8049a6cf..f986e496 100644 --- a/surprise/prediction_algorithms/slope_one.pyx +++ b/surprise/prediction_algorithms/slope_one.pyx @@ -79,7 +79,7 @@ class SlopeOne(AlgoBase): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/trainset.py b/surprise/trainset.py index ebb95204..c7d091f6 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -33,21 +33,37 @@ class Trainset: ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a dictionary containing lists of tuples of the form ``(user_inner_id, rating)``. The keys are item inner ids. + u_features(:obj:`defaultdict` of :obj:`list`): The user features. This + is a dictionary containing lists of features. The keys are user + inner ids. + i_features(:obj:`defaultdict` of :obj:`list`): The item features. This + is a dictionary containing lists of features. The keys are item + inner ids. n_users: Total number of users :math:`|U|`. n_items: Total number of items :math:`|I|`. + n_user_features: Total number of user features. + n_item_features: Total number of item features. n_ratings: Total number of ratings :math:`|R_{train}|`. rating_scale(tuple): The minimum and maximal rating of the rating scale. global_mean: The mean of all ratings :math:`\\mu`. """ - def __init__(self, ur, ir, n_users, n_items, n_ratings, rating_scale, - offset, raw2inner_id_users, raw2inner_id_items): + def __init__(self, ur, ir, u_features, i_features, n_users, n_items, + n_user_features, n_item_features, user_features_labels, + item_features_labels, n_ratings, rating_scale, offset, + raw2inner_id_users, raw2inner_id_items): self.ur = ur self.ir = ir + self.u_features = u_features + self.i_features = i_features self.n_users = n_users self.n_items = n_items + self.n_user_features = n_user_features + self.n_item_features = n_item_features + self.user_features_labels = user_features_labels + self.item_features_labels = item_features_labels self.n_ratings = n_ratings self.rating_scale = rating_scale self.offset = offset @@ -87,6 +103,30 @@ def knows_item(self, iid): return iid in self.ir + def has_user_features(self, uid): + """Indicate if the user features are part of the trainset. + + Args: + uid(int): The (inner) user id. See :ref:`this + note`. + Returns: + ``True`` if user features are part of the trainset, else ``False``. + """ + + return uid in self.u_features + + def has_item_features(self, iid): + """Indicate if the item features are part of the trainset. + + Args: + iid(int): The (inner) item id. See :ref:`this + note`. + Returns: + ``True`` if item features are part of the trainset, else ``False``. + """ + + return iid in self.i_features + def to_inner_uid(self, ruid): """Convert a **user** raw id to an inner id. @@ -200,8 +240,14 @@ def build_testset(self): cases where you want to to test your algorithm on the trainset. """ - return [(self.to_raw_uid(u), self.to_raw_iid(i), r) - for (u, i, r) in self.all_ratings()] + testset = [] + for (u, i, r) in self.all_ratings(): + u_features = self.u_features.get(u, None) + i_features = self.i_features.get(i, None) + testset.append((self.to_raw_uid(u), self.to_raw_iid(i), u_features, + i_features, r)) + + return testset def build_anti_testset(self, fill=None): """Return a list of ratings that can be used as a testset in the @@ -228,9 +274,13 @@ def build_anti_testset(self, fill=None): anti_testset = [] for u in self.all_users(): user_items = set([j for (j, _) in self.ur[u]]) - anti_testset += [(self.to_raw_uid(u), self.to_raw_iid(i), fill) for - i in self.all_items() if - i not in user_items] + anti_testset += [(self.to_raw_uid(u), + self.to_raw_iid(i), + self.u_features.get(u, None), + self.i_features.get(i, None), + fill) + for i in self.all_items() + if i not in user_items] return anti_testset def all_users(self): diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 69311404..2526d20c 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -145,12 +145,12 @@ def test_trainset_testset(): for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): - trainset.to_inner_uid('unkown_user') + trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): - trainset.to_inner_iid('unkown_item') + trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None @@ -167,19 +167,19 @@ def test_trainset_testset(): algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data - assert ('user0', 'item0', 4) in testset - assert ('user3', 'item1', 5) in testset - assert ('user3', 'item1', 0) not in testset + assert ('user0', 'item0', None, None, 4) in testset + assert ('user3', 'item1', None, None, 5) in testset + assert ('user3', 'item1', None, None, 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data - assert ('user0', 'item0', trainset.global_mean) not in testset - assert ('user3', 'item1', trainset.global_mean) not in testset - assert ('user0', 'item1', trainset.global_mean) in testset - assert ('user3', 'item0', trainset.global_mean) in testset + assert ('user0', 'item0', None, None, trainset.global_mean) not in testset + assert ('user3', 'item1', None, None, trainset.global_mean) not in testset + assert ('user0', 'item1', None, None, trainset.global_mean) in testset + assert ('user3', 'item0', None, None, trainset.global_mean) in testset def test_load_form_df(): @@ -238,11 +238,11 @@ def test_build_anti_testset(): # fill with some specific value for fillvalue in (0, 42., -1): anti = trainset.build_anti_testset(fill=fillvalue) - for (u, i, r) in anti: + for (u, i, u_f, i_f, r) in anti: assert r == fillvalue # fill with global_mean anti = trainset.build_anti_testset(fill=None) - for (u, i, r) in anti: + for (u, i, u_f, i_f, r) in anti: assert r == trainset.global_mean expect = trainset.n_users * trainset.n_items assert trainset.n_ratings + len(anti) == expect diff --git a/tests/test_split.py b/tests/test_split.py index 0c12cb53..d55eb5ad 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -299,7 +299,7 @@ def test_LeaveOneOut(): # Make sure only one rating per user is present in the testset loo = LeaveOneOut() for _, testset in loo.split(data): - cnt = Counter([uid for (uid, _, _) in testset]) + cnt = Counter([uid for (uid, _, _, _, _) in testset]) assert all(val == 1 for val in itervalues(cnt)) # test the min_n_ratings parameter diff --git a/tests/test_train2fit.py b/tests/test_train2fit.py index ab0634e4..b2993031 100644 --- a/tests/test_train2fit.py +++ b/tests/test_train2fit.py @@ -35,7 +35,7 @@ def fit(self, trainset): self.bu, self.bi = 1, 1 self.cnt += 1 - def estimate(self, u, i): + def estimate(self, u, i, *_): return self.est algo = CustomAlgoFit() @@ -91,7 +91,7 @@ def train(self, trainset): self.bu, self.bi = 1, 1 self.cnt += 1 - def estimate(self, u, i): + def estimate(self, u, i, *_): return self.est with pytest.warns(UserWarning):