NicolasHug · martincousi · Mar 26, 2018 · Mar 26, 2018 · Mar 26, 2018 · Mar 27, 2018
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,4 @@ _site
 
 .coverage
 tags
-settings.json
+settings.json
diff --git a/surprise/dataset.py b/surprise/dataset.py
@@ -53,6 +53,12 @@ class Dataset:
     def __init__(self, reader):
 
         self.reader = reader
+        self.user_features_nb = 0
+        self.item_features_nb = 0
+        self.user_features = {}
+        self.item_features = {}
+        self.user_features_labels = []
+        self.item_features_labels = []
 
     @classmethod
     def load_builtin(cls, name='ml-100k'):
@@ -165,6 +171,36 @@ def load_from_df(cls, df, reader):
 
         return DatasetAutoFolds(reader=reader, df=df)
 
+    def load_features_df(self, features_df, user_features=True):
+        """Load features from a pandas dataframe into a dataset.
+
+        Use this if you want to add user or item features to a dataset. Only
+        certain prediction algorithms in the :mod:`prediction_algorithms`
+        package support this additional data.
+
+        Args:
+            features_df(`Dataframe`): The dataframe containing the features. It
+                must have two columns or more, corresponding to the user or
+                item (raw) ids, and the features, in this order.
+            user_features(:obj:`bool`): Whether the features are for the users
+                or the items. Default is ``True``.
+        """
+
+        if user_features:
+            self.user_features_df = features_df
+            self.user_features = {tup[0]: tup[1:] for tup in
+                                  features_df.itertuples(index=False)}
+            self.user_features_labels = features_df.columns.values.tolist()[1:]
+            self.user_features_nb = len(self.user_features_labels)
+        else:
+            self.item_features_df = features_df
+            self.item_features = {tup[0]: tup[1:] for tup in
+                                  features_df.itertuples(index=False)}
+            self.item_features_labels = features_df.columns.values.tolist()[1:]
+            self.item_features_nb = len(self.item_features_labels)
+
+        return self
+
     def read_ratings(self, file_name):
         """Return a list of ratings (user, item, rating, timestamp) read from
         file_name"""
@@ -208,20 +244,28 @@ def construct_trainset(self, raw_trainset):
         ur = defaultdict(list)
         ir = defaultdict(list)
 
+        u_features = {}
+        i_features = {}
+
         # user raw id, item raw id, translated rating, time stamp
-        for urid, irid, r, timestamp in raw_trainset:
+        for urid, irid, r, _ in raw_trainset:
             try:
                 uid = raw2inner_id_users[urid]
             except KeyError:
                 uid = current_u_index
                 raw2inner_id_users[urid] = current_u_index
                 current_u_index += 1
+                if self.user_features_nb > 0:
+                    u_features[uid] = self.user_features.get(urid, None)
+
             try:
                 iid = raw2inner_id_items[irid]
             except KeyError:
                 iid = current_i_index
                 raw2inner_id_items[irid] = current_i_index
                 current_i_index += 1
+                if self.item_features_nb > 0:
+                    i_features[iid] = self.item_features.get(irid, None)
 
             ur[uid].append((iid, r))
             ir[iid].append((uid, r))
@@ -232,8 +276,14 @@ def construct_trainset(self, raw_trainset):
 
         trainset = Trainset(ur,
                             ir,
+                            u_features,
+                            i_features,
                             n_users,
                             n_items,
+                            self.user_features_nb,
+                            self.item_features_nb,
+                            self.user_features_labels,
+                            self.item_features_labels,
                             n_ratings,
                             self.reader.rating_scale,
                             self.reader.offset,
@@ -244,8 +294,13 @@ def construct_trainset(self, raw_trainset):
 
     def construct_testset(self, raw_testset):
 
-        return [(ruid, riid, r_ui_trans)
-                for (ruid, riid, r_ui_trans, _) in raw_testset]
+        testset = []
+        for (ruid, riid, r_ui_trans, _) in raw_testset:
+            u_features = self.user_features.get(ruid, None)
+            i_features = self.item_features.get(riid, None)
+            testset.append((ruid, riid, u_features, i_features, r_ui_trans))
+
+        return testset
 
 
 class DatasetUserFolds(Dataset):

diff --git a/surprise/evaluate.py b/surprise/evaluate.py
@@ -301,6 +301,7 @@ class CaseInsensitiveDefaultDict(defaultdict):
         Used for the returned dict, so that users can use perf['RMSE'] or
         perf['rmse'] indifferently.
     """
+
     def __setitem__(self, key, value):
         super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value)
 
@@ -333,4 +334,5 @@ def seed_and_eval(seed, *args):
     different processes."""
 
     random.seed(seed)
+
     return evaluate(*args, verbose=0)
diff --git a/surprise/model_selection/search.py b/surprise/model_selection/search.py
@@ -294,6 +294,7 @@ class GridSearchCV(BaseSearchCV):
             into a pandas `DataFrame` (see :ref:`example
             <cv_results_example>`).
     """
+
     def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'],
                  cv=None, refit=False, return_train_measures=False, n_jobs=1,
                  pre_dispatch='2*n_jobs', joblib_verbose=0):
@@ -410,6 +411,7 @@ class RandomizedSearchCV(BaseSearchCV):
             into a pandas `DataFrame` (see :ref:`example
             <cv_results_example>`).
     """
+
     def __init__(self, algo_class, param_distributions, n_iter=10,
                  measures=['rmse', 'mae'], cv=None, refit=False,
                  return_train_measures=False, n_jobs=1,

diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py
@@ -372,7 +372,7 @@ def split(self, data):
 
         Args:
             data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
-                ratings that will be devided into trainsets and testsets.
+                ratings that will be divided into trainsets and testsets.
 
         Yields:
             tuple of (trainset, testset)

diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py
@@ -37,7 +37,7 @@ def __init__(self, **kwargs):
         self.skip_train = False
 
         if (guf(self.__class__.fit) is guf(AlgoBase.fit) and
-           guf(self.__class__.train) is not guf(AlgoBase.train)):
+                guf(self.__class__.train) is not guf(AlgoBase.train)):
             warnings.warn('It looks like this algorithm (' +
                           str(self.__class__) +
                           ') implements train() '
@@ -96,7 +96,8 @@ def fit(self, trainset):
 
         return self
 
-    def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
+    def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None,
+                clip=True, verbose=False):
         """Compute the rating prediction for given user and item.
 
         The ``predict`` method converts raw ids to inner ids and then calls the
@@ -108,6 +109,10 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
         Args:
             uid: (Raw) id of the user. See :ref:`this note<raw_inner_note>`.
             iid: (Raw) id of the item. See :ref:`this note<raw_inner_note>`.
+            u_features: List of user features in the same order as used in
+                the ``fit`` method. Optional, default is ``None``.
+            i_features: List of item features in the same order as used in
+                the ``fit`` method. Optional, default is ``None``.
             r_ui(float): The true rating :math:`r_{ui}`. Optional, default is
                 ``None``.
             clip(bool): Whether to clip the estimation into the rating scale.
@@ -143,7 +148,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
 
         details = {}
         try:
-            est = self.estimate(iuid, iiid)
+            est = self.estimate(iuid, iiid, u_features, i_features)
 
             # If the details dict was also returned
             if isinstance(est, tuple):
@@ -207,9 +212,12 @@ def test(self, testset, verbose=False):
         # The ratings are translated back to their original scale.
         predictions = [self.predict(uid,
                                     iid,
+                                    u_features,
+                                    i_features,
                                     r_ui_trans - self.trainset.offset,
                                     verbose=verbose)
-                       for (uid, iid, r_ui_trans) in testset]
+                       for (uid, iid, u_features, i_features, r_ui_trans)
+                       in testset]
         return predictions
 
     def compute_baselines(self):

diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py
@@ -37,7 +37,7 @@ def fit(self, trainset):
 
         return self
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         est = self.trainset.global_mean
         if self.trainset.knows_user(u):

diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx
@@ -62,7 +62,7 @@ class CoClustering(AlgoBase):
         self.n_cltr_u = n_cltr_u
         self.n_cltr_i = n_cltr_i
         self.n_epochs = n_epochs
-        self.verbose=verbose
+        self.verbose = verbose
         self.random_state = random_state
 
     def fit(self, trainset):
@@ -236,7 +236,7 @@ class CoClustering(AlgoBase):
 
         return avg_cltr_u, avg_cltr_i, avg_cocltr
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
             return self.trainset.global_mean

diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py
@@ -100,10 +100,10 @@ def fit(self, trainset):
 
         return self
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
-            raise PredictionImpossible('User and/or item is unkown.')
+            raise PredictionImpossible('User and/or item is unknown.')
 
         x, y = self.switch(u, i)
 
@@ -183,10 +183,10 @@ def fit(self, trainset):
 
         return self
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
-            raise PredictionImpossible('User and/or item is unkown.')
+            raise PredictionImpossible('User and/or item is unknown.')
 
         x, y = self.switch(u, i)
 
@@ -254,13 +254,11 @@ class KNNBaseline(SymmetricAlgo):
             measure. See :ref:`similarity_measures_configuration` for accepted
             options. It is recommended to use the :func:`pearson_baseline
             <surprise.similarities.pearson_baseline>` similarity measure.
-
         bsl_options(dict): A dictionary of options for the baseline estimates
             computation. See :ref:`baseline_estimates_configuration` for
             accepted options.
         verbose(bool): Whether to print trace messages of bias estimation,
             similarity, etc.  Default is True.
-
     """
 
     def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={},
@@ -282,7 +280,7 @@ def fit(self, trainset):
 
         return self
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         est = self.trainset.global_mean
         if self.trainset.knows_user(u):
@@ -384,10 +382,10 @@ def fit(self, trainset):
 
         return self
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
-            raise PredictionImpossible('User and/or item is unkown.')
+            raise PredictionImpossible('User and/or item is unknown.')
 
         x, y = self.switch(u, i)
 

diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx
@@ -253,7 +253,7 @@ class SVD(AlgoBase):
         self.pu = pu
         self.qi = qi
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
         # Should we cythonize this as well?
 
         known_user = self.trainset.knows_user(u)
@@ -275,7 +275,7 @@ class SVD(AlgoBase):
             if known_user and known_item:
                 est = np.dot(self.qi[i], self.pu[u])
             else:
-                raise PredictionImpossible('User and item are unkown.')
+                raise PredictionImpossible('User and item are unknown.')
 
         return est
 
@@ -484,7 +484,7 @@ class SVDpp(AlgoBase):
         self.qi = qi
         self.yj = yj
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         est = self.trainset.global_mean
 
@@ -715,7 +715,7 @@ class NMF(AlgoBase):
         self.pu = pu
         self.qi = qi
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
         # Should we cythonize this as well?
 
         known_user = self.trainset.knows_user(u)
@@ -737,6 +737,6 @@ class NMF(AlgoBase):
             if known_user and known_item:
                 est = np.dot(self.qi[i], self.pu[u])
             else:
-                raise PredictionImpossible('User and item are unkown.')
+                raise PredictionImpossible('User and item are unknown.')
 
         return est
diff --git a/surprise/prediction_algorithms/slope_one.pyx b/surprise/prediction_algorithms/slope_one.pyx
@@ -79,7 +79,7 @@ class SlopeOne(AlgoBase):
 
         return self
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
             raise PredictionImpossible('User and/or item is unkown.')