mozilla · suhaibmujahid · Dec 2, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 2, 2023
diff --git a/bugbug/commit_features.py b/bugbug/commit_features.py
@@ -3,7 +3,6 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 import sys
-from collections import defaultdict
 from typing import Sequence
 
 import pandas as pd
@@ -808,34 +807,14 @@ def __call__(self, commit, **kwargs):
 
 
 class Files(object):
-    def __init__(self, min_freq=0.0014):
-        self.min_freq = min_freq
+    name = "files"
 
-    def fit(self, commits):
-        self.count = defaultdict(int)
-
-        self.total_commits = 0
-
-        for commit in commits:
-            self.total_commits += 1
-
-            for f in commit["files"]:
-                self.count[f] += 1
-
-        # We no longer need to store counts for files which have low frequency.
-        to_del = set(
-            f for f, c in self.count.items() if c / self.total_commits < self.min_freq
-        )
+    def __call__(self, commit, **kwargs):
+        return commit["files"]
 
-        for f in to_del:
-            del self.count[f]
 
-    def __call__(self, commit, **kwargs):
-        return [
-            f
-            for f in commit["files"]
-            if (self.count[f] / self.total_commits) > self.min_freq
-        ]
+def _pass_through_tokenizer(doc):
+    return doc
 
 
 class FileTouchedPrev(object):
@@ -1008,6 +987,7 @@ def transform(self, commits):
 
         for commit in commits():
             data = {}
+            result = {"data": data}
 
             for feature_extractor in self.feature_extractors:
                 if "bug_features" in feature_extractor.__module__:
@@ -1028,6 +1008,13 @@ def transform(self, commits):
                 else:
                     feature_extractor_name = feature_extractor.__class__.__name__
 
+                # FIXME: This is a workaround to pass the value to the
+                # union transformer independently. This will be dropped when we
+                # resolve https://github.com/mozilla/bugbug/issues/3876
+                if isinstance(feature_extractor, Files):
+                    result[sys.intern(feature_extractor_name)] = res
+                    continue
+
                 if isinstance(res, dict):
                     for key, value in res.items():
                         data[sys.intern(key)] = value
@@ -1040,7 +1027,6 @@ def transform(self, commits):
 
                 data[sys.intern(feature_extractor_name)] = res
 
-            result = {"data": data}
             if "desc" in commit:
                 for cleanup_function in self.cleanup_functions:
                     result["desc"] = cleanup_function(commit["desc"])

diff --git a/bugbug/model.py b/bugbug/model.py
@@ -18,7 +18,6 @@
     make_index_balanced_accuracy,
     specificity_score,
 )
-from imblearn.pipeline import make_pipeline
 from sklearn import metrics
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import precision_recall_fscore_support
@@ -148,7 +147,6 @@ def __init__(self, lemmatization=False):
             self.text_vectorizer = TfidfVectorizer
 
         self.cross_validation_enabled = True
-        self.sampler = None
 
         self.calculate_importance = True
 
@@ -196,6 +194,8 @@ def get_human_readable_feature_names(self):
                 feature_name = f"Comments contain '{feature_name}'"
             elif type_ == "text":
                 feature_name = f"Combined text contains '{feature_name}'"
+            elif type_ == "files":
+                feature_name = f"File '{feature_name}'"
             elif type_ not in ("data", "couple_data"):
                 raise ValueError(f"Unexpected feature type for: {full_feature_name}")
 
@@ -348,7 +348,7 @@ def train(self, importance_cutoff=0.15, limit=None):
         X_gen, y = split_tuple_generator(lambda: self.items_gen(classes))
 
         # Extract features from the items.
-        X = self.extraction_pipeline.fit_transform(X_gen)
+        X = self.extraction_pipeline.transform(X_gen)
 
         # Calculate labels.
         y = np.array(y)
@@ -365,10 +365,6 @@ def train(self, importance_cutoff=0.15, limit=None):
 
         # Split dataset in training and test.
         X_train, X_test, y_train, y_test = self.train_test_split(X, y)
-        if self.sampler is not None:
-            pipeline = make_pipeline(self.sampler, self.clf)
-        else:
-            pipeline = self.clf
 
         tracking_metrics = {}
 
@@ -379,7 +375,7 @@ def train(self, importance_cutoff=0.15, limit=None):
                 scorings += ["precision", "recall"]
 
             scores = cross_validate(
-                pipeline, X_train, self.le.transform(y_train), scoring=scorings, cv=5
+                self.clf, X_train, self.le.transform(y_train), scoring=scorings, cv=5
             )
 
             logger.info("Cross Validation scores:")
@@ -394,16 +390,10 @@ def train(self, importance_cutoff=0.15, limit=None):
                 )
 
         logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
-
-        # Training on the resampled dataset if sampler is provided.
-        if self.sampler is not None:
-            X_train, y_train = self.sampler.fit_resample(X_train, y_train)
-
-            logger.info(f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}")
-
         logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
 
         self.clf.fit(X_train, self.le.transform(y_train))
+        logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)
 
         logger.info("Model trained")
 
@@ -558,11 +548,8 @@ def train(self, importance_cutoff=0.15, limit=None):
         if self.entire_dataset_training:
             logger.info("Retraining on the entire dataset...")
 
-            if self.sampler is not None:
-                X_train, y_train = self.sampler.fit_resample(X, y)
-            else:
-                X_train = X
-                y_train = y
+            X_train = X
+            y_train = y
 
             logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
 
@@ -571,13 +558,15 @@ def train(self, importance_cutoff=0.15, limit=None):
         model_directory = self.__class__.__name__.lower()
         makedirs(model_directory, exist_ok=True)
 
-        if issubclass(type(self.clf), XGBModel):
+        step_name, estimator = self.clf.steps.pop()
+        if issubclass(type(estimator), XGBModel):
             xgboost_model_path = path.join(model_directory, "xgboost.ubj")
-            self.clf.save_model(xgboost_model_path)
+            estimator.save_model(xgboost_model_path)
 
-            # Since we save the classifier separately, we need to clear the clf
-            # attribute to prevent it from being pickled with the model object.
-            self.clf = self.clf.__class__(**self.hyperparameter)
+            # Since we save the estimator separately, we need to reset it to
+            # prevent its data from being pickled with the pipeline.
+            estimator = estimator.__class__(**self.hyperparameter)
+        self.clf.steps.append((step_name, estimator))
 
         model_path = path.join(model_directory, "model.pkl")
         with open(model_path, "wb") as f:
@@ -600,7 +589,7 @@ def load(model_directory: str) -> "Model":
 
         xgboost_model_path = path.join(model_directory, "xgboost.ubj")
         if path.exists(xgboost_model_path):
-            model.clf.load_model(xgboost_model_path)
+            model.clf.named_steps["estimator"].load_model(xgboost_model_path)
 
         return model
 

diff --git a/bugbug/models/annotate_ignore.py b/bugbug/models/annotate_ignore.py
@@ -6,9 +6,11 @@
 import logging
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.pipeline import Pipeline
 
 from bugbug import bugzilla, commit_features, feature_cleanup, labels, repository, utils
@@ -23,11 +25,10 @@ def __init__(self, lemmatization: bool = False) -> None:
         CommitModel.__init__(self, lemmatization)
 
         self.calculate_importance = False
+        self.cross_validation_enabled = False
 
         self.training_dbs += [bugzilla.BUGS_DB]
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             commit_features.SourceCodeFileSize(),
             commit_features.OtherFileSize(),
@@ -67,21 +68,38 @@ def __init__(self, lemmatization: bool = False) -> None:
                         feature_extractors, cleanup_functions
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = ImblearnPipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
                         [
                             ("data", DictVectorizer(), "data"),
                             ("desc", self.text_vectorizer(min_df=0.0001), "desc"),
+                            (
+                                "files",
+                                CountVectorizer(
+                                    analyzer=utils.keep_as_is,
+                                    lowercase=False,
+                                    min_df=0.0014,
+                                ),
+                                "files",
+                            ),
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -123,4 +141,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/assignee.py b/bugbug/models/assignee.py
@@ -67,6 +67,12 @@ def __init__(self, lemmatization=False):
                         rollback_when=self.rollback,
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -81,12 +87,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -117,7 +124,7 @@ def get_labels(self):
         return classes, set(classes.values())
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def rollback(self, change):
         return change["field_name"].startswith("assigned_to")
diff --git a/bugbug/models/backout.py b/bugbug/models/backout.py
@@ -9,9 +9,11 @@
 import dateutil.parser
 import xgboost
 from dateutil.relativedelta import relativedelta
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.pipeline import Pipeline
 
 from bugbug import bug_features, commit_features, feature_cleanup, repository, utils
@@ -27,8 +29,6 @@ def __init__(self, lemmatization=False, bug_data=False):
 
         self.calculate_importance = False
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             commit_features.SourceCodeFilesModifiedNum(),
             commit_features.OtherFilesModifiedNum(),
@@ -74,29 +74,46 @@ def __init__(self, lemmatization=False, bug_data=False):
             feature_cleanup.synonyms(),
         ]
 
-        self.extraction_pipeline = Pipeline(
+        self.extraction_pipeline = ImblearnPipeline(
             [
                 (
                     "commit_extractor",
                     commit_features.CommitExtractor(
                         feature_extractors, cleanup_functions
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
                         [
                             ("data", DictVectorizer(), "data"),
                             ("desc", self.text_vectorizer(), "desc"),
+                            (
+                                "files",
+                                CountVectorizer(
+                                    analyzer=utils.keep_as_is,
+                                    lowercase=False,
+                                    min_df=0.0014,
+                                ),
+                                "files",
+                            ),
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -123,4 +140,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()