From 4cc4089397009ed3addadea2d6042e37d96d30ae Mon Sep 17 00:00:00 2001
From: Suhaib Mujahid <smujahid@mozilla.com>
Date: Fri, 1 Dec 2023 01:24:38 -0500
Subject: [PATCH 1/6] Perform the column transforming step in the clf pipeline

---
 bugbug/models/annotate_ignore.py              | 15 +++++++++++----
 bugbug/models/assignee.py                     | 15 +++++++++++----
 bugbug/models/backout.py                      | 15 +++++++++++----
 bugbug/models/browsername.py                  | 15 +++++++++++----
 bugbug/models/bugtype.py                      | 15 +++++++++++----
 bugbug/models/component.py                    | 15 +++++++++++----
 bugbug/models/defect.py                       | 15 +++++++++++----
 bugbug/models/devdocneeded.py                 | 15 +++++++++++----
 bugbug/models/fixtime.py                      | 15 +++++++++++----
 bugbug/models/invalid_compatibility_report.py | 15 +++++++++++----
 bugbug/models/needsdiagnosis.py               | 15 +++++++++++----
 bugbug/models/qaneeded.py                     | 15 +++++++++++----
 bugbug/models/rcatype.py                      | 15 +++++++++++----
 bugbug/models/regressionrange.py              | 15 +++++++++++----
 bugbug/models/regressor.py                    | 13 +++++++++----
 bugbug/models/spambug.py                      | 15 +++++++++++----
 bugbug/models/stepstoreproduce.py             | 15 +++++++++++----
 bugbug/models/testfailure.py                  | 13 ++++++++++---
 bugbug/models/testselect.py                   | 13 ++++++++++---
 bugbug/models/tracking.py                     | 15 +++++++++++----
 bugbug/models/uplift.py                       | 15 +++++++++++----
 21 files changed, 227 insertions(+), 82 deletions(-)

diff --git a/bugbug/models/annotate_ignore.py b/bugbug/models/annotate_ignore.py
index 5df3e3658c..10818ba845 100644
--- a/bugbug/models/annotate_ignore.py
+++ b/bugbug/models/annotate_ignore.py
@@ -67,6 +67,12 @@ def __init__(self, lemmatization: bool = False) -> None:
                         feature_extractors, cleanup_functions
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -76,12 +82,13 @@ def __init__(self, lemmatization: bool = False) -> None:
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -123,4 +130,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/assignee.py b/bugbug/models/assignee.py
index 07ec2bedc4..4ec9f7b36f 100644
--- a/bugbug/models/assignee.py
+++ b/bugbug/models/assignee.py
@@ -67,6 +67,12 @@ def __init__(self, lemmatization=False):
                         rollback_when=self.rollback,
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -81,12 +87,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -117,7 +124,7 @@ def get_labels(self):
         return classes, set(classes.values())
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def rollback(self, change):
         return change["field_name"].startswith("assigned_to")
diff --git a/bugbug/models/backout.py b/bugbug/models/backout.py
index e6100d9676..6ad9255305 100644
--- a/bugbug/models/backout.py
+++ b/bugbug/models/backout.py
@@ -82,6 +82,12 @@ def __init__(self, lemmatization=False, bug_data=False):
                         feature_extractors, cleanup_functions
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -91,12 +97,13 @@ def __init__(self, lemmatization=False, bug_data=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -123,4 +130,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/browsername.py b/bugbug/models/browsername.py
index 4d781e86a2..35d83adea4 100644
--- a/bugbug/models/browsername.py
+++ b/bugbug/models/browsername.py
@@ -38,6 +38,12 @@ def __init__(self, lemmatization=False):
                         feature_extractors, cleanup_functions
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -52,12 +58,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -81,4 +88,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/bugtype.py b/bugbug/models/bugtype.py
index 70a6111d1f..9e2ea95259 100644
--- a/bugbug/models/bugtype.py
+++ b/bugbug/models/bugtype.py
@@ -128,6 +128,12 @@ def __init__(self, lemmatization=False, historical=False):
                     "bug_extractor",
                     bug_features.BugExtractor(feature_extractors, cleanup_functions),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -147,12 +153,13 @@ def __init__(self, lemmatization=False, historical=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter)),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter))
-
     def get_labels(self) -> tuple[dict[int, np.ndarray], list[str]]:
         classes = {}
 
@@ -175,7 +182,7 @@ def get_labels(self) -> tuple[dict[int, np.ndarray], list[str]]:
         return classes, TYPE_LIST
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def overwrite_classes(
         self,
diff --git a/bugbug/models/component.py b/bugbug/models/component.py
index b1a8ada587..02aa953752 100644
--- a/bugbug/models/component.py
+++ b/bugbug/models/component.py
@@ -97,6 +97,12 @@ def __init__(self, lemmatization=False):
                         feature_extractors, cleanup_functions, rollback=True
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -111,12 +117,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
         self.CONFLATED_COMPONENTS_INVERSE_MAPPING = {
             v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items()
         }
@@ -231,7 +238,7 @@ def get_meaningful_product_components(self, full_comp_tuples, threshold_ratio=10
         )
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def check(self):
         success = super().check()
diff --git a/bugbug/models/defect.py b/bugbug/models/defect.py
index 2cd5aef90f..3896b2d1ea 100644
--- a/bugbug/models/defect.py
+++ b/bugbug/models/defect.py
@@ -64,6 +64,12 @@ def __init__(self, lemmatization=False, historical=False):
                     "bug_extractor",
                     bug_features.BugExtractor(feature_extractors, cleanup_functions),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -83,12 +89,13 @@ def __init__(self, lemmatization=False, historical=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_bugbug_labels(self, kind="bug") -> dict[int, Any]:
         assert kind in ["bug", "regression", "defect_enhancement_task"]
 
@@ -264,7 +271,7 @@ def get_labels(self) -> tuple[dict[int, Any], list[Any]]:
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def overwrite_classes(self, bugs, classes, probabilities):
         for i, bug in enumerate(bugs):
diff --git a/bugbug/models/devdocneeded.py b/bugbug/models/devdocneeded.py
index 400d934600..bceca9f62c 100644
--- a/bugbug/models/devdocneeded.py
+++ b/bugbug/models/devdocneeded.py
@@ -59,6 +59,12 @@ def __init__(self, lemmatization=False):
                         commit_data=True,
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -69,12 +75,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def rollback(self, change):
         return change["field_name"] == "keywords" and any(
             keyword in change["added"]
@@ -121,4 +128,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/fixtime.py b/bugbug/models/fixtime.py
index 89919f8678..2c3d684dd3 100644
--- a/bugbug/models/fixtime.py
+++ b/bugbug/models/fixtime.py
@@ -54,6 +54,12 @@ def __init__(self, lemmatization=False):
                         feature_extractors, cleanup_functions, rollback=True
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -68,12 +74,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         bug_fix_times = []
 
@@ -118,4 +125,4 @@ def _quantiles(n):
         return classes, list(range(len(quantiles) + 1))
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/invalid_compatibility_report.py b/bugbug/models/invalid_compatibility_report.py
index b4460fae2e..859991e740 100644
--- a/bugbug/models/invalid_compatibility_report.py
+++ b/bugbug/models/invalid_compatibility_report.py
@@ -35,6 +35,12 @@ def __init__(self, lemmatization=False):
                         feature_extractors, cleanup_functions, rollback=False
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -47,12 +53,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def items_gen(self, classes):
         # Do cleanup separately from extraction pipeline to
         # make sure it's not applied during classification due to differences
@@ -103,4 +110,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/needsdiagnosis.py b/bugbug/models/needsdiagnosis.py
index b41da12672..3e2699ed83 100644
--- a/bugbug/models/needsdiagnosis.py
+++ b/bugbug/models/needsdiagnosis.py
@@ -39,6 +39,12 @@ def __init__(self, lemmatization=False):
                         feature_extractors, cleanup_functions, rollback=True
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -52,12 +58,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -92,4 +99,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/qaneeded.py b/bugbug/models/qaneeded.py
index 5643b24882..171647feff 100644
--- a/bugbug/models/qaneeded.py
+++ b/bugbug/models/qaneeded.py
@@ -51,6 +51,12 @@ def __init__(self, lemmatization=False):
                         rollback_when=self.rollback,
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -61,12 +67,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def rollback(self, change):
         return any(
             change["added"].startswith(prefix)
@@ -109,4 +116,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/rcatype.py b/bugbug/models/rcatype.py
index 202f38835c..dd9ce6b60d 100644
--- a/bugbug/models/rcatype.py
+++ b/bugbug/models/rcatype.py
@@ -101,6 +101,12 @@ def __init__(
                     "bug_extractor",
                     bug_features.BugExtractor(feature_extractors, cleanup_functions),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -120,12 +126,13 @@ def __init__(
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter)),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter))
-
     # return rca from a whiteboard string
     def get_rca_from_whiteboard(self, whiteboard_data):
         rca = []
@@ -161,7 +168,7 @@ def get_labels(self):
         return classes, self.RCA_LIST
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def overwrite_classes(self, bugs, classes, probabilities):
         rca_values = self.get_rca(bugs)
diff --git a/bugbug/models/regressionrange.py b/bugbug/models/regressionrange.py
index 58a2af5169..62f4868f3b 100644
--- a/bugbug/models/regressionrange.py
+++ b/bugbug/models/regressionrange.py
@@ -50,6 +50,12 @@ def __init__(self, lemmatization=False):
                     "bug_extractor",
                     bug_features.BugExtractor(feature_extractors, cleanup_functions),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -60,12 +66,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -93,4 +100,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/regressor.py b/bugbug/models/regressor.py
index 01adbc2212..af43532033 100644
--- a/bugbug/models/regressor.py
+++ b/bugbug/models/regressor.py
@@ -126,15 +126,20 @@ def __init__(
                         feature_extractors, cleanup_functions
                     ),
                 ),
-                ("union", ColumnTransformer(column_transformers)),
             ]
         )
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
+        estimator = xgboost.XGBClassifier(**self.hyperparameter)
         if calibration:
-            self.clf = IsotonicRegressionCalibrator(self.clf)
+            estimator = IsotonicRegressionCalibrator(estimator)
             # This is a temporary workaround for the error : "Model type not yet supported by TreeExplainer"
             self.calculate_importance = False
+        self.clf = Pipeline(
+            [
+                ("union", ColumnTransformer(column_transformers)),
+                ("estimator", estimator),
+            ]
+        )
 
     def get_labels(self):
         classes = {}
@@ -365,7 +370,7 @@ def evaluation(self) -> None:
                 )
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def overwrite_classes(self, commits, classes, probabilities):
         for i, commit in enumerate(commits):
diff --git a/bugbug/models/spambug.py b/bugbug/models/spambug.py
index 332f1e13a1..bde977843e 100644
--- a/bugbug/models/spambug.py
+++ b/bugbug/models/spambug.py
@@ -63,6 +63,12 @@ def __init__(self, lemmatization=False):
                         feature_extractors, cleanup_functions, rollback=True
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -77,12 +83,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -131,7 +138,7 @@ def items_gen(self, classes):
         )
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def overwrite_classes(self, bugs, classes, probabilities):
         for i, bug in enumerate(bugs):
diff --git a/bugbug/models/stepstoreproduce.py b/bugbug/models/stepstoreproduce.py
index 76947527ad..43724ba8ae 100644
--- a/bugbug/models/stepstoreproduce.py
+++ b/bugbug/models/stepstoreproduce.py
@@ -50,6 +50,12 @@ def __init__(self, lemmatization=False):
                     "bug_extractor",
                     bug_features.BugExtractor(feature_extractors, cleanup_functions),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -60,12 +66,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def get_labels(self):
         classes = {}
 
@@ -106,4 +113,4 @@ def overwrite_classes(self, bugs, classes, probabilities):
         return classes
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/testfailure.py b/bugbug/models/testfailure.py
index 04e5af4cda..eaf769c2b5 100644
--- a/bugbug/models/testfailure.py
+++ b/bugbug/models/testfailure.py
@@ -59,12 +59,19 @@ def __init__(self, lemmatization=False):
                     "commit_extractor",
                     commit_features.CommitExtractor(feature_extractors, []),
                 ),
-                ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
             ]
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
+        self.clf = Pipeline(
+            [
+                ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
+            ]
+        )
 
     def items_gen(self, classes):
         commit_map = {}
@@ -110,4 +117,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
diff --git a/bugbug/models/testselect.py b/bugbug/models/testselect.py
index 684b1b495e..a6f2780c45 100644
--- a/bugbug/models/testselect.py
+++ b/bugbug/models/testselect.py
@@ -448,12 +448,19 @@ def __init__(self, lemmatization=False, granularity="label", failures_skip=None)
                     "commit_extractor",
                     commit_features.CommitExtractor(feature_extractors, []),
                 ),
-                ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
             ]
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
+        self.clf = Pipeline(
+            [
+                ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
+            ]
+        )
 
     def get_pushes(
         self, apply_filters: bool = False
@@ -859,7 +866,7 @@ def do_eval(
                     do_eval(executor, confidence_threshold, reduction, cap, minimum)
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
 
 class TestLabelSelectModel(TestSelectModel):
diff --git a/bugbug/models/tracking.py b/bugbug/models/tracking.py
index 417da45497..d07a3661e6 100644
--- a/bugbug/models/tracking.py
+++ b/bugbug/models/tracking.py
@@ -67,6 +67,12 @@ def __init__(self, lemmatization=False):
                         rollback_when=self.rollback,
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -81,12 +87,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def rollback(self, change):
         return change["field_name"].startswith("cf_tracking_firefox")
 
@@ -132,7 +139,7 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()
 
     def overwrite_classes(self, bugs, classes, probabilities):
         for i, bug in enumerate(bugs):
diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py
index 27bba853be..c374abf778 100644
--- a/bugbug/models/uplift.py
+++ b/bugbug/models/uplift.py
@@ -51,6 +51,12 @@ def __init__(self, lemmatization=False):
                         rollback_when=self.rollback,
                     ),
                 ),
+            ]
+        )
+
+        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
+        self.clf = Pipeline(
+            [
                 (
                     "union",
                     ColumnTransformer(
@@ -61,12 +67,13 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(**self.hyperparameter),
+                ),
             ]
         )
 
-        self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = xgboost.XGBClassifier(**self.hyperparameter)
-
     def rollback(self, change):
         return (
             change["field_name"] == "flagtypes.name"
@@ -95,4 +102,4 @@ def get_labels(self):
         return classes, [0, 1]
 
     def get_feature_names(self):
-        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
+        return self.clf.named_steps["union"].get_feature_names_out()

From 5af9a8ad2ade28e7772eaca2ba5118ab57c87b3b Mon Sep 17 00:00:00 2001
From: Suhaib Mujahid <smujahid@mozilla.com>
Date: Fri, 1 Dec 2023 02:19:06 -0500
Subject: [PATCH 2/6] Integrate the sampler as a step in the clf pipeline

---
 bugbug/model.py                   | 22 +++-------------------
 bugbug/models/annotate_ignore.py  |  6 +++---
 bugbug/models/backout.py          |  6 +++---
 bugbug/models/defect.py           |  6 +++---
 bugbug/models/devdocneeded.py     |  6 +++---
 bugbug/models/qaneeded.py         |  6 +++---
 bugbug/models/regressionrange.py  |  6 +++---
 bugbug/models/regressor.py        |  5 +++--
 bugbug/models/spambug.py          |  5 +++--
 bugbug/models/stepstoreproduce.py |  6 +++---
 bugbug/models/testfailure.py      |  6 +++---
 bugbug/models/testselect.py       |  6 +++---
 bugbug/models/tracking.py         |  6 +++---
 bugbug/models/uplift.py           |  6 +++---
 14 files changed, 42 insertions(+), 56 deletions(-)

diff --git a/bugbug/model.py b/bugbug/model.py
index 22097af553..9a0bd83077 100644
--- a/bugbug/model.py
+++ b/bugbug/model.py
@@ -18,7 +18,6 @@
     make_index_balanced_accuracy,
     specificity_score,
 )
-from imblearn.pipeline import make_pipeline
 from sklearn import metrics
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import precision_recall_fscore_support
@@ -148,7 +147,6 @@ def __init__(self, lemmatization=False):
             self.text_vectorizer = TfidfVectorizer
 
         self.cross_validation_enabled = True
-        self.sampler = None
 
         self.calculate_importance = True
 
@@ -365,10 +363,6 @@ def train(self, importance_cutoff=0.15, limit=None):
 
         # Split dataset in training and test.
         X_train, X_test, y_train, y_test = self.train_test_split(X, y)
-        if self.sampler is not None:
-            pipeline = make_pipeline(self.sampler, self.clf)
-        else:
-            pipeline = self.clf
 
         tracking_metrics = {}
 
@@ -379,7 +373,7 @@ def train(self, importance_cutoff=0.15, limit=None):
                 scorings += ["precision", "recall"]
 
             scores = cross_validate(
-                pipeline, X_train, self.le.transform(y_train), scoring=scorings, cv=5
+                self.clf, X_train, self.le.transform(y_train), scoring=scorings, cv=5
             )
 
             logger.info("Cross Validation scores:")
@@ -394,13 +388,6 @@ def train(self, importance_cutoff=0.15, limit=None):
                 )
 
         logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
-
-        # Training on the resampled dataset if sampler is provided.
-        if self.sampler is not None:
-            X_train, y_train = self.sampler.fit_resample(X_train, y_train)
-
-            logger.info(f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}")
-
         logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
 
         self.clf.fit(X_train, self.le.transform(y_train))
@@ -558,11 +545,8 @@ def train(self, importance_cutoff=0.15, limit=None):
         if self.entire_dataset_training:
             logger.info("Retraining on the entire dataset...")
 
-            if self.sampler is not None:
-                X_train, y_train = self.sampler.fit_resample(X, y)
-            else:
-                X_train = X
-                y_train = y
+            X_train = X
+            y_train = y
 
             logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
 
diff --git a/bugbug/models/annotate_ignore.py b/bugbug/models/annotate_ignore.py
index 10818ba845..307c64b770 100644
--- a/bugbug/models/annotate_ignore.py
+++ b/bugbug/models/annotate_ignore.py
@@ -6,6 +6,7 @@
 import logging
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -26,8 +27,6 @@ def __init__(self, lemmatization: bool = False) -> None:
 
         self.training_dbs += [bugzilla.BUGS_DB]
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             commit_features.SourceCodeFileSize(),
             commit_features.OtherFileSize(),
@@ -71,7 +70,7 @@ def __init__(self, lemmatization: bool = False) -> None:
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -82,6 +81,7 @@ def __init__(self, lemmatization: bool = False) -> None:
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/backout.py b/bugbug/models/backout.py
index 6ad9255305..9cd3dbbb99 100644
--- a/bugbug/models/backout.py
+++ b/bugbug/models/backout.py
@@ -9,6 +9,7 @@
 import dateutil.parser
 import xgboost
 from dateutil.relativedelta import relativedelta
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -27,8 +28,6 @@ def __init__(self, lemmatization=False, bug_data=False):
 
         self.calculate_importance = False
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             commit_features.SourceCodeFilesModifiedNum(),
             commit_features.OtherFilesModifiedNum(),
@@ -74,7 +73,7 @@ def __init__(self, lemmatization=False, bug_data=False):
             feature_cleanup.synonyms(),
         ]
 
-        self.extraction_pipeline = Pipeline(
+        self.extraction_pipeline = ImblearnPipeline(
             [
                 (
                     "commit_extractor",
@@ -97,6 +96,7 @@ def __init__(self, lemmatization=False, bug_data=False):
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/defect.py b/bugbug/models/defect.py
index 3896b2d1ea..f80ed65246 100644
--- a/bugbug/models/defect.py
+++ b/bugbug/models/defect.py
@@ -9,6 +9,7 @@
 
 import xgboost
 from imblearn.over_sampling import BorderlineSMOTE
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline
@@ -24,8 +25,6 @@ class DefectModel(BugModel):
     def __init__(self, lemmatization=False, historical=False):
         BugModel.__init__(self, lemmatization)
 
-        self.sampler = BorderlineSMOTE(random_state=0)
-
         feature_extractors = [
             bug_features.HasSTR(),
             bug_features.Severity(),
@@ -68,7 +67,7 @@ def __init__(self, lemmatization=False, historical=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -89,6 +88,7 @@ def __init__(self, lemmatization=False, historical=False):
                         ]
                     ),
                 ),
+                ("sampler", BorderlineSMOTE(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/devdocneeded.py b/bugbug/models/devdocneeded.py
index bceca9f62c..0c2a313757 100644
--- a/bugbug/models/devdocneeded.py
+++ b/bugbug/models/devdocneeded.py
@@ -4,6 +4,7 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -19,8 +20,6 @@ def __init__(self, lemmatization=False):
 
         self.cross_validation_enabled = False
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             bug_features.HasSTR(),
             bug_features.HasRegressionRange(),
@@ -63,7 +62,7 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -75,6 +74,7 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/qaneeded.py b/bugbug/models/qaneeded.py
index 171647feff..1c82cd6366 100644
--- a/bugbug/models/qaneeded.py
+++ b/bugbug/models/qaneeded.py
@@ -4,6 +4,7 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -17,8 +18,6 @@ class QANeededModel(BugModel):
     def __init__(self, lemmatization=False):
         BugModel.__init__(self, lemmatization)
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             bug_features.HasSTR(),
             bug_features.HasRegressionRange(),
@@ -55,7 +54,7 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -67,6 +66,7 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/regressionrange.py b/bugbug/models/regressionrange.py
index 62f4868f3b..67a760db35 100644
--- a/bugbug/models/regressionrange.py
+++ b/bugbug/models/regressionrange.py
@@ -6,6 +6,7 @@
 import logging
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -22,8 +23,6 @@ class RegressionRangeModel(BugModel):
     def __init__(self, lemmatization=False):
         BugModel.__init__(self, lemmatization)
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             bug_features.HasSTR(),
             bug_features.Severity(),
@@ -54,7 +53,7 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -66,6 +65,7 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/regressor.py b/bugbug/models/regressor.py
index af43532033..4149793ab5 100644
--- a/bugbug/models/regressor.py
+++ b/bugbug/models/regressor.py
@@ -11,6 +11,7 @@
 import numpy as np
 import xgboost
 from dateutil.relativedelta import relativedelta
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -66,7 +67,6 @@ def __init__(
             self.training_dbs.append(BUG_FIXING_COMMITS_DB)
 
         self.store_dataset = True
-        self.sampler = RandomUnderSampler(random_state=0)
 
         self.use_finder = use_finder
         self.exclude_finder = exclude_finder
@@ -134,9 +134,10 @@ def __init__(
             estimator = IsotonicRegressionCalibrator(estimator)
             # This is a temporary workaround for the error : "Model type not yet supported by TreeExplainer"
             self.calculate_importance = False
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 ("union", ColumnTransformer(column_transformers)),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 ("estimator", estimator),
             ]
         )
diff --git a/bugbug/models/spambug.py b/bugbug/models/spambug.py
index bde977843e..63246121d4 100644
--- a/bugbug/models/spambug.py
+++ b/bugbug/models/spambug.py
@@ -7,6 +7,7 @@
 
 import xgboost
 from imblearn.over_sampling import BorderlineSMOTE
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline
@@ -22,7 +23,6 @@ class SpamBugModel(BugModel):
     def __init__(self, lemmatization=False):
         BugModel.__init__(self, lemmatization)
 
-        self.sampler = BorderlineSMOTE(random_state=0)
         self.calculate_importance = False
 
         feature_extractors = [
@@ -67,7 +67,7 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -83,6 +83,7 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                ("sampler", BorderlineSMOTE(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/stepstoreproduce.py b/bugbug/models/stepstoreproduce.py
index 43724ba8ae..e6a6350dee 100644
--- a/bugbug/models/stepstoreproduce.py
+++ b/bugbug/models/stepstoreproduce.py
@@ -6,6 +6,7 @@
 import logging
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -22,8 +23,6 @@ class StepsToReproduceModel(BugModel):
     def __init__(self, lemmatization=False):
         BugModel.__init__(self, lemmatization)
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             bug_features.HasRegressionRange(),
             bug_features.Severity(),
@@ -54,7 +53,7 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -66,6 +65,7 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/testfailure.py b/bugbug/models/testfailure.py
index eaf769c2b5..eab436818e 100644
--- a/bugbug/models/testfailure.py
+++ b/bugbug/models/testfailure.py
@@ -6,6 +6,7 @@
 import logging
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -24,8 +25,6 @@ def __init__(self, lemmatization=False):
 
         self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB)
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             commit_features.SourceCodeFileSize(),
             commit_features.OtherFileSize(),
@@ -63,9 +62,10 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/testselect.py b/bugbug/models/testselect.py
index a6f2780c45..a70a01a20c 100644
--- a/bugbug/models/testselect.py
+++ b/bugbug/models/testselect.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from ortools.linear_solver import pywraplp
 from sklearn.compose import ColumnTransformer
@@ -423,8 +424,6 @@ def __init__(self, lemmatization=False, granularity="label", failures_skip=None)
 
         self.entire_dataset_training = True
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             test_scheduling_features.PrevFailures(),
         ]
@@ -452,9 +451,10 @@ def __init__(self, lemmatization=False, granularity="label", failures_skip=None)
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/tracking.py b/bugbug/models/tracking.py
index d07a3661e6..d54191a58e 100644
--- a/bugbug/models/tracking.py
+++ b/bugbug/models/tracking.py
@@ -4,6 +4,7 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import InstanceHardnessThreshold
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -19,8 +20,6 @@ def __init__(self, lemmatization=False):
 
         self.calculate_importance = False
 
-        self.sampler = InstanceHardnessThreshold(random_state=0)
-
         feature_extractors = [
             bug_features.HasSTR(),
             bug_features.HasRegressionRange(),
@@ -71,7 +70,7 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -87,6 +86,7 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                ("sampler", InstanceHardnessThreshold(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),
diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py
index c374abf778..e3fd90d0f3 100644
--- a/bugbug/models/uplift.py
+++ b/bugbug/models/uplift.py
@@ -4,6 +4,7 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
@@ -17,8 +18,6 @@ class UpliftModel(BugModel):
     def __init__(self, lemmatization=False):
         BugModel.__init__(self, lemmatization)
 
-        self.sampler = RandomUnderSampler(random_state=0)
-
         feature_extractors = [
             bug_features.HasSTR(),
             bug_features.HasRegressionRange(),
@@ -55,7 +54,7 @@ def __init__(self, lemmatization=False):
         )
 
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
-        self.clf = Pipeline(
+        self.clf = ImblearnPipeline(
             [
                 (
                     "union",
@@ -67,6 +66,7 @@ def __init__(self, lemmatization=False):
                         ]
                     ),
                 ),
+                ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
                     xgboost.XGBClassifier(**self.hyperparameter),

From b2a4a8e7bc3f5f9141c777edee8a3368a0317ad0 Mon Sep 17 00:00:00 2001
From: Suhaib Mujahid <smujahid@mozilla.com>
Date: Fri, 1 Dec 2023 21:25:50 -0500
Subject: [PATCH 3/6] Fit the files feature in the clf pipeline

---
 bugbug/commit_features.py        | 40 +++++++++++---------------------
 bugbug/model.py                  |  2 ++
 bugbug/models/annotate_ignore.py | 11 +++++++++
 bugbug/models/backout.py         | 10 ++++++++
 bugbug/models/regressor.py       | 14 ++++++++++-
 bugbug/models/testfailure.py     | 19 ++++++++++++++-
 bugbug/utils.py                  |  5 ++++
 7 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/bugbug/commit_features.py b/bugbug/commit_features.py
index 12614f67c4..4d7f3b0bda 100644
--- a/bugbug/commit_features.py
+++ b/bugbug/commit_features.py
@@ -3,7 +3,6 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 import sys
-from collections import defaultdict
 from typing import Sequence
 
 import pandas as pd
@@ -808,34 +807,14 @@ def __call__(self, commit, **kwargs):
 
 
 class Files(object):
-    def __init__(self, min_freq=0.0014):
-        self.min_freq = min_freq
+    name = "files"
 
-    def fit(self, commits):
-        self.count = defaultdict(int)
-
-        self.total_commits = 0
-
-        for commit in commits:
-            self.total_commits += 1
-
-            for f in commit["files"]:
-                self.count[f] += 1
-
-        # We no longer need to store counts for files which have low frequency.
-        to_del = set(
-            f for f, c in self.count.items() if c / self.total_commits < self.min_freq
-        )
+    def __call__(self, commit, **kwargs):
+        return commit["files"]
 
-        for f in to_del:
-            del self.count[f]
 
-    def __call__(self, commit, **kwargs):
-        return [
-            f
-            for f in commit["files"]
-            if (self.count[f] / self.total_commits) > self.min_freq
-        ]
+def _pass_through_tokenizer(doc):
+    return doc
 
 
 class FileTouchedPrev(object):
@@ -1008,6 +987,7 @@ def transform(self, commits):
 
         for commit in commits():
             data = {}
+            result = {"data": data}
 
             for feature_extractor in self.feature_extractors:
                 if "bug_features" in feature_extractor.__module__:
@@ -1028,6 +1008,13 @@ def transform(self, commits):
                 else:
                     feature_extractor_name = feature_extractor.__class__.__name__
 
+                # FIXME: This is a workaround to pass the value to the
+                # union transformer independently. This will be dropped when we
+                # resolve https://github.com/mozilla/bugbug/issues/3876
+                if isinstance(feature_extractor, Files):
+                    result[sys.intern(feature_extractor_name)] = res
+                    continue
+
                 if isinstance(res, dict):
                     for key, value in res.items():
                         data[sys.intern(key)] = value
@@ -1040,7 +1027,6 @@ def transform(self, commits):
 
                 data[sys.intern(feature_extractor_name)] = res
 
-            result = {"data": data}
             if "desc" in commit:
                 for cleanup_function in self.cleanup_functions:
                     result["desc"] = cleanup_function(commit["desc"])
diff --git a/bugbug/model.py b/bugbug/model.py
index 9a0bd83077..be7a7f6857 100644
--- a/bugbug/model.py
+++ b/bugbug/model.py
@@ -194,6 +194,8 @@ def get_human_readable_feature_names(self):
                 feature_name = f"Comments contain '{feature_name}'"
             elif type_ == "text":
                 feature_name = f"Combined text contains '{feature_name}'"
+            elif type_ == "files":
+                feature_name = f"File '{feature_name}'"
             elif type_ not in ("data", "couple_data"):
                 raise ValueError(f"Unexpected feature type for: {full_feature_name}")
 
diff --git a/bugbug/models/annotate_ignore.py b/bugbug/models/annotate_ignore.py
index 307c64b770..bccb418662 100644
--- a/bugbug/models/annotate_ignore.py
+++ b/bugbug/models/annotate_ignore.py
@@ -10,6 +10,7 @@
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.pipeline import Pipeline
 
 from bugbug import bugzilla, commit_features, feature_cleanup, labels, repository, utils
@@ -24,6 +25,7 @@ def __init__(self, lemmatization: bool = False) -> None:
         CommitModel.__init__(self, lemmatization)
 
         self.calculate_importance = False
+        self.cross_validation_enabled = False
 
         self.training_dbs += [bugzilla.BUGS_DB]
 
@@ -78,6 +80,15 @@ def __init__(self, lemmatization: bool = False) -> None:
                         [
                             ("data", DictVectorizer(), "data"),
                             ("desc", self.text_vectorizer(min_df=0.0001), "desc"),
+                            (
+                                "files",
+                                CountVectorizer(
+                                    analyzer=utils.keep_as_is,
+                                    lowercase=False,
+                                    min_df=0.0014,
+                                ),
+                                "files",
+                            ),
                         ]
                     ),
                 ),
diff --git a/bugbug/models/backout.py b/bugbug/models/backout.py
index 9cd3dbbb99..8874ee98c0 100644
--- a/bugbug/models/backout.py
+++ b/bugbug/models/backout.py
@@ -13,6 +13,7 @@
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.pipeline import Pipeline
 
 from bugbug import bug_features, commit_features, feature_cleanup, repository, utils
@@ -93,6 +94,15 @@ def __init__(self, lemmatization=False, bug_data=False):
                         [
                             ("data", DictVectorizer(), "data"),
                             ("desc", self.text_vectorizer(), "desc"),
+                            (
+                                "files",
+                                CountVectorizer(
+                                    analyzer=utils.keep_as_is,
+                                    lowercase=False,
+                                    min_df=0.0014,
+                                ),
+                                "files",
+                            ),
                         ]
                     ),
                 ),
diff --git a/bugbug/models/regressor.py b/bugbug/models/regressor.py
index 4149793ab5..77a7d3a978 100644
--- a/bugbug/models/regressor.py
+++ b/bugbug/models/regressor.py
@@ -15,6 +15,7 @@
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.pipeline import Pipeline
 
 from bugbug import bugzilla, commit_features, db, feature_cleanup, repository, utils
@@ -111,7 +112,18 @@ def __init__(
             feature_cleanup.synonyms(),
         ]
 
-        column_transformers = [("data", DictVectorizer(), "data")]
+        column_transformers = [
+            ("data", DictVectorizer(), "data"),
+            (
+                "files",
+                CountVectorizer(
+                    analyzer=utils.keep_as_is,
+                    lowercase=False,
+                    min_df=0.0014,
+                ),
+                "files",
+            ),
+        ]
 
         if not interpretable:
             column_transformers.append(
diff --git a/bugbug/models/testfailure.py b/bugbug/models/testfailure.py
index eab436818e..5e88f61661 100644
--- a/bugbug/models/testfailure.py
+++ b/bugbug/models/testfailure.py
@@ -10,6 +10,7 @@
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.pipeline import Pipeline
 
 from bugbug import commit_features, repository, test_scheduling, utils
@@ -64,7 +65,23 @@ def __init__(self, lemmatization=False):
         self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
         self.clf = ImblearnPipeline(
             [
-                ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
+                (
+                    "union",
+                    ColumnTransformer(
+                        [
+                            ("data", DictVectorizer(), "data"),
+                            (
+                                "files",
+                                CountVectorizer(
+                                    analyzer=utils.keep_as_is,
+                                    lowercase=False,
+                                    min_df=0.0014,
+                                ),
+                                "files",
+                            ),
+                        ]
+                    ),
+                ),
                 ("sampler", RandomUnderSampler(random_state=0)),
                 (
                     "estimator",
diff --git a/bugbug/utils.py b/bugbug/utils.py
index b576c92858..2a737023fc 100644
--- a/bugbug/utils.py
+++ b/bugbug/utils.py
@@ -553,3 +553,8 @@ def escape_markdown(text: str) -> str:
         .replace(")", "\\)")
         .replace("|", "\\|")
     )
+
+
+def keep_as_is(x):
+    """A tokenizer that does nothing."""
+    return x

From d9cff732d71a5a5476edc51ab60b18bdd47fe0e7 Mon Sep 17 00:00:00 2001
From: Suhaib Mujahid <smujahid@mozilla.com>
Date: Fri, 1 Dec 2023 22:08:16 -0500
Subject: [PATCH 4/6] Do not fit when transforming the entire data

---
 bugbug/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bugbug/model.py b/bugbug/model.py
index be7a7f6857..259d5b388d 100644
--- a/bugbug/model.py
+++ b/bugbug/model.py
@@ -348,7 +348,7 @@ def train(self, importance_cutoff=0.15, limit=None):
         X_gen, y = split_tuple_generator(lambda: self.items_gen(classes))
 
         # Extract features from the items.
-        X = self.extraction_pipeline.fit_transform(X_gen)
+        X = self.extraction_pipeline.transform(X_gen)
 
         # Calculate labels.
         y = np.array(y)

From a09e6caf6364c3ff459c64588e41f558c0f27488 Mon Sep 17 00:00:00 2001
From: Suhaib Mujahid <smujahid@mozilla.com>
Date: Fri, 1 Dec 2023 21:56:37 -0500
Subject: [PATCH 5/6] Support saving and loading xgboost models in the pipeline

---
 bugbug/model.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/bugbug/model.py b/bugbug/model.py
index 259d5b388d..deb41ab707 100644
--- a/bugbug/model.py
+++ b/bugbug/model.py
@@ -557,13 +557,15 @@ def train(self, importance_cutoff=0.15, limit=None):
         model_directory = self.__class__.__name__.lower()
         makedirs(model_directory, exist_ok=True)
 
-        if issubclass(type(self.clf), XGBModel):
+        step_name, estimator = self.clf.steps.pop()
+        if issubclass(type(estimator), XGBModel):
             xgboost_model_path = path.join(model_directory, "xgboost.ubj")
-            self.clf.save_model(xgboost_model_path)
+            estimator.save_model(xgboost_model_path)
 
-            # Since we save the classifier separately, we need to clear the clf
-            # attribute to prevent it from being pickled with the model object.
-            self.clf = self.clf.__class__(**self.hyperparameter)
+            # Since we save the estimator separately, we need to reset it to
+            # prevent its data from being pickled with the pipeline.
+            estimator = estimator.__class__(**self.hyperparameter)
+        self.clf.steps.append((step_name, estimator))
 
         model_path = path.join(model_directory, "model.pkl")
         with open(model_path, "wb") as f:
@@ -586,7 +588,7 @@ def load(model_directory: str) -> "Model":
 
         xgboost_model_path = path.join(model_directory, "xgboost.ubj")
         if path.exists(xgboost_model_path):
-            model.clf.load_model(xgboost_model_path)
+            model.clf.named_steps["estimator"].load_model(xgboost_model_path)
 
         return model
 

From d739773a8ccf2315455ee26ea7fd8a2947e748b8 Mon Sep 17 00:00:00 2001
From: Suhaib Mujahid <smujahid@mozilla.com>
Date: Sat, 2 Dec 2023 10:01:53 -0500
Subject: [PATCH 6/6] Log the number of features

---
 bugbug/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bugbug/model.py b/bugbug/model.py
index deb41ab707..963d837d7a 100644
--- a/bugbug/model.py
+++ b/bugbug/model.py
@@ -393,6 +393,7 @@ def train(self, importance_cutoff=0.15, limit=None):
         logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
 
         self.clf.fit(X_train, self.le.transform(y_train))
+        logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)
 
         logger.info("Model trained")