diff --git a/bugbug/commit_features.py b/bugbug/commit_features.py index 12614f67c4..4d7f3b0bda 100644 --- a/bugbug/commit_features.py +++ b/bugbug/commit_features.py @@ -3,7 +3,6 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. import sys -from collections import defaultdict from typing import Sequence import pandas as pd @@ -808,34 +807,14 @@ def __call__(self, commit, **kwargs): class Files(object): - def __init__(self, min_freq=0.0014): - self.min_freq = min_freq + name = "files" - def fit(self, commits): - self.count = defaultdict(int) - - self.total_commits = 0 - - for commit in commits: - self.total_commits += 1 - - for f in commit["files"]: - self.count[f] += 1 - - # We no longer need to store counts for files which have low frequency. - to_del = set( - f for f, c in self.count.items() if c / self.total_commits < self.min_freq - ) + def __call__(self, commit, **kwargs): + return commit["files"] - for f in to_del: - del self.count[f] - def __call__(self, commit, **kwargs): - return [ - f - for f in commit["files"] - if (self.count[f] / self.total_commits) > self.min_freq - ] +def _pass_through_tokenizer(doc): + return doc class FileTouchedPrev(object): @@ -1008,6 +987,7 @@ def transform(self, commits): for commit in commits(): data = {} + result = {"data": data} for feature_extractor in self.feature_extractors: if "bug_features" in feature_extractor.__module__: @@ -1028,6 +1008,13 @@ def transform(self, commits): else: feature_extractor_name = feature_extractor.__class__.__name__ + # FIXME: This is a workaround to pass the value to the + # union transformer independently. This will be dropped when we + # resolve https://github.com/mozilla/bugbug/issues/3876 + if isinstance(feature_extractor, Files): + result[sys.intern(feature_extractor_name)] = res + continue + if isinstance(res, dict): for key, value in res.items(): data[sys.intern(key)] = value @@ -1040,7 +1027,6 @@ def transform(self, commits): data[sys.intern(feature_extractor_name)] = res - result = {"data": data} if "desc" in commit: for cleanup_function in self.cleanup_functions: result["desc"] = cleanup_function(commit["desc"]) diff --git a/bugbug/model.py b/bugbug/model.py index 22097af553..963d837d7a 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -18,7 +18,6 @@ make_index_balanced_accuracy, specificity_score, ) -from imblearn.pipeline import make_pipeline from sklearn import metrics from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import precision_recall_fscore_support @@ -148,7 +147,6 @@ def __init__(self, lemmatization=False): self.text_vectorizer = TfidfVectorizer self.cross_validation_enabled = True - self.sampler = None self.calculate_importance = True @@ -196,6 +194,8 @@ def get_human_readable_feature_names(self): feature_name = f"Comments contain '{feature_name}'" elif type_ == "text": feature_name = f"Combined text contains '{feature_name}'" + elif type_ == "files": + feature_name = f"File '{feature_name}'" elif type_ not in ("data", "couple_data"): raise ValueError(f"Unexpected feature type for: {full_feature_name}") @@ -348,7 +348,7 @@ def train(self, importance_cutoff=0.15, limit=None): X_gen, y = split_tuple_generator(lambda: self.items_gen(classes)) # Extract features from the items. - X = self.extraction_pipeline.fit_transform(X_gen) + X = self.extraction_pipeline.transform(X_gen) # Calculate labels. y = np.array(y) @@ -365,10 +365,6 @@ def train(self, importance_cutoff=0.15, limit=None): # Split dataset in training and test. X_train, X_test, y_train, y_test = self.train_test_split(X, y) - if self.sampler is not None: - pipeline = make_pipeline(self.sampler, self.clf) - else: - pipeline = self.clf tracking_metrics = {} @@ -379,7 +375,7 @@ def train(self, importance_cutoff=0.15, limit=None): scorings += ["precision", "recall"] scores = cross_validate( - pipeline, X_train, self.le.transform(y_train), scoring=scorings, cv=5 + self.clf, X_train, self.le.transform(y_train), scoring=scorings, cv=5 ) logger.info("Cross Validation scores:") @@ -394,16 +390,10 @@ def train(self, importance_cutoff=0.15, limit=None): ) logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}") - - # Training on the resampled dataset if sampler is provided. - if self.sampler is not None: - X_train, y_train = self.sampler.fit_resample(X_train, y_train) - - logger.info(f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}") - logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}") self.clf.fit(X_train, self.le.transform(y_train)) + logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_) logger.info("Model trained") @@ -558,11 +548,8 @@ def train(self, importance_cutoff=0.15, limit=None): if self.entire_dataset_training: logger.info("Retraining on the entire dataset...") - if self.sampler is not None: - X_train, y_train = self.sampler.fit_resample(X, y) - else: - X_train = X - y_train = y + X_train = X + y_train = y logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}") @@ -571,13 +558,15 @@ def train(self, importance_cutoff=0.15, limit=None): model_directory = self.__class__.__name__.lower() makedirs(model_directory, exist_ok=True) - if issubclass(type(self.clf), XGBModel): + step_name, estimator = self.clf.steps.pop() + if issubclass(type(estimator), XGBModel): xgboost_model_path = path.join(model_directory, "xgboost.ubj") - self.clf.save_model(xgboost_model_path) + estimator.save_model(xgboost_model_path) - # Since we save the classifier separately, we need to clear the clf - # attribute to prevent it from being pickled with the model object. - self.clf = self.clf.__class__(**self.hyperparameter) + # Since we save the estimator separately, we need to reset it to + # prevent its data from being pickled with the pipeline. + estimator = estimator.__class__(**self.hyperparameter) + self.clf.steps.append((step_name, estimator)) model_path = path.join(model_directory, "model.pkl") with open(model_path, "wb") as f: @@ -600,7 +589,7 @@ def load(model_directory: str) -> "Model": xgboost_model_path = path.join(model_directory, "xgboost.ubj") if path.exists(xgboost_model_path): - model.clf.load_model(xgboost_model_path) + model.clf.named_steps["estimator"].load_model(xgboost_model_path) return model diff --git a/bugbug/models/annotate_ignore.py b/bugbug/models/annotate_ignore.py index 5df3e3658c..bccb418662 100644 --- a/bugbug/models/annotate_ignore.py +++ b/bugbug/models/annotate_ignore.py @@ -6,9 +6,11 @@ import logging import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from bugbug import bugzilla, commit_features, feature_cleanup, labels, repository, utils @@ -23,11 +25,10 @@ def __init__(self, lemmatization: bool = False) -> None: CommitModel.__init__(self, lemmatization) self.calculate_importance = False + self.cross_validation_enabled = False self.training_dbs += [bugzilla.BUGS_DB] - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ commit_features.SourceCodeFileSize(), commit_features.OtherFileSize(), @@ -67,21 +68,38 @@ def __init__(self, lemmatization: bool = False) -> None: feature_extractors, cleanup_functions ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(min_df=0.0001), "desc"), + ( + "files", + CountVectorizer( + analyzer=utils.keep_as_is, + lowercase=False, + min_df=0.0014, + ), + "files", + ), ] ), ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -123,4 +141,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/assignee.py b/bugbug/models/assignee.py index 07ec2bedc4..4ec9f7b36f 100644 --- a/bugbug/models/assignee.py +++ b/bugbug/models/assignee.py @@ -67,6 +67,12 @@ def __init__(self, lemmatization=False): rollback_when=self.rollback, ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -81,12 +87,13 @@ def __init__(self, lemmatization=False): ] ), ), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -117,7 +124,7 @@ def get_labels(self): return classes, set(classes.values()) def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def rollback(self, change): return change["field_name"].startswith("assigned_to") diff --git a/bugbug/models/backout.py b/bugbug/models/backout.py index e6100d9676..8874ee98c0 100644 --- a/bugbug/models/backout.py +++ b/bugbug/models/backout.py @@ -9,9 +9,11 @@ import dateutil.parser import xgboost from dateutil.relativedelta import relativedelta +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from bugbug import bug_features, commit_features, feature_cleanup, repository, utils @@ -27,8 +29,6 @@ def __init__(self, lemmatization=False, bug_data=False): self.calculate_importance = False - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ commit_features.SourceCodeFilesModifiedNum(), commit_features.OtherFilesModifiedNum(), @@ -74,7 +74,7 @@ def __init__(self, lemmatization=False, bug_data=False): feature_cleanup.synonyms(), ] - self.extraction_pipeline = Pipeline( + self.extraction_pipeline = ImblearnPipeline( [ ( "commit_extractor", @@ -82,21 +82,38 @@ def __init__(self, lemmatization=False, bug_data=False): feature_extractors, cleanup_functions ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), + ( + "files", + CountVectorizer( + analyzer=utils.keep_as_is, + lowercase=False, + min_df=0.0014, + ), + "files", + ), ] ), ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -123,4 +140,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/browsername.py b/bugbug/models/browsername.py index 4d781e86a2..35d83adea4 100644 --- a/bugbug/models/browsername.py +++ b/bugbug/models/browsername.py @@ -38,6 +38,12 @@ def __init__(self, lemmatization=False): feature_extractors, cleanup_functions ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -52,12 +58,13 @@ def __init__(self, lemmatization=False): ] ), ), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -81,4 +88,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/bugtype.py b/bugbug/models/bugtype.py index 70a6111d1f..9e2ea95259 100644 --- a/bugbug/models/bugtype.py +++ b/bugbug/models/bugtype.py @@ -128,6 +128,12 @@ def __init__(self, lemmatization=False, historical=False): "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -147,12 +153,13 @@ def __init__(self, lemmatization=False, historical=False): ] ), ), + ( + "estimator", + OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter)), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter)) - def get_labels(self) -> tuple[dict[int, np.ndarray], list[str]]: classes = {} @@ -175,7 +182,7 @@ def get_labels(self) -> tuple[dict[int, np.ndarray], list[str]]: return classes, TYPE_LIST def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def overwrite_classes( self, diff --git a/bugbug/models/component.py b/bugbug/models/component.py index b1a8ada587..02aa953752 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -97,6 +97,12 @@ def __init__(self, lemmatization=False): feature_extractors, cleanup_functions, rollback=True ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -111,12 +117,13 @@ def __init__(self, lemmatization=False): ] ), ), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - self.CONFLATED_COMPONENTS_INVERSE_MAPPING = { v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items() } @@ -231,7 +238,7 @@ def get_meaningful_product_components(self, full_comp_tuples, threshold_ratio=10 ) def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def check(self): success = super().check() diff --git a/bugbug/models/defect.py b/bugbug/models/defect.py index 2cd5aef90f..f80ed65246 100644 --- a/bugbug/models/defect.py +++ b/bugbug/models/defect.py @@ -9,6 +9,7 @@ import xgboost from imblearn.over_sampling import BorderlineSMOTE +from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline @@ -24,8 +25,6 @@ class DefectModel(BugModel): def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) - self.sampler = BorderlineSMOTE(random_state=0) - feature_extractors = [ bug_features.HasSTR(), bug_features.Severity(), @@ -64,6 +63,12 @@ def __init__(self, lemmatization=False, historical=False): "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -83,12 +88,14 @@ def __init__(self, lemmatization=False, historical=False): ] ), ), + ("sampler", BorderlineSMOTE(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_bugbug_labels(self, kind="bug") -> dict[int, Any]: assert kind in ["bug", "regression", "defect_enhancement_task"] @@ -264,7 +271,7 @@ def get_labels(self) -> tuple[dict[int, Any], list[Any]]: return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def overwrite_classes(self, bugs, classes, probabilities): for i, bug in enumerate(bugs): diff --git a/bugbug/models/devdocneeded.py b/bugbug/models/devdocneeded.py index 400d934600..0c2a313757 100644 --- a/bugbug/models/devdocneeded.py +++ b/bugbug/models/devdocneeded.py @@ -4,6 +4,7 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -19,8 +20,6 @@ def __init__(self, lemmatization=False): self.cross_validation_enabled = False - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ bug_features.HasSTR(), bug_features.HasRegressionRange(), @@ -59,6 +58,12 @@ def __init__(self, lemmatization=False): commit_data=True, ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -69,12 +74,14 @@ def __init__(self, lemmatization=False): ] ), ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def rollback(self, change): return change["field_name"] == "keywords" and any( keyword in change["added"] @@ -121,4 +128,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/fixtime.py b/bugbug/models/fixtime.py index 89919f8678..2c3d684dd3 100644 --- a/bugbug/models/fixtime.py +++ b/bugbug/models/fixtime.py @@ -54,6 +54,12 @@ def __init__(self, lemmatization=False): feature_extractors, cleanup_functions, rollback=True ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -68,12 +74,13 @@ def __init__(self, lemmatization=False): ] ), ), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): bug_fix_times = [] @@ -118,4 +125,4 @@ def _quantiles(n): return classes, list(range(len(quantiles) + 1)) def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/invalid_compatibility_report.py b/bugbug/models/invalid_compatibility_report.py index b4460fae2e..859991e740 100644 --- a/bugbug/models/invalid_compatibility_report.py +++ b/bugbug/models/invalid_compatibility_report.py @@ -35,6 +35,12 @@ def __init__(self, lemmatization=False): feature_extractors, cleanup_functions, rollback=False ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -47,12 +53,13 @@ def __init__(self, lemmatization=False): ] ), ), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def items_gen(self, classes): # Do cleanup separately from extraction pipeline to # make sure it's not applied during classification due to differences @@ -103,4 +110,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/needsdiagnosis.py b/bugbug/models/needsdiagnosis.py index b41da12672..3e2699ed83 100644 --- a/bugbug/models/needsdiagnosis.py +++ b/bugbug/models/needsdiagnosis.py @@ -39,6 +39,12 @@ def __init__(self, lemmatization=False): feature_extractors, cleanup_functions, rollback=True ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -52,12 +58,13 @@ def __init__(self, lemmatization=False): ] ), ), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -92,4 +99,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/qaneeded.py b/bugbug/models/qaneeded.py index 5643b24882..1c82cd6366 100644 --- a/bugbug/models/qaneeded.py +++ b/bugbug/models/qaneeded.py @@ -4,6 +4,7 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -17,8 +18,6 @@ class QANeededModel(BugModel): def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ bug_features.HasSTR(), bug_features.HasRegressionRange(), @@ -51,6 +50,12 @@ def __init__(self, lemmatization=False): rollback_when=self.rollback, ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -61,12 +66,14 @@ def __init__(self, lemmatization=False): ] ), ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def rollback(self, change): return any( change["added"].startswith(prefix) @@ -109,4 +116,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/rcatype.py b/bugbug/models/rcatype.py index 202f38835c..dd9ce6b60d 100644 --- a/bugbug/models/rcatype.py +++ b/bugbug/models/rcatype.py @@ -101,6 +101,12 @@ def __init__( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = Pipeline( + [ ( "union", ColumnTransformer( @@ -120,12 +126,13 @@ def __init__( ] ), ), + ( + "estimator", + OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter)), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = OneVsRestClassifier(xgboost.XGBClassifier(**self.hyperparameter)) - # return rca from a whiteboard string def get_rca_from_whiteboard(self, whiteboard_data): rca = [] @@ -161,7 +168,7 @@ def get_labels(self): return classes, self.RCA_LIST def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def overwrite_classes(self, bugs, classes, probabilities): rca_values = self.get_rca(bugs) diff --git a/bugbug/models/regressionrange.py b/bugbug/models/regressionrange.py index 58a2af5169..67a760db35 100644 --- a/bugbug/models/regressionrange.py +++ b/bugbug/models/regressionrange.py @@ -6,6 +6,7 @@ import logging import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -22,8 +23,6 @@ class RegressionRangeModel(BugModel): def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ bug_features.HasSTR(), bug_features.Severity(), @@ -50,6 +49,12 @@ def __init__(self, lemmatization=False): "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -60,12 +65,14 @@ def __init__(self, lemmatization=False): ] ), ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -93,4 +100,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/regressor.py b/bugbug/models/regressor.py index 01adbc2212..77a7d3a978 100644 --- a/bugbug/models/regressor.py +++ b/bugbug/models/regressor.py @@ -11,9 +11,11 @@ import numpy as np import xgboost from dateutil.relativedelta import relativedelta +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from bugbug import bugzilla, commit_features, db, feature_cleanup, repository, utils @@ -66,7 +68,6 @@ def __init__( self.training_dbs.append(BUG_FIXING_COMMITS_DB) self.store_dataset = True - self.sampler = RandomUnderSampler(random_state=0) self.use_finder = use_finder self.exclude_finder = exclude_finder @@ -111,7 +112,18 @@ def __init__( feature_cleanup.synonyms(), ] - column_transformers = [("data", DictVectorizer(), "data")] + column_transformers = [ + ("data", DictVectorizer(), "data"), + ( + "files", + CountVectorizer( + analyzer=utils.keep_as_is, + lowercase=False, + min_df=0.0014, + ), + "files", + ), + ] if not interpretable: column_transformers.append( @@ -126,15 +138,21 @@ def __init__( feature_extractors, cleanup_functions ), ), - ("union", ColumnTransformer(column_transformers)), ] ) self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) + estimator = xgboost.XGBClassifier(**self.hyperparameter) if calibration: - self.clf = IsotonicRegressionCalibrator(self.clf) + estimator = IsotonicRegressionCalibrator(estimator) # This is a temporary workaround for the error : "Model type not yet supported by TreeExplainer" self.calculate_importance = False + self.clf = ImblearnPipeline( + [ + ("union", ColumnTransformer(column_transformers)), + ("sampler", RandomUnderSampler(random_state=0)), + ("estimator", estimator), + ] + ) def get_labels(self): classes = {} @@ -365,7 +383,7 @@ def evaluation(self) -> None: ) def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def overwrite_classes(self, commits, classes, probabilities): for i, commit in enumerate(commits): diff --git a/bugbug/models/spambug.py b/bugbug/models/spambug.py index 332f1e13a1..63246121d4 100644 --- a/bugbug/models/spambug.py +++ b/bugbug/models/spambug.py @@ -7,6 +7,7 @@ import xgboost from imblearn.over_sampling import BorderlineSMOTE +from imblearn.pipeline import Pipeline as ImblearnPipeline from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline @@ -22,7 +23,6 @@ class SpamBugModel(BugModel): def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) - self.sampler = BorderlineSMOTE(random_state=0) self.calculate_importance = False feature_extractors = [ @@ -63,6 +63,12 @@ def __init__(self, lemmatization=False): feature_extractors, cleanup_functions, rollback=True ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -77,12 +83,14 @@ def __init__(self, lemmatization=False): ] ), ), + ("sampler", BorderlineSMOTE(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -131,7 +139,7 @@ def items_gen(self, classes): ) def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def overwrite_classes(self, bugs, classes, probabilities): for i, bug in enumerate(bugs): diff --git a/bugbug/models/stepstoreproduce.py b/bugbug/models/stepstoreproduce.py index 76947527ad..e6a6350dee 100644 --- a/bugbug/models/stepstoreproduce.py +++ b/bugbug/models/stepstoreproduce.py @@ -6,6 +6,7 @@ import logging import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -22,8 +23,6 @@ class StepsToReproduceModel(BugModel): def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ bug_features.HasRegressionRange(), bug_features.Severity(), @@ -50,6 +49,12 @@ def __init__(self, lemmatization=False): "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -60,12 +65,14 @@ def __init__(self, lemmatization=False): ] ), ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def get_labels(self): classes = {} @@ -106,4 +113,4 @@ def overwrite_classes(self, bugs, classes, probabilities): return classes def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/testfailure.py b/bugbug/models/testfailure.py index 04e5af4cda..5e88f61661 100644 --- a/bugbug/models/testfailure.py +++ b/bugbug/models/testfailure.py @@ -6,9 +6,11 @@ import logging import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from bugbug import commit_features, repository, test_scheduling, utils @@ -24,8 +26,6 @@ def __init__(self, lemmatization=False): self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ commit_features.SourceCodeFileSize(), commit_features.OtherFileSize(), @@ -59,12 +59,36 @@ def __init__(self, lemmatization=False): "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), - ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ] ) self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) + self.clf = ImblearnPipeline( + [ + ( + "union", + ColumnTransformer( + [ + ("data", DictVectorizer(), "data"), + ( + "files", + CountVectorizer( + analyzer=utils.keep_as_is, + lowercase=False, + min_df=0.0014, + ), + "files", + ), + ] + ), + ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), + ] + ) def items_gen(self, classes): commit_map = {} @@ -110,4 +134,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/models/testselect.py b/bugbug/models/testselect.py index 684b1b495e..a70a01a20c 100644 --- a/bugbug/models/testselect.py +++ b/bugbug/models/testselect.py @@ -15,6 +15,7 @@ import numpy as np import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from ortools.linear_solver import pywraplp from sklearn.compose import ColumnTransformer @@ -423,8 +424,6 @@ def __init__(self, lemmatization=False, granularity="label", failures_skip=None) self.entire_dataset_training = True - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ test_scheduling_features.PrevFailures(), ] @@ -448,12 +447,20 @@ def __init__(self, lemmatization=False, granularity="label", failures_skip=None) "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), - ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ] ) self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) + self.clf = ImblearnPipeline( + [ + ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), + ] + ) def get_pushes( self, apply_filters: bool = False @@ -859,7 +866,7 @@ def do_eval( do_eval(executor, confidence_threshold, reduction, cap, minimum) def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() class TestLabelSelectModel(TestSelectModel): diff --git a/bugbug/models/tracking.py b/bugbug/models/tracking.py index 417da45497..d54191a58e 100644 --- a/bugbug/models/tracking.py +++ b/bugbug/models/tracking.py @@ -4,6 +4,7 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import InstanceHardnessThreshold from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -19,8 +20,6 @@ def __init__(self, lemmatization=False): self.calculate_importance = False - self.sampler = InstanceHardnessThreshold(random_state=0) - feature_extractors = [ bug_features.HasSTR(), bug_features.HasRegressionRange(), @@ -67,6 +66,12 @@ def __init__(self, lemmatization=False): rollback_when=self.rollback, ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -81,12 +86,14 @@ def __init__(self, lemmatization=False): ] ), ), + ("sampler", InstanceHardnessThreshold(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def rollback(self, change): return change["field_name"].startswith("cf_tracking_firefox") @@ -132,7 +139,7 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() def overwrite_classes(self, bugs, classes, probabilities): for i, bug in enumerate(bugs): diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py index 27bba853be..e3fd90d0f3 100644 --- a/bugbug/models/uplift.py +++ b/bugbug/models/uplift.py @@ -4,6 +4,7 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import xgboost +from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer @@ -17,8 +18,6 @@ class UpliftModel(BugModel): def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) - self.sampler = RandomUnderSampler(random_state=0) - feature_extractors = [ bug_features.HasSTR(), bug_features.HasRegressionRange(), @@ -51,6 +50,12 @@ def __init__(self, lemmatization=False): rollback_when=self.rollback, ), ), + ] + ) + + self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} + self.clf = ImblearnPipeline( + [ ( "union", ColumnTransformer( @@ -61,12 +66,14 @@ def __init__(self, lemmatization=False): ] ), ), + ("sampler", RandomUnderSampler(random_state=0)), + ( + "estimator", + xgboost.XGBClassifier(**self.hyperparameter), + ), ] ) - self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()} - self.clf = xgboost.XGBClassifier(**self.hyperparameter) - def rollback(self, change): return ( change["field_name"] == "flagtypes.name" @@ -95,4 +102,4 @@ def get_labels(self): return classes, [0, 1] def get_feature_names(self): - return self.extraction_pipeline.named_steps["union"].get_feature_names_out() + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/bugbug/utils.py b/bugbug/utils.py index b576c92858..2a737023fc 100644 --- a/bugbug/utils.py +++ b/bugbug/utils.py @@ -553,3 +553,8 @@ def escape_markdown(text: str) -> str: .replace(")", "\\)") .replace("|", "\\|") ) + + +def keep_as_is(x): + """A tokenizer that does nothing.""" + return x