Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the pipelines to avoid fitting before splitting #3877

Merged
merged 6 commits into from
Dec 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 13 additions & 27 deletions bugbug/commit_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import sys
from collections import defaultdict
from typing import Sequence

import pandas as pd
Expand Down Expand Up @@ -808,34 +807,14 @@ def __call__(self, commit, **kwargs):


class Files(object):
def __init__(self, min_freq=0.0014):
self.min_freq = min_freq
name = "files"

def fit(self, commits):
self.count = defaultdict(int)

self.total_commits = 0

for commit in commits:
self.total_commits += 1

for f in commit["files"]:
self.count[f] += 1

# We no longer need to store counts for files which have low frequency.
to_del = set(
f for f, c in self.count.items() if c / self.total_commits < self.min_freq
)
def __call__(self, commit, **kwargs):
return commit["files"]

for f in to_del:
del self.count[f]

def __call__(self, commit, **kwargs):
return [
f
for f in commit["files"]
if (self.count[f] / self.total_commits) > self.min_freq
]
def _pass_through_tokenizer(doc):
return doc


class FileTouchedPrev(object):
Expand Down Expand Up @@ -1008,6 +987,7 @@ def transform(self, commits):

for commit in commits():
data = {}
result = {"data": data}

for feature_extractor in self.feature_extractors:
if "bug_features" in feature_extractor.__module__:
Expand All @@ -1028,6 +1008,13 @@ def transform(self, commits):
else:
feature_extractor_name = feature_extractor.__class__.__name__

# FIXME: This is a workaround to pass the value to the
# union transformer independently. This will be dropped when we
# resolve https://github.com/mozilla/bugbug/issues/3876
if isinstance(feature_extractor, Files):
result[sys.intern(feature_extractor_name)] = res
continue

if isinstance(res, dict):
for key, value in res.items():
data[sys.intern(key)] = value
Expand All @@ -1040,7 +1027,6 @@ def transform(self, commits):

data[sys.intern(feature_extractor_name)] = res

result = {"data": data}
if "desc" in commit:
for cleanup_function in self.cleanup_functions:
result["desc"] = cleanup_function(commit["desc"])
Expand Down
41 changes: 15 additions & 26 deletions bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
make_index_balanced_accuracy,
specificity_score,
)
from imblearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
Expand Down Expand Up @@ -148,7 +147,6 @@ def __init__(self, lemmatization=False):
self.text_vectorizer = TfidfVectorizer

self.cross_validation_enabled = True
self.sampler = None

self.calculate_importance = True

Expand Down Expand Up @@ -196,6 +194,8 @@ def get_human_readable_feature_names(self):
feature_name = f"Comments contain '{feature_name}'"
elif type_ == "text":
feature_name = f"Combined text contains '{feature_name}'"
elif type_ == "files":
feature_name = f"File '{feature_name}'"
elif type_ not in ("data", "couple_data"):
raise ValueError(f"Unexpected feature type for: {full_feature_name}")

Expand Down Expand Up @@ -348,7 +348,7 @@ def train(self, importance_cutoff=0.15, limit=None):
X_gen, y = split_tuple_generator(lambda: self.items_gen(classes))

# Extract features from the items.
X = self.extraction_pipeline.fit_transform(X_gen)
X = self.extraction_pipeline.transform(X_gen)

# Calculate labels.
y = np.array(y)
Expand All @@ -365,10 +365,6 @@ def train(self, importance_cutoff=0.15, limit=None):

# Split dataset in training and test.
X_train, X_test, y_train, y_test = self.train_test_split(X, y)
if self.sampler is not None:
pipeline = make_pipeline(self.sampler, self.clf)
else:
pipeline = self.clf

tracking_metrics = {}

Expand All @@ -379,7 +375,7 @@ def train(self, importance_cutoff=0.15, limit=None):
scorings += ["precision", "recall"]

scores = cross_validate(
pipeline, X_train, self.le.transform(y_train), scoring=scorings, cv=5
self.clf, X_train, self.le.transform(y_train), scoring=scorings, cv=5
)

logger.info("Cross Validation scores:")
Expand All @@ -394,16 +390,10 @@ def train(self, importance_cutoff=0.15, limit=None):
)

logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

# Training on the resampled dataset if sampler is provided.
if self.sampler is not None:
X_train, y_train = self.sampler.fit_resample(X_train, y_train)

logger.info(f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}")

logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

self.clf.fit(X_train, self.le.transform(y_train))
logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)

logger.info("Model trained")

Expand Down Expand Up @@ -558,11 +548,8 @@ def train(self, importance_cutoff=0.15, limit=None):
if self.entire_dataset_training:
logger.info("Retraining on the entire dataset...")

if self.sampler is not None:
X_train, y_train = self.sampler.fit_resample(X, y)
else:
X_train = X
y_train = y
X_train = X
y_train = y

logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

Expand All @@ -571,13 +558,15 @@ def train(self, importance_cutoff=0.15, limit=None):
model_directory = self.__class__.__name__.lower()
makedirs(model_directory, exist_ok=True)

if issubclass(type(self.clf), XGBModel):
step_name, estimator = self.clf.steps.pop()
if issubclass(type(estimator), XGBModel):
xgboost_model_path = path.join(model_directory, "xgboost.ubj")
self.clf.save_model(xgboost_model_path)
estimator.save_model(xgboost_model_path)

# Since we save the classifier separately, we need to clear the clf
# attribute to prevent it from being pickled with the model object.
self.clf = self.clf.__class__(**self.hyperparameter)
# Since we save the estimator separately, we need to reset it to
# prevent its data from being pickled with the pipeline.
estimator = estimator.__class__(**self.hyperparameter)
self.clf.steps.append((step_name, estimator))

model_path = path.join(model_directory, "model.pkl")
with open(model_path, "wb") as f:
Expand All @@ -600,7 +589,7 @@ def load(model_directory: str) -> "Model":

xgboost_model_path = path.join(model_directory, "xgboost.ubj")
if path.exists(xgboost_model_path):
model.clf.load_model(xgboost_model_path)
model.clf.named_steps["estimator"].load_model(xgboost_model_path)

return model

Expand Down
30 changes: 24 additions & 6 deletions bugbug/models/annotate_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import logging

import xgboost
from imblearn.pipeline import Pipeline as ImblearnPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from bugbug import bugzilla, commit_features, feature_cleanup, labels, repository, utils
Expand All @@ -23,11 +25,10 @@ def __init__(self, lemmatization: bool = False) -> None:
CommitModel.__init__(self, lemmatization)

self.calculate_importance = False
self.cross_validation_enabled = False

self.training_dbs += [bugzilla.BUGS_DB]

self.sampler = RandomUnderSampler(random_state=0)

feature_extractors = [
commit_features.SourceCodeFileSize(),
commit_features.OtherFileSize(),
Expand Down Expand Up @@ -67,21 +68,38 @@ def __init__(self, lemmatization: bool = False) -> None:
feature_extractors, cleanup_functions
),
),
]
)

self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
self.clf = ImblearnPipeline(
suhaibmujahid marked this conversation as resolved.
Show resolved Hide resolved
[
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
("desc", self.text_vectorizer(min_df=0.0001), "desc"),
(
"files",
CountVectorizer(
analyzer=utils.keep_as_is,
lowercase=False,
min_df=0.0014,
),
"files",
),
]
),
),
("sampler", RandomUnderSampler(random_state=0)),
(
"estimator",
xgboost.XGBClassifier(**self.hyperparameter),
),
]
)

self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
self.clf = xgboost.XGBClassifier(**self.hyperparameter)

def get_labels(self):
classes = {}

Expand Down Expand Up @@ -123,4 +141,4 @@ def get_labels(self):
return classes, [0, 1]

def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
return self.clf.named_steps["union"].get_feature_names_out()
15 changes: 11 additions & 4 deletions bugbug/models/assignee.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ def __init__(self, lemmatization=False):
rollback_when=self.rollback,
),
),
]
)

self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
self.clf = Pipeline(
[
(
"union",
ColumnTransformer(
Expand All @@ -81,12 +87,13 @@ def __init__(self, lemmatization=False):
]
),
),
(
"estimator",
xgboost.XGBClassifier(**self.hyperparameter),
),
]
)

self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
self.clf = xgboost.XGBClassifier(**self.hyperparameter)

def get_labels(self):
classes = {}

Expand Down Expand Up @@ -117,7 +124,7 @@ def get_labels(self):
return classes, set(classes.values())

def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
return self.clf.named_steps["union"].get_feature_names_out()

def rollback(self, change):
return change["field_name"].startswith("assigned_to")
31 changes: 24 additions & 7 deletions bugbug/models/backout.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
import dateutil.parser
import xgboost
from dateutil.relativedelta import relativedelta
from imblearn.pipeline import Pipeline as ImblearnPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from bugbug import bug_features, commit_features, feature_cleanup, repository, utils
Expand All @@ -27,8 +29,6 @@ def __init__(self, lemmatization=False, bug_data=False):

self.calculate_importance = False

self.sampler = RandomUnderSampler(random_state=0)

feature_extractors = [
commit_features.SourceCodeFilesModifiedNum(),
commit_features.OtherFilesModifiedNum(),
Expand Down Expand Up @@ -74,29 +74,46 @@ def __init__(self, lemmatization=False, bug_data=False):
feature_cleanup.synonyms(),
]

self.extraction_pipeline = Pipeline(
self.extraction_pipeline = ImblearnPipeline(
[
(
"commit_extractor",
commit_features.CommitExtractor(
feature_extractors, cleanup_functions
),
),
]
)

self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
self.clf = Pipeline(
[
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
("desc", self.text_vectorizer(), "desc"),
(
"files",
CountVectorizer(
analyzer=utils.keep_as_is,
lowercase=False,
min_df=0.0014,
),
"files",
),
]
),
),
("sampler", RandomUnderSampler(random_state=0)),
(
"estimator",
xgboost.XGBClassifier(**self.hyperparameter),
),
]
)

self.hyperparameter = {"n_jobs": utils.get_physical_cpu_count()}
self.clf = xgboost.XGBClassifier(**self.hyperparameter)

def get_labels(self):
classes = {}

Expand All @@ -123,4 +140,4 @@ def get_labels(self):
return classes, [0, 1]

def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
return self.clf.named_steps["union"].get_feature_names_out()
Loading