-
Notifications
You must be signed in to change notification settings - Fork 307
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a model to identify invalid reports for Firefox in-product report…
…er (#3790)
- Loading branch information
Showing
7 changed files
with
283 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# -*- coding: utf-8 -*- | ||
# This Source Code Form is subject to the terms of the Mozilla Public | ||
# License, v. 2.0. If a copy of the MPL was not distributed with this file, | ||
# You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
||
import logging | ||
|
||
import xgboost | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.pipeline import Pipeline | ||
|
||
from bugbug import feature_cleanup, issue_features, utils | ||
from bugbug.model import IssueModel | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class InvalidCompatibilityReportModel(IssueModel): | ||
def __init__(self, lemmatization=False): | ||
super().__init__( | ||
owner="webcompat", repo="web-bugs", lemmatization=lemmatization | ||
) | ||
|
||
self.calculate_importance = False | ||
|
||
feature_extractors = [] | ||
|
||
cleanup_functions = [] | ||
|
||
self.extraction_pipeline = Pipeline( | ||
[ | ||
( | ||
"report_extractor", | ||
issue_features.IssueExtractor( | ||
feature_extractors, cleanup_functions, rollback=False | ||
), | ||
), | ||
( | ||
"union", | ||
ColumnTransformer( | ||
[ | ||
( | ||
"first_comment", | ||
self.text_vectorizer(min_df=0.0001), | ||
"first_comment", | ||
), | ||
] | ||
), | ||
), | ||
] | ||
) | ||
|
||
self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) | ||
|
||
def items_gen(self, classes): | ||
# Do cleanup separately from extraction pipeline to | ||
# make sure it's not applied during classification due to differences | ||
# in text structure between GitHub issues and reports | ||
cleanup_function = feature_cleanup.CleanCompatibilityReportDescription() | ||
|
||
for issue, label in super().items_gen(classes): | ||
issue = { | ||
**issue, | ||
"body": cleanup_function(issue["body"]), | ||
} | ||
yield issue, label | ||
|
||
def get_labels(self): | ||
classes = {} | ||
for issue in self.github.get_issues(): | ||
if not issue["title"] or not issue["body"]: | ||
continue | ||
|
||
# Skip issues that are not moderated yet as they don't have a | ||
# meaningful title or body. | ||
if issue["title"] == "In the moderation queue.": | ||
continue | ||
|
||
if ( | ||
issue["milestone"] | ||
and (issue["milestone"]["title"] in ("invalid", "incomplete")) | ||
and any(label["name"] == "wcrt-invalid" for label in issue["labels"]) | ||
): | ||
classes[issue["number"]] = 1 | ||
|
||
elif any( | ||
event["event"] == "milestoned" | ||
and (event["milestone"]["title"] in ("needsdiagnosis", "moved")) | ||
for event in issue["events"] | ||
): | ||
classes[issue["number"]] = 0 | ||
|
||
logger.info( | ||
"%d issues have been moved to invalid", | ||
sum(label == 1 for label in classes.values()), | ||
) | ||
logger.info( | ||
"%d issues have not been moved to invalid", | ||
sum(label == 0 for label in classes.values()), | ||
) | ||
|
||
return classes, [0, 1] | ||
|
||
def get_feature_names(self): | ||
return self.extraction_pipeline.named_steps["union"].get_feature_names_out() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import argparse | ||
import os | ||
from logging import INFO, basicConfig, getLogger | ||
|
||
import numpy as np | ||
import requests | ||
|
||
from bugbug.models import get_model_class | ||
from bugbug.utils import download_model | ||
|
||
basicConfig(level=INFO) | ||
logger = getLogger(__name__) | ||
|
||
|
||
def classify_reports(model_name: str, report_text: str) -> None: | ||
model_file_name = f"{model_name}model" | ||
|
||
if not os.path.exists(model_file_name): | ||
logger.info("%s does not exist. Downloading the model....", model_file_name) | ||
try: | ||
download_model(model_name) | ||
except requests.HTTPError: | ||
logger.error( | ||
"A pre-trained model is not available, you will need to train it yourself using the trainer script" | ||
) | ||
raise SystemExit(1) | ||
|
||
model_class = get_model_class(model_name) | ||
model = model_class.load(model_file_name) | ||
|
||
logger.info("%s", report_text) | ||
|
||
report = {"body": report_text, "title": ""} | ||
|
||
if model.calculate_importance: | ||
probas, importance = model.classify( | ||
report, probabilities=True, importances=True | ||
) | ||
|
||
model.print_feature_importances( | ||
importance["importances"], class_probabilities=probas | ||
) | ||
else: | ||
probas = model.classify(report, probabilities=True, importances=False) | ||
|
||
probability = probas[0] | ||
pred_index = np.argmax(probability) | ||
if len(probability) > 2: | ||
pred_class = model.le.inverse_transform([pred_index])[0] | ||
else: | ||
pred_class = "Positive" if pred_index == 1 else "Negative" | ||
logger.info("%s %s", pred_class, probability) | ||
input() | ||
|
||
|
||
def main() -> None: | ||
description = "Perform evaluation of user report using the specified model" | ||
parser = argparse.ArgumentParser(description=description) | ||
|
||
parser.add_argument("model", type=str, help="Which model to use for evaluation") | ||
parser.add_argument("--report-text", help="Report text to classify", type=str) | ||
|
||
args = parser.parse_args() | ||
|
||
classify_reports(args.model, args.report_text) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# -*- coding: utf-8 -*- | ||
# This Source Code Form is subject to the terms of the Mozilla Public | ||
# License, v. 2.0. If a copy of the MPL was not distributed with this file, | ||
# You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
||
from bugbug.models.invalid_compatibility_report import InvalidCompatibilityReportModel | ||
|
||
|
||
def test_get_invalid_labels(): | ||
model = InvalidCompatibilityReportModel() | ||
classes, _ = model.get_labels() | ||
assert classes[70960] | ||
assert classes[70978] | ||
assert not classes[71052] | ||
assert not classes[71011] |