Skip to content

Commit

Permalink
Add a model to identify invalid reports for Firefox in-product report…
Browse files Browse the repository at this point in the history
…er (#3790)
  • Loading branch information
ksy36 authored Nov 19, 2023
1 parent d454491 commit 97a0f86
Show file tree
Hide file tree
Showing 7 changed files with 283 additions and 2 deletions.
63 changes: 63 additions & 0 deletions bugbug/feature_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,66 @@ def __init__(self):

def __call__(self, text):
return self.pattern.sub("__CRASH_STATS_LINK__", text)


class CleanCompatibilityReportDescription(object):
def __init__(self):
self.sub_patterns = {
"details": re.compile(r"<details>.*?</details>", re.DOTALL),
"footer": re.compile(
r"_From \[webcompat\.com\]\(https://webcompat\.com/\) with ❤️_"
),
"link": re.compile(
r"\[View console log messages\]\(https://webcompat\.com/console_logs/.*?\)"
),
"screenshot": re.compile(r"\[\!\[Screenshot Description\]\(.*?\)\]\(.*?\)"),
"screenshot_md": re.compile(
r'\*\*Screenshot\*\*\s*\r?\n\<img width="[\d]+" alt="[^"]*" src="https?://[^"]+"[^>]*>'
),
"watchers": re.compile(r"\*\*Watchers:\*\*(?:\r?\n@[\w-]+)+"),
}
self.extract_patterns = {
"description": re.compile(r"\*\*Description\*\*: (.*?)\n", re.DOTALL),
"problem_type": re.compile(r"\*\*Problem type\*\*: (.*?)\n", re.DOTALL),
"steps": re.compile(r"\*\*Steps to Reproduce\*\*:?(.*)", re.DOTALL),
}

self.default_problems = {
"Desktop site instead of mobile site",
"Browser unsupported",
"Page not loading correctly",
"Missing items",
"Buttons or links not working",
"Unable to type",
"Unable to login",
"Problems with Captcha",
"Images not loaded",
"Items are overlapped",
"Items are misaligned",
"Items not fully visible",
"There is no video",
"There is no audio",
"Media controls are broken or missing",
"The video or audio does not play",
}

def _extract_and_strip(self, pattern, text):
match = pattern.search(text)
return match.group(1).strip() if match else ""

def __call__(self, text):
for pattern in self.sub_patterns.values():
text = pattern.sub("", text)

problem_type = self._extract_and_strip(
self.extract_patterns["problem_type"], text
)
description = self._extract_and_strip(
self.extract_patterns["description"], text
)
steps = self._extract_and_strip(self.extract_patterns["steps"], text)

if problem_type == "Something else" or description not in self.default_problems:
return f"{description}\n {steps}" if steps else description
else:
return steps
1 change: 1 addition & 0 deletions bugbug/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"devdocneeded": "bugbug.models.devdocneeded.DevDocNeededModel",
"duplicate": "bugbug.models.duplicate.DuplicateModel",
"fixtime": "bugbug.models.fixtime.FixTimeModel",
"invalidcompatibilityreport": "bugbug.models.invalid_compatibility_report.InvalidCompatibilityReportModel",
"needsdiagnosis": "bugbug.models.needsdiagnosis.NeedsDiagnosisModel",
"qaneeded": "bugbug.models.qaneeded.QANeededModel",
"rcatype": "bugbug.models.rcatype.RCATypeModel",
Expand Down
105 changes: 105 additions & 0 deletions bugbug/models/invalid_compatibility_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import logging

import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from bugbug import feature_cleanup, issue_features, utils
from bugbug.model import IssueModel

logger = logging.getLogger(__name__)


class InvalidCompatibilityReportModel(IssueModel):
def __init__(self, lemmatization=False):
super().__init__(
owner="webcompat", repo="web-bugs", lemmatization=lemmatization
)

self.calculate_importance = False

feature_extractors = []

cleanup_functions = []

self.extraction_pipeline = Pipeline(
[
(
"report_extractor",
issue_features.IssueExtractor(
feature_extractors, cleanup_functions, rollback=False
),
),
(
"union",
ColumnTransformer(
[
(
"first_comment",
self.text_vectorizer(min_df=0.0001),
"first_comment",
),
]
),
),
]
)

self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())

def items_gen(self, classes):
# Do cleanup separately from extraction pipeline to
# make sure it's not applied during classification due to differences
# in text structure between GitHub issues and reports
cleanup_function = feature_cleanup.CleanCompatibilityReportDescription()

for issue, label in super().items_gen(classes):
issue = {
**issue,
"body": cleanup_function(issue["body"]),
}
yield issue, label

def get_labels(self):
classes = {}
for issue in self.github.get_issues():
if not issue["title"] or not issue["body"]:
continue

# Skip issues that are not moderated yet as they don't have a
# meaningful title or body.
if issue["title"] == "In the moderation queue.":
continue

if (
issue["milestone"]
and (issue["milestone"]["title"] in ("invalid", "incomplete"))
and any(label["name"] == "wcrt-invalid" for label in issue["labels"])
):
classes[issue["number"]] = 1

elif any(
event["event"] == "milestoned"
and (event["milestone"]["title"] in ("needsdiagnosis", "moved"))
for event in issue["events"]
):
classes[issue["number"]] = 0

logger.info(
"%d issues have been moved to invalid",
sum(label == 1 for label in classes.values()),
)
logger.info(
"%d issues have not been moved to invalid",
sum(label == 0 for label in classes.values()),
)

return classes, [0, 1]

def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
71 changes: 71 additions & 0 deletions scripts/compatibility_report_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-

import argparse
import os
from logging import INFO, basicConfig, getLogger

import numpy as np
import requests

from bugbug.models import get_model_class
from bugbug.utils import download_model

basicConfig(level=INFO)
logger = getLogger(__name__)


def classify_reports(model_name: str, report_text: str) -> None:
model_file_name = f"{model_name}model"

if not os.path.exists(model_file_name):
logger.info("%s does not exist. Downloading the model....", model_file_name)
try:
download_model(model_name)
except requests.HTTPError:
logger.error(
"A pre-trained model is not available, you will need to train it yourself using the trainer script"
)
raise SystemExit(1)

model_class = get_model_class(model_name)
model = model_class.load(model_file_name)

logger.info("%s", report_text)

report = {"body": report_text, "title": ""}

if model.calculate_importance:
probas, importance = model.classify(
report, probabilities=True, importances=True
)

model.print_feature_importances(
importance["importances"], class_probabilities=probas
)
else:
probas = model.classify(report, probabilities=True, importances=False)

probability = probas[0]
pred_index = np.argmax(probability)
if len(probability) > 2:
pred_class = model.le.inverse_transform([pred_index])[0]
else:
pred_class = "Positive" if pred_index == 1 else "Negative"
logger.info("%s %s", pred_class, probability)
input()


def main() -> None:
description = "Perform evaluation of user report using the specified model"
parser = argparse.ArgumentParser(description=description)

parser.add_argument("model", type=str, help="Which model to use for evaluation")
parser.add_argument("--report-text", help="Report text to classify", type=str)

args = parser.parse_args()

classify_reports(args.model, args.report_text)


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions tests/fixtures/github_webcompat_web-bugs_issues.json

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions tests/test_feature_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,29 @@ def test_crash():
]
for orig_text, cleaned_text in tests:
assert feature_cleanup.crash()(orig_text) == cleaned_text


def test_clean_compatibility_report_description():
tests = [
(
'<!-- @browser: Firefox 117.0 -->\n<!-- @ua_header: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0 -->\n<!-- @reported_with: unknown -->\n<!-- @public_url: https://github.com/webcompat/web-bugs/issues/126685 -->\n\n**URL**: https://www.lequipe.fr/explore/video/la-course-en-tete/20177528\n\n**Browser / Version**: Firefox 117.0\n**Operating System**: Windows 10\n**Tested Another Browser**: Yes Chrome\n\n**Problem type**: Video or audio doesn\'t play\n**Description**: Media controls are broken or missing\n**Steps to Reproduce**:\nVideo is starting but we cannot use the video panel control. It working on Brave.\r\n<details>\r\n <summary>View the screenshot</summary>\r\n <img alt="Screenshot" src="https://webcompat.com/uploads/2023/9/501af310-e646-4b2c-8eb9-7f21ce8725fe.jpg">\r\n </details>\n\n<details>\n<summary>Browser Configuration</summary>\n<ul>\n <li>None</li>\n</ul>\n</details>\n\n_From [webcompat.com](https://webcompat.com/) with ❤️_',
"Video is starting but we cannot use the video panel control. It working on Brave.",
),
(
"<!-- @browser: Firefox Mobile 120.0 -->\n<!-- @ua_header: Mozilla/5.0 (Android 10; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0 -->\n<!-- @reported_with: unknown -->\n<!-- @public_url: https://github.com/webcompat/web-bugs/issues/128961 -->\n\n**URL**: https://www.jianshu.com/p/ba52ec38ac51\n\n**Browser / Version**: Firefox Mobile 120.0\n**Operating System**: Android 10\n**Tested Another Browser**: Yes Edge\n\n**Problem type**: Something else\n**Description**: Couldn't scroll down\n**Steps to Reproduce**:\nScroll down the page, then scroll to top, scroll down again, the page couldn't scroll (will always back to top). \n\n<details>\n<summary>Browser Configuration</summary>\n<ul>\n <li>None</li>\n</ul>\n</details>\n\n_From [webcompat.com](https://webcompat.com/) with ❤️_",
"Couldn't scroll down\n Scroll down the page, then scroll to top, scroll down again, the page couldn't scroll (will always back to top).",
),
(
'**URL**:\r\nhttps://samarabags.com/collections/all-bags/products/the-jewelry-box?variant=40390455820322\r\n\r\n**Browser/Version**:\r\nFirefox 112.0.2\r\n\r\n**Operating System**:\r\nMacOS Ventura 13.3.1 (a) (22E772610a)\r\nPrivate window\r\n\r\n**What seems to be the trouble?(Required)**\r\n- [ ] Desktop site instead of mobile site\r\n- [ ] Mobile site is not usable\r\n- [ ] Video doesn\'t play\r\n- [X] Layout is messed up\r\n- [X] Text is not visible\r\n- [ ] Something else (Add details below)\r\n\r\n**Steps to Reproduce**\r\n\r\n1. Navigate to: (www.samarabags.com)\r\n2. Select a product and open its page.\r\n\r\n*__Expected Behavior:__*\r\nThe customer review, Instagram and the footer are visible.\r\n\r\n*__Actual Behavior:__*\r\nAnything below the product\'s image is just blank. "This page slowing down Firefox" message appears on the top.\r\n\r\n**Screenshot**\r\n<img width="1510" alt="Screenshot 2023-05-12 at 6 24 29 PM" src="https://github.com/webcompat/web-bugs/assets/1740517/20423943-c0a2-42b4-a763-ff814fa48ecb">\r\n',
'\n 1. Navigate to: (www.samarabags.com)\r\n2. Select a product and open its page.\r\n\r\n*__Expected Behavior:__*\r\nThe customer review, Instagram and the footer are visible.\r\n\r\n*__Actual Behavior:__*\r\nAnything below the product\'s image is just blank. "This page slowing down Firefox" message appears on the top.',
),
(
'<!-- @browser: Firefox Nightly 108.0a1 (2022-10-18) -->\r\n<!-- @ua_header: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0 -->\r\n<!-- @reported_with: unknown -->\r\n\r\n**URL**: https://dlive.tv/s/dashboard#0\r\n\r\n**Browser / Version**: Firefox Nightly 108.0a1 (2022-10-18)\r\n**Operating System**: Windows 10\r\n**Tested Another Browser**: Yes Chrome\r\n\r\n**Problem type**: Design is broken\r\n**Description**: Items are misaligned\r\n\r\n**Prerequisites**: \r\nAccount created and signed in.\r\n\r\n**Steps to Reproduce**:\r\n1. Navigate to https://dlive.tv/s/dashboard#0\r\n2. Type in a message in the "Chat". \r\n3. Observe text alignment. \r\n\r\n**Expected Behavior:**\r\nThe text is centered in the message field.\r\n\r\n**Actual Behavior:**\r\nThe text is aligned on the top side of the message field.\r\n\r\n**Notes:**\r\n1. The issue is not reproducible on Chrome.\r\n2. The issue is also reproducible on Firefox Release.\r\n3. The issue is also reproducible for the hint text in the message field.\r\n3. Screenshot attached. \r\n\r\n**Watchers:**\r\n@softvision-oana-arbuzov\r\n@softvision-raul-bucata\r\n@sv-calin \r\n<details>\r\n <summary>View the screenshot</summary>\r\n <img alt="Screenshot" src="https://webcompat.com/uploads/2022/10/b4a296a5-ee2f-4a18-a5da-b1e20ee8d27d.jpg">\r\n </details>\r\n\r\n<details>\r\n<summary>Browser Configuration</summary>\r\n<ul>\r\n <li>None</li>\r\n</ul>\r\n</details>\r\n\r\n_From [webcompat.com](https://webcompat.com/) with ❤️_',
'1. Navigate to https://dlive.tv/s/dashboard#0\r\n2. Type in a message in the "Chat". \r\n3. Observe text alignment. \r\n\r\n**Expected Behavior:**\r\nThe text is centered in the message field.\r\n\r\n**Actual Behavior:**\r\nThe text is aligned on the top side of the message field.\r\n\r\n**Notes:**\r\n1. The issue is not reproducible on Chrome.\r\n2. The issue is also reproducible on Firefox Release.\r\n3. The issue is also reproducible for the hint text in the message field.\r\n3. Screenshot attached.',
),
]
for orig_text, cleaned_text in tests:
assert (
feature_cleanup.CleanCompatibilityReportDescription()(orig_text)
== cleaned_text
)
15 changes: 15 additions & 0 deletions tests/test_invalid_compatibility_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

from bugbug.models.invalid_compatibility_report import InvalidCompatibilityReportModel


def test_get_invalid_labels():
model = InvalidCompatibilityReportModel()
classes, _ = model.get_labels()
assert classes[70960]
assert classes[70978]
assert not classes[71052]
assert not classes[71011]

0 comments on commit 97a0f86

Please sign in to comment.