Add a model to identify invalid reports for Firefox in-product report…

…er (#3790)
mozilla · Nov 19, 2023 · 97a0f86 · 97a0f86
1 parent d454491
commit 97a0f86
Show file tree

Hide file tree

Showing 7 changed files with 283 additions and 2 deletions.
diff --git a/bugbug/feature_cleanup.py b/bugbug/feature_cleanup.py
@@ -198,3 +198,66 @@ def __init__(self):
 
     def __call__(self, text):
         return self.pattern.sub("__CRASH_STATS_LINK__", text)
+
+
+class CleanCompatibilityReportDescription(object):
+    def __init__(self):
+        self.sub_patterns = {
+            "details": re.compile(r"<details>.*?</details>", re.DOTALL),
+            "footer": re.compile(
+                r"_From \[webcompat\.com\]\(https://webcompat\.com/\) with ❤️_"
+            ),
+            "link": re.compile(
+                r"\[View console log messages\]\(https://webcompat\.com/console_logs/.*?\)"
+            ),
+            "screenshot": re.compile(r"\[\!\[Screenshot Description\]\(.*?\)\]\(.*?\)"),
+            "screenshot_md": re.compile(
+                r'\*\*Screenshot\*\*\s*\r?\n\<img width="[\d]+" alt="[^"]*" src="https?://[^"]+"[^>]*>'
+            ),
+            "watchers": re.compile(r"\*\*Watchers:\*\*(?:\r?\n@[\w-]+)+"),
+        }
+        self.extract_patterns = {
+            "description": re.compile(r"\*\*Description\*\*: (.*?)\n", re.DOTALL),
+            "problem_type": re.compile(r"\*\*Problem type\*\*: (.*?)\n", re.DOTALL),
+            "steps": re.compile(r"\*\*Steps to Reproduce\*\*:?(.*)", re.DOTALL),
+        }
+
+        self.default_problems = {
+            "Desktop site instead of mobile site",
+            "Browser unsupported",
+            "Page not loading correctly",
+            "Missing items",
+            "Buttons or links not working",
+            "Unable to type",
+            "Unable to login",
+            "Problems with Captcha",
+            "Images not loaded",
+            "Items are overlapped",
+            "Items are misaligned",
+            "Items not fully visible",
+            "There is no video",
+            "There is no audio",
+            "Media controls are broken or missing",
+            "The video or audio does not play",
+        }
+
+    def _extract_and_strip(self, pattern, text):
+        match = pattern.search(text)
+        return match.group(1).strip() if match else ""
+
+    def __call__(self, text):
+        for pattern in self.sub_patterns.values():
+            text = pattern.sub("", text)
+
+        problem_type = self._extract_and_strip(
+            self.extract_patterns["problem_type"], text
+        )
+        description = self._extract_and_strip(
+            self.extract_patterns["description"], text
+        )
+        steps = self._extract_and_strip(self.extract_patterns["steps"], text)
+
+        if problem_type == "Something else" or description not in self.default_problems:
+            return f"{description}\n {steps}" if steps else description
+        else:
+            return steps
diff --git a/bugbug/models/__init__.py b/bugbug/models/__init__.py
@@ -21,6 +21,7 @@
     "devdocneeded": "bugbug.models.devdocneeded.DevDocNeededModel",
     "duplicate": "bugbug.models.duplicate.DuplicateModel",
     "fixtime": "bugbug.models.fixtime.FixTimeModel",
+    "invalidcompatibilityreport": "bugbug.models.invalid_compatibility_report.InvalidCompatibilityReportModel",
     "needsdiagnosis": "bugbug.models.needsdiagnosis.NeedsDiagnosisModel",
     "qaneeded": "bugbug.models.qaneeded.QANeededModel",
     "rcatype": "bugbug.models.rcatype.RCATypeModel",

diff --git a/bugbug/models/invalid_compatibility_report.py b/bugbug/models/invalid_compatibility_report.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import logging
+
+import xgboost
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+
+from bugbug import feature_cleanup, issue_features, utils
+from bugbug.model import IssueModel
+
+logger = logging.getLogger(__name__)
+
+
+class InvalidCompatibilityReportModel(IssueModel):
+    def __init__(self, lemmatization=False):
+        super().__init__(
+            owner="webcompat", repo="web-bugs", lemmatization=lemmatization
+        )
+
+        self.calculate_importance = False
+
+        feature_extractors = []
+
+        cleanup_functions = []
+
+        self.extraction_pipeline = Pipeline(
+            [
+                (
+                    "report_extractor",
+                    issue_features.IssueExtractor(
+                        feature_extractors, cleanup_functions, rollback=False
+                    ),
+                ),
+                (
+                    "union",
+                    ColumnTransformer(
+                        [
+                            (
+                                "first_comment",
+                                self.text_vectorizer(min_df=0.0001),
+                                "first_comment",
+                            ),
+                        ]
+                    ),
+                ),
+            ]
+        )
+
+        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
+
+    def items_gen(self, classes):
+        # Do cleanup separately from extraction pipeline to
+        # make sure it's not applied during classification due to differences
+        # in text structure between GitHub issues and reports
+        cleanup_function = feature_cleanup.CleanCompatibilityReportDescription()
+
+        for issue, label in super().items_gen(classes):
+            issue = {
+                **issue,
+                "body": cleanup_function(issue["body"]),
+            }
+            yield issue, label
+
+    def get_labels(self):
+        classes = {}
+        for issue in self.github.get_issues():
+            if not issue["title"] or not issue["body"]:
+                continue
+
+            # Skip issues that are not moderated yet as they don't have a
+            # meaningful title or body.
+            if issue["title"] == "In the moderation queue.":
+                continue
+
+            if (
+                issue["milestone"]
+                and (issue["milestone"]["title"] in ("invalid", "incomplete"))
+                and any(label["name"] == "wcrt-invalid" for label in issue["labels"])
+            ):
+                classes[issue["number"]] = 1
+
+            elif any(
+                event["event"] == "milestoned"
+                and (event["milestone"]["title"] in ("needsdiagnosis", "moved"))
+                for event in issue["events"]
+            ):
+                classes[issue["number"]] = 0
+
+        logger.info(
+            "%d issues have been moved to invalid",
+            sum(label == 1 for label in classes.values()),
+        )
+        logger.info(
+            "%d issues have not been moved to invalid",
+            sum(label == 0 for label in classes.values()),
+        )
+
+        return classes, [0, 1]
+
+    def get_feature_names(self):
+        return self.extraction_pipeline.named_steps["union"].get_feature_names_out()
diff --git a/scripts/compatibility_report_classifier.py b/scripts/compatibility_report_classifier.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+import os
+from logging import INFO, basicConfig, getLogger
+
+import numpy as np
+import requests
+
+from bugbug.models import get_model_class
+from bugbug.utils import download_model
+
+basicConfig(level=INFO)
+logger = getLogger(__name__)
+
+
+def classify_reports(model_name: str, report_text: str) -> None:
+    model_file_name = f"{model_name}model"
+
+    if not os.path.exists(model_file_name):
+        logger.info("%s does not exist. Downloading the model....", model_file_name)
+        try:
+            download_model(model_name)
+        except requests.HTTPError:
+            logger.error(
+                "A pre-trained model is not available, you will need to train it yourself using the trainer script"
+            )
+            raise SystemExit(1)
+
+    model_class = get_model_class(model_name)
+    model = model_class.load(model_file_name)
+
+    logger.info("%s", report_text)
+
+    report = {"body": report_text, "title": ""}
+
+    if model.calculate_importance:
+        probas, importance = model.classify(
+            report, probabilities=True, importances=True
+        )
+
+        model.print_feature_importances(
+            importance["importances"], class_probabilities=probas
+        )
+    else:
+        probas = model.classify(report, probabilities=True, importances=False)
+
+    probability = probas[0]
+    pred_index = np.argmax(probability)
+    if len(probability) > 2:
+        pred_class = model.le.inverse_transform([pred_index])[0]
+    else:
+        pred_class = "Positive" if pred_index == 1 else "Negative"
+    logger.info("%s %s", pred_class, probability)
+    input()
+
+
+def main() -> None:
+    description = "Perform evaluation of user report using the specified model"
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument("model", type=str, help="Which model to use for evaluation")
+    parser.add_argument("--report-text", help="Report text to classify", type=str)
+
+    args = parser.parse_args()
+
+    classify_reports(args.model, args.report_text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/fixtures/github_webcompat_web-bugs_issues.json b/tests/fixtures/github_webcompat_web-bugs_issues.json
diff --git a/tests/test_feature_cleanup.py b/tests/test_feature_cleanup.py
@@ -134,3 +134,29 @@ def test_crash():
     ]
     for orig_text, cleaned_text in tests:
         assert feature_cleanup.crash()(orig_text) == cleaned_text
+
+
+def test_clean_compatibility_report_description():
+    tests = [
+        (
+            '<!-- @browser: Firefox 117.0 -->\n<!-- @ua_header: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0 -->\n<!-- @reported_with: unknown -->\n<!-- @public_url: https://github.com/webcompat/web-bugs/issues/126685 -->\n\n**URL**: https://www.lequipe.fr/explore/video/la-course-en-tete/20177528\n\n**Browser / Version**: Firefox 117.0\n**Operating System**: Windows 10\n**Tested Another Browser**: Yes Chrome\n\n**Problem type**: Video or audio doesn\'t play\n**Description**: Media controls are broken or missing\n**Steps to Reproduce**:\nVideo is starting but we cannot use the video panel control. It working on Brave.\r\n<details>\r\n      <summary>View the screenshot</summary>\r\n      <img alt="Screenshot" src="https://webcompat.com/uploads/2023/9/501af310-e646-4b2c-8eb9-7f21ce8725fe.jpg">\r\n      </details>\n\n<details>\n<summary>Browser Configuration</summary>\n<ul>\n  <li>None</li>\n</ul>\n</details>\n\n_From [webcompat.com](https://webcompat.com/) with ❤️_',
+            "Video is starting but we cannot use the video panel control. It working on Brave.",
+        ),
+        (
+            "<!-- @browser: Firefox Mobile 120.0 -->\n<!-- @ua_header: Mozilla/5.0 (Android 10; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0 -->\n<!-- @reported_with: unknown -->\n<!-- @public_url: https://github.com/webcompat/web-bugs/issues/128961 -->\n\n**URL**: https://www.jianshu.com/p/ba52ec38ac51\n\n**Browser / Version**: Firefox Mobile 120.0\n**Operating System**: Android 10\n**Tested Another Browser**: Yes Edge\n\n**Problem type**: Something else\n**Description**: Couldn't scroll down\n**Steps to Reproduce**:\nScroll down the page, then scroll to top, scroll down again, the page couldn't scroll (will always back to top). \n\n<details>\n<summary>Browser Configuration</summary>\n<ul>\n  <li>None</li>\n</ul>\n</details>\n\n_From [webcompat.com](https://webcompat.com/) with ❤️_",
+            "Couldn't scroll down\n Scroll down the page, then scroll to top, scroll down again, the page couldn't scroll (will always back to top).",
+        ),
+        (
+            '**URL**:\r\nhttps://samarabags.com/collections/all-bags/products/the-jewelry-box?variant=40390455820322\r\n\r\n**Browser/Version**:\r\nFirefox 112.0.2\r\n\r\n**Operating System**:\r\nMacOS Ventura 13.3.1 (a) (22E772610a)\r\nPrivate window\r\n\r\n**What seems to be the trouble?(Required)**\r\n- [ ] Desktop site instead of mobile site\r\n- [ ] Mobile site is not usable\r\n- [ ] Video doesn\'t play\r\n- [X] Layout is messed up\r\n- [X] Text is not visible\r\n- [ ] Something else (Add details below)\r\n\r\n**Steps to Reproduce**\r\n\r\n1. Navigate to: (www.samarabags.com)\r\n2. Select a product and open its page.\r\n\r\n*__Expected Behavior:__*\r\nThe customer review, Instagram and the footer are visible.\r\n\r\n*__Actual Behavior:__*\r\nAnything below the product\'s image is just blank. "This page slowing down Firefox" message appears on the top.\r\n\r\n**Screenshot**\r\n<img width="1510" alt="Screenshot 2023-05-12 at 6 24 29 PM" src="https://github.com/webcompat/web-bugs/assets/1740517/20423943-c0a2-42b4-a763-ff814fa48ecb">\r\n',
+            '\n 1. Navigate to: (www.samarabags.com)\r\n2. Select a product and open its page.\r\n\r\n*__Expected Behavior:__*\r\nThe customer review, Instagram and the footer are visible.\r\n\r\n*__Actual Behavior:__*\r\nAnything below the product\'s image is just blank. "This page slowing down Firefox" message appears on the top.',
+        ),
+        (
+            '<!-- @browser: Firefox Nightly 108.0a1 (2022-10-18) -->\r\n<!-- @ua_header: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0 -->\r\n<!-- @reported_with: unknown -->\r\n\r\n**URL**: https://dlive.tv/s/dashboard#0\r\n\r\n**Browser / Version**: Firefox Nightly 108.0a1 (2022-10-18)\r\n**Operating System**: Windows 10\r\n**Tested Another Browser**: Yes Chrome\r\n\r\n**Problem type**: Design is broken\r\n**Description**: Items are misaligned\r\n\r\n**Prerequisites**: \r\nAccount created and signed in.\r\n\r\n**Steps to Reproduce**:\r\n1. Navigate to https://dlive.tv/s/dashboard#0\r\n2. Type in a message in the "Chat". \r\n3. Observe text alignment. \r\n\r\n**Expected Behavior:**\r\nThe text is centered in the message field.\r\n\r\n**Actual Behavior:**\r\nThe text is aligned on the top side of the message field.\r\n\r\n**Notes:**\r\n1. The issue is not reproducible on Chrome.\r\n2. The issue is also reproducible on Firefox Release.\r\n3. The issue is also reproducible for the hint text in the message field.\r\n3. Screenshot attached. \r\n\r\n**Watchers:**\r\n@softvision-oana-arbuzov\r\n@softvision-raul-bucata\r\n@sv-calin \r\n<details>\r\n      <summary>View the screenshot</summary>\r\n      <img alt="Screenshot" src="https://webcompat.com/uploads/2022/10/b4a296a5-ee2f-4a18-a5da-b1e20ee8d27d.jpg">\r\n      </details>\r\n\r\n<details>\r\n<summary>Browser Configuration</summary>\r\n<ul>\r\n  <li>None</li>\r\n</ul>\r\n</details>\r\n\r\n_From [webcompat.com](https://webcompat.com/) with ❤️_',
+            '1. Navigate to https://dlive.tv/s/dashboard#0\r\n2. Type in a message in the "Chat". \r\n3. Observe text alignment. \r\n\r\n**Expected Behavior:**\r\nThe text is centered in the message field.\r\n\r\n**Actual Behavior:**\r\nThe text is aligned on the top side of the message field.\r\n\r\n**Notes:**\r\n1. The issue is not reproducible on Chrome.\r\n2. The issue is also reproducible on Firefox Release.\r\n3. The issue is also reproducible for the hint text in the message field.\r\n3. Screenshot attached.',
+        ),
+    ]
+    for orig_text, cleaned_text in tests:
+        assert (
+            feature_cleanup.CleanCompatibilityReportDescription()(orig_text)
+            == cleaned_text
+        )
diff --git a/tests/test_invalid_compatibility_report.py b/tests/test_invalid_compatibility_report.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from bugbug.models.invalid_compatibility_report import InvalidCompatibilityReportModel
+
+
+def test_get_invalid_labels():
+    model = InvalidCompatibilityReportModel()
+    classes, _ = model.get_labels()
+    assert classes[70960]
+    assert classes[70978]
+    assert not classes[71052]
+    assert not classes[71011]