mozilla · suhaibmujahid · Jan 3, 2024 · Dec 7, 2023 · Dec 8, 2023 · Dec 8, 2023
diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py
@@ -9,6 +9,7 @@
 from datetime import datetime, timezone
 from functools import partial
 from multiprocessing.pool import Pool
+from typing import Optional
 
 import dateutil.parser
 import pandas as pd
@@ -17,7 +18,7 @@
 from libmozdata.bugzilla import Bugzilla
 from sklearn.base import BaseEstimator, TransformerMixin
 
-from bugbug import bug_snapshot, repository
+from bugbug import bug_snapshot, bugzilla, repository
 
 
 def field(bug, field):
@@ -690,3 +691,185 @@ def apply_rollback(bugs_iter):
             bugs_iter = apply_rollback(bugs_iter)
 
         return pd.DataFrame(apply_transform(bug) for bug in bugs_iter)
+
+
+class IsPerformanceBug(SingleBugFeature):
+    """Determine if the bug is related to performance based on given bug data."""
+
+    name = "Is Performance Bug"
+
+    def __init__(self):
+        self.keywords = set(["perf", "topperf", "main-thread-io"])
+
+    def __call__(self, bug: bugzilla.BugDict) -> bool:
+        if any(
+            f"[{whiteboard_text}" in bug["whiteboard"].lower()
+            for whiteboard_text in (
+                "fxperf",
+                "fxperfsize",
+                "snappy",
+                "pdfjs-c-performance",
+                "pdfjs-performance",
+                "sp3",
+            )
+        ):
+            return True
+
+        if bug.get("cf_performance_impact") in ("low", "medium", "high"):
+            return True
+
+        if any(
+            keyword.startswith(keyword_start)
+            for keyword_start in self.keywords
+            for keyword in bug["keywords"]
+        ):
+            return True
+
+        return False
+
+
+class IsMemoryBug(SingleBugFeature):
+    """Determine if the bug is related to memory based on given bug data."""
+
+    name = "Is Memory Bug"
+
+    def __init__(self):
+        self.keywords = set(["memory-"])
+
+    def __call__(
+        self,
+        bug: bugzilla.BugDict,
+        bug_map: Optional[dict[int, bugzilla.BugDict]] = None,
+    ) -> bool:
+        if any(
+            f"{whiteboard_text}" in bug["whiteboard"].lower()
+            for whiteboard_text in ("overhead", "memshrink")
+        ):
+            return True
+
+        if bug_map is not None:
+            for bug_id in bug["blocks"]:
+                if bug_id not in bug_map:
+                    continue
+
+                alias = bug_map[bug_id]["alias"]
+                if alias and alias.startswith("memshrink"):
+                    return True
+
+        if any(
+            keyword.startswith(keyword_start)
+            for keyword_start in self.keywords
+            for keyword in bug["keywords"]
+        ):
+            return True
+
+        return False
+
+
+class IsPowerBug(SingleBugFeature):
+    """Determine if the bug is related to power based on given bug data."""
+
+    name = "Is Power Bug"
+
+    def __init__(self):
+        self.keywords = set(["power"])
+
+    def __call__(self, bug: bugzilla.BugDict) -> bool:
+        if "[power" in bug["whiteboard"].lower():
+            return True
+
+        if any(
+            keyword.startswith(keyword_start)
+            for keyword_start in self.keywords
+            for keyword in bug["keywords"]
+        ):
+            return True
+
+        return False
+
+
+class IsSecurityBug(SingleBugFeature):
+    """Determine if the bug is related to security based on given bug data."""
+
+    name = "Is Security Bug"
+
+    def __init__(self):
+        self.keywords = set(["sec-", "csectype-"])
+
+    def __call__(self, bug: bugzilla.BugDict) -> bool:
+        if any(
+            f"[{whiteboard_text}" in bug["whiteboard"].lower()
+            for whiteboard_text in ("client-bounty-form", "sec-survey")
+        ):
+            return True
+
+        if any(
+            keyword.startswith(keyword_start)
+            for keyword_start in self.keywords
+            for keyword in bug["keywords"]
+        ):
+            return True
+
+        return False
+
+
+class IsCrashBug(SingleBugFeature):
+    """Determine if the bug is related to crash based on given bug data."""
+
+    name = "Is Crash Bug"
+
+    def __init__(self):
+        self.keywords = set(["crash", "crashreportid"])
+
+    def __call__(self, bug: bugzilla.BugDict) -> bool:
+        if "cf_crash_signature" in bug and bug["cf_crash_signature"] not in ("", "---"):
+            return True
+
+        if any(
+            keyword.startswith(keyword_start)
+            for keyword_start in self.keywords
+            for keyword in bug["keywords"]
+        ):
+            return True
+
+        return False
+
+
+def infer_bug_types(
+    bug: bugzilla.BugDict, bug_map: Optional[dict[int, bugzilla.BugDict]] = None
+) -> list[str]:
+    """Infer bug types based on various bug characteristics.
+
+    Args:
+    - bug (bugzilla.BugDict): A dictionary containing bug data.
+    - bug_map (Optional[dict[int, bugzilla.BugDict]]): A mapping
+        of bug IDs to bug dictionaries. Default is None.
+
+    Returns:
+    - list[str]: A list of inferred bug types (e.g., "memory", "power",
+        "performance", "security", "crash").
+    """
+    is_performance_bug = IsPerformanceBug()
+    is_memory_bug = IsMemoryBug()
+    is_power_bug = IsPowerBug()
+    is_security_bug = IsSecurityBug()
+    is_crash_bug = IsCrashBug()
+
+    types = set()
+
+    if is_memory_bug(bug, bug_map):
+        types.add("memory")
+
+    if is_power_bug(bug):
+        types.add("power")
+
+    if is_performance_bug(bug):
+        types.add("performance")
+
+    if is_security_bug(bug):
+        types.add("security")
+
+    if is_crash_bug(bug):
+        types.add("crash")
+
+    return list(types)
diff --git a/bugbug/models/bugtype.py b/bugbug/models/bugtype.py
@@ -4,7 +4,7 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import logging
-from typing import Iterable, Optional
+from typing import Iterable
 
 import numpy as np
 import xgboost
@@ -13,83 +13,13 @@
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.pipeline import Pipeline
 
-from bugbug import bug_features, bugzilla, feature_cleanup, utils
+from bugbug import bug_features, bugzilla, feature_cleanup
 from bugbug.model import BugModel
+from bugbug.utils import get_physical_cpu_count
 
 logger = logging.getLogger(__name__)
 
-KEYWORD_DICT = {
-    "sec-": "security",
-    "csectype-": "security",
-    "memory-": "memory",
-    "crash": "crash",
-    "crashreportid": "crash",
-    "perf": "performance",
-    "topperf": "performance",
-    "main-thread-io": "performance",
-    "power": "power",
-}
-TYPE_LIST = sorted(set(KEYWORD_DICT.values()))
-
-
-def bug_to_types(
-    bug: bugzilla.BugDict, bug_map: Optional[dict[int, bugzilla.BugDict]] = None
-) -> list[str]:
-    types = set()
-
-    bug_whiteboard = bug["whiteboard"].lower()
-
-    if any(
-        f"{whiteboard_text}" in bug_whiteboard
-        for whiteboard_text in ("overhead", "memshrink")
-    ):
-        types.add("memory")
-
-    if "[power" in bug_whiteboard:
-        types.add("power")
-
-    if any(
-        f"[{whiteboard_text}" in bug_whiteboard
-        for whiteboard_text in (
-            "fxperf",
-            "fxperfsize",
-            "snappy",
-            "pdfjs-c-performance",
-            "pdfjs-performance",
-            "sp3",
-        )
-    ):
-        types.add("performance")
-
-    if any(
-        f"[{whiteboard_text}" in bug_whiteboard
-        for whiteboard_text in ("client-bounty-form", "sec-survey")
-    ):
-        types.add("security")
-
-    if "cf_performance_impact" in bug and bug["cf_performance_impact"] not in (
-        "---",
-        "?",
-    ):
-        types.add("performance")
-
-    if "cf_crash_signature" in bug and bug["cf_crash_signature"] not in ("", "---"):
-        types.add("crash")
-
-    if bug_map is not None:
-        for bug_id in bug["blocks"]:
-            if bug_id not in bug_map:
-                continue
-
-            alias = bug_map[bug_id]["alias"]
-            if alias and alias.startswith("memshrink"):
-                types.add("memory")
-
-    for keyword_start, type in KEYWORD_DICT.items():
-        if any(keyword.startswith(keyword_start) for keyword in bug["keywords"]):
-            types.add(type)
-
-    return list(types)
+TYPE_LIST = sorted(["security", "memory", "crash", "performance", "power"])
 
 
 class BugTypeModel(BugModel):
@@ -98,12 +28,26 @@ def __init__(self, lemmatization=False, historical=False):
 
         self.calculate_importance = False
 
+        self.label_extractors = [
+            bug_features.IsPerformanceBug(),
+            bug_features.IsMemoryBug(),
+            bug_features.IsPowerBug(),
+            bug_features.IsSecurityBug(),
+            bug_features.IsCrashBug(),
+        ]
+
+        keywords = {
+            keyword
+            for extractor in self.label_extractors
+            for keyword in extractor.keywords
+        }
+
         feature_extractors = [
             bug_features.HasSTR(),
             bug_features.Severity(),
             # Ignore keywords that would make the ML completely skewed
             # (we are going to use them as 100% rules in the evaluation phase).
-            bug_features.Keywords(set(KEYWORD_DICT.keys())),
+            bug_features.Keywords(keywords),
             bug_features.IsCoverityIssue(),
             bug_features.HasCrashSignature(),
             bug_features.HasURL(),
@@ -158,7 +102,7 @@ def __init__(self, lemmatization=False, historical=False):
                 (
                     "estimator",
                     OneVsRestClassifier(
-                        xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
+                        xgboost.XGBClassifier(n_jobs=get_physical_cpu_count())
                     ),
                 ),
             ]
@@ -171,7 +115,7 @@ def get_labels(self) -> tuple[dict[int, np.ndarray], list[str]]:
 
         for bug_data in bug_map.values():
             target = np.zeros(len(TYPE_LIST))
-            for type_ in bug_to_types(bug_data, bug_map):
+            for type_ in bug_features.infer_bug_types(bug_data, bug_map):
                 target[TYPE_LIST.index(type_)] = 1
 
             classes[int(bug_data["id"])] = target
@@ -195,7 +139,7 @@ def overwrite_classes(
         probabilities: bool,
     ):
         for i, bug in enumerate(bugs):
-            for type_ in bug_to_types(bug):
+            for type_ in bug_features.infer_bug_types(bug):
                 if probabilities:
                     classes[i][TYPE_LIST.index(type_)] = 1.0
                 else:

diff --git a/scripts/generate_landings_risk_report.py b/scripts/generate_landings_risk_report.py
@@ -30,7 +30,6 @@
 from tqdm import tqdm
 
 from bugbug import bug_features, bugzilla, db, phabricator, repository, test_scheduling
-from bugbug.models.bugtype import bug_to_types
 from bugbug.models.regressor import BUG_FIXING_COMMITS_DB, RegressorModel
 from bugbug.utils import (
     download_check_etag,
@@ -662,7 +661,7 @@ def get_commit_data(commit_list: list[repository.CommitDict]) -> list[dict]:
                 ),
                 "summary": bug["summary"],
                 "fixed": bug["status"] in ("VERIFIED", "RESOLVED"),
-                "types": bug_to_types(bug, bug_map)
+                "types": bug_features.infer_bug_types(bug, bug_map)
                 + (
                     ["intermittent"]
                     if "intermittent-failure" in bug["keywords"]
@@ -881,7 +880,7 @@ def go(self, days: int) -> None:
 
         bug_map = {}
         for bug in bugzilla.get_bugs():
-            # Only add to the map bugs we are interested in, bugs that are blocked by other bugs (needed for the bug_to_types call) and bugs that caused regressions.
+            # Only add to the map bugs we are interested in, bugs that are blocked by other bugs (needed for the infer_bug_types call) and bugs that caused regressions.
             if (
                 bug["id"] in bugs_set
                 or len(bug["depends_on"]) > 0