Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract the logic to identify bug types into individual feature classes #3907

Merged
merged 16 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 184 additions & 1 deletion bugbug/bug_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from datetime import datetime, timezone
from functools import partial
from multiprocessing.pool import Pool
from typing import Optional

import dateutil.parser
import pandas as pd
Expand All @@ -17,7 +18,7 @@
from libmozdata.bugzilla import Bugzilla
from sklearn.base import BaseEstimator, TransformerMixin

from bugbug import bug_snapshot, repository
from bugbug import bug_snapshot, bugzilla, repository


def field(bug, field):
Expand Down Expand Up @@ -690,3 +691,185 @@ def apply_rollback(bugs_iter):
bugs_iter = apply_rollback(bugs_iter)

return pd.DataFrame(apply_transform(bug) for bug in bugs_iter)


class IsPerformanceBug(SingleBugFeature):
"""Determine if the bug is related to performance based on given bug data."""

name = "Is Performance Bug"

def __init__(self):
self.keywords = set(["perf", "topperf", "main-thread-io"])

def __call__(self, bug: bugzilla.BugDict) -> bool:
if any(
f"[{whiteboard_text}" in bug["whiteboard"].lower()
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
for whiteboard_text in (
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
"fxperf",
"fxperfsize",
"snappy",
"pdfjs-c-performance",
"pdfjs-performance",
"sp3",
)
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
):
return True

if bug.get("cf_performance_impact") in ("low", "medium", "high"):
return True
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved

if any(
keyword.startswith(keyword_start)
for keyword_start in self.keywords
for keyword in bug["keywords"]
):
return True

return False


class IsMemoryBug(SingleBugFeature):
"""Determine if the bug is related to memory based on given bug data."""

name = "Is Memory Bug"

def __init__(self):
self.keywords = set(["memory-"])

def __call__(
self,
bug: bugzilla.BugDict,
bug_map: Optional[dict[int, bugzilla.BugDict]] = None,
) -> bool:
if any(
f"{whiteboard_text}" in bug["whiteboard"].lower()
for whiteboard_text in ("overhead", "memshrink")
):
return True

if bug_map is not None:
for bug_id in bug["blocks"]:
if bug_id not in bug_map:
continue

alias = bug_map[bug_id]["alias"]
if alias and alias.startswith("memshrink"):
return True

if any(
keyword.startswith(keyword_start)
for keyword_start in self.keywords
for keyword in bug["keywords"]
):
return True

return False


class IsPowerBug(SingleBugFeature):
"""Determine if the bug is related to power based on given bug data."""

name = "Is Power Bug"

def __init__(self):
self.keywords = set(["power"])

def __call__(self, bug: bugzilla.BugDict) -> bool:
if "[power" in bug["whiteboard"].lower():
return True

if any(
keyword.startswith(keyword_start)
for keyword_start in self.keywords
for keyword in bug["keywords"]
):
return True

return False


class IsSecurityBug(SingleBugFeature):
"""Determine if the bug is related to security based on given bug data."""

name = "Is Security Bug"

def __init__(self):
self.keywords = set(["sec-", "csectype-"])

def __call__(self, bug: bugzilla.BugDict) -> bool:
if any(
f"[{whiteboard_text}" in bug["whiteboard"].lower()
for whiteboard_text in ("client-bounty-form", "sec-survey")
):
return True

if any(
keyword.startswith(keyword_start)
for keyword_start in self.keywords
for keyword in bug["keywords"]
):
return True

return False


class IsCrashBug(SingleBugFeature):
"""Determine if the bug is related to crash based on given bug data."""

name = "Is Crash Bug"

def __init__(self):
self.keywords = set(["crash", "crashreportid"])

def __call__(self, bug: bugzilla.BugDict) -> bool:
if "cf_crash_signature" in bug and bug["cf_crash_signature"] not in ("", "---"):
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
return True

if any(
keyword.startswith(keyword_start)
for keyword_start in self.keywords
for keyword in bug["keywords"]
):
return True

return False


def infer_bug_types(
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
bug: bugzilla.BugDict, bug_map: Optional[dict[int, bugzilla.BugDict]] = None
) -> list[str]:
"""Infer bug types based on various bug characteristics.

Args:
- bug (bugzilla.BugDict): A dictionary containing bug data.
- bug_map (Optional[dict[int, bugzilla.BugDict]]): A mapping
of bug IDs to bug dictionaries. Default is None.

Returns:
- list[str]: A list of inferred bug types (e.g., "memory", "power",
"performance", "security", "crash").
"""
is_performance_bug = IsPerformanceBug()
is_memory_bug = IsMemoryBug()
is_power_bug = IsPowerBug()
is_security_bug = IsSecurityBug()
is_crash_bug = IsCrashBug()

types = set()
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved

if is_memory_bug(bug, bug_map):
types.add("memory")

if is_power_bug(bug):
types.add("power")

if is_performance_bug(bug):
types.add("performance")

if is_security_bug(bug):
types.add("security")

if is_crash_bug(bug):
types.add("crash")

return list(types)
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
100 changes: 22 additions & 78 deletions bugbug/models/bugtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# You can obtain one at http://mozilla.org/MPL/2.0/.

import logging
from typing import Iterable, Optional
from typing import Iterable

import numpy as np
import xgboost
Expand All @@ -13,83 +13,13 @@
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

from bugbug import bug_features, bugzilla, feature_cleanup, utils
from bugbug import bug_features, bugzilla, feature_cleanup
from bugbug.model import BugModel
from bugbug.utils import get_physical_cpu_count

logger = logging.getLogger(__name__)

KEYWORD_DICT = {
"sec-": "security",
"csectype-": "security",
"memory-": "memory",
"crash": "crash",
"crashreportid": "crash",
"perf": "performance",
"topperf": "performance",
"main-thread-io": "performance",
"power": "power",
}
TYPE_LIST = sorted(set(KEYWORD_DICT.values()))


def bug_to_types(
bug: bugzilla.BugDict, bug_map: Optional[dict[int, bugzilla.BugDict]] = None
) -> list[str]:
types = set()

bug_whiteboard = bug["whiteboard"].lower()

if any(
f"{whiteboard_text}" in bug_whiteboard
for whiteboard_text in ("overhead", "memshrink")
):
types.add("memory")

if "[power" in bug_whiteboard:
types.add("power")

if any(
f"[{whiteboard_text}" in bug_whiteboard
for whiteboard_text in (
"fxperf",
"fxperfsize",
"snappy",
"pdfjs-c-performance",
"pdfjs-performance",
"sp3",
)
):
types.add("performance")

if any(
f"[{whiteboard_text}" in bug_whiteboard
for whiteboard_text in ("client-bounty-form", "sec-survey")
):
types.add("security")

if "cf_performance_impact" in bug and bug["cf_performance_impact"] not in (
"---",
"?",
):
types.add("performance")

if "cf_crash_signature" in bug and bug["cf_crash_signature"] not in ("", "---"):
types.add("crash")

if bug_map is not None:
for bug_id in bug["blocks"]:
if bug_id not in bug_map:
continue

alias = bug_map[bug_id]["alias"]
if alias and alias.startswith("memshrink"):
types.add("memory")

for keyword_start, type in KEYWORD_DICT.items():
if any(keyword.startswith(keyword_start) for keyword in bug["keywords"]):
types.add(type)

return list(types)
TYPE_LIST = sorted(["security", "memory", "crash", "performance", "power"])


class BugTypeModel(BugModel):
Expand All @@ -98,12 +28,26 @@ def __init__(self, lemmatization=False, historical=False):

self.calculate_importance = False

self.label_extractors = [
bug_features.IsPerformanceBug(),
bug_features.IsMemoryBug(),
bug_features.IsPowerBug(),
bug_features.IsSecurityBug(),
bug_features.IsCrashBug(),
]

keywords = {
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
keyword
for extractor in self.label_extractors
for keyword in extractor.keywords
}

feature_extractors = [
bug_features.HasSTR(),
bug_features.Severity(),
# Ignore keywords that would make the ML completely skewed
# (we are going to use them as 100% rules in the evaluation phase).
bug_features.Keywords(set(KEYWORD_DICT.keys())),
bug_features.Keywords(keywords),
bug_features.IsCoverityIssue(),
bug_features.HasCrashSignature(),
bug_features.HasURL(),
Expand Down Expand Up @@ -158,7 +102,7 @@ def __init__(self, lemmatization=False, historical=False):
(
"estimator",
OneVsRestClassifier(
xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
xgboost.XGBClassifier(n_jobs=get_physical_cpu_count())
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved
),
),
]
Expand All @@ -171,7 +115,7 @@ def get_labels(self) -> tuple[dict[int, np.ndarray], list[str]]:

for bug_data in bug_map.values():
target = np.zeros(len(TYPE_LIST))
for type_ in bug_to_types(bug_data, bug_map):
for type_ in bug_features.infer_bug_types(bug_data, bug_map):
target[TYPE_LIST.index(type_)] = 1
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved

classes[int(bug_data["id"])] = target
Expand All @@ -195,7 +139,7 @@ def overwrite_classes(
probabilities: bool,
):
for i, bug in enumerate(bugs):
for type_ in bug_to_types(bug):
for type_ in bug_features.infer_bug_types(bug):
if probabilities:
classes[i][TYPE_LIST.index(type_)] = 1.0
else:
Expand Down
5 changes: 2 additions & 3 deletions scripts/generate_landings_risk_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from tqdm import tqdm

from bugbug import bug_features, bugzilla, db, phabricator, repository, test_scheduling
from bugbug.models.bugtype import bug_to_types
from bugbug.models.regressor import BUG_FIXING_COMMITS_DB, RegressorModel
from bugbug.utils import (
download_check_etag,
Expand Down Expand Up @@ -662,7 +661,7 @@ def get_commit_data(commit_list: list[repository.CommitDict]) -> list[dict]:
),
"summary": bug["summary"],
"fixed": bug["status"] in ("VERIFIED", "RESOLVED"),
"types": bug_to_types(bug, bug_map)
"types": bug_features.infer_bug_types(bug, bug_map)
+ (
["intermittent"]
if "intermittent-failure" in bug["keywords"]
Expand Down Expand Up @@ -881,7 +880,7 @@ def go(self, days: int) -> None:

bug_map = {}
for bug in bugzilla.get_bugs():
# Only add to the map bugs we are interested in, bugs that are blocked by other bugs (needed for the bug_to_types call) and bugs that caused regressions.
# Only add to the map bugs we are interested in, bugs that are blocked by other bugs (needed for the infer_bug_types call) and bugs that caused regressions.
if (
bug["id"] in bugs_set
or len(bug["depends_on"]) > 0
Expand Down
Loading