Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract the logic to identify bug types into individual feature classes #3907

Merged
merged 16 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 190 additions & 1 deletion bugbug/bug_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from libmozdata.bugzilla import Bugzilla
from sklearn.base import BaseEstimator, TransformerMixin

from bugbug import bug_snapshot, repository
from bugbug import bug_snapshot, bugzilla, repository


def field(bug, field):
Expand Down Expand Up @@ -687,3 +687,192 @@ def apply_rollback(bugs_iter):
bugs_iter = apply_rollback(bugs_iter)

return pd.DataFrame(apply_transform(bug) for bug in bugs_iter)


class IsPerformanceBug(SingleBugFeature):
"""Determine if the bug is related to performance based on given bug data."""

name = "Is Performance Bug"
type_name = "performance"
keyword_prefixes = ("perf", "topperf", "main-thread-io")
whiteboard_prefixes = (
"[fxperf",
"[fxperfsize",
"[snappy",
"[pdfjs-c-performance",
"[pdfjs-performance",
"[sp3",
)

def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if bug.get("cf_performance_impact") in ("low", "medium", "high"):
return True
PromiseFru marked this conversation as resolved.
Show resolved Hide resolved

if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True

bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True

return False


class IsMemoryBug(SingleBugFeature):
"""Determine if the bug is related to memory based on given bug data."""

name = "Is Memory Bug"
type_name = "memory"
keyword_prefixes = ("memory-",)
whiteboard_prefixes = ("[overhead", "[memshrink")

def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if bug_map is not None:
for bug_id in bug["blocks"]:
if bug_id not in bug_map:
continue

alias = bug_map[bug_id]["alias"]
if alias and alias.startswith("memshrink"):
return True

if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True

bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True

return False


class IsPowerBug(SingleBugFeature):
"""Determine if the bug is related to power based on given bug data."""

name = "Is Power Bug"
type_name = "power"
keyword_prefixes = ("power",)
whiteboard_prefixes = ("[power",)

def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True

bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True

return False


class IsSecurityBug(SingleBugFeature):
"""Determine if the bug is related to security based on given bug data."""

name = "Is Security Bug"
type_name = "security"
keyword_prefixes = ("sec-", "csectype-")
whiteboard_prefixes = ("[client-bounty-form", "[sec-survey")

def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True

bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True

return False


class IsCrashBug(SingleBugFeature):
"""Determine if the bug is related to crash based on given bug data."""

name = "Is Crash Bug"
type_name = "crash"
keyword_prefixes = ("crash", "crashreportid")

def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
# Checking for `[@` will exclude some bugs that do not have valid
# signatures: https://mzl.la/46XAqRF
if bug.get("cf_crash_signature") and "[@" in bug["cf_crash_signature"]:
return True

if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True

return False


class BugTypes(SingleBugFeature):
"""Determine bug type."""

name = "Infer Bug Type"
bug_type_extractors: list = [
IsCrashBug(),
IsMemoryBug(),
IsPerformanceBug(),
IsPowerBug(),
IsSecurityBug(),
]

def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> list[str]:
"""Infer bug types based on various bug characteristics.

Args:
- bug (bugzilla.BugDict): A dictionary containing bug data.
- bug_map (Optional[dict[int, bugzilla.BugDict]]): A mapping
of bug IDs to bug dictionaries. Default is None.

Returns:
- list[str]: A list of inferred bug types (e.g., "memory", "power",
"performance", "security", "crash").
"""
return [
is_type.type_name
for is_type in self.bug_type_extractors
if is_type(bug, bug_map)
]
113 changes: 27 additions & 86 deletions bugbug/models/bugtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,92 +18,27 @@

logger = logging.getLogger(__name__)

KEYWORD_DICT = {
"sec-": "security",
"csectype-": "security",
"memory-": "memory",
"crash": "crash",
"crashreportid": "crash",
"perf": "performance",
"topperf": "performance",
"main-thread-io": "performance",
"power": "power",
}
TYPE_LIST = sorted(set(KEYWORD_DICT.values()))


def bug_to_types(
bug: bugzilla.BugDict, bug_map: dict[int, bugzilla.BugDict] | None = None
) -> list[str]:
types = set()

bug_whiteboard = bug["whiteboard"].lower()

if any(
f"{whiteboard_text}" in bug_whiteboard
for whiteboard_text in ("overhead", "memshrink")
):
types.add("memory")

if "[power" in bug_whiteboard:
types.add("power")

if any(
f"[{whiteboard_text}" in bug_whiteboard
for whiteboard_text in (
"fxperf",
"fxperfsize",
"snappy",
"pdfjs-c-performance",
"pdfjs-performance",
"sp3",
)
):
types.add("performance")

if any(
f"[{whiteboard_text}" in bug_whiteboard
for whiteboard_text in ("client-bounty-form", "sec-survey")
):
types.add("security")

if "cf_performance_impact" in bug and bug["cf_performance_impact"] not in (
"---",
"?",
):
types.add("performance")

if "cf_crash_signature" in bug and bug["cf_crash_signature"] not in ("", "---"):
types.add("crash")

if bug_map is not None:
for bug_id in bug["blocks"]:
if bug_id not in bug_map:
continue

alias = bug_map[bug_id]["alias"]
if alias and alias.startswith("memshrink"):
types.add("memory")

for keyword_start, type in KEYWORD_DICT.items():
if any(keyword.startswith(keyword_start) for keyword in bug["keywords"]):
types.add(type)

return list(types)


class BugTypeModel(BugModel):
def __init__(self, lemmatization=False, historical=False):
BugModel.__init__(self, lemmatization)

self.calculate_importance = False

self.bug_type_extractors = bug_features.BugTypes.bug_type_extractors

label_keyword_prefixes = {
keyword
for extractor in self.bug_type_extractors
for keyword in extractor.keyword_prefixes
}

feature_extractors = [
bug_features.HasSTR(),
bug_features.Severity(),
# Ignore keywords that would make the ML completely skewed
# (we are going to use them as 100% rules in the evaluation phase).
bug_features.Keywords(set(KEYWORD_DICT.keys())),
bug_features.Keywords(label_keyword_prefixes),
bug_features.IsCoverityIssue(),
bug_features.HasCrashSignature(),
bug_features.HasURL(),
Expand Down Expand Up @@ -170,20 +105,23 @@ def get_labels(self) -> tuple[dict[int, np.ndarray], list[str]]:
bug_map = {bug["id"]: bug for bug in bugzilla.get_bugs()}

for bug_data in bug_map.values():
target = np.zeros(len(TYPE_LIST))
for type_ in bug_to_types(bug_data, bug_map):
target[TYPE_LIST.index(type_)] = 1
target = np.zeros(len(self.bug_type_extractors))
for i, is_type in enumerate(self.bug_type_extractors):
if is_type(bug_data, bug_map):
target[i] = 1

classes[int(bug_data["id"])] = target

for type_ in TYPE_LIST:
bug_types = [extractor.type_name for extractor in self.bug_type_extractors]

for i, bug_type in enumerate(bug_types):
logger.info(
"%d %s bugs",
sum(target[TYPE_LIST.index(type_)] == 1 for target in classes.values()),
type_,
sum(target[i] for target in classes.values()),
bug_type,
)

return classes, TYPE_LIST
return classes, bug_types

def get_feature_names(self):
return self.clf.named_steps["union"].get_feature_names_out()
Expand All @@ -194,11 +132,14 @@ def overwrite_classes(
classes: dict[int, np.ndarray],
probabilities: bool,
):
bug_map = {bug["id"]: bug for bug in bugs}

for i, bug in enumerate(bugs):
for type_ in bug_to_types(bug):
if probabilities:
classes[i][TYPE_LIST.index(type_)] = 1.0
else:
classes[i][TYPE_LIST.index(type_)] = 1
for j, is_type_applicable in enumerate(self.bug_type_extractors):
if is_type_applicable(bug, bug_map):
if probabilities:
classes[i][j] = 1.0
else:
classes[i][j] = 1

return classes
3 changes: 2 additions & 1 deletion scripts/generate_landings_risk_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from tqdm import tqdm

from bugbug import bug_features, bugzilla, db, phabricator, repository, test_scheduling
from bugbug.models.bugtype import bug_to_types
from bugbug.models.regressor import BUG_FIXING_COMMITS_DB, RegressorModel
from bugbug.utils import (
download_check_etag,
Expand Down Expand Up @@ -555,6 +554,8 @@ def get_commit_data(commit_list: list[repository.CommitDict]) -> list[dict]:

component_team_mapping = get_component_team_mapping()

bug_to_types = bug_features.BugTypes()

bug_summaries = []
for bug_id in bugs:
if bug_id not in bug_map:
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/bug_features/bug_types.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"keywords": ["meta", "perf"], "whiteboard": "", "cf_crash_signature": ""}
{"keywords": ["memory-leak", "regression"], "whiteboard": "[MemShrink:P1]", "cf_crash_signature": ""}
{"whiteboard": "", "keywords": ["power"]}
{"keywords": ["sec-want"], "whiteboard": "[sg:want][psm-padlock]"}
{"keywords": ["crash", "regression"], "whiteboard": "", "cf_crash_signature": "[@ audiounit_property_listener_callback]"}
9 changes: 9 additions & 0 deletions tests/test_bug_features.py
suhaibmujahid marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
BlockedBugsNumber,
BugExtractor,
BugReporter,
BugTypes,
CommentCount,
CommentLength,
Component,
Expand Down Expand Up @@ -178,3 +179,11 @@ def test_BugExtractor():
BugExtractor([HasSTR(), HasSTR()], [fileref(), url()])
with pytest.raises(AssertionError):
BugExtractor([HasSTR(), HasURL()], [fileref(), fileref()])


def test_BugTypes(read) -> None:
read(
"bug_types.json",
BugTypes,
[["performance"], ["memory"], ["power"], ["security"], ["crash"]],
)