Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving the monitoring portability #20

Merged
merged 9 commits into from
May 31, 2024
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Running the **simple.conf** job:
## Mandatory
- python >= 3.9
- [python dependencies](./requirements/base.in)
- turbostat >= 2022.07.28
- turbostat >= 2022.04.16
- numactl
- dmidecode
- util-linux >= 2.32
Expand All @@ -75,4 +75,4 @@ Running the **simple.conf** job:
## Optional
- ipmitool
- ilorest (for HPE servers)
- stress-ng >= 0.17.04
- stress-ng >= 0.17.04
7 changes: 7 additions & 0 deletions hwbench/bench/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional

from ..utils.external import External
from ..utils.helpers import fatal
from .parameters import BenchmarkParameters


Expand Down Expand Up @@ -47,6 +48,12 @@ def __init__(
self.engine_name = name
self.binary = binary
self.modules = modules
# FIXME: If the import is done at the file level, the mocking is lost here
# So I'm importing is_binary_available just before the call :/
from ..utils.helpers import is_binary_available

if not is_binary_available(self.binary):
fatal(f"Engine {name} requires '{binary}' binary, please install it.")

def get_binary(self) -> str:
return self.binary
Expand Down
28 changes: 14 additions & 14 deletions hwbench/bench/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,52 +57,52 @@ def __set_metric(self, metric: Metrics, value: dict[str, dict[str, MonitorMetric

def prepare(self):
"""Preparing the monitoring"""
# Let's be sure the monitoring is functional by
# - checking the BMC is actually connected to the network
if self.vendor.get_bmc().get_ip() == "0.0.0.0":
h.fatal("BMC has no IP, monitoring will not be possible")
print(
f"Starting monitoring for {self.vendor.name()} vendor with {self.vendor.get_bmc().get_ip()}"
)
v = self.vendor
bmc = self.vendor.get_bmc()

def check_monitoring(metric: Metrics):
def check_monitoring(source: str, metric: Metrics):
data = self.get_metric(metric)
if not len(data):
h.fatal(f"Cannot detect {str(metric)} metrics")

print(
f"Monitoring {str(metric)} metrics:"
f"Monitoring/{source}: {str(metric)} metrics:"
+ ", ".join(
[f"{len(data[pc])}x{pc}" for pc in data if len(data[pc]) > 0]
)
)

# - checking if the CPU monitoring works
if self.hardware.cpu.get_arch() == "x86_64":
print("Monitoring/turbostat: initialize")
self.turbostat = Turbostat(
self.hardware,
self.get_metric(Metrics.FREQ),
self.get_metric(Metrics.POWER_CONSUMPTION),
)
check_monitoring(Metrics.FREQ)
check_monitoring("turbostat", Metrics.FREQ)

print(
f"Monitoring/BMC: initialize {v.name()} vendor with {bmc.get_driver_name()} driver @ {bmc.get_ip()}"
)

# - checking if the bmc monitoring works
# These calls will also initialize the datastructures out of the monitoring loop
self.vendor.get_bmc().read_thermals(self.get_metric(Metrics.THERMAL))
check_monitoring(Metrics.THERMAL)
check_monitoring("BMC", Metrics.THERMAL)

self.vendor.get_bmc().read_fans(self.get_metric(Metrics.FANS))
check_monitoring(Metrics.FANS)
check_monitoring("BMC", Metrics.FANS)

self.vendor.get_bmc().read_power_consumption(
self.get_metric(Metrics.POWER_CONSUMPTION)
)
check_monitoring(Metrics.POWER_CONSUMPTION)
check_monitoring("BMC", Metrics.POWER_CONSUMPTION)

self.vendor.get_bmc().read_power_supplies(
self.get_metric(Metrics.POWER_SUPPLIES)
)
check_monitoring(Metrics.POWER_SUPPLIES)
check_monitoring("BMC", Metrics.POWER_SUPPLIES)

def __monitor_bmc(self):
"""Monitor the bmc metrics"""
Expand Down
14 changes: 9 additions & 5 deletions hwbench/bench/test_benchmarks_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,15 @@ def get_benches(self):

def parse_jobs_config(self, validate_parameters=True):
# We need to mock turbostat when parsing config with monitoring
# We mock the run() command to get a constant output
with patch("hwbench.environment.turbostat.Turbostat.run") as ts:
with open("tests/parsing/turbostat/run", "r") as f:
ts.return_value = ast.literal_eval(f.read())
return self.benches.parse_jobs_config(validate_parameters)
with patch("hwbench.utils.helpers.is_binary_available") as iba:
iba.return_value = True
# We mock the run() and check_version() command to get a constant output
with patch("hwbench.environment.turbostat.Turbostat.check_version") as cv:
cv.return_value = True
with patch("hwbench.environment.turbostat.Turbostat.run") as ts:
with open("tests/parsing/turbostat/run", "r") as f:
ts.return_value = ast.literal_eval(f.read())
return self.benches.parse_jobs_config(validate_parameters)

def get_jobs_config(self) -> config.Config:
return self.jobs_config
Expand Down
4 changes: 3 additions & 1 deletion hwbench/config/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def test_keywords(self):
.read_bytes()
.split(b":", 1)
)
self.get_jobs_config().validate_sections()
with patch("hwbench.utils.helpers.is_binary_available") as iba:
iba.return_value = True
self.get_jobs_config().validate_sections()
except Exception as exc:
assert False, f"'validate_sections' detected a syntax error {exc}"

Expand Down
18 changes: 11 additions & 7 deletions hwbench/engines/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@
def mock_engine(version: str) -> StressNG:
# We need to patch list_module_parameters() function
# to avoid considering the local stress-ng binary
with patch("hwbench.engines.stressng.EngineModuleCpu.list_module_parameters") as p:
p.return_value = (
pathlib.Path(f"./tests/parsing/stressngmethods/{version}/stdout")
.read_bytes()
.split(b":", 1)
)
return StressNG()
with patch("hwbench.utils.helpers.is_binary_available") as iba:
iba.return_value = True
with patch(
"hwbench.engines.stressng.EngineModuleCpu.list_module_parameters"
) as p:
p.return_value = (
pathlib.Path(f"./tests/parsing/stressngmethods/{version}/stdout")
.read_bytes()
.split(b":", 1)
)
return StressNG()


class TestParse(unittest.TestCase):
Expand Down
1 change: 1 addition & 0 deletions hwbench/environment/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def dump(self) -> dict[str, Optional[str | int] | dict]:
return {
"dmi": self.dmi.dump(),
"cpu": self.cpu.dump(),
"bmc": self.vendor.get_bmc().dump(),
}

def cpu_flags(self) -> list[str]:
Expand Down
58 changes: 51 additions & 7 deletions hwbench/environment/turbostat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os
import re
import subprocess
from enum import Enum
from packaging.version import Version
from ..environment.hardware import BaseHardware
from ..bench.monitoring_structs import MonitorMetric, CPUContext, PowerContext
from ..utils.helpers import is_binary_available, fatal

CORE = "core"
PACKAGE = "package"
Expand Down Expand Up @@ -48,16 +51,49 @@ def __init__(
CPUSTATS.CORE_WATTS,
CPUSTATS.PACKAGE_WATTS,
}
self.min_release = Version("2022.04.16")
self.header = ""
self.freq_metrics = freq_metrics
self.power_metrics = power_metrics
self.hardware = hardware
self.process: subprocess.Popen[bytes] = None # type: ignore[assignment]
self.freq_metrics[str(CPUContext.CPU)] = {} # type: ignore[no-redef]
self.power_metrics[str(PowerContext.CPU)] = {} # type: ignore[no-redef]

# Let's make a first quick run to detect system
self.check_version()
self.pre_run()

def check_version(self):
english_env = os.environ.copy()
english_env["LC_ALL"] = "C"

if not is_binary_available("turbostat"):
fatal("Missing turbostat binary, please install it.")

self.process = subprocess.Popen(
["turbostat", "--version"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
env=english_env,
stdin=subprocess.DEVNULL,
)
# turbostat version 2022.04.16 - Len Brown <[email protected]>
match = re.search(
r"turbostat version (?P<version>[0-9]+\.[0-9]+\.[0-9]+).*",
str(self.get_process_output()),
)

current_version = Version(match.group("version"))
if not match:
fatal("Monitoring/turbostat: Cannot detect turbostat version")

print(f"Monitoring/turbostat: Detected release {current_version}")
if current_version < self.min_release:
fatal(
f"Monitoring/turbostat: minimal expected release is {self.min_release}"
)

def reset_metrics(self, power_metrics=None):
if power_metrics is not None:
self.power_metrics = power_metrics
Expand Down Expand Up @@ -115,14 +151,14 @@ def run(self, interval: float = 1, wait=False):
"-c",
f"{self.hardware.get_cpu().get_logical_cores_count()-1}",
"turbostat",
"-c",
"--cpu",
"core",
"-q",
"--quiet",
"--interval",
str(interval),
"-n",
"--num_iterations",
"1",
"-s",
"--show",
]
sensors = ""
for sensor in CPUSTATS:
Expand Down Expand Up @@ -189,9 +225,17 @@ def parse(self):
items = line.split()
core_nb = items[int(self.__get_field_position(CPUSTATS.CPU))]
if self.has(CPUSTATS.CORE_WATTS):
self.power_metrics[str(PowerContext.CPU)][f"Core_{core_nb}"].add(
float(items[int(self.__get_field_position(CPUSTATS.CORE_WATTS))])
)
try:
self.power_metrics[str(PowerContext.CPU)][f"Core_{core_nb}"].add(
float(
items[int(self.__get_field_position(CPUSTATS.CORE_WATTS))]
)
)
except IndexError:
# Some processors reports the corewatt in the header but not for all cores ...
# So let's ignore if the metrics does not exist for this core
pass

self.freq_metrics[str(CPUContext.CPU)][f"Core_{core_nb}"].add(
float(items[int(self.__get_field_position(CPUSTATS.BUSY_MHZ))])
)
Expand Down
8 changes: 8 additions & 0 deletions hwbench/environment/vendors/vendor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ def get_ip(self) -> str:

return ip

def get_driver_name(self) -> str:
"""Return the BMC driver name"""
return type(self).__name__

def dump(self) -> dict[str, str]:
"""Return the dump of the BMC"""
return {"driver": self.get_driver_name()}

def connect_redfish(self):
"""Connect to the BMC using Redfish."""
if not self.vendor.get_monitoring_config_filename():
Expand Down
3 changes: 3 additions & 0 deletions hwbench/tuning/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def run(self):
for dirname in dirnames:
diskdir = pathlib.Path(rootpath) / dirname
file = diskdir / "queue/scheduler"
# Some block devices like zram do not have scheduler
if not os.path.isfile(file):
continue
previous = file.read_text(encoding="utf-8").rstrip()
# see https://docs.kernel.org/block/switching-sched.html
# for deeper explanation
Expand Down
30 changes: 17 additions & 13 deletions hwbench/utils/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pathlib
import subprocess
from abc import abstractmethod, ABC
from .helpers import fatal


class External(ABC):
Expand Down Expand Up @@ -39,24 +40,27 @@ def run(self):
"""Returns the output of parse_cmd (a json-able type)"""
english_env = os.environ.copy()
english_env["LC_ALL"] = "C"
if self.run_cmd_version():
ver = subprocess.run(
self.run_cmd_version(),
try:
if self.run_cmd_version():
ver = subprocess.run(
self.run_cmd_version(),
capture_output=True,
cwd=self.out_dir,
env=english_env,
stdin=subprocess.DEVNULL,
)
self._write_output("version-stdout", ver.stdout)
self._write_output("version-stderr", ver.stderr)
self.parse_version(ver.stdout, ver.stderr)
out = subprocess.run(
self.run_cmd(),
capture_output=True,
cwd=self.out_dir,
env=english_env,
stdin=subprocess.DEVNULL,
)
self._write_output("version-stdout", ver.stdout)
self._write_output("version-stderr", ver.stderr)
self.parse_version(ver.stdout, ver.stderr)
out = subprocess.run(
self.run_cmd(),
capture_output=True,
cwd=self.out_dir,
env=english_env,
stdin=subprocess.DEVNULL,
)
except FileNotFoundError as e:
fatal(f"Missing {e.filename} binary, please install it.")
# save outputs

self._write_output("stdout", out.stdout)
Expand Down
6 changes: 6 additions & 0 deletions hwbench/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import sys
from datetime import timedelta
from shutil import which
from typing import NoReturn


Expand All @@ -24,3 +25,8 @@ def time_to_next_sync(safe_start=True):
# Let's bump to the next minute o'clock
next_sync += timedelta(seconds=60 - next_sync.second)
return (next_sync - now).total_seconds(), next_sync


def is_binary_available(binary_name: str) -> bool:
"""A function to check if a binary is available"""
return which(binary_name) is not None
Loading