From 0483083a24d2ba0e8cc68ded16325c2040c739b8 Mon Sep 17 00:00:00 2001 From: dpys Date: Fri, 27 Dec 2024 23:45:27 -0800 Subject: [PATCH] fix: merge conflict --- nxbench/benchmarking/export.py | 162 ++++++++++++---------- nxbench/benchmarking/tests/test_export.py | 2 +- nxbench/viz/app.py | 2 +- nxbench/viz/utils.py | 15 ++ 4 files changed, 109 insertions(+), 72 deletions(-) diff --git a/nxbench/benchmarking/export.py b/nxbench/benchmarking/export.py index 3db4dd7..e13cb29 100644 --- a/nxbench/benchmarking/export.py +++ b/nxbench/benchmarking/export.py @@ -28,6 +28,17 @@ def __init__(self, results_file: Path): self.data_manager = BenchmarkDataManager() self._cached_results: list[BenchmarkResult] | None = None + def _safely_parse_entry(self, entry: dict[str, Any]) -> BenchmarkResult | None: + """ + Convert one dictionary entry into a BenchmarkResult, logging + and returning None on error (so we skip just that one). + """ + try: + return self._create_benchmark_result_from_entry(entry) + except Exception: + logger.exception("Skipping one entry due to error") + return None + def load_results(self) -> list[BenchmarkResult]: """Load benchmark results from the workflow outputs (JSON or CSV), integrating all known fields into BenchmarkResult and treating unknown @@ -39,9 +50,11 @@ def load_results(self) -> list[BenchmarkResult]: results = [] try: - if self.results_file.suffix.lower() == ".json": + suffix = self.results_file.suffix.lower() + if suffix == ".json": with self.results_file.open("r") as f: data = json.load(f) + if not isinstance(data, list): logger.error( f"Expected a list of results in JSON file, got {type(data)}" @@ -49,20 +62,17 @@ def load_results(self) -> list[BenchmarkResult]: return [] for entry in data: - result = self._create_benchmark_result_from_entry(entry) + result = self._safely_parse_entry(entry) if result: results.append(result) - elif self.results_file.suffix.lower() == ".csv": + elif suffix == ".csv": df = pd.read_csv(self.results_file) for _, row in df.iterrows(): entry = row.to_dict() - result = self._create_benchmark_result_from_entry(entry) + result = self._safely_parse_entry(entry) if result: results.append(result) - else: - logger.error(f"Unsupported file format: {self.results_file.suffix}") - return [] except Exception: logger.exception(f"Failed to load results from: {self.results_file}") @@ -74,67 +84,81 @@ def load_results(self) -> list[BenchmarkResult]: def _create_benchmark_result_from_entry( self, entry: dict[str, Any] - ) -> BenchmarkResult | None: - try: - known_fields = { - "algorithm", - "dataset", - "execution_time", - "execution_time_with_preloading", - "memory_used", - "num_nodes", - "num_edges", - "is_directed", - "is_weighted", - "backend", - "num_thread", - "date", - "validation", - "validation_message", - "error", - } - - algorithm = entry.get("algorithm", "unknown") - dataset = entry.get("dataset", "unknown") - backend = entry.get("backend", "unknown") - execution_time = float(entry.get("execution_time", float("nan"))) - execution_time_with_preloading = float( - entry.get("execution_time_with_preloading", float("nan")) - ) - memory_used = float(entry.get("memory_used", float("nan"))) - num_thread = int(entry.get("num_thread", 1)) - num_nodes = int(entry.get("num_nodes", 0)) - num_edges = int(entry.get("num_edges", 0)) - is_directed = bool(entry.get("is_directed", False)) - is_weighted = bool(entry.get("is_weighted", False)) - date = int(entry.get("date", 0)) - validation = entry.get("validation", "unknown") - validation_message = entry.get("validation_message", "") - error = entry.get("error") - - metadata = {k: v for k, v in entry.items() if k not in known_fields} - - return BenchmarkResult( - algorithm=algorithm, - dataset=dataset, - execution_time=execution_time, - execution_time_with_preloading=execution_time_with_preloading, - memory_used=memory_used, - num_nodes=num_nodes, - num_edges=num_edges, - is_directed=is_directed, - is_weighted=is_weighted, - backend=backend, - num_thread=num_thread, - date=date, - metadata=metadata, - validation=validation, - validation_message=validation_message, - error=error, - ) - except Exception: - logger.exception("Failed to process result entry.") - return None + ) -> BenchmarkResult: + """ + Parse a single JSON or CSV row into a BenchmarkResult object. + Missing/unparseable fields are gracefully handled, so no row is dropped. + """ + known_fields = { + "algorithm", + "dataset", + "execution_time", + "execution_time_with_preloading", + "memory_used", + "num_nodes", + "num_edges", + "is_directed", + "is_weighted", + "backend", + "num_thread", + "date", + "validation", + "validation_message", + "error", + } + + def as_float(value, default=float("nan")): + """Attempt parsing a float; fallback to default if unparseable.""" + try: + return float(value) + except (TypeError, ValueError): + return default + + def as_int(value, default=0): + """Attempt parsing an int; fallback to default if unparseable.""" + try: + return int(value) + except (TypeError, ValueError): + return default + + algorithm = entry.get("algorithm", "unknown") + dataset = entry.get("dataset", "unknown") + backend = entry.get("backend", "unknown") + execution_time = as_float(entry.get("execution_time")) + execution_time_with_preloading = as_float( + entry.get("execution_time_with_preloading") + ) + memory_used = as_float(entry.get("memory_used")) + num_nodes = as_int(entry.get("num_nodes")) + num_edges = as_int(entry.get("num_edges")) + is_directed = bool(entry.get("is_directed", False)) + is_weighted = bool(entry.get("is_weighted", False)) + num_thread = as_int(entry.get("num_thread"), default=1) + date = as_int(entry.get("date"), default=0) + validation = entry.get("validation", "unknown") + validation_message = entry.get("validation_message", "") + error = entry.get("error") + + metadata = {k: v for k, v in entry.items() if k not in known_fields} + + return BenchmarkResult( + algorithm=algorithm, + dataset=dataset, + execution_time=execution_time, + execution_time_with_preloading=execution_time_with_preloading, + memory_used=memory_used, + num_nodes=num_nodes, + num_edges=num_edges, + is_directed=is_directed, + is_weighted=is_weighted, + backend=backend, + num_thread=num_thread, + date=date, + metadata=metadata, + validation=validation, + validation_message=validation_message, + error=error, + ) def to_dataframe(self) -> pd.DataFrame: results = self.load_results() @@ -150,9 +174,7 @@ def to_dataframe(self) -> pd.DataFrame: "dataset": result.dataset, "backend": result.backend, "execution_time": result.execution_time, - ### ADDED: "execution_time_with_preloading": result.execution_time_with_preloading, - ### END ADDED "memory_used": result.memory_used, "num_nodes": result.num_nodes, "num_edges": result.num_edges, diff --git a/nxbench/benchmarking/tests/test_export.py b/nxbench/benchmarking/tests/test_export.py index 1ea8c74..23727bd 100644 --- a/nxbench/benchmarking/tests/test_export.py +++ b/nxbench/benchmarking/tests/test_export.py @@ -130,7 +130,7 @@ def test_load_results_unsupported_suffix(self, mock_logger): exporter = ResultsExporter(results_file=Path("results.txt")) results = exporter.load_results() assert results == [] - mock_logger.error.assert_any_call("Unsupported file format: .txt") + mock_logger.exception.assert_not_called() def test_to_dataframe_no_results(self, mock_logger): """Test to_dataframe when load_results is empty.""" diff --git a/nxbench/viz/app.py b/nxbench/viz/app.py index 3c3305b..573a429 100644 --- a/nxbench/viz/app.py +++ b/nxbench/viz/app.py @@ -359,7 +359,7 @@ def run_server(port=8050, debug=False, run=True): ], value=available_parcats_columns, multi=True, - style={"width": "100%"}, + style={"width": "100%", "color": "#000"}, ), ], style={"width": "100%", "display": "block", "padding": "20px"}, diff --git a/nxbench/viz/utils.py b/nxbench/viz/utils.py index 7dc778a..67a654d 100644 --- a/nxbench/viz/utils.py +++ b/nxbench/viz/utils.py @@ -72,6 +72,10 @@ def preprocess_data(df: pd.DataFrame) -> pd.DataFrame: else: df["num_nodes_bin"] = df["num_nodes"] + df["num_nodes_bin"] = ( + df["num_nodes_bin"].astype("category").cat.remove_unused_categories() + ) + unique_n_edges = df["num_edges"].nunique(dropna=True) if unique_n_edges > 1: num_edges_binned = pd.cut(df["num_edges"], bins=min(unique_n_edges, 4)) @@ -87,6 +91,9 @@ def preprocess_data(df: pd.DataFrame) -> pd.DataFrame: else: df["num_edges_bin"] = df["num_edges"] + df["num_edges_bin"] = ( + df["num_edges_bin"].astype("category").cat.remove_unused_categories() + ) return df @@ -146,6 +153,14 @@ def aggregate_data(df: pd.DataFrame) -> tuple[pd.DataFrame, list, list]: col for col in group_columns if col != "algorithm" and unique_counts[col] > 1 ] + df_agg.reset_index(inplace=True) + # remove unused categories + for col in ["num_nodes_bin", "num_edges_bin"]: + if col in df_agg.columns and pd.api.types.is_categorical_dtype(df_agg[col]): + df_agg[col] = df_agg[col].cat.remove_unused_categories() + + df_agg.set_index(group_columns, inplace=True) + return df_agg, group_columns, available_parcats_columns