diff --git a/nxbench/benchmarks/benchmark.py b/nxbench/benchmarks/benchmark.py index cb2a74a..13de097 100644 --- a/nxbench/benchmarks/benchmark.py +++ b/nxbench/benchmarks/benchmark.py @@ -61,18 +61,16 @@ def make_benchmark_method(algo_config, dataset_name, backend, num_thread): and number of threads combination. """ algo_name = algo_config.name - orig_dataset_name = dataset_name - method_name = f"track_{algo_name}_{orig_dataset_name}_{backend}_{num_thread}" + safe_dataset_name = dataset_name.replace("-", "_") + method_name = f"track_{algo_name}_{safe_dataset_name}_{backend}_{num_thread}" def track_method(self): """Run benchmark and return metrics for the unique combination.""" logger.debug( f"Starting track_method for {method_name} with backend={backend}, " - f"threads={num_thread}, dataset={orig_dataset_name}" - ) - metrics = self.do_benchmark( - algo_config, orig_dataset_name, backend, num_thread + f"threads={num_thread}, dataset={dataset_name}" ) + metrics = self.do_benchmark(algo_config, dataset_name, backend, num_thread) logger.debug(f"Track {method_name} results: {metrics}") return metrics diff --git a/nxbench/cli.py b/nxbench/cli.py index e611819..aeb591a 100644 --- a/nxbench/cli.py +++ b/nxbench/cli.py @@ -110,7 +110,7 @@ def get_latest_commit_hash(github_url: str) -> str: def safe_run( cmd: Sequence[str | Path], - check: bool = True, + check: bool = False, capture_output: bool = False, **kwargs, ) -> subprocess.CompletedProcess: @@ -121,7 +121,7 @@ def safe_run( ---------- cmd : Sequence[str | Path] The command and arguments to execute. - check : bool, default=True + check : bool, default=False If True, raise an exception if the command fails. capture_output : bool, default=False If True, capture stdout and stderr. diff --git a/nxbench/configs/asv.conf.json b/nxbench/configs/asv.conf.json index 1f76a1b..947501e 100644 --- a/nxbench/configs/asv.conf.json +++ b/nxbench/configs/asv.conf.json @@ -5,10 +5,10 @@ "project_url": "https://github.com/dpys/nxbench", "dvcs": "git", "branches": [ - "main" + "cli-gotchas" ], "repo": "https://github.com/dpys/nxbench", - "environment_type": "conda", + "environment_type": "virtualenv", "show_commit_url": "https://github.com/dpys/nxbench/commit/", "matrix": {}, "benchmark_dir": "nxbench/benchmarks", diff --git a/nxbench/configs/dummy.yaml b/nxbench/configs/dummy.yaml index 9258547..309b939 100644 --- a/nxbench/configs/dummy.yaml +++ b/nxbench/configs/dummy.yaml @@ -1,14 +1,11 @@ algorithms: - - name: "pagerank" - func: "networkx.pagerank" - params: - alpha: 0.9 - tol: 1.0e-6 + - name: "average_clustering" + func: "networkx.average_clustering" + params: {} requires_directed: false - groups: ["centrality", "random_walk"] - min_rounds: 10 - warmup: true - warmup_iterations: 50 + groups: ["clustering", "graph_structure"] + min_rounds: 3 + validate_result: "nxbench.validation.validate_scalar_result" datasets: - name: "erdos_renyi_small" @@ -35,9 +32,10 @@ matrix: - "1" env_config: - repo: "https://github.com/dpys/nxbench.git" - branches: - - "main" req: - "networkx==3.4.2" + - "nx-parallel-0.3rc0.dev0" - "graphblas_algorithms==2023.10.0" + pythons: + - "3.10" + - "3.11" diff --git a/nxbench/configs/all.yaml b/nxbench/configs/example2.yaml similarity index 78% rename from nxbench/configs/all.yaml rename to nxbench/configs/example2.yaml index 9b765c5..03ad96e 100644 --- a/nxbench/configs/all.yaml +++ b/nxbench/configs/example2.yaml @@ -19,17 +19,17 @@ algorithms: warmup_iterations: 20 validate_result: "nxbench.validation.validate_node_scores" - - name: "betweenness_centrality" - func: "networkx.betweenness_centrality" - params: - normalized: true - endpoints: false - requires_directed: false - groups: ["centrality", "path_based"] - min_rounds: 5 - warmup: true - warmup_iterations: 20 - validate_result: "nxbench.validation.validate_node_scores" + # - name: "betweenness_centrality" + # func: "networkx.betweenness_centrality" + # params: + # normalized: true + # endpoints: false + # requires_directed: false + # groups: ["centrality", "path_based"] + # min_rounds: 5 + # warmup: true + # warmup_iterations: 20 + # validate_result: "nxbench.validation.validate_node_scores" # - name: "edge_betweenness_centrality" # func: "networkx.edge_betweenness_centrality" @@ -160,45 +160,21 @@ algorithms: # min_rounds: 3 datasets: - - name: "twitter" - source: "networkrepository" - params: {} - - - name: "08blocks" - source: "networkrepository" - params: {} - - - name: "amazon" - source: "networkrepository" - params: {} - - - name: "google" - source: "networkrepository" - params: {} - - - name: "enron" - source: "networkrepository" - params: {} + # - name: "enron" + # source: "networkrepository" + # params: {} - - name: "citationCiteseer" - source: "networkrepository" - params: {} + # - name: "citationCiteseer" + # source: "networkrepository" + # params: {} - name: "karate" source: "networkrepository" params: {} - - name: "netscience" - source: "networkrepository" - params: {} - - - name: "email-Eu-core" - source: "networkrepository" - params: {} - - - name: "jazz" - source: "networkrepository" - params: {} + # - name: "patentcite" + # source: "networkrepository" + # params: {} - name: "erdos_renyi_small" source: "generator" @@ -214,43 +190,43 @@ datasets: source: "generator" params: generator: "networkx.watts_strogatz_graph" - n: 10000 + n: 1000 k: 6 p: 0.1 metadata: directed: false weighted: false - - name: "barabasi_albert_small" - source: "generator" - params: - generator: "networkx.barabasi_albert_graph" - n: 1000 - m: 3 - metadata: - directed: false - weighted: false + # - name: "barabasi_albert_small" + # source: "generator" + # params: + # generator: "networkx.barabasi_albert_graph" + # n: 1000 + # m: 3 + # metadata: + # directed: false + # weighted: false - - name: "powerlaw_cluster_small" - source: "generator" - params: - generator: "networkx.powerlaw_cluster_graph" - n: 1000 - m: 2 - p: 0.1 - metadata: - directed: false - weighted: false + # - name: "powerlaw_cluster_small" + # source: "generator" + # params: + # generator: "networkx.powerlaw_cluster_graph" + # n: 1000 + # m: 2 + # p: 0.1 + # metadata: + # directed: false + # weighted: false - - name: "erdos_renyi_small" - source: "generator" - params: - generator: "networkx.erdos_renyi_graph" - n: 1000 - p: 0.01 - metadata: - directed: false - weighted: false + # - name: "erdos_renyi_large" + # source: "generator" + # params: + # generator: "networkx.erdos_renyi_graph" + # n: 10000 + # p: 0.01 + # metadata: + # directed: false + # weighted: false # - name: "watts_strogatz_large" # source: "generator" @@ -295,7 +271,7 @@ matrix: - "networkx" - "parallel" - "graphblas" - - "cugraph" + # - "cugraph" num_threads: - "1" - "2" @@ -306,7 +282,7 @@ env_config: - "networkx==3.4.2" - "nx-parallel-0.3rc0.dev0" - "graphblas_algorithms==2023.10.0" - - "nx-cugraph_cu11==24.8.0" + # - "nx-cugraph_cu11==24.8.0" pythons: - "3.10" - "3.11" diff --git a/nxbench/data/loader.py b/nxbench/data/loader.py index 86160a4..f0ad4e0 100644 --- a/nxbench/data/loader.py +++ b/nxbench/data/loader.py @@ -14,6 +14,7 @@ from nxbench.benchmarks.config import DatasetConfig from nxbench.data.synthesize import generate_graph +from nxbench.data.utils import detect_delimiter, fix_matrix_market_file warnings.filterwarnings("ignore") @@ -36,7 +37,7 @@ def __init__(self, data_dir: str | Path | None = None): self._metadata_df = self._load_metadata() def _normalize_name(self, name: str) -> str: - return name.lower().replace("-", "_") + return name.lower().strip().replace("-", "_") def _load_metadata(self) -> pd.DataFrame: try: @@ -68,6 +69,7 @@ async def load_network( return self._generate_graph(config) metadata = self.get_metadata(config.name) + if config.name in self._network_cache: logger.debug(f"Loading network '{config.name}' from cache") return self._network_cache[config.name] @@ -117,8 +119,32 @@ def check_graph_validity(graph, file_path): suffix = graph_file.suffix.lower() if suffix == ".mtx": logger.info(f"Loading Matrix Market file from {graph_file}") + graph_path = Path(graph_file) + corrected_file = graph_path.with_name( + f"{graph_path.stem}_corrected{graph_path.suffix}" + ) + try: - sparse_matrix = mmread(graph_file) + # check if the corrected file already exists + if corrected_file.exists(): + logger.info( + f"Using existing corrected Matrix Market file: " + f"{corrected_file}" + ) + sparse_matrix = mmread(corrected_file) + else: + try: + # attempt to read the original file + sparse_matrix = mmread(graph_file) + except Exception: + logger.info(f"Fixing Matrix Market file: {graph_file}") + # fix the file and load the corrected version + corrected_file = fix_matrix_market_file(graph_path) + sparse_matrix = mmread(corrected_file) + except Exception: + logger.exception(f"Failed to load Matrix Market file {graph_file}") + raise ValueError("Matrix Market file not in expected format") + else: graph = nx.from_scipy_sparse_array( sparse_matrix, create_using=( @@ -127,14 +153,18 @@ def check_graph_validity(graph, file_path): else nx.Graph() ), ) - check_graph_validity(graph, graph_file) - except ValueError: - logger.exception(f"Failed to load Matrix Market file {graph_file}") - raise ValueError("Matrix Market file not in expected format") - except Exception: - logger.exception(f"Failed to load Matrix Market file {graph_file}") - raise + graph.graph.update(metadata) + return graph elif suffix in [".edgelist", ".edges"]: + try: + delimiter = detect_delimiter(graph_file) + logger.debug(f"Detected delimiter: '{delimiter}'") + except Exception: + logger.debug( + "No valid delimiter found, falling back to whitespace split" + ) + delimiter = " " + create_using = ( nx.DiGraph() if metadata.get("directed", False) else nx.Graph() ) @@ -179,6 +209,7 @@ def edge_parser(): ) graph = nx.read_edgelist( edge_iter, + delimiter=delimiter, nodetype=str, create_using=create_using, data=False, @@ -194,6 +225,7 @@ def edge_parser(): ) graph = nx.read_edgelist( edge_iter, + delimiter=delimiter, nodetype=str, create_using=create_using, data=False, @@ -211,6 +243,7 @@ def edge_parser(): ) graph = nx.read_edgelist( edge_iter, + delimiter=delimiter, nodetype=str, create_using=create_using, data=False, @@ -245,6 +278,8 @@ def edge_parser(): raise else: graph.graph.update(metadata) + if graph.number_of_edges() == 0: + raise ValueError(f"Graph file {graph_file} contains no valid edges.") logger.info(f"Loaded network from '{graph_file}' successfully.") return graph diff --git a/nxbench/data/repository.py b/nxbench/data/repository.py index a45b70a..15c0ffc 100644 --- a/nxbench/data/repository.py +++ b/nxbench/data/repository.py @@ -14,6 +14,7 @@ from urllib.parse import urljoin import aiofiles +import aiofiles.os import aiohttp import chardet from aiohttp import ClientSession, ClientTimeout @@ -176,7 +177,7 @@ async def __aexit__(self, exc_type, exc, tb): async def _fetch_text( self, url: str, method: str = "GET", retries: int = 3, **kwargs - ) -> str: + ) -> str | None: """Fetch the text content of a URL using aiohttp with retries and robust encoding handling. """ @@ -252,7 +253,7 @@ async def _fetch_text( async def _fetch_response( self, url: str, method: str = "GET", retries: int = 3, **kwargs - ) -> aiohttp.ClientResponse: + ) -> aiohttp.ClientResponse | None: """Fetch the response object of a URL using aiohttp with retries.""" if not self.session: raise RuntimeError("HTTP session is not initialized.") diff --git a/nxbench/data/tests/test_loader.py b/nxbench/data/tests/test_loader.py index b820fd9..6d62799 100644 --- a/nxbench/data/tests/test_loader.py +++ b/nxbench/data/tests/test_loader.py @@ -209,7 +209,7 @@ async def test_load_unweighted_with_comments(data_manager, create_edge_file): graph, (nx.Graph, nx.DiGraph) ), "Graph should be NetworkX Graph or DiGraph" assert graph.number_of_nodes() == 4, "Graph should have 4 nodes" - assert graph.number_of_edges() == 3, "Graph should have 3 edges" + assert graph.number_of_edges() == 2, "Graph should have 2 edges" for u, v, data in graph.edges(data=True): assert "weight" not in data, f"Edge ({u}, {v}) should not have a 'weight'" @@ -629,15 +629,21 @@ async def test_generate_graph_exception(data_manager): metadata={"directed": False, "weighted": False}, ) - with patch( - "nxbench.data.loader.generate_graph", side_effect=Exception("Generator failed") - ) as mock_generate_graph: - with pytest.raises(Exception, match="Generator failed"): - await data_manager.load_network(config) - - mock_generate_graph.assert_called_once_with( - "networkx.invalid_generator", {"n": 100, "p": 0.1}, False - ) + with patch.object( + data_manager, + "get_metadata", + return_value={"directed": False, "weighted": False}, + ): + with patch( + "nxbench.data.loader.generate_graph", + side_effect=Exception("Generator failed"), + ) as mock_generate_graph: + with pytest.raises(Exception, match="Generator failed"): + await data_manager.load_network(config) + + mock_generate_graph.assert_called_once_with( + "networkx.invalid_generator", {"n": 100, "p": 0.1}, False + ) def test_generate_graph_missing_generator_name(data_manager): diff --git a/nxbench/data/utils.py b/nxbench/data/utils.py index bc00b5a..0ea4650 100644 --- a/nxbench/data/utils.py +++ b/nxbench/data/utils.py @@ -1,5 +1,6 @@ import re import zipfile +from collections import Counter from pathlib import Path import networkx as nx @@ -60,3 +61,141 @@ def safe_extract(filepath, extracted_path): if name.startswith("/") or ".." in name: raise ValueError(f"Malicious path in archive: {name}") zf.extractall(extracted_path) + + +def fix_matrix_market_file(in_path: Path) -> Path: + if not in_path.exists() or not in_path.is_file(): + raise FileNotFoundError( + f"Input file '{in_path!s}' does not exist or is not a file." + ) + + with in_path.open("r") as f: + lines = [line.rstrip("\n") for line in f] + + header_index = None + for i, line in enumerate(lines): + if line.startswith("%%MatrixMarket"): + header_index = i + break + + if header_index is None: + raise ValueError("No %%MatrixMarket header line found.") + + header_line = lines[header_index] + if "coordinate" not in header_line: + raise ValueError( + "This fix only applies to coordinate format Matrix Market files." + ) + + symmetric = "symmetric" in header_line.lower() + content_lines = lines[header_index + 1 :] + + non_comment_lines = [ln for ln in content_lines if ln and not ln.startswith("%")] + + if not non_comment_lines: + raise ValueError("No dimension or data lines found after header and comments.") + + dimension_line = non_comment_lines[0] + parts = dimension_line.split() + + out_file_path = in_path.with_name(f"{in_path.stem}_corrected{in_path.suffix}") + + if len(parts) == 3: + out_file_path.write_text("\n".join(lines) + "\n") + return out_file_path + + if len(parts) < 2: + raise ValueError( + f"Dimension line '{dimension_line}' does not have enough integers." + ) + + data_lines = non_comment_lines[1:] + if not data_lines: + raise ValueError("No data lines found; cannot infer NNZ, M, N.") + + # parse data lines to determine M, N, and NNZ + max_row = 0 + max_col = 0 + NNZ = 0 + for line in data_lines: + coords = line.split() + if len(coords) < 2: + raise ValueError(f"Data line '{line}' does not have two coordinates.") + + r, c = map(int, coords[:2]) # row and col are 1-based + if r > max_row: + max_row = r + if c > max_col: + max_col = c + NNZ += 1 + + # infer M and N from max indices + M = max_row + N = max_col + + # if symmetric and not square, make it square by taking max dimension + if symmetric and M != N: + dim = max(M, N) + M = dim + N = dim + + # construct corrected dimension line + corrected_dimension_line = f"{M} {N} {NNZ}" + + # extract comment lines after header and before dimension line: + after_header = lines[header_index + 1 :] + dim_line_index_in_after = None + for idx, val in enumerate(after_header): + if val.strip() == dimension_line: + dim_line_index_in_after = idx + break + + if dim_line_index_in_after is None: + raise ValueError( + "Could not locate dimension line in the file after header. File may be " + "malformed." + ) + + # comment lines before dimension line: + comment_lines_before_dim = [] + for val in after_header[:dim_line_index_in_after]: + if val.startswith("%"): + comment_lines_before_dim.append(val) + elif not val.strip(): + pass + + with out_file_path.open("w") as out_f: + for i in range(header_index + 1): + out_f.write(lines[i] + "\n") + + for cl in comment_lines_before_dim: + out_f.write(cl + "\n") + + out_f.write(corrected_dimension_line + "\n") + + for dl in data_lines: + out_f.write(dl + "\n") + + return out_file_path + + +def detect_delimiter(file_path: Path, sample_size: int = 5) -> str: + """Detect the most common delimiter in the first few lines of a file.""" + delimiters = [",", "\t", " ", ";"] + delimiter_counts = Counter() + + with file_path.open("r") as f: + for i, line in enumerate(f): + if i >= sample_size: + break + line = line.strip() + if not line or line.startswith(("#", "%")): + continue + for delimiter in delimiters: + if delimiter in line: + delimiter_counts[delimiter] += line.count(delimiter) + + if delimiter_counts: + return delimiter_counts.most_common(1)[0][0] + + raise ValueError("No valid delimiter found in the file.") diff --git a/nxbench/validation/registry.py b/nxbench/validation/registry.py index 13bd938..2246634 100644 --- a/nxbench/validation/registry.py +++ b/nxbench/validation/registry.py @@ -112,6 +112,11 @@ class ValidationRegistry: params={}, expected_type=dict, ), + "average_clustering": ValidationConfig( + validator=validate_node_scores, + params={"require_normalized": False}, + expected_type=dict, + ), "square_clustering": ValidationConfig( validator=validate_node_scores, params={"require_normalized": False},