Merge pull request #17 from dPys/cli-gotchas

Cli gotchas
dPys · Dec 13, 2024 · 9babc43 · 9babc43
2 parents 2a0e09b + f0ce92a
commit 9babc43
Show file tree

Hide file tree

Showing 10 changed files with 276 additions and 118 deletions.
diff --git a/nxbench/benchmarks/benchmark.py b/nxbench/benchmarks/benchmark.py
@@ -61,18 +61,16 @@ def make_benchmark_method(algo_config, dataset_name, backend, num_thread):
         and number of threads combination.
         """
         algo_name = algo_config.name
-        orig_dataset_name = dataset_name
-        method_name = f"track_{algo_name}_{orig_dataset_name}_{backend}_{num_thread}"
+        safe_dataset_name = dataset_name.replace("-", "_")
+        method_name = f"track_{algo_name}_{safe_dataset_name}_{backend}_{num_thread}"
 
         def track_method(self):
             """Run benchmark and return metrics for the unique combination."""
             logger.debug(
                 f"Starting track_method for {method_name} with backend={backend}, "
-                f"threads={num_thread}, dataset={orig_dataset_name}"
-            )
-            metrics = self.do_benchmark(
-                algo_config, orig_dataset_name, backend, num_thread
+                f"threads={num_thread}, dataset={dataset_name}"
             )
+            metrics = self.do_benchmark(algo_config, dataset_name, backend, num_thread)
             logger.debug(f"Track {method_name} results: {metrics}")
             return metrics
 

diff --git a/nxbench/cli.py b/nxbench/cli.py
@@ -110,7 +110,7 @@ def get_latest_commit_hash(github_url: str) -> str:
 
 def safe_run(
     cmd: Sequence[str | Path],
-    check: bool = True,
+    check: bool = False,
     capture_output: bool = False,
     **kwargs,
 ) -> subprocess.CompletedProcess:
@@ -121,7 +121,7 @@ def safe_run(
     ----------
     cmd : Sequence[str | Path]
         The command and arguments to execute.
-    check : bool, default=True
+    check : bool, default=False
         If True, raise an exception if the command fails.
     capture_output : bool, default=False
         If True, capture stdout and stderr.

diff --git a/nxbench/configs/asv.conf.json b/nxbench/configs/asv.conf.json
@@ -5,10 +5,10 @@
     "project_url": "https://github.com/dpys/nxbench",
     "dvcs": "git",
     "branches": [
-        "main"
+        "cli-gotchas"
     ],
     "repo": "https://github.com/dpys/nxbench",
-    "environment_type": "conda",
+    "environment_type": "virtualenv",
     "show_commit_url": "https://github.com/dpys/nxbench/commit/",
     "matrix": {},
     "benchmark_dir": "nxbench/benchmarks",

diff --git a/nxbench/configs/dummy.yaml b/nxbench/configs/dummy.yaml
@@ -1,14 +1,11 @@
 algorithms:
-  - name: "pagerank"
-    func: "networkx.pagerank"
-    params:
-      alpha: 0.9
-      tol: 1.0e-6
+  - name: "average_clustering"
+    func: "networkx.average_clustering"
+    params: {}
     requires_directed: false
-    groups: ["centrality", "random_walk"]
-    min_rounds: 10
-    warmup: true
-    warmup_iterations: 50
+    groups: ["clustering", "graph_structure"]
+    min_rounds: 3
+    validate_result: "nxbench.validation.validate_scalar_result"
 
 datasets:
   - name: "erdos_renyi_small"
@@ -35,9 +32,10 @@ matrix:
     - "1"
 
 env_config:
-  repo: "https://github.com/dpys/nxbench.git"
-  branches:
-    - "main"
   req:
     - "networkx==3.4.2"
+    - "nx-parallel-0.3rc0.dev0"
     - "graphblas_algorithms==2023.10.0"
+  pythons:
+    - "3.10"
+    - "3.11"
diff --git a/nxbench/configs/all.yaml → nxbench/configs/example2.yaml b/nxbench/configs/all.yaml → nxbench/configs/example2.yaml
@@ -19,17 +19,17 @@ algorithms:
     warmup_iterations: 20
     validate_result: "nxbench.validation.validate_node_scores"
 
-  - name: "betweenness_centrality"
-    func: "networkx.betweenness_centrality"
-    params:
-      normalized: true
-      endpoints: false
-    requires_directed: false
-    groups: ["centrality", "path_based"]
-    min_rounds: 5
-    warmup: true
-    warmup_iterations: 20
-    validate_result: "nxbench.validation.validate_node_scores"
+  # - name: "betweenness_centrality"
+  #   func: "networkx.betweenness_centrality"
+  #   params:
+  #     normalized: true
+  #     endpoints: false
+  #   requires_directed: false
+  #   groups: ["centrality", "path_based"]
+  #   min_rounds: 5
+  #   warmup: true
+  #   warmup_iterations: 20
+  #   validate_result: "nxbench.validation.validate_node_scores"
 
   # - name: "edge_betweenness_centrality"
   #   func: "networkx.edge_betweenness_centrality"
@@ -160,45 +160,21 @@ algorithms:
   #   min_rounds: 3
 
 datasets:
-  - name: "twitter"
-    source: "networkrepository"
-    params: {}
-
-  - name: "08blocks"
-    source: "networkrepository"
-    params: {}
-
-  - name: "amazon"
-    source: "networkrepository"
-    params: {}
-
-  - name: "google"
-    source: "networkrepository"
-    params: {}
-
-  - name: "enron"
-    source: "networkrepository"
-    params: {}
+  # - name: "enron"
+  #   source: "networkrepository"
+  #   params: {}
 
-  - name: "citationCiteseer"
-    source: "networkrepository"
-    params: {}
+  # - name: "citationCiteseer"
+  #   source: "networkrepository"
+  #   params: {}
 
   - name: "karate"
     source: "networkrepository"
     params: {}
 
-  - name: "netscience"
-    source: "networkrepository"
-    params: {}
-
-  - name: "email-Eu-core"
-    source: "networkrepository"
-    params: {}
-
-  - name: "jazz"
-    source: "networkrepository"
-    params: {}
+  # - name: "patentcite"
+  #   source: "networkrepository"
+  #   params: {}
 
   - name: "erdos_renyi_small"
     source: "generator"
@@ -214,43 +190,43 @@ datasets:
     source: "generator"
     params:
       generator: "networkx.watts_strogatz_graph"
-      n: 10000
+      n: 1000
       k: 6
       p: 0.1
     metadata:
       directed: false
       weighted: false
 
-  - name: "barabasi_albert_small"
-    source: "generator"
-    params:
-      generator: "networkx.barabasi_albert_graph"
-      n: 1000
-      m: 3
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "barabasi_albert_small"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.barabasi_albert_graph"
+  #     n: 1000
+  #     m: 3
+  #   metadata:
+  #     directed: false
+  #     weighted: false
 
-  - name: "powerlaw_cluster_small"
-    source: "generator"
-    params:
-      generator: "networkx.powerlaw_cluster_graph"
-      n: 1000
-      m: 2
-      p: 0.1
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "powerlaw_cluster_small"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.powerlaw_cluster_graph"
+  #     n: 1000
+  #     m: 2
+  #     p: 0.1
+  #   metadata:
+  #     directed: false
+  #     weighted: false
 
-  - name: "erdos_renyi_small"
-    source: "generator"
-    params:
-      generator: "networkx.erdos_renyi_graph"
-      n: 1000
-      p: 0.01
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "erdos_renyi_large"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.erdos_renyi_graph"
+  #     n: 10000
+  #     p: 0.01
+  #   metadata:
+  #     directed: false
+  #     weighted: false
 
   # - name: "watts_strogatz_large"
   #   source: "generator"
@@ -295,7 +271,7 @@ matrix:
     - "networkx"
     - "parallel"
     - "graphblas"
-    - "cugraph"
+    # - "cugraph"
   num_threads:
     - "1"
     - "2"
@@ -306,7 +282,7 @@ env_config:
     - "networkx==3.4.2"
     - "nx-parallel-0.3rc0.dev0"
     - "graphblas_algorithms==2023.10.0"
-    - "nx-cugraph_cu11==24.8.0"
+    # - "nx-cugraph_cu11==24.8.0"
   pythons:
     - "3.10"
     - "3.11"
diff --git a/nxbench/data/loader.py b/nxbench/data/loader.py
@@ -14,6 +14,7 @@
 
 from nxbench.benchmarks.config import DatasetConfig
 from nxbench.data.synthesize import generate_graph
+from nxbench.data.utils import detect_delimiter, fix_matrix_market_file
 
 warnings.filterwarnings("ignore")
 
@@ -36,7 +37,7 @@ def __init__(self, data_dir: str | Path | None = None):
         self._metadata_df = self._load_metadata()
 
     def _normalize_name(self, name: str) -> str:
-        return name.lower().replace("-", "_")
+        return name.lower().strip().replace("-", "_")
 
     def _load_metadata(self) -> pd.DataFrame:
         try:
@@ -68,6 +69,7 @@ async def load_network(
             return self._generate_graph(config)
 
         metadata = self.get_metadata(config.name)
+
         if config.name in self._network_cache:
             logger.debug(f"Loading network '{config.name}' from cache")
             return self._network_cache[config.name]
@@ -117,8 +119,32 @@ def check_graph_validity(graph, file_path):
             suffix = graph_file.suffix.lower()
             if suffix == ".mtx":
                 logger.info(f"Loading Matrix Market file from {graph_file}")
+                graph_path = Path(graph_file)
+                corrected_file = graph_path.with_name(
+                    f"{graph_path.stem}_corrected{graph_path.suffix}"
+                )
+
                 try:
-                    sparse_matrix = mmread(graph_file)
+                    # check if the corrected file already exists
+                    if corrected_file.exists():
+                        logger.info(
+                            f"Using existing corrected Matrix Market file: "
+                            f"{corrected_file}"
+                        )
+                        sparse_matrix = mmread(corrected_file)
+                    else:
+                        try:
+                            # attempt to read the original file
+                            sparse_matrix = mmread(graph_file)
+                        except Exception:
+                            logger.info(f"Fixing Matrix Market file: {graph_file}")
+                            # fix the file and load the corrected version
+                            corrected_file = fix_matrix_market_file(graph_path)
+                            sparse_matrix = mmread(corrected_file)
+                except Exception:
+                    logger.exception(f"Failed to load Matrix Market file {graph_file}")
+                    raise ValueError("Matrix Market file not in expected format")
+                else:
                     graph = nx.from_scipy_sparse_array(
                         sparse_matrix,
                         create_using=(
@@ -127,14 +153,18 @@ def check_graph_validity(graph, file_path):
                             else nx.Graph()
                         ),
                     )
-                    check_graph_validity(graph, graph_file)
-                except ValueError:
-                    logger.exception(f"Failed to load Matrix Market file {graph_file}")
-                    raise ValueError("Matrix Market file not in expected format")
-                except Exception:
-                    logger.exception(f"Failed to load Matrix Market file {graph_file}")
-                    raise
+                    graph.graph.update(metadata)
+                    return graph
             elif suffix in [".edgelist", ".edges"]:
+                try:
+                    delimiter = detect_delimiter(graph_file)
+                    logger.debug(f"Detected delimiter: '{delimiter}'")
+                except Exception:
+                    logger.debug(
+                        "No valid delimiter found, falling back to whitespace split"
+                    )
+                    delimiter = " "
+
                 create_using = (
                     nx.DiGraph() if metadata.get("directed", False) else nx.Graph()
                 )
@@ -179,6 +209,7 @@ def edge_parser():
                             )
                             graph = nx.read_edgelist(
                                 edge_iter,
+                                delimiter=delimiter,
                                 nodetype=str,
                                 create_using=create_using,
                                 data=False,
@@ -194,6 +225,7 @@ def edge_parser():
                             )
                             graph = nx.read_edgelist(
                                 edge_iter,
+                                delimiter=delimiter,
                                 nodetype=str,
                                 create_using=create_using,
                                 data=False,
@@ -211,6 +243,7 @@ def edge_parser():
                             )
                             graph = nx.read_edgelist(
                                 edge_iter,
+                                delimiter=delimiter,
                                 nodetype=str,
                                 create_using=create_using,
                                 data=False,
@@ -245,6 +278,8 @@ def edge_parser():
             raise
         else:
             graph.graph.update(metadata)
+            if graph.number_of_edges() == 0:
+                raise ValueError(f"Graph file {graph_file} contains no valid edges.")
             logger.info(f"Loaded network from '{graph_file}' successfully.")
             return graph