From 3aee58821d85fe1e3847602689ff020fc1d928af Mon Sep 17 00:00:00 2001
From: dpys <dpysalexander@gmail.com>
Date: Thu, 12 Dec 2024 16:15:02 -0500
Subject: [PATCH 1/7] feat: add mtx file de-corruptor

---
 nxbench/benchmarks/benchmark.py |  10 ++-
 nxbench/configs/all.yaml        |  32 +++------
 nxbench/configs/dummy.yaml      |  52 ++++++++------
 nxbench/data/loader.py          |  48 ++++++++++---
 nxbench/data/repository.py      |   5 +-
 nxbench/data/utils.py           | 116 ++++++++++++++++++++++++++++++++
 nxbench/validation/registry.py  |   5 ++
 7 files changed, 205 insertions(+), 63 deletions(-)

diff --git a/nxbench/benchmarks/benchmark.py b/nxbench/benchmarks/benchmark.py
index cb2a74a..13de097 100644
--- a/nxbench/benchmarks/benchmark.py
+++ b/nxbench/benchmarks/benchmark.py
@@ -61,18 +61,16 @@ def make_benchmark_method(algo_config, dataset_name, backend, num_thread):
         and number of threads combination.
         """
         algo_name = algo_config.name
-        orig_dataset_name = dataset_name
-        method_name = f"track_{algo_name}_{orig_dataset_name}_{backend}_{num_thread}"
+        safe_dataset_name = dataset_name.replace("-", "_")
+        method_name = f"track_{algo_name}_{safe_dataset_name}_{backend}_{num_thread}"
 
         def track_method(self):
             """Run benchmark and return metrics for the unique combination."""
             logger.debug(
                 f"Starting track_method for {method_name} with backend={backend}, "
-                f"threads={num_thread}, dataset={orig_dataset_name}"
-            )
-            metrics = self.do_benchmark(
-                algo_config, orig_dataset_name, backend, num_thread
+                f"threads={num_thread}, dataset={dataset_name}"
             )
+            metrics = self.do_benchmark(algo_config, dataset_name, backend, num_thread)
             logger.debug(f"Track {method_name} results: {metrics}")
             return metrics
 
diff --git a/nxbench/configs/all.yaml b/nxbench/configs/all.yaml
index 9b765c5..3bd1fd8 100644
--- a/nxbench/configs/all.yaml
+++ b/nxbench/configs/all.yaml
@@ -160,41 +160,25 @@ algorithms:
   #   min_rounds: 3
 
 datasets:
-  - name: "twitter"
-    source: "networkrepository"
-    params: {}
-
   - name: "08blocks"
     source: "networkrepository"
     params: {}
 
-  - name: "amazon"
-    source: "networkrepository"
-    params: {}
-
-  - name: "google"
-    source: "networkrepository"
-    params: {}
-
   - name: "enron"
     source: "networkrepository"
     params: {}
 
-  - name: "citationCiteseer"
-    source: "networkrepository"
-    params: {}
+  # - name: "citationCiteseer"
+  #   source: "networkrepository"
+  #   params: {}
 
   - name: "karate"
     source: "networkrepository"
     params: {}
 
-  - name: "netscience"
-    source: "networkrepository"
-    params: {}
-
-  - name: "email-Eu-core"
-    source: "networkrepository"
-    params: {}
+  # - name: "ca-netscience"
+  #   source: "networkrepository"
+  #   params: {}
 
   - name: "jazz"
     source: "networkrepository"
@@ -295,7 +279,7 @@ matrix:
     - "networkx"
     - "parallel"
     - "graphblas"
-    - "cugraph"
+    # - "cugraph"
   num_threads:
     - "1"
     - "2"
@@ -306,7 +290,7 @@ env_config:
     - "networkx==3.4.2"
     - "nx-parallel-0.3rc0.dev0"
     - "graphblas_algorithms==2023.10.0"
-    - "nx-cugraph_cu11==24.8.0"
+    # - "nx-cugraph_cu11==24.8.0"
   pythons:
     - "3.10"
     - "3.11"
diff --git a/nxbench/configs/dummy.yaml b/nxbench/configs/dummy.yaml
index 9258547..9377da4 100644
--- a/nxbench/configs/dummy.yaml
+++ b/nxbench/configs/dummy.yaml
@@ -1,25 +1,34 @@
 algorithms:
-  - name: "pagerank"
-    func: "networkx.pagerank"
-    params:
-      alpha: 0.9
-      tol: 1.0e-6
+  - name: "average_clustering"
+    func: "networkx.average_clustering"
+    params: {}
     requires_directed: false
-    groups: ["centrality", "random_walk"]
-    min_rounds: 10
-    warmup: true
-    warmup_iterations: 50
+    groups: ["clustering", "graph_structure"]
+    min_rounds: 3
+    validate_result: "nxbench.validation.validate_scalar_result"
 
 datasets:
-  - name: "erdos_renyi_small"
-    source: "generator"
-    params:
-      generator: "networkx.erdos_renyi_graph"
-      n: 1000
-      p: 0.01
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "erdos_renyi_small"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.erdos_renyi_graph"
+  #     n: 1000
+  #     p: 0.01
+  #   metadata:
+  #     directed: false
+  #     weighted: false
+
+  - name: "enron"
+    source: "networkrepository"
+    params: {}
+
+  - name: "citationCiteseer"
+    source: "networkrepository"
+    params: {}
+
+  # - name: "ca-netscience"
+  #   source: "networkrepository"
+  #   params: {}
 
 validation:
   skip_slow: false
@@ -35,9 +44,10 @@ matrix:
     - "1"
 
 env_config:
-  repo: "https://github.com/dpys/nxbench.git"
-  branches:
-    - "main"
   req:
     - "networkx==3.4.2"
+    - "nx-parallel-0.3rc0.dev0"
     - "graphblas_algorithms==2023.10.0"
+  pythons:
+    - "3.10"
+    - "3.11"
diff --git a/nxbench/data/loader.py b/nxbench/data/loader.py
index 86160a4..99d6b62 100644
--- a/nxbench/data/loader.py
+++ b/nxbench/data/loader.py
@@ -14,6 +14,7 @@
 
 from nxbench.benchmarks.config import DatasetConfig
 from nxbench.data.synthesize import generate_graph
+from nxbench.data.utils import fix_matrix_market_file
 
 warnings.filterwarnings("ignore")
 
@@ -36,7 +37,7 @@ def __init__(self, data_dir: str | Path | None = None):
         self._metadata_df = self._load_metadata()
 
     def _normalize_name(self, name: str) -> str:
-        return name.lower().replace("-", "_")
+        return name.lower().strip().replace("-", "_")
 
     def _load_metadata(self) -> pd.DataFrame:
         try:
@@ -55,7 +56,15 @@ def get_metadata(self, name: str) -> dict[str, Any]:
         normalized_name = self._normalize_name(name)
         network = self._metadata_df[self._metadata_df["name"] == normalized_name]
         if len(network) == 0:
-            raise ValueError(f"Network {name} not found in metadata cache")
+            logger.warning(
+                f"Network {name} not found in metadata cache. Returning dummy metadata."
+            )
+            return {
+                "name": name,
+                "download_url": None,
+                "directed": False,
+                "weighted": False,
+            }
         return network.iloc[0].to_dict()
 
     async def load_network(
@@ -117,8 +126,32 @@ def check_graph_validity(graph, file_path):
             suffix = graph_file.suffix.lower()
             if suffix == ".mtx":
                 logger.info(f"Loading Matrix Market file from {graph_file}")
+                graph_path = Path(graph_file)
+                corrected_file = graph_path.with_name(
+                    f"{graph_path.stem}_corrected{graph_path.suffix}"
+                )
+
                 try:
-                    sparse_matrix = mmread(graph_file)
+                    # check if the corrected file already exists
+                    if corrected_file.exists():
+                        logger.info(
+                            f"Using existing corrected Matrix Market file: "
+                            f"{corrected_file}"
+                        )
+                        sparse_matrix = mmread(corrected_file)
+                    else:
+                        try:
+                            # attempt to read the original file
+                            sparse_matrix = mmread(graph_file)
+                        except Exception:
+                            logger.info(f"Fixing Matrix Market file: {graph_file}")
+                            # fix the file and load the corrected version
+                            corrected_file = fix_matrix_market_file(graph_path)
+                            sparse_matrix = mmread(corrected_file)
+                except Exception:
+                    logger.exception(f"Failed to load Matrix Market file {graph_file}")
+                    raise
+                else:
                     graph = nx.from_scipy_sparse_array(
                         sparse_matrix,
                         create_using=(
@@ -127,13 +160,8 @@ def check_graph_validity(graph, file_path):
                             else nx.Graph()
                         ),
                     )
-                    check_graph_validity(graph, graph_file)
-                except ValueError:
-                    logger.exception(f"Failed to load Matrix Market file {graph_file}")
-                    raise ValueError("Matrix Market file not in expected format")
-                except Exception:
-                    logger.exception(f"Failed to load Matrix Market file {graph_file}")
-                    raise
+                    graph.graph.update(metadata)
+                    return graph
             elif suffix in [".edgelist", ".edges"]:
                 create_using = (
                     nx.DiGraph() if metadata.get("directed", False) else nx.Graph()
diff --git a/nxbench/data/repository.py b/nxbench/data/repository.py
index a45b70a..15c0ffc 100644
--- a/nxbench/data/repository.py
+++ b/nxbench/data/repository.py
@@ -14,6 +14,7 @@
 from urllib.parse import urljoin
 
 import aiofiles
+import aiofiles.os
 import aiohttp
 import chardet
 from aiohttp import ClientSession, ClientTimeout
@@ -176,7 +177,7 @@ async def __aexit__(self, exc_type, exc, tb):
 
     async def _fetch_text(
         self, url: str, method: str = "GET", retries: int = 3, **kwargs
-    ) -> str:
+    ) -> str | None:
         """Fetch the text content of a URL using aiohttp with retries and robust
         encoding handling.
         """
@@ -252,7 +253,7 @@ async def _fetch_text(
 
     async def _fetch_response(
         self, url: str, method: str = "GET", retries: int = 3, **kwargs
-    ) -> aiohttp.ClientResponse:
+    ) -> aiohttp.ClientResponse | None:
         """Fetch the response object of a URL using aiohttp with retries."""
         if not self.session:
             raise RuntimeError("HTTP session is not initialized.")
diff --git a/nxbench/data/utils.py b/nxbench/data/utils.py
index bc00b5a..d8ee935 100644
--- a/nxbench/data/utils.py
+++ b/nxbench/data/utils.py
@@ -60,3 +60,119 @@ def safe_extract(filepath, extracted_path):
             if name.startswith("/") or ".." in name:
                 raise ValueError(f"Malicious path in archive: {name}")
         zf.extractall(extracted_path)
+
+
+def fix_matrix_market_file(in_path: Path) -> Path:
+    if not in_path.exists() or not in_path.is_file():
+        raise FileNotFoundError(
+            f"Input file '{in_path!s}' does not exist or is not a file."
+        )
+
+    with in_path.open("r") as f:
+        lines = [line.rstrip("\n") for line in f]
+
+    header_index = None
+    for i, line in enumerate(lines):
+        if line.startswith("%%MatrixMarket"):
+            header_index = i
+            break
+
+    if header_index is None:
+        raise ValueError("No %%MatrixMarket header line found.")
+
+    header_line = lines[header_index]
+    if "coordinate" not in header_line:
+        raise ValueError(
+            "This fix only applies to coordinate format Matrix Market files."
+        )
+
+    symmetric = "symmetric" in header_line.lower()
+    content_lines = lines[header_index + 1 :]
+
+    non_comment_lines = [ln for ln in content_lines if ln and not ln.startswith("%")]
+
+    if not non_comment_lines:
+        raise ValueError("No dimension or data lines found after header and comments.")
+
+    dimension_line = non_comment_lines[0]
+    parts = dimension_line.split()
+
+    out_file_path = in_path.with_name(f"{in_path.stem}_corrected{in_path.suffix}")
+
+    if len(parts) == 3:
+        out_file_path.write_text("\n".join(lines) + "\n")
+        return out_file_path
+
+    if len(parts) < 2:
+        raise ValueError(
+            f"Dimension line '{dimension_line}' does not have enough integers."
+        )
+
+    data_lines = non_comment_lines[1:]
+    if not data_lines:
+        raise ValueError("No data lines found; cannot infer NNZ, M, N.")
+
+    # parse data lines to determine M, N, and NNZ
+    max_row = 0
+    max_col = 0
+    NNZ = 0
+    for line in data_lines:
+        coords = line.split()
+        if len(coords) < 2:
+            raise ValueError(f"Data line '{line}' does not have two coordinates.")
+
+        r, c = map(int, coords[:2])  # row and col are 1-based
+        if r > max_row:
+            max_row = r
+        if c > max_col:
+            max_col = c
+        NNZ += 1
+
+    # infer M and N from max indices
+    M = max_row
+    N = max_col
+
+    # if symmetric and not square, make it square by taking max dimension
+    if symmetric and M != N:
+        dim = max(M, N)
+        M = dim
+        N = dim
+
+    # construct corrected dimension line
+    corrected_dimension_line = f"{M} {N} {NNZ}"
+
+    # extract comment lines after header and before dimension line:
+    after_header = lines[header_index + 1 :]
+    dim_line_index_in_after = None
+    for idx, val in enumerate(after_header):
+        if val.strip() == dimension_line:
+            dim_line_index_in_after = idx
+            break
+
+    if dim_line_index_in_after is None:
+        raise ValueError(
+            "Could not locate dimension line in the file after header. File may be "
+            "malformed."
+        )
+
+    # comment lines before dimension line:
+    comment_lines_before_dim = []
+    for val in after_header[:dim_line_index_in_after]:
+        if val.startswith("%"):
+            comment_lines_before_dim.append(val)
+        elif not val.strip():
+            pass
+
+    with out_file_path.open("w") as out_f:
+        for i in range(header_index + 1):
+            out_f.write(lines[i] + "\n")
+
+        for cl in comment_lines_before_dim:
+            out_f.write(cl + "\n")
+
+        out_f.write(corrected_dimension_line + "\n")
+
+        for dl in data_lines:
+            out_f.write(dl + "\n")
+
+    return out_file_path
diff --git a/nxbench/validation/registry.py b/nxbench/validation/registry.py
index 13bd938..2246634 100644
--- a/nxbench/validation/registry.py
+++ b/nxbench/validation/registry.py
@@ -112,6 +112,11 @@ class ValidationRegistry:
             params={},
             expected_type=dict,
         ),
+        "average_clustering": ValidationConfig(
+            validator=validate_node_scores,
+            params={"require_normalized": False},
+            expected_type=dict,
+        ),
         "square_clustering": ValidationConfig(
             validator=validate_node_scores,
             params={"require_normalized": False},

From 04483f64d7bfb4db73a051e28c398c5652db6e28 Mon Sep 17 00:00:00 2001
From: dpys <dpysalexander@gmail.com>
Date: Thu, 12 Dec 2024 16:19:01 -0500
Subject: [PATCH 2/7] rebase: main

---
 nxbench/configs/dummy.yaml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/nxbench/configs/dummy.yaml b/nxbench/configs/dummy.yaml
index 9377da4..e2b40ce 100644
--- a/nxbench/configs/dummy.yaml
+++ b/nxbench/configs/dummy.yaml
@@ -18,11 +18,15 @@ datasets:
   #     directed: false
   #     weighted: false
 
-  - name: "enron"
-    source: "networkrepository"
-    params: {}
+  # - name: "enron"
+  #   source: "networkrepository"
+  #   params: {}
+
+  # - name: "citationCiteseer"
+  #   source: "networkrepository"
+  #   params: {}
 
-  - name: "citationCiteseer"
+  - name: "patentcite"
     source: "networkrepository"
     params: {}
 

From ced9b309c9179a9575ff9bf5b1c8fde105fe9636 Mon Sep 17 00:00:00 2001
From: dpys <dpysalexander@gmail.com>
Date: Thu, 12 Dec 2024 17:40:22 -0500
Subject: [PATCH 3/7] feat: add dynamic delimiter discovery

---
 nxbench/configs/all.yaml          | 16 ++++++---------
 nxbench/configs/dummy.yaml        | 34 ++++++++-----------------------
 nxbench/data/loader.py            | 30 ++++++++++++++++-----------
 nxbench/data/tests/test_loader.py | 26 ++++++++++++++---------
 nxbench/data/utils.py             | 23 +++++++++++++++++++++
 5 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/nxbench/configs/all.yaml b/nxbench/configs/all.yaml
index 3bd1fd8..362bea7 100644
--- a/nxbench/configs/all.yaml
+++ b/nxbench/configs/all.yaml
@@ -160,30 +160,26 @@ algorithms:
   #   min_rounds: 3
 
 datasets:
-  - name: "08blocks"
+  - name: "enron"
     source: "networkrepository"
     params: {}
 
-  - name: "enron"
+  - name: "citationCiteseer"
     source: "networkrepository"
     params: {}
 
-  # - name: "citationCiteseer"
-  #   source: "networkrepository"
-  #   params: {}
-
   - name: "karate"
     source: "networkrepository"
     params: {}
 
+  - name: "patentcite"
+    source: "networkrepository"
+    params: {}
+
   # - name: "ca-netscience"
   #   source: "networkrepository"
   #   params: {}
 
-  - name: "jazz"
-    source: "networkrepository"
-    params: {}
-
   - name: "erdos_renyi_small"
     source: "generator"
     params:
diff --git a/nxbench/configs/dummy.yaml b/nxbench/configs/dummy.yaml
index e2b40ce..309b939 100644
--- a/nxbench/configs/dummy.yaml
+++ b/nxbench/configs/dummy.yaml
@@ -8,31 +8,15 @@ algorithms:
     validate_result: "nxbench.validation.validate_scalar_result"
 
 datasets:
-  # - name: "erdos_renyi_small"
-  #   source: "generator"
-  #   params:
-  #     generator: "networkx.erdos_renyi_graph"
-  #     n: 1000
-  #     p: 0.01
-  #   metadata:
-  #     directed: false
-  #     weighted: false
-
-  # - name: "enron"
-  #   source: "networkrepository"
-  #   params: {}
-
-  # - name: "citationCiteseer"
-  #   source: "networkrepository"
-  #   params: {}
-
-  - name: "patentcite"
-    source: "networkrepository"
-    params: {}
-
-  # - name: "ca-netscience"
-  #   source: "networkrepository"
-  #   params: {}
+  - name: "erdos_renyi_small"
+    source: "generator"
+    params:
+      generator: "networkx.erdos_renyi_graph"
+      n: 1000
+      p: 0.01
+    metadata:
+      directed: false
+      weighted: false
 
 validation:
   skip_slow: false
diff --git a/nxbench/data/loader.py b/nxbench/data/loader.py
index 99d6b62..c8b0ac4 100644
--- a/nxbench/data/loader.py
+++ b/nxbench/data/loader.py
@@ -14,7 +14,7 @@
 
 from nxbench.benchmarks.config import DatasetConfig
 from nxbench.data.synthesize import generate_graph
-from nxbench.data.utils import fix_matrix_market_file
+from nxbench.data.utils import detect_delimiter, fix_matrix_market_file
 
 warnings.filterwarnings("ignore")
 
@@ -56,27 +56,19 @@ def get_metadata(self, name: str) -> dict[str, Any]:
         normalized_name = self._normalize_name(name)
         network = self._metadata_df[self._metadata_df["name"] == normalized_name]
         if len(network) == 0:
-            logger.warning(
-                f"Network {name} not found in metadata cache. Returning dummy metadata."
-            )
-            return {
-                "name": name,
-                "download_url": None,
-                "directed": False,
-                "weighted": False,
-            }
+            raise ValueError(f"Network {name} not found in metadata cache")
         return network.iloc[0].to_dict()
 
     async def load_network(
         self, config: DatasetConfig, session: aiohttp.ClientSession | None = None
     ) -> tuple[nx.Graph | nx.DiGraph, dict[str, Any]]:
         """Load or generate a network based on config."""
+        metadata = self.get_metadata(config.name)
         source_lower = config.source.lower()
 
         if source_lower == "generator":
             return self._generate_graph(config)
 
-        metadata = self.get_metadata(config.name)
         if config.name in self._network_cache:
             logger.debug(f"Loading network '{config.name}' from cache")
             return self._network_cache[config.name]
@@ -150,7 +142,7 @@ def check_graph_validity(graph, file_path):
                             sparse_matrix = mmread(corrected_file)
                 except Exception:
                     logger.exception(f"Failed to load Matrix Market file {graph_file}")
-                    raise
+                    raise ValueError("Matrix Market file not in expected format")
                 else:
                     graph = nx.from_scipy_sparse_array(
                         sparse_matrix,
@@ -163,6 +155,15 @@ def check_graph_validity(graph, file_path):
                     graph.graph.update(metadata)
                     return graph
             elif suffix in [".edgelist", ".edges"]:
+                try:
+                    delimiter = detect_delimiter(graph_file)
+                    logger.debug(f"Detected delimiter: '{delimiter}'")
+                except Exception:
+                    logger.debug(
+                        "No valid delimiter found, falling back to whitespace split"
+                    )
+                    delimiter = " "
+
                 create_using = (
                     nx.DiGraph() if metadata.get("directed", False) else nx.Graph()
                 )
@@ -207,6 +208,7 @@ def edge_parser():
                             )
                             graph = nx.read_edgelist(
                                 edge_iter,
+                                delimiter=delimiter,
                                 nodetype=str,
                                 create_using=create_using,
                                 data=False,
@@ -222,6 +224,7 @@ def edge_parser():
                             )
                             graph = nx.read_edgelist(
                                 edge_iter,
+                                delimiter=delimiter,
                                 nodetype=str,
                                 create_using=create_using,
                                 data=False,
@@ -239,6 +242,7 @@ def edge_parser():
                             )
                             graph = nx.read_edgelist(
                                 edge_iter,
+                                delimiter=delimiter,
                                 nodetype=str,
                                 create_using=create_using,
                                 data=False,
@@ -273,6 +277,8 @@ def edge_parser():
             raise
         else:
             graph.graph.update(metadata)
+            if graph.number_of_edges() == 0:
+                raise ValueError(f"Graph file {graph_file} contains no valid edges.")
             logger.info(f"Loaded network from '{graph_file}' successfully.")
             return graph
 
diff --git a/nxbench/data/tests/test_loader.py b/nxbench/data/tests/test_loader.py
index b820fd9..6d62799 100644
--- a/nxbench/data/tests/test_loader.py
+++ b/nxbench/data/tests/test_loader.py
@@ -209,7 +209,7 @@ async def test_load_unweighted_with_comments(data_manager, create_edge_file):
         graph, (nx.Graph, nx.DiGraph)
     ), "Graph should be NetworkX Graph or DiGraph"
     assert graph.number_of_nodes() == 4, "Graph should have 4 nodes"
-    assert graph.number_of_edges() == 3, "Graph should have 3 edges"
+    assert graph.number_of_edges() == 2, "Graph should have 2 edges"
 
     for u, v, data in graph.edges(data=True):
         assert "weight" not in data, f"Edge ({u}, {v}) should not have a 'weight'"
@@ -629,15 +629,21 @@ async def test_generate_graph_exception(data_manager):
         metadata={"directed": False, "weighted": False},
     )
 
-    with patch(
-        "nxbench.data.loader.generate_graph", side_effect=Exception("Generator failed")
-    ) as mock_generate_graph:
-        with pytest.raises(Exception, match="Generator failed"):
-            await data_manager.load_network(config)
-
-        mock_generate_graph.assert_called_once_with(
-            "networkx.invalid_generator", {"n": 100, "p": 0.1}, False
-        )
+    with patch.object(
+        data_manager,
+        "get_metadata",
+        return_value={"directed": False, "weighted": False},
+    ):
+        with patch(
+            "nxbench.data.loader.generate_graph",
+            side_effect=Exception("Generator failed"),
+        ) as mock_generate_graph:
+            with pytest.raises(Exception, match="Generator failed"):
+                await data_manager.load_network(config)
+
+            mock_generate_graph.assert_called_once_with(
+                "networkx.invalid_generator", {"n": 100, "p": 0.1}, False
+            )
 
 
 def test_generate_graph_missing_generator_name(data_manager):
diff --git a/nxbench/data/utils.py b/nxbench/data/utils.py
index d8ee935..0ea4650 100644
--- a/nxbench/data/utils.py
+++ b/nxbench/data/utils.py
@@ -1,5 +1,6 @@
 import re
 import zipfile
+from collections import Counter
 from pathlib import Path
 
 import networkx as nx
@@ -176,3 +177,25 @@ def fix_matrix_market_file(in_path: Path) -> Path:
             out_f.write(dl + "\n")
 
     return out_file_path
+
+
+def detect_delimiter(file_path: Path, sample_size: int = 5) -> str:
+    """Detect the most common delimiter in the first few lines of a file."""
+    delimiters = [",", "\t", " ", ";"]
+    delimiter_counts = Counter()
+
+    with file_path.open("r") as f:
+        for i, line in enumerate(f):
+            if i >= sample_size:
+                break
+            line = line.strip()
+            if not line or line.startswith(("#", "%")):
+                continue
+            for delimiter in delimiters:
+                if delimiter in line:
+                    delimiter_counts[delimiter] += line.count(delimiter)
+
+    if delimiter_counts:
+        return delimiter_counts.most_common(1)[0][0]
+
+    raise ValueError("No valid delimiter found in the file.")

From aaa0a171abafcbc1e756fe6f6e85353e31f12500 Mon Sep 17 00:00:00 2001
From: dpys <dpysalexander@gmail.com>
Date: Thu, 12 Dec 2024 17:42:52 -0500
Subject: [PATCH 4/7] feat: add dynamic delimiter discovery

---
 nxbench/data/loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nxbench/data/loader.py b/nxbench/data/loader.py
index c8b0ac4..f0ad4e0 100644
--- a/nxbench/data/loader.py
+++ b/nxbench/data/loader.py
@@ -63,12 +63,13 @@ async def load_network(
         self, config: DatasetConfig, session: aiohttp.ClientSession | None = None
     ) -> tuple[nx.Graph | nx.DiGraph, dict[str, Any]]:
         """Load or generate a network based on config."""
-        metadata = self.get_metadata(config.name)
         source_lower = config.source.lower()
 
         if source_lower == "generator":
             return self._generate_graph(config)
 
+        metadata = self.get_metadata(config.name)
+
         if config.name in self._network_cache:
             logger.debug(f"Loading network '{config.name}' from cache")
             return self._network_cache[config.name]

From 1fee8939fcd50bb9b9dd82546d6fe89f269bf0bf Mon Sep 17 00:00:00 2001
From: dpys <dpysalexander@gmail.com>
Date: Thu, 12 Dec 2024 22:18:21 -0500
Subject: [PATCH 5/7] feat: check=False

---
 nxbench/cli.py                              |   4 +-
 nxbench/configs/asv.conf.json               |   2 +-
 nxbench/configs/{all.yaml => example2.yaml} | 136 ++++++++++----------
 3 files changed, 69 insertions(+), 73 deletions(-)
 rename nxbench/configs/{all.yaml => example2.yaml} (75%)

diff --git a/nxbench/cli.py b/nxbench/cli.py
index e611819..aeb591a 100644
--- a/nxbench/cli.py
+++ b/nxbench/cli.py
@@ -110,7 +110,7 @@ def get_latest_commit_hash(github_url: str) -> str:
 
 def safe_run(
     cmd: Sequence[str | Path],
-    check: bool = True,
+    check: bool = False,
     capture_output: bool = False,
     **kwargs,
 ) -> subprocess.CompletedProcess:
@@ -121,7 +121,7 @@ def safe_run(
     ----------
     cmd : Sequence[str | Path]
         The command and arguments to execute.
-    check : bool, default=True
+    check : bool, default=False
         If True, raise an exception if the command fails.
     capture_output : bool, default=False
         If True, capture stdout and stderr.
diff --git a/nxbench/configs/asv.conf.json b/nxbench/configs/asv.conf.json
index 1f76a1b..d4c0b4f 100644
--- a/nxbench/configs/asv.conf.json
+++ b/nxbench/configs/asv.conf.json
@@ -8,7 +8,7 @@
         "main"
     ],
     "repo": "https://github.com/dpys/nxbench",
-    "environment_type": "conda",
+    "environment_type": "virtualenv",
     "show_commit_url": "https://github.com/dpys/nxbench/commit/",
     "matrix": {},
     "benchmark_dir": "nxbench/benchmarks",
diff --git a/nxbench/configs/all.yaml b/nxbench/configs/example2.yaml
similarity index 75%
rename from nxbench/configs/all.yaml
rename to nxbench/configs/example2.yaml
index 362bea7..db121c2 100644
--- a/nxbench/configs/all.yaml
+++ b/nxbench/configs/example2.yaml
@@ -19,17 +19,17 @@ algorithms:
     warmup_iterations: 20
     validate_result: "nxbench.validation.validate_node_scores"
 
-  - name: "betweenness_centrality"
-    func: "networkx.betweenness_centrality"
-    params:
-      normalized: true
-      endpoints: false
-    requires_directed: false
-    groups: ["centrality", "path_based"]
-    min_rounds: 5
-    warmup: true
-    warmup_iterations: 20
-    validate_result: "nxbench.validation.validate_node_scores"
+  # - name: "betweenness_centrality"
+  #   func: "networkx.betweenness_centrality"
+  #   params:
+  #     normalized: true
+  #     endpoints: false
+  #   requires_directed: false
+  #   groups: ["centrality", "path_based"]
+  #   min_rounds: 5
+  #   warmup: true
+  #   warmup_iterations: 20
+  #   validate_result: "nxbench.validation.validate_node_scores"
 
   # - name: "edge_betweenness_centrality"
   #   func: "networkx.edge_betweenness_centrality"
@@ -65,13 +65,13 @@ algorithms:
     min_rounds: 3
     validate_result: "nxbench.validation.validate_node_scores"
 
-  - name: "transitivity"
-    func: "networkx.transitivity"
-    params: {}
-    requires_directed: false
-    groups: ["clustering", "graph_structure"]
-    min_rounds: 3
-    validate_result: "nxbench.validation.validate_scalar_result"
+  # - name: "transitivity"
+  #   func: "networkx.transitivity"
+  #   params: {}
+  #   requires_directed: false
+  #   groups: ["clustering", "graph_structure"]
+  #   min_rounds: 3
+  #   validate_result: "nxbench.validation.validate_scalar_result"
 
   # - name: "all_pairs_node_connectivity"
   #   func: "networkx.algorithms.connectivity.connectivity.all_pairs_node_connectivity"
@@ -160,23 +160,19 @@ algorithms:
   #   min_rounds: 3
 
 datasets:
-  - name: "enron"
-    source: "networkrepository"
-    params: {}
-
-  - name: "citationCiteseer"
-    source: "networkrepository"
-    params: {}
+  # - name: "enron"
+  #   source: "networkrepository"
+  #   params: {}
 
-  - name: "karate"
-    source: "networkrepository"
-    params: {}
+  # - name: "citationCiteseer"
+  #   source: "networkrepository"
+  #   params: {}
 
-  - name: "patentcite"
-    source: "networkrepository"
-    params: {}
+  # - name: "karate"
+  #   source: "networkrepository"
+  #   params: {}
 
-  # - name: "ca-netscience"
+  # - name: "patentcite"
   #   source: "networkrepository"
   #   params: {}
 
@@ -190,47 +186,47 @@ datasets:
       directed: false
       weighted: false
 
-  - name: "watts_strogatz_small"
-    source: "generator"
-    params:
-      generator: "networkx.watts_strogatz_graph"
-      n: 10000
-      k: 6
-      p: 0.1
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "watts_strogatz_small"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.watts_strogatz_graph"
+  #     n: 10000
+  #     k: 6
+  #     p: 0.1
+  #   metadata:
+  #     directed: false
+  #     weighted: false
 
-  - name: "barabasi_albert_small"
-    source: "generator"
-    params:
-      generator: "networkx.barabasi_albert_graph"
-      n: 1000
-      m: 3
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "barabasi_albert_small"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.barabasi_albert_graph"
+  #     n: 1000
+  #     m: 3
+  #   metadata:
+  #     directed: false
+  #     weighted: false
 
-  - name: "powerlaw_cluster_small"
-    source: "generator"
-    params:
-      generator: "networkx.powerlaw_cluster_graph"
-      n: 1000
-      m: 2
-      p: 0.1
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "powerlaw_cluster_small"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.powerlaw_cluster_graph"
+  #     n: 1000
+  #     m: 2
+  #     p: 0.1
+  #   metadata:
+  #     directed: false
+  #     weighted: false
 
-  - name: "erdos_renyi_small"
-    source: "generator"
-    params:
-      generator: "networkx.erdos_renyi_graph"
-      n: 1000
-      p: 0.01
-    metadata:
-      directed: false
-      weighted: false
+  # - name: "erdos_renyi_small"
+  #   source: "generator"
+  #   params:
+  #     generator: "networkx.erdos_renyi_graph"
+  #     n: 1000
+  #     p: 0.01
+  #   metadata:
+  #     directed: false
+  #     weighted: false
 
   # - name: "watts_strogatz_large"
   #   source: "generator"

From e9dd1ec3ebdb2bc7483eacfca8cfca6bfb4750e7 Mon Sep 17 00:00:00 2001
From: dpys <dpysalexander@gmail.com>
Date: Thu, 12 Dec 2024 22:20:14 -0500
Subject: [PATCH 6/7] feat: check=False

---
 nxbench/configs/example2.yaml | 62 +++++++++++++++++------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/nxbench/configs/example2.yaml b/nxbench/configs/example2.yaml
index db121c2..1a33ccf 100644
--- a/nxbench/configs/example2.yaml
+++ b/nxbench/configs/example2.yaml
@@ -65,13 +65,13 @@ algorithms:
     min_rounds: 3
     validate_result: "nxbench.validation.validate_node_scores"
 
-  # - name: "transitivity"
-  #   func: "networkx.transitivity"
-  #   params: {}
-  #   requires_directed: false
-  #   groups: ["clustering", "graph_structure"]
-  #   min_rounds: 3
-  #   validate_result: "nxbench.validation.validate_scalar_result"
+  - name: "transitivity"
+    func: "networkx.transitivity"
+    params: {}
+    requires_directed: false
+    groups: ["clustering", "graph_structure"]
+    min_rounds: 3
+    validate_result: "nxbench.validation.validate_scalar_result"
 
   # - name: "all_pairs_node_connectivity"
   #   func: "networkx.algorithms.connectivity.connectivity.all_pairs_node_connectivity"
@@ -160,21 +160,21 @@ algorithms:
   #   min_rounds: 3
 
 datasets:
-  # - name: "enron"
-  #   source: "networkrepository"
-  #   params: {}
+  - name: "enron"
+    source: "networkrepository"
+    params: {}
 
-  # - name: "citationCiteseer"
-  #   source: "networkrepository"
-  #   params: {}
+  - name: "citationCiteseer"
+    source: "networkrepository"
+    params: {}
 
-  # - name: "karate"
-  #   source: "networkrepository"
-  #   params: {}
+  - name: "karate"
+    source: "networkrepository"
+    params: {}
 
-  # - name: "patentcite"
-  #   source: "networkrepository"
-  #   params: {}
+  - name: "patentcite"
+    source: "networkrepository"
+    params: {}
 
   - name: "erdos_renyi_small"
     source: "generator"
@@ -186,16 +186,16 @@ datasets:
       directed: false
       weighted: false
 
-  # - name: "watts_strogatz_small"
-  #   source: "generator"
-  #   params:
-  #     generator: "networkx.watts_strogatz_graph"
-  #     n: 10000
-  #     k: 6
-  #     p: 0.1
-  #   metadata:
-  #     directed: false
-  #     weighted: false
+  - name: "watts_strogatz_small"
+    source: "generator"
+    params:
+      generator: "networkx.watts_strogatz_graph"
+      n: 1000
+      k: 6
+      p: 0.1
+    metadata:
+      directed: false
+      weighted: false
 
   # - name: "barabasi_albert_small"
   #   source: "generator"
@@ -218,11 +218,11 @@ datasets:
   #     directed: false
   #     weighted: false
 
-  # - name: "erdos_renyi_small"
+  # - name: "erdos_renyi_large"
   #   source: "generator"
   #   params:
   #     generator: "networkx.erdos_renyi_graph"
-  #     n: 1000
+  #     n: 10000
   #     p: 0.01
   #   metadata:
   #     directed: false

From f0ce92aac134edacc66f1c352f26c8f3d0a24e8f Mon Sep 17 00:00:00 2001
From: dpys <dpysalexander@gmail.com>
Date: Fri, 13 Dec 2024 00:19:33 -0500
Subject: [PATCH 7/7] feat: update example config 2

---
 nxbench/configs/asv.conf.json |  2 +-
 nxbench/configs/example2.yaml | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/nxbench/configs/asv.conf.json b/nxbench/configs/asv.conf.json
index d4c0b4f..947501e 100644
--- a/nxbench/configs/asv.conf.json
+++ b/nxbench/configs/asv.conf.json
@@ -5,7 +5,7 @@
     "project_url": "https://github.com/dpys/nxbench",
     "dvcs": "git",
     "branches": [
-        "main"
+        "cli-gotchas"
     ],
     "repo": "https://github.com/dpys/nxbench",
     "environment_type": "virtualenv",
diff --git a/nxbench/configs/example2.yaml b/nxbench/configs/example2.yaml
index 1a33ccf..03ad96e 100644
--- a/nxbench/configs/example2.yaml
+++ b/nxbench/configs/example2.yaml
@@ -160,21 +160,21 @@ algorithms:
   #   min_rounds: 3
 
 datasets:
-  - name: "enron"
-    source: "networkrepository"
-    params: {}
+  # - name: "enron"
+  #   source: "networkrepository"
+  #   params: {}
 
-  - name: "citationCiteseer"
-    source: "networkrepository"
-    params: {}
+  # - name: "citationCiteseer"
+  #   source: "networkrepository"
+  #   params: {}
 
   - name: "karate"
     source: "networkrepository"
     params: {}
 
-  - name: "patentcite"
-    source: "networkrepository"
-    params: {}
+  # - name: "patentcite"
+  #   source: "networkrepository"
+  #   params: {}
 
   - name: "erdos_renyi_small"
     source: "generator"