Merge pull request #1 from BenJourdan/dev

Finished first sprint.
BenJourdan · Nov 8, 2024 · 96bab9f · 96bab9f
2 parents a489068 + 2591fa4
commit 96bab9f
Show file tree

Hide file tree

Showing 22 changed files with 2,727 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,10 @@ __pycache__/
 tests/__pycache__/*
 *.py[cod]
 *.pyc
+*.so
+
+
+# results:
+*.png
+!csc.png
+*.json
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,7 +1,3 @@
 {
     "rust-analyzer.check.features": null,
-    "rust-analyzer.cargo.features": [
-        "bindings"
-    ],
-    "rust-analyzer.check.features": "bindings"
 }
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,13 +6,44 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [lib]
 name = "coreset_sc"
-crate-type = ["cdylib"]
+crate-type = ["cdylib", "rlib"]
+path = "src/lib.rs"
+
 
-[features]
-default = []
-bindings = ["pyo3"]
 
 
 [dependencies]
-clap = "4.5.20"
-pyo3 = {version = "0.22.4", optional = true, features = ["extension-module"]}
+ndarray = "0.16.1"
+rand =  "0.8.5"
+pyo3 = {version = "0.22.6", features = ["extension-module"]}
+faer = { version = "0.19.4", features = ["rayon"] }
+numpy = "0.22.1"
+
+sampling-tree = "0.1.0"
+rand_distr = "0.4.3"
+criterion = "0.5.1"
+rayon = "1.10.0"
+
+
+faer-ext = { version = "0.3.0", features = ["ndarray"] }
+ndarray-rand = "0.15.0"
+
+
+[workspace]
+resolver = "2"
+
+[profile.dev]
+incremental = true
+
+
+
+[profile.release]
+debug = true
+
+
+
+
+
+[[bench]]
+name = "sbm"
+harness = false
diff --git a/README.md b/README.md
@@ -1,7 +1,43 @@
 # coreset-sc
-Coreset Spectral Clustering
+A minimal implementation of the Coreset Spectral Clustering (CSC) algorithm given in [this paper](https://openreview.net/pdf?id=1qgZXeMTTU).
 
-Work in progress....
+The presented method repeatedly jumps across the equivalence between the normlised cut and weighted kernel k-means problems to apply coreset methods to spectral clustering.
+
+Combined with [recent work on fast spectral clustering](https://neurips.cc/virtual/2023/poster/71723), this gives us a method for clustering very large graphs (millions of nodes) in seconds by only running (fast) spectral clustering on a much smaller induced subgraph. It can do so in even the most extreme case, where the number of clusters is linear in the number of nodes. See the experiments section the [paper](https://openreview.net/pdf?id=1qgZXeMTTU).
+
+![Coreset Spectral Clustering](csc.png)
+
+
+
+
+Basic usage:
+```python
+from coreset_sc import CoresetSpectralClustering, gen_sbm
+from sklearn.metrics.cluster import adjusted_rand_score
+
+# Generate a graph from the stochastic block model
+n = 1000            # number of nodes per cluster
+k = 50              # number of clusters
+p = 0.5             # probability of an intra-cluster edge
+q = (1.0 / n) / k   # probability of an inter-cluster edge
+
+
+# A is a sparse scipy CSR matrix of a symmetric adjacency graph
+A,ground_truth_labels = gen_sbm(n, k, p, q)
+
+coreset_ratio = 0.1 # fraction of the data to use for the coreset graph
+
+csc = CoresetSpectralClustering(
+    num_clusters=k, coreset_ratio=coreset_ratio
+)
+csc.fit(A) # sample extract and cluster the coreset graph
+csc.label_full_graph() # label the rest of the graph given the coreset labels
+pred_labels = csc.labels_ # get the full labels
+
+# Alternatively, label the full graph in one line:
+pred_labels = csc.fit_predict(A)
+ari = adjusted_rand_score(ground_truth_labels,pred_labels)
+```
 
 
 [Python Docs](https://benjourdan.github.io/coreset-sc/)
diff --git a/benches/sbm.rs b/benches/sbm.rs
@@ -0,0 +1,37 @@
+
+
+
+use criterion::{criterion_group, criterion_main, Criterion, black_box};
+
+
+use coreset_sc::gen_sbm_with_self_loops;
+
+
+fn bench_sbm(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sbm");
+
+    let ns = [1000];
+    let ks = [20];
+
+
+    for n in ns{
+        for k in ks{
+            let p = 0.5;
+            let q = (1.0/(n as f64))/(k as f64);
+            group.sample_size(10);
+            group.nresamples(10);
+            group.measurement_time(std::time::Duration::from_secs(10));
+            group.bench_function(
+                format!("gen_sbm_{}_{}",n,k).as_str(),
+                |b| b.iter(|| black_box(gen_sbm_with_self_loops(black_box(n),black_box(k),black_box(p),black_box(q)))
+            ));
+
+        }
+    }
+    group.finish();
+}
+
+
+
+criterion_group!(benches, bench_sbm);
+criterion_main!(benches);
diff --git a/csc.png b/csc.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,8 +12,12 @@ classifiers = [
 ]
 dynamic = ["version"]
 [tool.maturin]
-features = ["pyo3/extension-module", "bindings"]
-
+cargo-extra-args = ["--quiet"]
+features = ["pyo3/extension-module"]
+python-source = "python"
+# build with release optimizations
+release = true
+strip = false
 
 
 [tool.ruff]

diff --git a/python/coreset_sc/__init__.py b/python/coreset_sc/__init__.py
@@ -0,0 +1,71 @@
+from . import coreset_sc, utils
+from .csc import CoresetSpectralClustering as CoresetSpectralClustering
+
+# import maturin_import_hook
+# import scipy
+# import stag.random
+# from maturin_import_hook.settings import MaturinSettings
+
+# maturin_import_hook.install(
+#     enable_project_importer=True,
+#     enable_rs_file_importer=True,
+#     settings=MaturinSettings(
+#         release=True,
+#         strip=True,
+#     ),
+#     show_warnings=False,
+# )
+
+
+def gen_sbm(n, k, p, q):
+    """
+    Generate an approximate sample from a Stochastic Block Model (SBM) graph.
+
+    Parameters
+    ----------
+    n : int
+        Number of nodes in each cluster.
+    k : int
+        Number of clusters.
+    p : float
+        Probability of an edge within the same cluster.
+    q : float
+        Probability of an edge between different clusters.
+
+    Returns
+    -------
+    adj_mat : scipy.sparse.csr_matrix, shape = (n*k, n*k)
+        The symmetric adjacency matrix of the generated graph with self loops added.
+    labels : numpy.ndarray, shape = (n*k,)
+        The ground truth cluster labels
+    """
+    assert isinstance(n, int), "n must be an integer"
+    assert isinstance(k, int), "k must be an integer"
+    assert isinstance(p, float), "p must be a float"
+    assert isinstance(q, float), "q must be a float"
+    assert n > 0, "n must be greater than 0"
+    assert k > 0, "k must be greater than 0"
+    assert 0 <= p <= 1, "p must be between 0 and 1"
+    assert 0 <= q <= 1, "q must be between 0 and 1"
+
+    size, data, indices, indptr, labels = coreset_sc.gen_sbm(n, k, p, q)
+    adj_mat = utils.convert_to_csr_matrix(size, data, indptr, indices)
+    return adj_mat, labels
+
+
+# def stag_sbm(n, k, p, q):
+#     assert isinstance(n, int), "n must be an integer"
+#     assert isinstance(k, int), "k must be an integer"
+#     assert isinstance(p, float), "p must be a float"
+#     assert isinstance(q, float), "q must be a float"
+#     assert n > 0, "n must be greater than 0"
+#     assert k > 0, "k must be greater than 0"
+#     assert 0 <= p <= 1, "p must be between 0 and 1"
+#     assert 0 <= q <= 1, "q must be between 0 and 1"
+#     N = int(n * k)
+#     g = stag.random.sbm(N, k, p, q, False)
+#     adj = g.adjacency().to_scipy()
+#     adj = (adj + scipy.sparse.eye(int(n * k))).tocsr()
+#     labels = stag.random.sbm_gt_labels(N, k)
+
+#     return adj, labels