Skip to content

Commit

Permalink
Merge pull request #1 from BenJourdan/dev
Browse files Browse the repository at this point in the history
Finished first sprint.
  • Loading branch information
BenJourdan authored Nov 8, 2024
2 parents a489068 + 2591fa4 commit 96bab9f
Show file tree
Hide file tree
Showing 22 changed files with 2,727 additions and 45 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,10 @@ __pycache__/
tests/__pycache__/*
*.py[cod]
*.pyc
*.so


# results:
*.png
!csc.png
*.json
4 changes: 0 additions & 4 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
{
"rust-analyzer.check.features": null,
"rust-analyzer.cargo.features": [
"bindings"
],
"rust-analyzer.check.features": "bindings"
}
43 changes: 37 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,44 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "coreset_sc"
crate-type = ["cdylib"]
crate-type = ["cdylib", "rlib"]
path = "src/lib.rs"


[features]
default = []
bindings = ["pyo3"]


[dependencies]
clap = "4.5.20"
pyo3 = {version = "0.22.4", optional = true, features = ["extension-module"]}
ndarray = "0.16.1"
rand = "0.8.5"
pyo3 = {version = "0.22.6", features = ["extension-module"]}
faer = { version = "0.19.4", features = ["rayon"] }
numpy = "0.22.1"

sampling-tree = "0.1.0"
rand_distr = "0.4.3"
criterion = "0.5.1"
rayon = "1.10.0"


faer-ext = { version = "0.3.0", features = ["ndarray"] }
ndarray-rand = "0.15.0"


[workspace]
resolver = "2"

[profile.dev]
incremental = true



[profile.release]
debug = true





[[bench]]
name = "sbm"
harness = false
40 changes: 38 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,43 @@
# coreset-sc
Coreset Spectral Clustering
A minimal implementation of the Coreset Spectral Clustering (CSC) algorithm given in [this paper](https://openreview.net/pdf?id=1qgZXeMTTU).

Work in progress....
The presented method repeatedly jumps across the equivalence between the normlised cut and weighted kernel k-means problems to apply coreset methods to spectral clustering.

Combined with [recent work on fast spectral clustering](https://neurips.cc/virtual/2023/poster/71723), this gives us a method for clustering very large graphs (millions of nodes) in seconds by only running (fast) spectral clustering on a much smaller induced subgraph. It can do so in even the most extreme case, where the number of clusters is linear in the number of nodes. See the experiments section the [paper](https://openreview.net/pdf?id=1qgZXeMTTU).

![Coreset Spectral Clustering](csc.png)




Basic usage:
```python
from coreset_sc import CoresetSpectralClustering, gen_sbm
from sklearn.metrics.cluster import adjusted_rand_score

# Generate a graph from the stochastic block model
n = 1000 # number of nodes per cluster
k = 50 # number of clusters
p = 0.5 # probability of an intra-cluster edge
q = (1.0 / n) / k # probability of an inter-cluster edge


# A is a sparse scipy CSR matrix of a symmetric adjacency graph
A,ground_truth_labels = gen_sbm(n, k, p, q)

coreset_ratio = 0.1 # fraction of the data to use for the coreset graph

csc = CoresetSpectralClustering(
num_clusters=k, coreset_ratio=coreset_ratio
)
csc.fit(A) # sample extract and cluster the coreset graph
csc.label_full_graph() # label the rest of the graph given the coreset labels
pred_labels = csc.labels_ # get the full labels

# Alternatively, label the full graph in one line:
pred_labels = csc.fit_predict(A)
ari = adjusted_rand_score(ground_truth_labels,pred_labels)
```


[Python Docs](https://benjourdan.github.io/coreset-sc/)
37 changes: 37 additions & 0 deletions benches/sbm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@



use criterion::{criterion_group, criterion_main, Criterion, black_box};


use coreset_sc::gen_sbm_with_self_loops;


fn bench_sbm(c: &mut Criterion) {
let mut group = c.benchmark_group("sbm");

let ns = [1000];
let ks = [20];


for n in ns{
for k in ks{
let p = 0.5;
let q = (1.0/(n as f64))/(k as f64);
group.sample_size(10);
group.nresamples(10);
group.measurement_time(std::time::Duration::from_secs(10));
group.bench_function(
format!("gen_sbm_{}_{}",n,k).as_str(),
|b| b.iter(|| black_box(gen_sbm_with_self_loops(black_box(n),black_box(k),black_box(p),black_box(q)))
));

}
}
group.finish();
}



criterion_group!(benches, bench_sbm);
criterion_main!(benches);
Binary file added csc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@ classifiers = [
]
dynamic = ["version"]
[tool.maturin]
features = ["pyo3/extension-module", "bindings"]

cargo-extra-args = ["--quiet"]
features = ["pyo3/extension-module"]
python-source = "python"
# build with release optimizations
release = true
strip = false


[tool.ruff]
Expand Down
71 changes: 71 additions & 0 deletions python/coreset_sc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from . import coreset_sc, utils
from .csc import CoresetSpectralClustering as CoresetSpectralClustering

# import maturin_import_hook
# import scipy
# import stag.random
# from maturin_import_hook.settings import MaturinSettings

# maturin_import_hook.install(
# enable_project_importer=True,
# enable_rs_file_importer=True,
# settings=MaturinSettings(
# release=True,
# strip=True,
# ),
# show_warnings=False,
# )


def gen_sbm(n, k, p, q):
"""
Generate an approximate sample from a Stochastic Block Model (SBM) graph.
Parameters
----------
n : int
Number of nodes in each cluster.
k : int
Number of clusters.
p : float
Probability of an edge within the same cluster.
q : float
Probability of an edge between different clusters.
Returns
-------
adj_mat : scipy.sparse.csr_matrix, shape = (n*k, n*k)
The symmetric adjacency matrix of the generated graph with self loops added.
labels : numpy.ndarray, shape = (n*k,)
The ground truth cluster labels
"""
assert isinstance(n, int), "n must be an integer"
assert isinstance(k, int), "k must be an integer"
assert isinstance(p, float), "p must be a float"
assert isinstance(q, float), "q must be a float"
assert n > 0, "n must be greater than 0"
assert k > 0, "k must be greater than 0"
assert 0 <= p <= 1, "p must be between 0 and 1"
assert 0 <= q <= 1, "q must be between 0 and 1"

size, data, indices, indptr, labels = coreset_sc.gen_sbm(n, k, p, q)
adj_mat = utils.convert_to_csr_matrix(size, data, indptr, indices)
return adj_mat, labels


# def stag_sbm(n, k, p, q):
# assert isinstance(n, int), "n must be an integer"
# assert isinstance(k, int), "k must be an integer"
# assert isinstance(p, float), "p must be a float"
# assert isinstance(q, float), "q must be a float"
# assert n > 0, "n must be greater than 0"
# assert k > 0, "k must be greater than 0"
# assert 0 <= p <= 1, "p must be between 0 and 1"
# assert 0 <= q <= 1, "q must be between 0 and 1"
# N = int(n * k)
# g = stag.random.sbm(N, k, p, q, False)
# adj = g.adjacency().to_scipy()
# adj = (adj + scipy.sparse.eye(int(n * k))).tocsr()
# labels = stag.random.sbm_gt_labels(N, k)

# return adj, labels
Loading

0 comments on commit 96bab9f

Please sign in to comment.