-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from BenJourdan/dev
Finished first sprint.
- Loading branch information
Showing
22 changed files
with
2,727 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,3 +26,10 @@ __pycache__/ | |
tests/__pycache__/* | ||
*.py[cod] | ||
*.pyc | ||
*.so | ||
|
||
|
||
# results: | ||
*.png | ||
!csc.png | ||
*.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,3 @@ | ||
{ | ||
"rust-analyzer.check.features": null, | ||
"rust-analyzer.cargo.features": [ | ||
"bindings" | ||
], | ||
"rust-analyzer.check.features": "bindings" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,43 @@ | ||
# coreset-sc | ||
Coreset Spectral Clustering | ||
A minimal implementation of the Coreset Spectral Clustering (CSC) algorithm given in [this paper](https://openreview.net/pdf?id=1qgZXeMTTU). | ||
|
||
Work in progress.... | ||
The presented method repeatedly jumps across the equivalence between the normlised cut and weighted kernel k-means problems to apply coreset methods to spectral clustering. | ||
|
||
Combined with [recent work on fast spectral clustering](https://neurips.cc/virtual/2023/poster/71723), this gives us a method for clustering very large graphs (millions of nodes) in seconds by only running (fast) spectral clustering on a much smaller induced subgraph. It can do so in even the most extreme case, where the number of clusters is linear in the number of nodes. See the experiments section the [paper](https://openreview.net/pdf?id=1qgZXeMTTU). | ||
|
||
![Coreset Spectral Clustering](csc.png) | ||
|
||
|
||
|
||
|
||
Basic usage: | ||
```python | ||
from coreset_sc import CoresetSpectralClustering, gen_sbm | ||
from sklearn.metrics.cluster import adjusted_rand_score | ||
|
||
# Generate a graph from the stochastic block model | ||
n = 1000 # number of nodes per cluster | ||
k = 50 # number of clusters | ||
p = 0.5 # probability of an intra-cluster edge | ||
q = (1.0 / n) / k # probability of an inter-cluster edge | ||
|
||
|
||
# A is a sparse scipy CSR matrix of a symmetric adjacency graph | ||
A,ground_truth_labels = gen_sbm(n, k, p, q) | ||
|
||
coreset_ratio = 0.1 # fraction of the data to use for the coreset graph | ||
|
||
csc = CoresetSpectralClustering( | ||
num_clusters=k, coreset_ratio=coreset_ratio | ||
) | ||
csc.fit(A) # sample extract and cluster the coreset graph | ||
csc.label_full_graph() # label the rest of the graph given the coreset labels | ||
pred_labels = csc.labels_ # get the full labels | ||
|
||
# Alternatively, label the full graph in one line: | ||
pred_labels = csc.fit_predict(A) | ||
ari = adjusted_rand_score(ground_truth_labels,pred_labels) | ||
``` | ||
|
||
|
||
[Python Docs](https://benjourdan.github.io/coreset-sc/) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
|
||
|
||
|
||
use criterion::{criterion_group, criterion_main, Criterion, black_box}; | ||
|
||
|
||
use coreset_sc::gen_sbm_with_self_loops; | ||
|
||
|
||
fn bench_sbm(c: &mut Criterion) { | ||
let mut group = c.benchmark_group("sbm"); | ||
|
||
let ns = [1000]; | ||
let ks = [20]; | ||
|
||
|
||
for n in ns{ | ||
for k in ks{ | ||
let p = 0.5; | ||
let q = (1.0/(n as f64))/(k as f64); | ||
group.sample_size(10); | ||
group.nresamples(10); | ||
group.measurement_time(std::time::Duration::from_secs(10)); | ||
group.bench_function( | ||
format!("gen_sbm_{}_{}",n,k).as_str(), | ||
|b| b.iter(|| black_box(gen_sbm_with_self_loops(black_box(n),black_box(k),black_box(p),black_box(q))) | ||
)); | ||
|
||
} | ||
} | ||
group.finish(); | ||
} | ||
|
||
|
||
|
||
criterion_group!(benches, bench_sbm); | ||
criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from . import coreset_sc, utils | ||
from .csc import CoresetSpectralClustering as CoresetSpectralClustering | ||
|
||
# import maturin_import_hook | ||
# import scipy | ||
# import stag.random | ||
# from maturin_import_hook.settings import MaturinSettings | ||
|
||
# maturin_import_hook.install( | ||
# enable_project_importer=True, | ||
# enable_rs_file_importer=True, | ||
# settings=MaturinSettings( | ||
# release=True, | ||
# strip=True, | ||
# ), | ||
# show_warnings=False, | ||
# ) | ||
|
||
|
||
def gen_sbm(n, k, p, q): | ||
""" | ||
Generate an approximate sample from a Stochastic Block Model (SBM) graph. | ||
Parameters | ||
---------- | ||
n : int | ||
Number of nodes in each cluster. | ||
k : int | ||
Number of clusters. | ||
p : float | ||
Probability of an edge within the same cluster. | ||
q : float | ||
Probability of an edge between different clusters. | ||
Returns | ||
------- | ||
adj_mat : scipy.sparse.csr_matrix, shape = (n*k, n*k) | ||
The symmetric adjacency matrix of the generated graph with self loops added. | ||
labels : numpy.ndarray, shape = (n*k,) | ||
The ground truth cluster labels | ||
""" | ||
assert isinstance(n, int), "n must be an integer" | ||
assert isinstance(k, int), "k must be an integer" | ||
assert isinstance(p, float), "p must be a float" | ||
assert isinstance(q, float), "q must be a float" | ||
assert n > 0, "n must be greater than 0" | ||
assert k > 0, "k must be greater than 0" | ||
assert 0 <= p <= 1, "p must be between 0 and 1" | ||
assert 0 <= q <= 1, "q must be between 0 and 1" | ||
|
||
size, data, indices, indptr, labels = coreset_sc.gen_sbm(n, k, p, q) | ||
adj_mat = utils.convert_to_csr_matrix(size, data, indptr, indices) | ||
return adj_mat, labels | ||
|
||
|
||
# def stag_sbm(n, k, p, q): | ||
# assert isinstance(n, int), "n must be an integer" | ||
# assert isinstance(k, int), "k must be an integer" | ||
# assert isinstance(p, float), "p must be a float" | ||
# assert isinstance(q, float), "q must be a float" | ||
# assert n > 0, "n must be greater than 0" | ||
# assert k > 0, "k must be greater than 0" | ||
# assert 0 <= p <= 1, "p must be between 0 and 1" | ||
# assert 0 <= q <= 1, "q must be between 0 and 1" | ||
# N = int(n * k) | ||
# g = stag.random.sbm(N, k, p, q, False) | ||
# adj = g.adjacency().to_scipy() | ||
# adj = (adj + scipy.sparse.eye(int(n * k))).tocsr() | ||
# labels = stag.random.sbm_gt_labels(N, k) | ||
|
||
# return adj, labels |
Oops, something went wrong.