Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add interpretability example notebooks #21

Open
wants to merge 37 commits into
base: obliquepr
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
d7b607b
grid search parameters
jshinm May 26, 2022
3d599b0
upload iris notebook
jshinm May 26, 2022
496969c
add 5000 sample
jshinm May 26, 2022
6e6f787
add delta plot
jshinm May 29, 2022
b3d3aa6
add mnist notebook
jshinm May 29, 2022
c5af203
fix delta calculation
jshinm May 30, 2022
6e1187c
preserve comprehensive run
jshinm Jun 8, 2022
8d45399
optimize runtime
jshinm Jun 8, 2022
6006426
fix bug and reparameterize
jshinm Jun 10, 2022
d45d2a8
add roc_auc and confusion matrix
jshinm Jun 10, 2022
6360f3d
add 3d visualization
jshinm Jun 12, 2022
d5b19f2
add narratives and descriptions
jshinm Jun 12, 2022
64169d8
add description
jshinm Jun 13, 2022
ef5d9bc
remove long notebook
jshinm Jun 13, 2022
faa0c53
added oblique trees
jshinm Jun 13, 2022
7a23b9b
remove ovr wrapper
jshinm Jun 13, 2022
83cda7a
add grid search results
jshinm Jun 15, 2022
7030191
remove over feature selection filter for RF
jshinm Jun 15, 2022
01826d6
add plotly io for plot rendering
jshinm Jun 15, 2022
0a016e3
add robustness test
jshinm Jun 16, 2022
b18d356
add description and rerun notebook
jshinm Jun 16, 2022
c56f351
new parameter search
jshinm Jun 16, 2022
22f643d
change plot style
jshinm Jun 16, 2022
01136c5
optimize robustness test with new parameters
jshinm Jun 16, 2022
dccfdd8
change plot style
jshinm Jun 16, 2022
14b172d
run appendix block
jshinm Jun 16, 2022
d3087a5
Merge branch 'neurodata:obliquepr' into obliquepr
jshinm Jun 29, 2022
5a15c53
added score vs performance metrics
jshinm Jun 29, 2022
1b3c991
uploading pickled dataframe
jshinm Jun 29, 2022
30b38d3
added refitting function and plot on score vs performance metrics
jshinm Jun 29, 2022
e2691a9
added simulation run dataframe
jshinm Jun 29, 2022
25a1e98
Added binning figure and plotly figures
jshinm Jun 29, 2022
5e8f1a6
Added binning figure and changed unit of the size to MB
jshinm Jun 29, 2022
b73915e
Add sparse parity example under ensemble section
jshinm Jul 25, 2022
67e04ef
Add cc18 example under ensemble section
jshinm Jul 25, 2022
da00a8b
Use tuned parameters and improve reproducibility
jshinm Aug 19, 2022
e30af84
Use selected datasets from cc18 suite and pre-tuned parameters
jshinm Aug 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions examples/ensemble/plot_oblique_axis_aligned_forests_cc18.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
===============================================================================
Plot oblique forest and axis-aligned random forest predictions on cc18 datasets
===============================================================================

A performance comparison between oblique forest and standard axis-
aligned random forest using three datasets from OpenML benchmarking suites.

Two of these datasets, namely [WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510)
and [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534) datasets
consist of 31 features where the former dataset is entirely numeric
and the latter dataset is entirely norminal. The third dataset, dubbed
[cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a
numeric dataset that has notably large feature space of 857 features. As you
will notice, of these three datasets, the oblique forest outperforms axis-aligned
random forest on cnae-9 utilizing sparse random projection machanism. All datasets
are subsampled due to computational constraints.
"""

import numpy as np
import pandas as pd
from datetime import datetime
import openml
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ObliqueRandomForestClassifier
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate

random_state = 123456
t0 = datetime.now()
data_ids = [11, 40499] # openml dataset id
df = pd.DataFrame()


def load_cc18(data_id):
dat = openml.datasets.get_dataset(data_id, download_data=False)
d_name = dat.name
d = dat.get_data()[0]

# Subsampling large datasets
n = int(d.shape[0] * 0.1)
d = d.sample(n, random_state=random_state)
X, y = d.iloc[:, :-1], d.iloc[:, -1]

return X, y, d_name


def get_scores(X, y, d_name="UNK", n_cv=5, n_repeats=2, random_state=1, kwargs=None):
clfs = [
RandomForestClassifier(**kwargs[0], random_state=random_state),
ObliqueRandomForestClassifier(**kwargs[1], random_state=random_state),
]

tmp = []

for i, clf in enumerate(clfs):
cv = RepeatedKFold(
n_splits=n_cv, n_repeats=n_repeats, random_state=random_state
)
test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")

tmp.append(
[
d_name,
["RF", "OF"][i],
test_score["test_score"],
test_score["test_score"].mean(),
]
)
print(
f'{d_name} mean test score for {["RF", "OF"][i]}:'
f' {test_score["test_score"].mean()}'
)

df = pd.DataFrame(tmp, columns=["dataset", "model", "score", "mean"])
df = df.explode("score")
df["score"] = df["score"].astype(float)
df.reset_index(inplace=True, drop=True)

return df


def load_best_params(data_ids):
# folder_path = "/home/jshinm/Desktop/workstation/sklearn-jms/notebook/hidden/output/"
folder_path = None
params = []

if not folder_path:
# pre-tuned hyper-parameters
params += [
[
{"max_depth": 5, "max_features": "sqrt", "n_estimators": 100},
{"max_depth": 5, "max_features": None, "n_estimators": 100},
],
[
{"max_depth": 10, "max_features": "log2", "n_estimators": 200},
{"max_depth": 10, "max_features": 80, "n_estimators": 200},
],
]
else:
for data_id in data_ids:
file_path = f"OFvsRF_grid_search_cv_results_openml_{data_id}.csv"
df = pd.read_csv(folder_path + file_path).sort_values(
"mean_test_score", ascending=False
)
tmp = []
for clf in ["RF", "OF"]:
tmp.append(eval(df.query(f'clf=="{clf}"')["params"].iloc[0]))
params.append(tmp)

return params


params = load_best_params(data_ids=data_ids)

for i, data_id in enumerate(data_ids):
X, y, d_name = load_cc18(data_id=data_id)
print(f"Loading [{d_name}] dataset..")
tmp = get_scores(
X=X, y=y, d_name=d_name, random_state=random_state, kwargs=params[i]
)
df = pd.concat([df, tmp])

t_d = (datetime.now() - t0).seconds
print(f"It took {t_d} seconds to run the script")

# Draw a comparison plot
d_names = df.dataset.unique()
N = d_names.shape[0]

fig, ax = plt.subplots(1, N, figsize=(6 * N, 6))

for i, name in enumerate(d_names):
if N == 1:
axs = ax
else:
axs = ax[i]
dff = df.query(f'dataset == "{name}"')

sns.stripplot(data=dff, x="model", y="score", ax=axs, dodge=True)
sns.boxplot(data=dff, x="model", y="score", ax=axs, color="white")
axs.set_title(f"{name} (#{data_ids[i]})")

rf = dff.query('model=="RF"')["mean"].iloc[0]
rff = f"RF (Mean Test Score: {round(rf,3)})"

of = dff.query('model=="OF"')["mean"].iloc[0]
off = f"OF (Mean Test Score: {round(of,3)})"

axs.legend([rff, off], loc=4)

if i != 0:
axs.set_ylabel("")
else:
axs.set_ylabel("Accuracy")

axs.set_xlabel("")

plt.savefig(f"plot_cc18_{t_d}s.jpg")
plt.show()
100 changes: 100 additions & 0 deletions examples/ensemble/plot_oblique_axis_aligned_forests_sparse_parity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
==========================================================================================
Plot oblique forest and axis-aligned random forest predictions on sparse parity simulation
==========================================================================================

A performance comparison between oblique forest and standard axis-
aligned random forest using sparse parity simulation dataset.

Sparse parity is a variation of the noisy parity problem,
which itself is a multivariate generalization of the noisy XOR problem.
This is a binary classification task in high dimensions. The simulation
will generate uniformly distributed `n_samples` number of sample points
in the range of -1 and +1 with `p` number of features. `p*` is a
parameter used to limit features that carry information about the class.
The informative binary label is then defined as 1 if there are odd number
of the sum of data `X` across first `p*` features that are greater than 0,
otherwise the label is defined as 0. The simulation is further detailed
in this [publication](https://epubs.siam.org/doi/epdf/10.1137/1.9781611974973.56).
"""

import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ObliqueRandomForestClassifier
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate

random_state = 123456
t0 = datetime.now()


def sparse_parity(n_samples, p=20, p_star=3, random_seed=None, **kwarg):
if random_seed:
np.random.seed(random_seed)

X = np.random.uniform(-1, 1, (n_samples, p))
y = np.zeros(n_samples)

for i in range(0, n_samples):
y[i] = sum(X[i, :p_star] > 0) % 2

return X, y


def get_scores(X, y, n_cv=5, n_repeats=1, random_state=1, kwargs=None):
clfs = [
RandomForestClassifier(**kwargs[0], random_state=random_state),
ObliqueRandomForestClassifier(**kwargs[1], random_state=random_state),
]

tmp = []

for i, clf in enumerate(clfs):
cv = RepeatedKFold(
n_splits=n_cv, n_repeats=n_repeats, random_state=random_state
)
test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")

tmp.append(
[["RF", "OF"][i], test_score["test_score"], test_score["test_score"].mean()]
)

df = pd.DataFrame(tmp, columns=["model", "score", "mean"])
df = df.explode("score")
df["score"] = df["score"].astype(float)
df.reset_index(inplace=True, drop=True)

return df


# Grid searched hyper-parameters
params = [
{"max_features": None, "n_estimators": 100, "max_depth": None},
{"max_features": 40, "n_estimators": 100, "max_depth": 20},
]

X, y = sparse_parity(n_samples=10000, random_seed=random_state)

df = get_scores(X=X, y=y, n_cv=3, n_repeats=1, random_state=random_state, kwargs=params)
t_d = (datetime.now() - t0).seconds
print(f"It took {t_d} seconds to run the script")

# Draw a comparison plot
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

sns.stripplot(data=df, x="model", y="score", ax=ax, dodge=True)
sns.boxplot(data=df, x="model", y="score", ax=ax, color="white")
ax.set_title("Sparse Parity")

rf = df.query('model=="RF"')["mean"].iloc[0]
rff = f"RF (Mean Test Score: {round(rf,3)})"

of = df.query('model=="OF"')["mean"].iloc[0]
off = f"OF (Mean Test Score: {round(of,3)})"

ax.legend([rff, off], loc=4)

plt.savefig(f"plot_sim_{t_d}s.jpg")
plt.show()
100 changes: 56 additions & 44 deletions examples/tree/plot_iris_dtc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
Plot the decision surface of decision trees trained on the iris dataset
=======================================================================

Plot the decision surface of a decision tree trained on pairs
of features of the iris dataset.
Plot the decision surface of a decision tree and oblique decision tree
trained on pairs of features of the iris dataset.

See :ref:`decision tree <tree>` for more information on the estimator.

Expand All @@ -27,61 +27,73 @@
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier, ObliqueDecisionTreeClassifier
from sklearn.inspection import DecisionBoundaryDisplay


# Parameters
n_classes = 3
plot_colors = "ryb"
plot_step = 0.02


for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
# We only take the two corresponding features
X = iris.data[:, pair]
y = iris.target

# Train
clf = DecisionTreeClassifier().fit(X, y)

# Plot the decision boundary
ax = plt.subplot(2, 3, pairidx + 1)
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
DecisionBoundaryDisplay.from_estimator(
clf,
X,
cmap=plt.cm.RdYlBu,
response_method="predict",
ax=ax,
xlabel=iris.feature_names[pair[0]],
ylabel=iris.feature_names[pair[1]],
)

# Plot the training points
for i, color in zip(range(n_classes), plot_colors):
idx = np.where(y == i)
plt.scatter(
X[idx, 0],
X[idx, 1],
c=color,
label=iris.target_names[i],
clf_labels = ['Random', 'Oblique']
random_state = 123456

clfs = [
DecisionTreeClassifier(random_state=random_state),
ObliqueDecisionTreeClassifier(random_state=random_state)
]

for clf, clf_lab in zip(clfs, clf_labels):

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
# We only take the two corresponding features
X = iris.data[:, pair]
y = iris.target

# Train
clf.fit(X, y)

# Plot the decision boundary
ax = plt.subplot(2, 3, pairidx + 1)
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
DecisionBoundaryDisplay.from_estimator(
clf,
X,
cmap=plt.cm.RdYlBu,
edgecolor="black",
s=15,
response_method="predict",
ax=ax,
xlabel=iris.feature_names[pair[0]],
ylabel=iris.feature_names[pair[1]],
)

plt.suptitle("Decision surface of decision trees trained on pairs of features")
plt.legend(loc="lower right", borderpad=0, handletextpad=0)
_ = plt.axis("tight")
# Plot the training points
for i, color in zip(range(n_classes), plot_colors):
idx = np.where(y == i)
plt.scatter(
X[idx, 0],
X[idx, 1],
c=color,
label=iris.target_names[i],
cmap=plt.cm.RdYlBu,
edgecolor="black",
s=15,
)

plt.suptitle(f"Decision surface of {clf_lab} decision trees trained on pairs of features")
plt.legend(loc="lower right", borderpad=0, handletextpad=0)
_ = plt.axis("tight")
plt.show()

# %%
# Display the structure of a single decision tree trained on all the features
# together.
from sklearn.tree import plot_tree

plt.figure()
clf = DecisionTreeClassifier().fit(iris.data, iris.target)
plot_tree(clf, filled=True)
plt.title("Decision tree trained on all the iris features")
plt.show()
for clf, clf_lab in zip(clfs, clf_labels):
plt.figure()
clf.fit(iris.data, iris.target)
plot_tree(clf, filled=True)
plt.title(f"{clf_lab} decision tree trained on all the iris features")
plt.show()

# %%
Loading