From 76aa854206734b8a59c2a950ada13ec1322e4a34 Mon Sep 17 00:00:00 2001
From: Hannah Bruce Macdonald <hannah.brucemacdonald@choderalab.org>
Date: Fri, 4 Sep 2020 15:56:47 +0100
Subject: [PATCH] Adding r2max function Issue #14

---
 arsenic/stats.py | 51 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/arsenic/stats.py b/arsenic/stats.py
index abc9a61..012901f 100644
--- a/arsenic/stats.py
+++ b/arsenic/stats.py
@@ -1,6 +1,43 @@
 import numpy as np
 
-def bootstrap_statistic(y_true, y_pred, dy_true=None, dy_pred=None, ci=0.95, statistic='RMSE', nbootstrap = 1000, plot_type='dG'):
+
+def calc_R2max(affinities, errors):
+    """Calculates the maximum achievable R2 given a set of affinities and their uncertainties.
+
+    [1] https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b01067
+    Sheridan, Robert P., et al. "Experimental Error, Kurtosis, Activity Cliffs, and Methodology: What Limits the
+    Predictivity of Quantitative Structure–Activity Relationship Models?."
+    Journal of Chemical Information and Modeling
+    60.4 (2020): 1969-1982.
+
+    Parameters
+    ----------
+    affinities : list[float]
+        List of affinities for a set (order not important)
+    errors : list[float] or float
+        Either a single experimental error for a set of compounds
+        or a per-ligand list of errors (order not important)
+
+    Returns
+    -------
+    float
+        value of R2max
+
+    """
+    if isinstance(errors, float):
+        errors = [errors for _ in affinities]
+    assert len(errors) == len(affinities), f'Need the same number of errors as affinities.\
+     {len(errors)}  errors for {len(affinities)} affinities,\
+      or one error as a float for the whole sett'
+    return 1. - (np.std(affinities) / np.std(errors))**0.5
+
+
+def bootstrap_statistic(y_true, y_pred,
+                        dy_true=None, dy_pred=None,
+                        ci=0.95,
+                        statistic='RMSE',
+                        nbootstrap=1000,
+                        plot_type='dG'):
     import sklearn.metrics
     import scipy
     """Compute mean and confidence intervals of specified statistic.
@@ -89,11 +126,12 @@ def unique_differences(x):
     assert len(y_true) == len(dy_true)
     assert len(y_true) == len(dy_pred)
     sample_size = len(y_true)
-    s_n = np.zeros([nbootstrap], np.float64) # s_n[n] is the statistic computed for bootstrap sample n
+    # s_n[n] is the statistic computed for bootstrap sample n
+    s_n = np.zeros([nbootstrap], np.float64)
     for replicate in range(nbootstrap):
         y_true_sample = np.zeros_like(y_true)
         y_pred_sample = np.zeros_like(y_pred)
-        for i,j in enumerate(np.random.choice(np.arange(sample_size), size=[sample_size], replace=True)):
+        for i, j in enumerate(np.random.choice(np.arange(sample_size), size=[sample_size], replace=True)):
             y_true_sample[i] = np.random.normal(loc=y_true[j], scale=np.fabs(dy_true[j]), size=1)
             y_pred_sample[i] = np.random.normal(loc=y_pred[j], scale=np.fabs(dy_pred[j]), size=1)
         s_n[replicate] = compute_statistic(y_true_sample, y_pred_sample, statistic)
@@ -121,7 +159,9 @@ def mle(g, factor='f_ij', node_factor=None):
     We assume the free energy of node 0 is zero.
 
     Reference : https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b00528
-    Xu, Huafeng. "Optimal measurement network of pairwise differences." Journal of Chemical Information and Modeling 59.11 (2019): 4720-4728.
+    Xu, Huafeng. "Optimal measurement network of pairwise differences."
+    Journal of Chemical Information and Modeling
+    59.11 (2019): 4720-4728.
 
     Parameters
     ----------
@@ -151,7 +191,8 @@ def mle(g, factor='f_ij', node_factor=None):
         df_ij = form_edge_matrix(g, factor.replace('_', '_d'), action='symmetrize')
     else:
         f_ij = form_edge_matrix(g, factor, action='antisymmetrize', node_label=node_factor)
-        df_ij = form_edge_matrix(g, factor.replace('_', '_d'), action='symmetrize', node_label=node_factor.replace('_', '_d'))
+        df_ij = form_edge_matrix(g, factor.replace('_', '_d'),                   action='symmetrize',
+                                 node_label=node_factor.replace('_', '_d'))
 
     node_name_to_index = {}
     for i, name in enumerate(g.nodes()):