-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresid-plot.py
78 lines (61 loc) · 2.4 KB
/
resid-plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import toolshed as ts
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence
import scipy.stats as ss
from statsmodels.formula.api import ols
import pandas as pd
import csv
import sys
csv.field_size_limit(14365000)
cutoff = 0.3
def fn(scores, covs):
c = np.array(covs)**2
#c /= c.sum()
return sum(sc * cov for sc, cov, oc in zip(scores, c, covs) if oc > cutoff)
sigma = []
cgs, zs, xs, ys, genes = [], [], [], [], []
try:
for i, d in enumerate(ts.reader(1)):
gerps = [float(x)+13 for x in d['gerp'].split(",")]
genes.append((d['chrom'], str(d['start']), str(d['end']), d['gene'],
d['exon'], str(len(gerps))))
coverage = map(float, d['coverage'].split(","))
score = fn(gerps, coverage)
cgs.append(float(d['cg_content']))
ys.append(score)
xs.append(sum(c for c in coverage if c > cutoff))
#sigma.append(np.std([sc * cov for sc, cov in zip(gerps, coverage)]) or 1)
# just subtract scaled gerp from scaled coverage.
zs.append(sum(cov - g / 25. for cov, g in zip(coverage, gerps) if cov >
cutoff))
except:
raise
X = pd.DataFrame({"CpG": cgs, "sum-coverage": xs})
#X = np.array(cgs)
results = sm.OLS(ys, X, hasconst=False).fit()
l = results.predict([np.zeros(X.shape[1] if len(X.shape) > 1 else 1), X.max(0)])
print >>sys.stderr, results.params
plt.plot(cgs, ys, marker='.', ls='none')
plt.plot([0, max(cgs)], l)
plt.title("split by missense")
plt.xlabel("sum(coverage)")
plt.ylabel("sum(cov * g for cov, g in zip(coverage, gerp))")
#plt.show()
resid = results.resid
resid = OLSInfluence(results).get_resid_studentized_external()
pctile = 100.0 * np.sort(resid).searchsorted(resid) / float(len(resid))
score_pctile = 100.0 * np.sort(xs).searchsorted(xs) / float(len(xs))
z_pctile = 100.0 * np.sort(zs).searchsorted(zs) / float(len(zs))
print "chrom\tstart\tend\tgene\texon\tn\tgerp_resid\tgerp_resid_pctile\tscore_pctile\tz_pctile"
for i, row in enumerate(genes):
print "\t".join(list(row) + ["%.3f" % resid[i], "%.9f" % pctile[i],
"%.9f" % score_pctile[i],
"%.9f" % z_pctile[i],
])
#plt.close()
#fig = plt.figure(figsize=(12,8))
#fig = sm.graphics.plot_partregress_grid(results, fig=fig)
#plt.show()