forked from norvig/pytudes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathibol.py
193 lines (168 loc) · 7.02 KB
/
ibol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from collections import defaultdict
def get_genomes(fname="byronbayseqs.fas.txt"):
"Return a list of genomes, and a list of their corresponding names."
import re
names, species, genomes = [], [], []
for name, g in re.findall('>(.*?)\r([^\r]*)\r*', file(fname).read()):
names.append(name)
species.append(name.split('|')[-1])
genomes.append(g)
return names, species, genomes
def get_neighbors(fname="editdistances.txt"):
"Return dict: neighbors[i][j] = neighbors[j][i] = d means i,j are d apart."
## Read the data pre-computed from the Java program
neighbors = dict((i, {}) for i in range(n))
for line in file(fname):
i,j,d = map(int, line.split())
neighbors[i][j] = neighbors[j][i] = d
return neighbors
def cluster(neighbors, d, dc):
"""Return a list of clusters, each cluster element is within d of another
and within dc of every other cluster element."""
unclustered = set(neighbors) ## set of g's not yet clustered
return [closure(g, set(), unclustered, d, dc)
for g in neighbors if g in unclustered]
def closure(g, s, unclustered, d, dc):
"Accumulate in set s the transitive closure of 'near', starting at g"
if g not in s and g in unclustered and near(g, s, d, dc):
s.add(g); unclustered.remove(g)
for g2 in neighbors[g]:
closure(g2, s, unclustered, d, dc)
return s
def dist(i, j):
"Distance between two genomes."
if i == j: return 0
return neighbors[min(i, j)].get(max(i, j), max_distance)
def near(g, cluster, d, dc):
"Is g within d of some member of c, and within dc of every member of c?"
distances = [dist(g, g2) for g2 in cluster] or [0]
return min(distances) <= d and max(distances) <= dc
def diameter(cluster):
"The largest distance between two elements of the cluster"
return max([dist(i, j) for i in cluster for j in cluster] or [0])
def margin(cluster):
"The distance from a cluster to the nearest g2 outside this cluster."
return min([d for g in cluster for g2,d in neighbors[g].items()
if g2 not in cluster] or [max_distance])
################################################################ Analysis
def pct(num, den):
"Return a string representing the percentage. "
if '__len__' in dir(den): den = len(den)
if num==den: return ' 100%'
return '%.1f%%' % (num*100.0/den)
def histo(items):
"Make a histogram from a sequence of items or (item, count) tuples."
D = defaultdict(int)
for item in items:
if isinstance(item, tuple): D[item[0]] += item[1]
else: D[item] += 1
return D
def showh(d):
"Show a histogram"
if not isinstance(d, dict): d = histo(d)
return ' '.join('%s:%s' % i for i in sorted(d.items()))
def greport(genomes):
print "Number of genomes: %d (%d distinct)" % (len(genomes), len(set(genomes)))
G = dict((g, set()) for g in genomes)
for i in range(n):
G[genomes[i]].add(species[i])
print "Multi-named genomes:", (
len([s for s in G.values() if len(s) > 1]))
lens = map(len, genomes)
print "Genome lengths: min=%d, max=%d" % (min(lens), max(lens))
print "Character counts: ", showh(c for g in genomes for c in g)
def nreport(neighbors):
NN, NumN = defaultdict(int), defaultdict(int) ## Nearest, Number of neighbors
for n in neighbors:
nn = min(neighbors[n].values() or ['>25'])
NN[nn] += 1
for d2 in neighbors[n].values():
NumN[d2] += 1
print
print "Nearest neighbor counts:", showh(NN)
print "Number of neighbors at each distance:", showh(NumN)
def nspecies(c): return len(set(species[g] for g in c))
def showc(c):
return "N=%d, D=%d, M=%d: %s %s" % (
len(c), diameter(c), margin(c), list(c), showh(species[g] for g in c))
def creport(drange, dcrange):
def table(what, fn):
print "\n" + what
print ' '*8, ' '.join([' '+pct(dc, glen) for dc in dcrange])
for d in drange:
print '%s (%2d)' % (pct(d, glen), d),
for dc in dcrange:
print '%5s' % fn(cluster(neighbors, d, dc)),
print
print '\nNearest neighbor must be closer than this percentage (places). '
print 'Each column: all genomes in cluster within this percentage of each other.'
table("Number of clusters", len)
cluster1 = cluster(neighbors, 8, 15) ## splits Cleora
print '\nNumber of clusters of different sizes:', showh(len(c) for c in cluster1)
M, T = defaultdict(int), defaultdict(int)
for c in cluster1:
M[margin(c)] += 1; T[margin(c)] += len(c)
for x in M: print '%d\t%d\t%d'% (x,M[x],T[x])
print '\nMargins', showh(M)
for c in cluster1:
if margin(c) <= 16:
print showc(c)
print '\nScatter plot of cluster diameter vs. margin.'
for c in cluster1:
if diameter(c) > 0:
pass
#print '%d\t%d' % (diameter(c), margin(c))
print '\nDifference from cluster(neighbors, 11, 14):'
#table(lambda cl: pct(len(cluster1)-compare(cluster1, cl),max(len(cluster1),len(cl))))
print '\nNumber of clusters witth more than one species name:'
#table(lambda cl: sum(nspecies(c) > 1 for c in cl))
def pct_near_another(clusters, P=1.25):
total = 0
for c in clusters:
d = diameter(c)
for g in c:
for g2 in neighbors[g]:
if g2 not in c and dist(g, g2) < P*d:
total += 1
return pct(total, n)
def f(P):
print '\nPercent of individuals within %.2f*diameter of another cluster.'%P
table(lambda cl: pct_near_another(cl, P))
#map(f, [1.2, 1.33, 1.5])
def sreport(species):
SS = defaultdict(int)
print
for s in set(species):
c = [g for g in range(n) if species[g] == s]
d = diameter(c)
if d > 14:
if d==glen: d = '>25'
print 'diameter %s for %s (%d elements)' % (d, s, len(c))
SS[d] += 1
print 'Diameters of %d labelled clusters: %s' % (len(set(species)), showh(SS))
def compare(cl1, cl2):
"Compare two lists of clusters"
return sum(c1==c2 or 0.5*(abs(len(c1)-len(c2))==1 and
(c1.issubset(c2) or c2.issubset(c1)))
for c1 in cl1 for c2 in cl2)
def unit_tests():
assert set(len(g) for g in genomes) == set([glen])
clusters = cluster(neighbors, 11, 11)
assert sum(len(c) for c in clusters) == len(genomes)
assert len(set(g for c in clusters for g in c)) == len(genomes)
assert dist(17, 42) == dist(42, 17)
assert diameter(set()) == 0
assert diameter([17, 42]) == dist(17, 42)
assert pct(1, 2) == '50.0%'
print '\nAll tests pass.\n'
################################################################ Main body
max_distance = 26
names, species, genomes = get_genomes() ## genomes = ['ACT...', ...]
n = len(genomes)
glen = len(genomes[0])
neighbors = get_neighbors() ## neighbor[g] = {g2:d2, g3:g3, ...}
greport(genomes)
nreport(neighbors)
creport(range(6, 15), [glen,16,15,14,13, 12, 11])
#sreport(species)
unit_tests()