-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfasta_cython.pyx
113 lines (97 loc) · 2.75 KB
/
fasta_cython.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# The Computer Language Benchmarks Game
# http://shootout.alioth.debian.org/
#
# modified by Ian Osgood
# modified again by Heinrich Acker
cdef extern from *:
void memcpy(void *, void *, int)
import sys
alu = (
'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG'
'GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA'
'CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT'
'ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA'
'GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG'
'AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC'
'AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA')
iub = zip('acgtBDHKMNRSVWY', [0.27, 0.12, 0.12, 0.27] + [0.02]*11)
homosapiens = [
('a', 0.3029549426680),
('c', 0.1979883004921),
('g', 0.1975473066391),
('t', 0.3015094502008),
]
cdef unsigned long seed = 42
cdef inline float pseudo_random(float lim):
cdef unsigned long ia = 3877, ic = 29573, im = 139968
cdef float imf = im
global seed
seed = (seed * ia + ic) % im
return lim * <long>seed / imf
def makeCumulative(table):
P = []
C = []
prob = 0.
for char, p in table:
prob += p
P += [prob]
C += [char]
return (P, C)
def repeatFasta(src, int n):
cdef int i, j, r = len(src)
cdef int width = 60
s = src * 2
line = ' ' * width
cdef char* ss = s
cdef char* buf = line
i = 0
for j in xrange(n // width):
memcpy(buf, ss + i, width)
print line
i += width
i -= r * (i>r)
if n % width:
memcpy(buf, ss + i, width)
print line[:n % width]
def randomFasta(table, int n):
probs, chars = makeCumulative(table)
cdef int width = 60
cdef float cprobs[15]
cdef char cchars[15]
cdef int i, j, k
cdef float r
line = ' ' * width
# buf is a pointer to the contents of line
cdef char* buf = line
cdef int bisect
cdef float bisect_prob
for i in range(len(table)):
cprobs[i] = probs[i]
cchars[i] = ord(chars[i])
for j in xrange(n // width):
for i in range(width):
r = pseudo_random(1.0)
k = (r >= cprobs[0]) + (r >= cprobs[1]) + (r >= cprobs[2])
while cprobs[k] < r:
k += 1
buf[i] = cchars[k]
print line
if n % width:
for i in range(n % width):
r = pseudo_random(1.0)
k = 0
while cprobs[k] < r:
k += 1
buf[i] = cchars[k]
print line[:n % width]
if __name__ == '__main__':
try:
n = int(sys.argv[1])
except IndexError:
n = 25000000
print '>ONE Homo sapiens alu'
repeatFasta(alu, n*2)
print '>TWO IUB ambiguity codes'
randomFasta(iub, n*3)
print '>THREE Homo sapiens frequency'
randomFasta(homosapiens, n*5)