-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetSequences.py
86 lines (66 loc) · 2.05 KB
/
getSequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
def setOutFile(outputFile):
outputFile = os.path.abspath(outputFile)
if not os.path.isdir(os.path.dirname(outputFile)):
os.makedirs(os.path.dirname(outputFile))
return outputFile
def getListSequences(sequencesListFile):
list_sequences = []
with open(sequencesListFile, 'rtU') as lines:
for line in lines:
line = line.splitlines()[0]
if len(line) > 0:
if line[0] == '>':
line = line[1:]
list_sequences.append(line)
return list_sequences
def writeOutFile(fastaFile, list_sequences, outputFile):
writer = open(outputFile, 'wt')
number_sequences = 0
number_bases = 0
seqHeader = ''
seqSequence = ''
with open(fastaFile, 'rtU') as lines:
for line in lines:
line = line.splitlines()[0]
if len(line) > 0:
if line[0] == '>':
if seqHeader != '':
sequenced_found = False
if seqHeader[1:] in list_sequences:
# First search
sequenced_found = True
elif seqHeader[1:].split()[0] in list_sequences:
# Second search
sequenced_found = True
if sequenced_found:
writer.write(seqHeader + '\n')
writer.write(seqSequence + '\n')
writer.flush()
number_bases = number_bases + len(seqSequence)
number_sequences += 1
seqHeader = ''
seqSequence = ''
seqHeader = line
else:
seqSequence = seqSequence + line
sequenced_found = False
if seqHeader[1:] in list_sequences:
# First search
sequenced_found = True
elif seqHeader[1:].split()[0] in list_sequences:
# Second search
sequenced_found = True
if sequenced_found:
writer.write(seqHeader + '\n')
writer.write(seqSequence + '\n')
writer.flush()
number_bases = number_bases + len(seqSequence)
number_sequences += 1
writer.close()
return number_sequences, number_bases
def getSequences(fastaFile, sequencesListFile, outputFile):
outputFile = setOutFile(outputFile)
list_sequences = getListSequences(sequencesListFile)
number_sequences, number_bases = writeOutFile(fastaFile, list_sequences, outputFile)
return outputFile, number_sequences, number_bases