-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_unequal_fasta.py
57 lines (46 loc) · 1.39 KB
/
merge_unequal_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/python3
import sys
import fileinput
all_names=sys.argv[1] #read in text file with all sequence headers
output=sys.argv[2] #create new output
fastas=sys.argv[3:] #read in fasta files in the order you want them to be merged
fullfas=open(output+'.fasta', 'w')
print(all_names)
print(output)
print(fastas)
print(len(fastas),"fasta files")
t = len(fastas)
seqlist=[]
names=open(all_names).readlines()
names=[s.rstrip() for s in names]
fas_dict={}
for key in names:
fas_dict[key]=''
for i in range(0,t): #range through each of the files
header=''
seq=''
temp_dict={}
file=open(fastas[i])
all_lines=file.readlines()
seqlength=len(all_lines[1])-1
print(fastas[i],seqlength,"bp in length")
emptyseq='-'*seqlength
for line in fileinput.input(fastas[i]): #extract fasta headers and sequences
if line[0:1]=='>':
header=line.rstrip()
next
else:
seq=line.rstrip()
temp_dict[header]=seq
fileinput.close()
for name in names: #write sequences to dictionary, i.e. concatenate sequences
if name in temp_dict:
fas_dict[name]=fas_dict[name]+temp_dict[name]
else: #if fasta file doesn't have a sequence of a particular sample, write '-' as long as the sequences
fas_dict[name]=fas_dict[name]+emptyseq
for name in fas_dict: #write out concatenated sequences to file
fullfas.write(name)
fullfas.write('\n')
fullfas.write(fas_dict[name])
fullfas.write('\n')
fullfas.close() #closing file