-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathParseCDHIT.py
110 lines (87 loc) · 3.15 KB
/
ParseCDHIT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
import sys
import argparse
import re
import traceback
from itertools import groupby
try:
from Bio import SeqIO
except ImportError:
msg = """
Could not import the BioPython module which means it is probably
not installed, or at least not available in the PYTHONPATH for this
particular binary.
If you have conda (recommended) try running:
$ conda install -c anaconda biopython
or, alternatively with python/pip:
$ python -m pip install biopython
"""
sys.stderr.write(msg)
sys.exit(1)
__author__ = "Joe R. J. Healey"
__version__ = "1.0"
__title__ = "ParseCDHIT"
__license__ = "GPLv3"
__author_email__ = "[email protected]"
def get_args():
"""Parse command line arguments"""
desc = (
"""Retrieve cluster sequences from the result of CD-HIT using the .clstr file"""
)
epi = """Return the sequences for each cluster defined by a run of CD-HIT.
A typical CD-HIT run would look like:
$ cd-hit -i seqs.fasta -d 0 -c 0.80 -o clust.fasta
Since this script is reliant on SeqIDs, your fasta SeqIDs must be well formed.
Ideally this means they are short and unique. The -d 0 flag is required to get
the SeqIDs as long as possible, thereby minimising the chance that sequences
get mixed up.
"""
try:
parser = argparse.ArgumentParser(description=desc, epilog=epi)
parser.add_argument(
"clusterfile", action="store", help='CD-HIT output file (ends in ".clstr").'
)
parser.add_argument(
"fastafile",
action="store",
help="CD-HIT input file of sequences (a multifasta).",
)
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
except NameError:
sys.stderr.write(
"An exception occurred with argument parsing. Check your provided options."
)
sys.exit(1)
return parser.parse_args()
def main():
"""Call functions and parse the CDHIT file"""
args = get_args()
# Load the fastafile ready to be queried
try:
idx = SeqIO.index(args.fastafile, "fasta")
except ValueError:
traceback.print_exc()
sys.stderr.write(
"BioPython caught a duplicate Sequence Identifier in the input fasta.\
Biopython's index(), and this script absolutely requires that all sequences have unique IDs. It will now exit.\n"
)
sys.exit(1)
# Get all of the sequence IDs from the cluster file
with open(args.clusterfile, "r") as cfh:
groups = [
list(group)
for key, group in groupby(cfh, lambda line: line.startswith(">Cluster"))
if not key
]
# Get all ID strings from the cluster file
ids = [re.findall(r">(.*)\.{3}", "".join(g)) for g in groups]
for i, cluster in enumerate(ids):
print("Parsing Cluster %s, with IDs:" % i)
print(cluster)
with open("Cluster_" + str(i) + ".fasta", "w") as ofh:
for identifier in cluster:
ofh.write(idx[identifier].format("fasta"))
if __name__ == "__main__":
main()