-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUpdateKrakenDatabases.py
160 lines (144 loc) · 6.91 KB
/
UpdateKrakenDatabases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
@author: sejalmodha
"""
#import OS module to use os methods and functions
import os
import subprocess
import sys
import datetime
import re
# you'll see this alias in documentation, examples, etc.
import pandas as pd
#import biopython SeqIO
from Bio import SeqIO
#set present working directory
if len(sys.argv) > 1:
cwd = sys.argv[1]
else:
cwd=os.getcwd()
#os.chdir(pwd)
print(cwd)
#get the current working directory
os.chdir(cwd)
print(os.getcwd())
#function to process ftp url file that is created from assembly files
def process_url_file(inputurlfile):
url_file=open(inputurlfile,'r')
file_suffix=r'genomic.gbff.gz'
for line in url_file:
url=line.rstrip('\n').split(',')
ftp_url= url[0]+'/'+url[1]+'_'+url[2]+'_'+file_suffix
#print(new_url)
print("Downloading"+ ftp_url)
#Download the files in the gbff format
subprocess.call("wget "+ftp_url,shell=True)
#unzip the files
subprocess.call("gunzip *.gz",shell=True)
return
#function to download bacterial sequences
def download_bacterial_genomes(outfile='outfile.txt'):
assembly_summary_file=r'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt'
#Download the file using wget sysyem call
subprocess.call("wget "+assembly_summary_file, shell=True)
#Reformat the file to pandas-friendly format
subprocess.call("sed -i '1d' assembly_summary.txt",shell=True)
subprocess.call("sed -i 's/^# //' assembly_summary.txt", shell=True)
#Read the file as a dataframe - using read_table
#Use read_table if the column separator is tab
assembly_sum = pd.read_table('assembly_summary.txt')
#filter the dataframe and save the URLs of the complete genomes in a new file
my_df=assembly_sum[(assembly_sum['version_status'] == 'latest') &
(assembly_sum['assembly_level']=='Complete Genome')
]
my_df=my_df[['ftp_path','assembly_accession','asm_name']]
#output_file.write
my_df.to_csv(outfile,mode='w',index=False,header=None)
process_url_file(outfile)
logdata = my_df[['organism_name','asm_name','seq_rel_date']].to_csv(sep='\t',header=False,index=False)
return(logdata)
#function to download reference genomes
#this function downloads latest version human reference genome by default
def download_refseq_genome(taxid=9606,outfile='refseq_genome.txt'):
assembly_summary_file="ftp://ftp.ncbi.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
#Download the file using wget sysyem call
subprocess.call("wget "+assembly_summary_file, shell=True)
#Reformat the file to pandas-friendly format
subprocess.call("sed -i '1d' assembly_summary_refseq.txt",shell=True)
subprocess.call("sed -i 's/^# //' assembly_summary_refseq.txt", shell=True)
#Read the file as a dataframe - using read_table
#Use read_table if the column separator is tab
assembly_sum = pd.read_table('assembly_summary_refseq.txt')
my_df=assembly_sum[(assembly_sum['taxid'] == taxid) &
(assembly_sum['refseq_category'] == 'reference genome')]
my_df=my_df[['ftp_path','assembly_accession','asm_name']]
#Process the newly created file and download genomes from NCBI website
my_df.to_csv(outfile,mode='w',index=False,header=None)
process_url_file(outfile)
logdata = my_df[['organism_name','asm_name','seq_rel_date']].to_csv(sep='\t',index=False)
return(logdata)
#format genbank files to generate kraken-friendly formatted fasta files
def get_fasta_in_kraken_format(outfile_fasta='sequences.fa'):
output=open(outfile_fasta,'w')
for file_name in os.listdir(cwd):
if file_name.endswith('.gbff'):
records = SeqIO.parse(file_name, "genbank")
for seq_record in records:
seq_id=seq_record.id
seq=seq_record.seq
for feature in seq_record.features:
if 'source' in feature.type:
print(feature.qualifiers)
taxid=''.join(feature.qualifiers['db_xref'])
taxid=re.sub(r'.*taxon:','kraken:taxid|',taxid)
print(''.join(taxid))
outseq=">"+seq_id+"|"+taxid+"\n"+str(seq)+"\n"
output.write(outseq)
os.remove(file_name)
output.close()
return
#------------------------------------------------------------------------------------------
# DOWNLOAD DATABASE GENOMES
#------------------------------------------------------------------------------------------
# Create log file to store date of last update
date=datetime.date.today().strftime ("%Y%m%d")
logfile = open("{}update_DB.log".format(date),'w')
print('Downloading human genome'+'\n')
#change argument in the following function if you want to download other reference genomes
#taxonomy ID 9606 (human) should be replaced with taxonomy ID of genome of interest
logfile.write('HUMANS:\n')
logfile.write(download_refseq_genome(9606,'human_genome_url.txt'))
print('Converting sequences to kraken input format'+'\n')
get_fasta_in_kraken_format('human_genome.fa')
logfile.write('BACTERIA:\n')
print('Downloading bacterial genomes'+'\n')
logfile.write(download_bacterial_genomes('bacterial_complete_genome_url.txt'))
print('Converting sequences to kraken input format'+'\n')
get_fasta_in_kraken_format('bacterial_genomes.fa')
print('Downloading viral genomes'+'\n')
subprocess.call('wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.*.genomic.gbff.gz',shell=True)
subprocess.call('gunzip *.gz',shell=True)
print('Converting sequences to kraken input format'+'\n')
get_fasta_in_kraken_format('viral_genomes.fa')
print('Downloading archaeal genomes'+'\n')
subprocess.call('wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/archaea/archaea.*.genomic.gbff.gz',shell=True)
subprocess.call('gunzip *.gz',shell=True)
print('Converting sequences to kraken input format'+'\n')
get_fasta_in_kraken_format('archaeal_genomes.fa')
print('Downloading plasmid sequences'+'\n')
subprocess.call('wget ftp://ftp.ncbi.nlm.nih.gov/refseq/release/plasmid/plasmid.*.genomic.gbff.gz', shell=True)
subprocess.call('gunzip *.gz',shell=True)
print('Converting sequences to kraken input format'+'\n')
get_fasta_in_kraken_format('plasmids_genomes.fa')
logfile.close()
#Set name for the krakendb directory
krakendb='HumanVirusBacteria'
# subprocess.call('/tools/kraken/kraken-build --download-taxonomy --db '+krakendb, shell=True) # If it fails, can be done manually from download_taxonomy.sh
print('Running Kraken DB build for '+krakendb+'\n')
print('This might take a while '+'\n')
for fasta_file in os.listdir(cwd):
if fasta_file.endswith('.fa') or fasta_file.endswith('.fasta'):
print (fasta_file)
subprocess.call('/tools/kraken/kraken-build --add-to-library '+fasta_file +' --db '+krakendb,shell=True)
subprocess.call('/tools/kraken/kraken-build --build --db '+krakendb+' --threads 12',shell=True)