forked from swiftbiosciences/16S-SNAPP-py3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnapp.sh
executable file
·186 lines (159 loc) · 5.96 KB
/
snapp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/bin/bash
## Swift Biosciences 16S snapp workflow
## Author Benli Chai & Sukhinder Sandhu 20200502
## Swift Biosciences 16S snapp.sh workflow
## The main wrapper to run the wordflow
## Remember to edit/set the parameters in config.txt file
## Run as snapp.sh config.txt inputDir workDir
## make sure work dir exists before running the pipeline
START=$(date +%s.%N)
if [ $# -ne 3 ]
then
echo 'snapp.sh config.txt inputdir workdir'
exit
fi
set -e
set -x
export SCRIPTS=$(dirname $(readlink -f $0)) #the path to scripts
source $1 #load config.txt
echo `cat ${1}`
INPUTDIR=$(readlink -f $2) # the absolute dir where the fastq.gz file are located
export WD=$(readlink -f $3) #work folder
runlog='log'
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
echo $current_time
export log=${runlog}.${current_time}
[ ! -d "$WD" ] && echo "Directory $WD does not exist, please create one..." && exit
##Make a directory for primer-trimmed sequence files
mkdir ${WD}/trimmed
mkdir RESDIR
export RESDIR=$(readlink -f $PWD/RESDIR)
cd ${WD}
##Match and trim primers from PE reads
echo -e "Matching/trimming primers...\n Starts: $(date)" >> $log
start=$(date +%s.%N)
trimStats='trimStats.txt'
printf "SampleID\tStarting read count\tPrimer trimmed\n" >> ${trimStats}
for R1 in ${INPUTDIR}/*_R1_*fastq.gz; do
echo $PRIMERS
R2=${R1/_R1_/_R2_} #the path to the read 2 file
basenameR1=${R1##*/}
basenameR2=${R2##*/}
prefixR1=${basenameR1%".fastq.gz"}
prefixR2=${basenameR2%".fastq.gz"}
prefix=${prefixR1%_L001_R1_*}
totalCount=$(bc <<< $(zcat ${R1}|wc -l)/4)
#PE option to trim primers off the reads
$CUTADAPT -e 0.10 -g file:$PRIMERS -G file:$PRIMERS \
-o trimmed/${basenameR1} \
-p trimmed/${basenameR2} \
--untrimmed-output ${prefixR1}_NotTrimmed.fastq \
--untrimmed-paired-output ${prefixR2}_NotTrimmed.fastq \
$R1 $R2 \
--max-n 0 \
--minimum-length ${READLEN}
[[ ! -s trimmed/${basenameR1} ]] \
&& rm trimmed/${basenameR1} \
&& rm trimmed/${basenameR2} \
&& echo "${prefix} zero match primers" \
&& rm ${prefix}* \
&& printf "${prefix}\t${totalCount}\t0}\n" >> ${trimStats}\
&& continue
trimmedCount=$(bc <<< $(zcat trimmed/${basenameR1}|wc -l)/4)
trimPCT=$(bc <<< "scale=4 ; (${trimmedCount}/${totalCount})*100" )
printf "${prefix}\t${totalCount}\t${trimmedCount}\n" >> ${trimStats}
echo "$prefix trimmed: ${trimPCT}%" >> $log
done
rm *NotTrimmed.fastq
echo " Ends: $(date)">>$log
end=$(date +%s.%N)
runtime=$(python -c "print(${end} - ${start})")
echo " Primer trimming Runtime: $runtime sec" >> $log
##Run Dada2 to obtain ASVs and remove chimeric reads from primer-trimmed
##fastq PE files of all samples
echo -e "\nRunning DADA2...\n Starts: $(date)">>$log
start=$(date +%s.%N)
printf '\n' >> $log
echo -e "\nDADA2 processing stats:" >> $log
#Prepare asv count tables and sequences in PEs, single formats
${SCRIPTS}/run_dada2.R trimmed/ . ${READLEN} >> $log #run dada2 with R
cat "DADA2_summary.csv" | sed -e 's/"//g' >> $log
${SCRIPTS}/get_asv_files.py asv_seqNcount.csv asv
echo -e "\n Unique ASV pairs from DADA2: $(bc <<< "$(grep -c ">" \
asv_seq.fasta)/2")" >>$log
echo " Ends: $(date)">>$log
end=$(date +%s.%N)
runtime=$(python -c "print(${end} - ${start})")
echo " DADA2 Runtime: $runtime sec" >> $log
##Classify asv PEs with RDP classifier
echo -e "\nRunning RDP classifier...\n Starts: $(date)">>$log
start=$(date +%s.%N)
if [[ -z $RDP_CLASSIFIER ]]
then
java -Xmx4g -jar ${RDPHOME}/classifier.jar \
-f fixrank \
-o asv_PE.cls \
asv_PE.fasta
else
java -Xmx4g -jar ${RDPHOME}/classifier.jar \
-t $RDP_CLASSIFIER \
-o asv_PE.cls \
asv_PE.fasta
fi
end=$(date +%s.%N)
runtime=$(python -c "print(${end} - ${start})")
echo " Read classfication Runtime: $runtime sec" >> $log
##Data reduction, i.e. reduce the number of reads for downstream processing
#Retain only unique sequences comparing both strands
echo -e "\nDereplicating ASVs...\n Starts: $(date)" >>$log
start=$(date +%s.%N)
$VSEARCH --cluster_size asv_seq.fasta \
--strand both \
--iddef 1 \
--id 1.00 \
--uc asv.uc \
--centroid asv_uniq.fasta
end=$(date +%s.%N)
runtime=$(python -c "print(${end} - ${start})")
echo " Dereplication Runtime: $runtime sec" >> $log
${SCRIPTS}/blastn_for_templates.sh
##Determine the candiate reference sequenes, associate reads, allocate read counts,
##and classify each consensus sequence representing each associated amplicon set.
echo -e "\nConverging candidate template sequences...\n Starts: $(date)">>$log
start=$(date +%s.%N)
${SCRIPTS}/converge.py \
asv_count.csv \
asv_PE.fasta \
asv_PE.cls \
$log
end=$(date +%s.%N)
runtime=$(python -c "print(${end} - ${start})")
echo " Converging Runtime: $runtime sec" >> $log
echo "\n" >> $log
##Cluster all sequences in consensus files using kmer in R
start=$(date +%s.%N)
cat *consensus.fasta > all_cons.fasta
${SCRIPTS}/sum_count.py . all
$VSEARCH --sortbysize all_cons.fasta --output all_cons_sorted.fasta
mv all_cons_sorted.fasta all_cons.fasta
[ ! -d "tmp" ] && echo "create tmp diretory" && mkdir tmp
#split the sorted fasta file into smaller ones for running minimap2 alignment
${SCRIPTS}/splitFasta.py all_cons.fasta 10000 tmp
for fas in tmp/*.fas; do
prefix=${fas%.fas}
prefix=${prefix##*/all_cons_}
minimap2 -a --eqx -o tmp/${prefix}.sam all_cons.fasta ${fas}
done
cat tmp/*.sam | grep -v "^@" > all_cons.sam
cutoff=0.03
${SCRIPTS}/parse_minimap2_clr.py all_cons.sam all_cons.clr $cutoff
${SCRIPTS}/get_OTU_table.py . all_cons.fasta all_cons.clr OTUs
END=$(date +%s.%N)
runtime=$(python -c "print(${END} - ${start})")
echo " Clustering time: $runtime sec" >> $log
echo "\n" >> $log
echo "\n" >> $log
runtime=$(python -c "print(${END} - ${START})")
echo -e "\nWhole process completed in: $runtime sec">>$log
#clean up
rm -rf tmp