This repository has been archived by the owner on May 11, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPROFYLE_TMB.recipe
216 lines (189 loc) · 11 KB
/
PROFYLE_TMB.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
BootStrap: yum
OSVersion: 7
MirrorURL: http://mirror.centos.org/centos-%{OSVERSION}/%{OSVERSION}/os/x86_64/
Include: yum
%runscript
usage_simple="
\n
TMB container V0.1\n
\n
Usage: singularity run --bind /path/to/reference/:/ref,/path/to/tumour/bam/:/tumour,//path/to/normal/bam/:/normal\n
PROFYLE_TMB -t tumour_bam_file.bam -n normal_bam_file.bam -r /ref/yourRef.fa -o /path/to/output\n
\n
Options:\n
--bind\tThe ABSOLUTE paths to your tumour bam, normal bam and reference files.\n
\t\t You can find the absolute path to symlinks using 'readlink -f myFile'\n
-t\tThe name of the tumour Bam file\n
-n\tThe name of the normal Bam file\n
-r\tThe name of the fasta file (must be indexed)\n
-o\tThe output folder to use\n
-h\tPrint the long help message\n
\n
For example:\n
singularity run --bind /projects/alignment_references/Homo_sapiens/hg19a/genome/fasta/:/ref,/projects/cellLineControls/genome/COLO829/hg19/dilutions/mem/:/tumour,/projects/cellLineControls/genome/COLO829BL/:/normal PROFYLE_TMB -t /tumour/A36971_2_lanes_dupsFlagged.bam -n /normal/A36973_1_lane_dupsFlagged.bam -r /ref/hg19a.fa -o `pwd`\n
"
unset tumour_bam
while getopts ":t:n:r:o:h" opt; do
case ${opt} in
t ) #Tumour bam file
tumour_bam=$OPTARG
;;
n ) #normal bam file
normal_bam=$OPTARG
;;
r ) #reference file
ref=$OPTARG
;;
o ) #base output folder
out_base=$OPTARG
;;
h )
echo -e ${usage_simple}
exit 1
;;
\? )
echo "Invalid option: $OPTARG" 1>&2
echo -e ${usage_simple}
exit 1
;;
: )
echo "Invalid option: $OPTARG requires an argument" 1>&2
echo -e ${usage_simple}
exit 1
;;
esac
done
shift $((OPTIND -1))
#handle the no parameters case
if [ -z "$tumour_bam" ]
then
echo -e ${usage_simple}
exit 1
fi
#make sure files exist
if [ ! -f "$tumour_bam" ] || [ ! -f "$normal_bam" ] || [ ! -f "$ref" ]
then
echo "One or more files don't exist. Make sure you are providing the correct"
echo "folder paths to the singularity --bind argument. Also check your file"
echo "names. You may find the "
echo "singularity shell"
echo "command useful to make sure your files are visible from within the"
echo "container."
fi
# Setup variables for filenames
tumour_name=`basename ${tumour_bam}`
normal_name=`basename ${normal_bam}`
ref_name=`basename ${ref}`
#results folder
results_folder=${out_base}/"counts_"${tumour_name}_${normal_name}_${ref_name}
mkdir -p ${results_folder}
results_file="${results_folder}/report.txt"
log_file="${results_folder}/log.txt"
echo "TMB Pipeline V0.1" | tee ${log_file}
echo "Tumour BAM File: ${tumour_bam}" | tee -a ${log_file}
echo "Normal BAM File: ${normal_bam}"| tee -a ${log_file}
echo "Reference: ${ref}"| tee -a ${log_file}
echo "Manta V1.6.0"| tee -a ${log_file}
echo "Strelka 2.9.2"| tee -a ${log_file}
echo "SnpEff 4.3t GRCh37.75"| tee -a ${log_file}
echo "msisensor 0.2" | tee -a ${log_file}
printf "Output going to: %s\n" ${out_base} | tee -a ${log_file}
#get reference base counts
echo "checking the reference bases to use for calculations..." | tee -a ${log_file}
count_file=${results_folder}/genome_ACTG_count.txt
#Took out the check that would skip counting bases if the file exists. If someone ran a test and killed the job early, the file would be empty but never overridden
# if test ! -f ${count_file}; then
/usr/TMB/seqtk-1.3/seqtk comp ${ref} | grep -E '^[123456789XY]{1,2}\s' | awk '{ print $3+$4+$5+$6 }' | paste -sd+ | bc > ${count_file}
# fi
#Run Manta
echo "running manta..." | tee -a ${log_file}
manta_folder=${out_base}/manta_${tumour_name}_${normal_name}_${ref_name}
/usr/TMB/manta-1.6.0.centos6_x86_64/bin/configManta.py --normalBam=${normal_bam} --tumorBam=${tumour_bam} --referenceFasta=${ref} --runDir $manta_folder 2>> ${log_file}
${manta_folder}/runWorkflow.py -m local -j 48 2>> ${log_file}
#Run Strelka
echo "running strelka..." | tee -a ${log_file}
strelka_folder=${out_base}/strelka_${tumour_name}_${normal_name}_${ref_name}
/usr/TMB/strelka-2.9.2.centos6_x86_64/bin//configureStrelkaSomaticWorkflow.py --normalBam ${normal_bam} --tumorBam ${tumour_bam} --referenceFasta ${ref} --runDir ${strelka_folder} --indelCandidates ${manta_folder}/results/variants/candidateSmallIndels.vcf.gz 2>> ${log_file}
${strelka_folder}/runWorkflow.py -m local -j 48 2>> ${log_file}
#Filter to get just the passed variants int he counts below
echo "annotating variants..." | tee -a ${log_file}
java -Xmx16g -jar /usr/TMB/snpEff/SnpSift.jar filter "(FILTER = 'PASS')" ${manta_folder}/results/variants/somaticSV.vcf.gz > ${manta_folder}/results/variants/somatic.SV.PASS.vcf | tee -a ${log_file}
java -Xmx16g -jar /usr/TMB/snpEff/SnpSift.jar filter "(FILTER = 'PASS')" ${strelka_folder}/results/variants/somatic.snvs.vcf.gz > ${strelka_folder}/results/variants/somatic.snvs.PASS.vcf | tee -a ${log_file}
java -Xmx16g -jar /usr/TMB/snpEff/SnpSift.jar filter "(FILTER = 'PASS')" ${strelka_folder}/results/variants/somatic.indels.vcf.gz > ${strelka_folder}/results/variants/somatic.indels.PASS.vcf | tee -a ${log_file}
#Run SnpEff
java -Xmx48g -jar /usr/TMB/snpEff/snpEff.jar GRCh37.75 -s ${results_folder}/snpEff_summary_snv_pass.html ${strelka_folder}/results/variants/somatic.snvs.PASS.vcf > ${results_folder}/somatic.snvs.PASS.SnpEff.vcf | tee -a ${log_file}
java -Xmx48g -jar /usr/TMB/snpEff/snpEff.jar GRCh37.75 -s ${results_folder}/snpEff_summary_indel_pass.html ${strelka_folder}/results/variants/somatic.indels.PASS.vcf > ${results_folder}/somatic.indels.PASS.SnpEff.vcf | tee -a ${log_file}
java -Xmx48g -jar /usr/TMB/snpEff/snpEff.jar GRCh37.75 -s ${results_folder}/snpEff_summary_sv_pass.html ${manta_folder}/results/variants/somatic.SV.PASS.vcf > ${results_folder}/somatic.SV.PASS.SnpEff.vcf | tee -a ${log_file}
#run msisensor and send the output to the counts folder
echo "estimating MSI..." | tee -a ${log_file}
/usr/TMB/msisensor.linux msi -d /usr/TMB/microsatellites.rep1 -n ${normal_bam} -t ${tumour_bam} -o ${results_folder}/msi_out.txt -b 48 > ${results_folder}/msi_stdout.txt 2> ${results_folder}/msi_stderr.txt
#Print Report:
total_bases=$(cat ${count_file})
CDS_bases=$(cat /usr/TMB/CDS_size.txt)
total_SNVs=$(cat ${results_folder}/somatic.snvs.PASS.SnpEff.vcf | grep -v ^# | wc -l)
total_Indels=$(cat ${results_folder}//somatic.indels.PASS.SnpEff.vcf | grep -v ^# | wc -l)
coding_SNVs=$(java -Xmx16g -jar /usr/TMB/snpEff/SnpSift.jar filter "(EFF[*].IMPACT = 'MODERATE') | (EFF[*].IMPACT = 'HIGH')" ${results_folder}/somatic.snvs.PASS.SnpEff.vcf | grep -E '^[1234567890XY]{1,2}\s' | wc -l)
coding_Indels=$(java -Xmx16g -jar /usr/TMB/snpEff/SnpSift.jar filter "(EFF[*].IMPACT = 'MODERATE') | (EFF[*].IMPACT = 'HIGH')" ${results_folder}/somatic.indels.PASS.SnpEff.vcf | grep -E '^[1234567890XY]{1,2}\s' | wc -l)
MSI_score=$(awk 'NR==2{ print $NF }' ${results_folder}/msi_out.txt)
echo "TMB Pipeline V0.1" | tee ${results_file}
echo "Tumour BAM File: ${tumour_bam}" | tee -a ${results_file}
echo "Normal BAM File: ${normal_bam}"| tee -a ${results_file}
printf "Non-N bases in 1-22,X,Y: %d\n" ${total_bases} | tee -a ${results_file}
printf "CDS bases in 1-22,X,Y: %d\n" ${CDS_bases} | tee -a ${results_file}
printf "Total genome SNVs: %d\n" ${total_SNVs} | tee -a ${results_file}
printf "Total genome Indels: %d\n" ${total_Indels} | tee -a ${results_file}
printf "Coding SNVs: %d\n" ${coding_SNVs} | tee -a ${results_file}
printf "Coding Indels: %d\n" ${coding_Indels} | tee -a ${results_file}
printf "============== Final results below ================\n" | tee -a ${results_file}
printf "Genome SNV TMB: %3.2f\n" `echo "scale=8; ${total_SNVs} * 1000000 / ${total_bases}" | bc` | tee -a ${results_file}
printf "Genome Indel TMB: %3.2f\n" `echo "scale=8; ${total_Indels} * 1000000 / ${total_bases}" | bc` | tee -a ${results_file}
printf "Coding SNV TMB: %3.2f\n" `echo "scale=8; ${coding_SNVs} * 1000000 / ${CDS_bases}" | bc` | tee -a ${results_file}
printf "Coding Indel TMB: %3.2f\n" `echo "scale=8; ${coding_Indels} * 1000000 / ${CDS_bases}" | bc` | tee -a ${results_file}
printf "MSI score: %3.2f\n" ${MSI_score} | tee -a ${results_file}
printf "Report Complete!" | tee -a ${results_file}
%post
mkdir /usr/TMB
cd /usr/TMB
yum -y update
yum -y install wget unzip tar bzip2 java-1.8.0-openjdk
yum -y install zlib-devel ncurses-devel ncurses
yum -y install libgomp
yum -y install make
yum -y install gcc
yum -y install bc
#install strelka2.9.2
wget https://github.com/Illumina/strelka/releases/download/v2.9.2/strelka-2.9.2.centos6_x86_64.tar.bz2
tar xvjf strelka-2.9.2.centos6_x86_64.tar.bz2
#run install tests
# bash strelka-2.9.2.centos6_x86_64/bin/runStrelkaSomaticWorkflowDemo.bash
# bash strelka-2.9.2.centos6_x86_64/bin/runStrelkaGermlineWorkflowDemo.bash
#install Manta for SV calls, and upstream Strelka calling
wget https://github.com/Illumina/manta/releases/download/v1.6.0/manta-1.6.0.centos6_x86_64.tar.bz2
tar xvjf manta-1.6.0.centos6_x86_64.tar.bz2
#run a small tests in Manta
# python manta-1.6.0.centos6_x86_64/bin//runMantaWorkflowDemo.py
#bedtools
wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools.static.binary
chmod 775 bedtools.static.binary
#seqtk for counting fasta bases quickly
wget https://github.com/lh3/seqtk/archive/v1.3.tar.gz
tar -xvf v1.3.tar.gz
cd seqtk-1.3/
make
cd /usr/TMB
#install snpEff
wget https://cfhcable.dl.sourceforge.net/project/snpeff/snpEff_v4_3t_core.zip
unzip snpEff_v4_3t_core.zip
#Set up Ensembl 75 GRCh37 database (will take a few minutes)
java -jar /usr/TMB/snpEff/snpEff.jar download GRCh37.75
#extract the stats for the annotations - needs about 16Gb od RAM so this won't build on singularityhub where the RAM limit is 3.5Gb.
java -jar /usr/TMB/snpEff/snpEff.jar dump -v -bed GRCh37.75 > GRCh37.75.bed
#get the size of the CDS
grep -E '^[1234567890XY]{1,2}\s' GRCh37.75.bed | grep CDS | ./bedtools.static.binary sort | ./bedtools.static.binary merge | awk '{ print $3-$2 }' | paste -sd+ | bc > CDS_size.txt
#msisensor install - would be nice to scan the reference at setup, but it needs to match the reference supplied at runtime.
wget https://github.com/ding-lab/msisensor/releases/download/0.2/msisensor.linux
chmod 775 msisensor.linux
wget https://www.bcgsc.ca/downloads/rcorbett/TMB/microsatellites.rep1
%environment
export PATH=/usr/TMB:$PATH
#export LC_ALL=C