-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVCFmerger.sh
executable file
·171 lines (148 loc) · 5.87 KB
/
VCFmerger.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/bin/bash
# VCF MERGER 0.7 - Merges the selected regions of the two VCF files with bcftools. Calculates statistics with R.
# WARNING: This script requires a terminating newline in the BED file. Information on how to circumvent this limitation is available here:
# http://stackoverflow.com/questions/4165135/how-to-use-while-read-bash-to-read-the-last-line-in-a-file-if-there-s-no-new
# WARNING: This script requires the usage of the latest experimental version of BCFTOOLS, available at http://pd3.github.io/bcftools/
# WARNING: VCFAnalysis.R is assumed to be in an environment variable. If not, modify the address.
# UPDATE: Argument parsing has been improved. Window size = 1000 by default. Though 200 was originally proposed based on the paper by
# Xiao et al., it was adjusted to 1000 for the sake of statistical power. The window size was assessed with 'WinEvaluation.R'.
# UPDATE2: Multiple flags added to enable greater flexibility. Script adapted to MKT.
# NOTE: Accessibility masks downloaded from: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/accessible_genome_masks/
display_usage() {
echo -e "\nThis script merges regions of two compressed VCF files defined in a BED file using BCFtools. To work, it must be provided with one BED file containing positions in the 2nd and 3rd columns, and two VCF files."
echo -e "\nUsage:\n $(basename "$0") [BED file] [1000GP VCF.GZ file] [Alignment VCF.GZ file] [-w --window] [-c --cnvs] [-db --database]"
echo -e "\nOptions:\n -c, --cnvs \t Remove copy number variants (CNV) from the 1000GP file,\n\t\t which may extend beyond the limits of the interval [False]"
echo -e " -w, --window \t Specifies the size of the window to be analyzed [1000]"
echo -e " -db, --database \t Specifies the name of the data table where the results will be stored [Genomics]"
echo -e " -mkt, --mktest \t Determines whether the MKT will be conducted [TRUE for windows >= 10,000]"
echo -e " -pop, --population \t Specifies the name of the population under analysis [ALL]"
}
#####################################
## ARGUMENT EVALUATION AND PARSING ##
#####################################
## ARGUMENT EVALUATION ##
# Check whether user had supplied -h or --help . If yes display usage
if [[ ( $1 == "--help") || $1 == "-h" ]]
then
display_usage
exit 0
fi
# If less than two arguments supplied, display usage
if [ $# -le 2 ]
then
echo -e "\nERROR: Missing arguments"
display_usage
exit 1
fi
## ARGUMENT PARSING ##
# POSITIONAL ARGUMENTS:
bedfile=$1; shift # BED file containing the regions
gpfile=$1; shift # 1000 GP file
alnfile=$1; shift # Alignment human-mouse file
# Check whether the supplied files exist:
if [ ! -e "$bedfile" ] || [ ! -e "$gpfile" ] || [ ! -e "$alnfile" ]
then
if [ ! -e "$bedfile" ]
then
echo -e "ERROR: $bedfile not found.\n"
fi
if [ ! -e "$gpfile" ]
then
echo -e "ERROR: $gpfile not found.\n"
fi
if [ ! -e "$alnfile" ]
then
echo -e "ERROR: $alnfile not found.\n"
fi
exit 1
fi
# OPTIONAL ARGUMENTS:
WINDOW=1000
DB="GenomicsMKT"
MKT="FALSE"
POP="ALL"
echo $POP
while [[ $# > 0 ]]
do
case "$1" in
-w|--window)
WINDOW="$2" # $1 has the name, $2 the value
echo -e "Window size set to $WINDOW"
shift 2 # next two arguments (window + size)
if [ "$WINDOW" -ge "10000" ] # For windows >= 10000 bp, MKT will be automatically calculated
then
MKT="TRUE"
fi
;;
-c|--cnvs)
CNVS="CNVS" # $1 has the name, $2 the value
echo -e "CNVS will be removed"
shift # next argument
;;
-db|--database)
DB="$2" # $1 has the name, $2 the value
echo -e "Database name set to $DB"
shift 2
;;
-mkt|--mktest)
MKT="$2" # $1 has the name, $2 the value
shift 2
;;
-pop|--population)
POP="$2"
shift 2
;;
*) # No more options
;;
esac
done
if [ "$MKT" == "TRUE" ]; then
echo -e "MKT will be calculated"
else
echo -e "MKT will NOT be calculated"
fi
###############################
## VCF MERGER AND R ANALYSIS ##
###############################
k=1
while read chrom pos1 pos2 level
do
# EXTRACT BED INFORMATION
chrom="${chrom##*chr}" # To extract the chromosome number/X/Y
#echo $chrom $pos1 $pos2
if [ "$(($pos2 - $pos1))" -le "$WINDOW" ]; then # If size region < size window, there is no need to examine it
echo -e "$pos2 - $pos1 is inferior to $WINDOW. This segment will be skipped."
continue
fi
# MERGE THE VCF FILES, REPLACING MISSING GENOTYPES
# The genome accessibility mask is in exclusive 1-based format, i.e. the last position is not included [half open].
# Since bcftools assumes inclusive 1-based format [closed], we subtract 1 from the end coordinate.
pos2=$((pos2-1)) # The last base is included by BCFtools
if [ "$CNVS" == "CNVS" ] ; then
bcftools merge -Ov --missing-to-ref -r $chrom:$pos1-$pos2 $gpfile $alnfile | awk '$5 !~ "<CN"' > merge.$k.vcf
bgzip merge.$k.vcf
else
bcftools merge -Oz --missing-to-ref -o merge.$k.vcf.gz -r $chrom:$pos1-$pos2 $gpfile $alnfile
fi
if [[ $? -ne 0 ]]; then
echo 'Error: Bcftools merge failed!'
exit -1
else
echo -e "Files merged: merge.$k.vcf.gz generated"
fi
tabix -p vcf merge.$k.vcf.gz # Tabixing for analysis with PopGenome
# R ANALYSIS OF NUCLEOTIDE VARIATION:
# PopGenome does not use the first position [left open], so we subtract -1 from its initial position (internal).
echo merge.$k.vcf.gz $chrom $pos1 $pos2 $WINDOW $DB $MKT $POP
VCFAnalysis.R merge.$k.vcf.gz $chrom $pos1 $pos2 $WINDOW $DB $MKT $POP # Avoid the message visualization!!
# Filename = merge.$k.vcf; ini = pos1; end = pos2 // --slave >/dev/null [slave to cut startup messages]
if [[ $? -ne 0 ]]; then
echo 'Error: VCFAnalysis.R failed!'
exit -1
else
echo -e "Analysis with R complete. Data saved and exported to the MySQL database."
rm merge.$k.vcf.gz merge.$k.vcf.gz.tbi # Removes the current merge file in order to free disk space
fi
((k++))
done < "$bedfile"
echo -e "\nAll regions in the BED file merged and analyzed."