-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDAFCalculator.R
executable file
·157 lines (122 loc) · 6.54 KB
/
DAFCalculator.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/Rscript
# setwd("/home/roger/Documents/Genomics/MyScripts")
# install.packages("PopGenome",repos="http://cran.r-project.org")
# ini=45886885; end=45920909; chrom <- "22"; filename <- "chr22.vcf.gz"
#########################
## GET AND CHECK INPUT ##
#########################
## Collect arguments
args <- commandArgs(trailingOnly = TRUE)
## Default setting when no arguments passed
if(length(args) < 4) {
args <- c("--help")
}
## Help section
if("--help" %in% args || "-h" %in% args) {
cat("
DAFCalculator - A simple script to calculate DAF using 1000 GP data and PopGenome.
Usage: DAFCalculator.R <VCF-file> <chr> <start> <end> [-o|--output] [-h]
Positional arguments:
<vcf-file> STR A VCF file containing the variants of interest
<chr> INT Chromosome of the region under study (e.g. 22)
<start> INT Start of the region under study (e.g. 1)
<end> INT End of the region under study (e.g. 1500)
Optional arguments:
--interval INT Interval of frequencies for the DAF histogram
--output FILENAME Name of the output [DEFAULT: INPUT_daf.txt]
-h, --help Print this text
Example usage: Rscript DAFcal.R chr21.vcf.gz 21 100 20000
")
q(save="no") # Exit without saving
}
## Retrieve positional arguments
filename <- args[1]
if(!file.exists(filename)) {
cat("\nFile does not exist! Exiting\n")
q(save="no")
}
chrom <- args[2] # Chromosome number
ini <- as.numeric(args[3])-1; end <- as.numeric(args[4])-1 # Although the mask file is supposed to be in BED format,
# comparison with the FASTA file reveals that is it actually 1-based, half-open. That is, the end is not
# included. START matches the position in the FASTA, but PopGenome tends to add 1; therefore, we remove it
## Retrieve optional arguments
output.name <- "DAFTable"; interval <- 0.05
if(length(args) > 5) {
optional.args <- args[5:length(args)]
if (length(index <- grep("--output",optional.args)) != 0){
output.name <- optional.args[index+1]
}
if (length(index <- grep("--interval",optional.args)) != 0){
interval <- as.numeric(optional.args[index+1])
}
}
cat(sprintf("Settings: %s:%s-%s %s %s\n",chrom,ini,end,output.name,interval))
########################
## LOAD GENOME OBJECT ##
########################
cat("Loading Popgenome object...\n")
suppressMessages(library("PopGenome"))
# We attempt to read the VCF file. This is not possible when, for instance, all human positions are monomorphic and some
# chimpanzee ones are unknown, probably due to an alignment with multiple sites at once. Another possibility is the complete
# absence of variants in that region, both between humans and chimpanzees and among humans.
region <- tryCatch({readVCF(filename,numcols=10000,tid=chrom,from=ini,to=end,include.unknown=TRUE)},error = function(e) {
message(e); write(sprintf("%s:%d-%d -- %s",chrom,ini,end,e),sprintf("error_chr%s_%s.log",chrom,pop),append=TRUE)
return(NULL)})
# Verify that the region object is not null (failure to load), contains variants and has been loaded onto R. If it is, then we assume
# that there are no variants, and therefore S,D and all related metrics are 0 (except for alpha)
if (is.null(region)||is.logical(region)||[email protected]==0) { # If readVCF fails, region=FALSE. If no variants, region=NULL
cat("This region does not contain any variants: DAF set to NA\n\n")
export <- data.frame(chr=chrom,start=ini,end=end,DAF=NA)
write.table(export,file=paste(output.name,".tab",sep=""),quote=FALSE,sep="\t",row.names=F,append=TRUE,
col.names=!file.exists(paste(output.name,".tab",sep=""))) # Column names written if file does not exist
quit()
}
## DEFINE POPULATION/OUTGROUP ##
individuals <- get.individuals(region)[[1]]
humans <- individuals[which(individuals != c("Chimp","Chimp.2"))]
chimp <- individuals[which(individuals == c("Chimp","Chimp.2"))]
region <- set.populations(region,list(humans,chimp))
region <- set.outgroup(region,c("Chimp"), diploid=TRUE)
###################
## CALCULATE DAF ##
###################
## OBTAIN MAF
cat("\nCalculating minor allele frequencies...")
n <- length(humans) # N = samples. Diploid, not divided by 2
bialhuman <- get.biallelic.matrix(region,1) [1:n,,drop=F] # Whole biallelic matrix, remove outgroup
MAF <- colSums(bialhuman == 1)/nrow(bialhuman)
MAF.df <- data.frame(POS=names(MAF),MAF=unname(MAF)) # Position and MAF of biallelic variants
cat("done\n")
## IDENTIFY ANCESTRAL ALLELES (AA) FOR SNPS IN THE REGION
cat("Obtaining ancestral allele from 1000KG file...")
library(stringr)
temp <- system(sprintf('gunzip -c %s | cut -f2,4,8',filename),intern=TRUE) # Extract all variants from the merge.vcf.gz file
gpimport <- read.table(textConnection(temp),sep="\t",stringsAsFactors = FALSE,header=TRUE)
# gpimport <- read.table("gpimport.txt",sep="\t",stringsAsFactors = FALSE,header=TRUE)
gpances <- toupper(sapply(gpimport[,3],function(x){str_match(x,"AA=([:alpha:])\\|")[2]},USE.NAMES=FALSE)) # Capture the ancestral allele
aarefs <- na.omit(cbind(gpimport[,1:2],ANC=gpances)) # Na.omit removes positions without known AA and divergent sites
cat("done\n")
## CALCULATE DERIVED ALLELE FREQUENCY (DAF)
cat("Calculating DAF based on ancestral allele...")
mafan <- merge(MAF.df,aarefs) # Intersection between SNPs in the biallelic matrix and the AA in the VCF file
if (dim(mafan)[1] > 0) { # Avoid windows where variants have no known AA
mafan$DAF <- NA # DAF is equal to MAF if derived = minor, but 1 - MAF if otherwise
mafan[mafan$REF != mafan$ANC,]$DAF <- 1- mafan[mafan$REF != mafan$ANC,]$MAF
mafan[mafan$REF == mafan$ANC,]$DAF <- mafan[mafan$REF == mafan$ANC,]$MAF
}
mafan <- merge(MAF.df[,1,drop=F],mafan,all.x=TRUE) # Retain originals
# Frequencies do not exactly match those of the VCF file! Why? Rounding perhaps
# Some sites do not have AA. In some cases because they are divergent, but others are polymorphic:
# MAF.df[which(is.na(mafan$MAF))[!(which(is.na(mafan$MAF)) %in% which(MAF.df$MAF == 0)),]
if (is.numeric (mafan$DAF)) {
DAF <- hist(mafan$DAF,seq(0.0,1,interval),plot=F)$counts
DAF <- paste(DAF,collapse = ";")
} else {DAF <- NA}
cat("done\n")
######################
## DATA EXPORTATION ##
######################
export <- data.frame(chr=chrom,start=ini,end=end,DAF=DAF)
write.table(export,file=paste(output.name,".tab",sep=""),quote=FALSE,sep="\t",row.names=F,append=TRUE,
col.names=!file.exists(paste(output.name,".tab",sep=""))) # Column names written if file does not exist
cat(sprintf("Data saved to file %s\n\n",paste(output.name,".tab",sep="")))