-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVCFAnalysis_notac.R
executable file
·370 lines (304 loc) · 18.1 KB
/
VCFAnalysis_notac.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/Rscript
#setwd("~/Documents/2_GenomicsData/Final")
# filename <- "merge.3.vcf.gz"
#filename <- "merge.vcf.gz"; ini <- 31768081-1; end <- 31899603-1; chrom <- "22"; wsize <- 10000; MK <- TRUE; ref.chr="chr22.fa"; window = 7
# Region Analysis v0.8 - Imports the VCF file containing human and chimpanzee data and calculates
# metrics of polymorphism and divergence. The output is sent to a MySQL database.
# WARNING: The program is mean to be called from the terminal (VCFmerger.sh)
# UPDATE: 1) Tests of neutrality added; 2) MKT removed; 3) SFS is calculated from the derived allele.
# UPDATE2: Monomorphic positions in humans are not considered for SFS calculation. Mean is replaced with histogram.
# UPDATE3: Windows are converted into 0-based (BED format). Instead of a single column with the range, two columns (start,end)
# are created. Generalization to multiple chromosomes is available.
# UPDATE4: DAF instead of SFS. Dots to underscores for MySQL. DB name changeable.
# UPDATE5: MKT re-added, optional. It requires a GFF and a FASTA file.
# UPDATE6: 1) Improved DAF with 1000 AA 2) Improved MKT with 4fold and extended MKT.
# UPDATE7: GFFtoFASTA has replaced 'set.synnonsyn_alt' as the method of choic for MKT.
# UPDATE8: Change in the coordinate system according to the mask in pseudo-BED format.
# UPDATE9: Fixing how empty regions are dealth with
#############################
## IMPORT AND PREPARE DATA ##
#############################
if (suppressMessages(!require("PopGenome"))) { # Can be removed
print ("The 'PopGenome' package is missing and will be installed")
install.packages("PopGenome")
library("PopGenome")
}
args <- commandArgs(trailingOnly = TRUE) # Import arguments from command line
filename <- args[1] # Name of the file specified after the script
chrom <- args[2] # Chromosome number
ini <- as.numeric(args[3])-1; end <- as.numeric(args[4])-1 # Although the mask file is supposed to be in BED format,
# comparison with the FASTA file reveals that is it actually 1-based, half-open. That is, the end is not
# included. START matches the position in the FASTA, but PopGenome tends to add 1; therefore, we remove it
wsize <- as.numeric(args[5]) # Window size
db <- args[6] # Name of the data table [Genomics]
MK <- args[7] # Calculate MKT
pop <- args[8] # Population name
notac <- as.numeric(args[9]) # Number of non-accessible positions
################## FOR TESTS START HERE ##################
# library("PopGenome")
# setwd("/home/scasillas/Results_Windows/chrX")
# # #[1] "merge.1.vcf.gz" "22" "16847853"
# # #[4] "16861622" "10000" "WindowsData_chr22"
# # #[7] "TRUE" "CEU"
# filename <- "merge.1.vcf.gz"
# chrom <- "X"
# ini <- 347642-1
# end <- 362360-1
# wsize <- 10000;
# db <- "WindowsData_chrX"
# MK <- TRUE
# pop <- "CEU"
#
# #if (ini<46164109) {
# # quit()
# #}
##########################################################
# (1) We need to keep the .tbi file in the same folder as the vcf.gz (which must be compressed)
# (2) The chromosome identifier in the GFF has to be identical to the identifier used in the VCF file
# Since VCFs from the 1000GP use numeric-only identifiers (e.g. '22'), a conversion is required:
# In bash: cat Chr22.gff | sed 's/^chr22/22'. Also, tid must be identical to those two.
# (3) Numcols indicates the number of SNPs read into the RAM at once. For a sample of 1000 individuals,
# 10,000 are recommended on a 4 GB RAM computer.
start.time <- Sys.time()
# We attempt to read the VCF file. This is not possible when, for instance, all human positions are monomorphic and some
# chimpanzee ones are unknown, probably due to an alignment with multiple sites at once. Another possibility is the complete
# absence of variants in that region, both between humans and chimpanzees and among humans.
region <- tryCatch({readVCF(filename,numcols=10000,tid=chrom,from=ini,to=end,include.unknown=TRUE)},error = function(e) {
message(e); write(sprintf("%s:%d-%d -- %s",chrom,ini,end,e),sprintf("error_chr%s_%s.log",chrom,pop),append=TRUE)
return(NULL)})
# Verify that the region object is not null (failure to load), contains variants and has been loaded onto R. If it is, then we assume
# that there are no variants, and therefore S,D and all related metrics are 0 (except for alpha)
if (is.null(region)||is.logical(region)||[email protected]==0) { # If readVCF fails, region=FALSE. If no variants, region=NULL
print("This region does not contain any variants: S, Pi and D set to 0")
nwin=floor((end+1-ini)/wsize)
windows <- cbind(start=seq(ini,end-wsize+1,by=wsize),end=seq(ini+wsize,end+1,by=wsize))
newrows <- matrix(rep(c(0,0,0,0,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,NA,0,0,0,0,0,0,0,0,NA,NA,NA,NA,0,0,0,0,0,0,0,NA,NA,0,0,notac),nwin),nrow=nwin,byrow=TRUE)
colnames(newrows) <- c("S","Pi","theta","S2","Tajima_D","FuLi_F","FuLi_D","FayWu_H","Zeng_E",
"Wall_B","Wall_Q","Rozas_ZA","Rozas_ZZ","Kelly_ZnS","nucleotide_FST","haplotype_FST",
"Nei_GST","Hudson_GST","Hudson_HST","Hudson_KST","nuc_diversity_within","hap_diversity_within",
"DAF","Divsites","D","K","Unknown","Pneu","Psel","Dneu","Dsel","NI","alpha","DoS",
"Fisher1","Pneu.less5","Pneu.more5","Psel.less5","Psel.more5","Psel.neutral.less5",
"Psel.neutral","Psel.weak","alpha.cor","Fisher2","msel","mneu","notac")
export <- data.frame(population=pop,chr=rep(paste(c("chr",chrom),collapse=""),nwin),windows,newrows)
write.table(export,file=paste(db,"_",pop,".tab",sep=""),quote=FALSE,sep="\t",row.names=F,append=TRUE,
col.names=!file.exists(paste(db,"_",pop,".tab",sep=""))) # Column names written if file does not exist quit()
quit()
}
load(sprintf("gffseq_chr%s.RData",chrom)) # Annotation data from GFFtoFASTA
## DEFINE POPULATION/OUTGROUP ##
# The outgroup can be defined as a different population so that calculations of diversity and neutrality
# are performed separately.
individuals <- get.individuals(region)[[1]]
humans <- individuals[which(individuals != c("Chimp","Chimp.2"))]
chimp <- individuals[which(individuals == c("Chimp","Chimp.2"))]
region <- set.populations(region,list(humans,chimp))
region <- set.outgroup(region,chimp)
windows <- data.frame(start=ini,end=end)
nwin = 1 ; window = 1
#####################################
## GLOBAL STATISTICS AND VARIABLES ##
#####################################
## NEUTRALITY TESTS ##
# This calculates neutrality stats (Watterson's Theta, Fu and Li, Tajima's D) and the
# number of segregating sites (S). However, S excludes missing information in the outgroup.
region <- neutrality.stats(region)
## SITE FREQUENCY SPECTRA (IMPROVED) ##
n <- length(humans) # N = samples. Diploid, not divided by 2
bialhuman <- get.biallelic.matrix(region,1) [1:n,,drop=F] # Whole biallelic matrix, remove outgroup
MAF <- colSums(bialhuman == 1)/nrow(bialhuman)
MAF.df <- data.frame(POS=names(MAF),MAF=unname(MAF)) # Position and MAF of biallelic variants
# IDENTIFY ANCESTRAL ALLELES (AA) FOR SNPS IN THE REGION
library(stringr)
temp <- system(sprintf('gunzip -c %s | grep -v "##" | cut -f2,4,8',filename),intern=TRUE) # Extract all variants from the merge.vcf.gz file
gpimport <- read.table(textConnection(temp),sep="\t",stringsAsFactors = FALSE,header=TRUE)
# gpimport <- read.table("gpimport.txt",sep="\t",stringsAsFactors = FALSE,header=TRUE)
gpances <- toupper(sapply(gpimport[,3],function(x){str_match(x,"AA=([:alpha:])\\|")[2]},USE.NAMES=FALSE)) # Capture the ancestral allele
aarefs <- na.omit(cbind(gpimport[,1:2],ANC=gpances)) # Na.omit removes positions without known AA and divergent sites
# CALCULATE DERIVED ALLELE FREQUENCY (DAF)
mafan <- merge(MAF.df,aarefs) # Intersection between SNPs in the biallelic matrix and the AA in the VCF file
if (dim(mafan)[1] > 0) { # Avoid windows where variants have no known AA
mafan$DAF <- NA # DAF is equal to MAF if derived = minor, but 1 - MAF if otherwise
mafan[mafan$REF != mafan$ANC,]$DAF <- 1- mafan[mafan$REF != mafan$ANC,]$MAF
mafan[mafan$REF == mafan$ANC,]$DAF <- mafan[mafan$REF == mafan$ANC,]$MAF
}
mafan <- merge(MAF.df[,1,drop=F],mafan,all.x=TRUE) # Retain originals
# Frequencies do not exactly match those of the VCF file! Why? Rounding perhaps
# Some sites do not have AA. In some cases because they are divergent, but others are polymorphic:
# MAF.df[which(is.na(mafan$MAF))[!(which(is.na(mafan$MAF)) %in% which(MAF.df$MAF == 0)),]
################ I do not add this, because it only works on "region" class, which includes the whole consecutive region (>= window)
# Counts for different types of sites: [email protected], n.biallelic.sites, n.gaps, n.unknowns, n.valid.sites, n.polyallelic.sites, trans.transv.ratio
n_sites <- [email protected]
n_biallelic_sites <- [email protected]
n_gaps <- [email protected]
n_unknowns <- [email protected]
n_valid_sites <- [email protected]
n_polyallelic_sites <- [email protected]
trans_transv_ratio <- [email protected]
################
######################################
## CUSTOM FUNCTION WINDOW BY WINDOW ##
######################################
measures <- function(object) {
n <- length(humans) # N = samples. Diploid, not divided by 2
total <- length(individuals) # Total number of samples (including outgroup)
## POLYMORPHISM ##
# PopGenome has its own 'neutrality' and 'diversity' stats modules, but diversity takes a
# very long time (55 seconds including Pi for a region of 100 kb).
Pi <- function(k,m,n) {
comb <- choose(n,2) # Binomial coefficient = combination without repetition
pi <- k/(comb*m)
return(round(pi,7))
}
# TABLE CONTAINING THE DATA:
tabsum <- as.data.frame(matrix(numeric(nwin*27),ncol=27,nrow=nwin))
colnames(tabsum) <- c("S","Pi","DAF","Divsites","D","K","Unknown","Pneu","Psel","Dneu","Dsel","NI","alpha","DoS",
"Fisher1","Pneu.less5","Pneu.more5","Psel.less5","Psel.more5","Psel.neutral.less5",
"Psel.neutral","Psel.weak","alpha.cor","Fisher2","msel","mneu","notac")
# EXTRACT SEGREGATING SITES (EXCLUDING OUTGROUP)
bial <- get.biallelic.matrix(object,1) # Biallelic matrix of the region
if (is.null(bial)||dim(bial)[2]==0) { # When no variants are detected
newrow <- rep(0,26) # Empty rows
tabsum[window,] <- newrow
}
bialhuman <- bial[1:n,,drop=F] # Remove outgroup (drop = F to keep 1 dimension)
misshuman <- colSums(is.na(bialhuman))>0 # Sites missing in humans (e.g. structural variants)
poly.sites <- apply(bialhuman,2,sum)>0 & !misshuman # Sites polymorphic in humans w/o missing
bialhuman <- bialhuman[,poly.sites,drop=FALSE] # Keep only polymorphic sites
# DETERMINE M (EXCLUDING MISSING AND POLYALLELIC)
winstart <- ini # Since we have only 1 window, winstart is the same as ini
# Total number of sites: 1) Remove NA in humans 2) Remove polyallelic sites 3) Remove non-accessible regions
multisites <- [email protected] # N polyallelic sites in that window
m <- wsize-sum(misshuman,na.rm=T)-multisites-notac
# DETERMINE S AND K (WHEN VARIANTS ARE AVAILABLE)
if (is.null(bialhuman)||dim(bialhuman)[2]==0) {
S <- 0
k <- 0
} else {
S <- ncol(bialhuman) # Number of variants
freqs <- apply(bialhuman,2,table)
k <- sum(freqs[1,]*freqs[2,]) # Note that k and K are different!
}
## SITE FREQUENCY SPECTRUM (SFS) DERIVED ALLELE FREQUENCY (DAF) ##
# The 'winsites' variable contains the coordinates of the biallelic variants in each window.
# By using this information, we can subset the DAF of the sites in this window contained in 'mafan'.
if (!is.null(mafan$DAF)) {
DAF <- hist(as.vector(na.omit(mafan$DAF)),seq(0.0,1,0.05),plot=F)$counts
DAF <- paste(DAF,collapse = ";")
} else {DAF <- NA}
## DIVERGENCE ##
# COUNT THE NUMBER OF UNKNOWNS IN THE OUTGROUP:
unknowns <- sum(is.nan(bial[total,])) # Displayed as NaN in the biallelic matrix
# COUNT THE NUMBER OF DIVERGENCE SITES:
divtotal <- bial[total,] == 1 & !poly.sites # 1/1 in the outgroup and excluding polymorphic
divsites <- sum (divtotal,na.rm=T) # Polymorphic cannot be counted for divergence
# CALCULATE D AND K:
mout <- m - unknowns
D <- round(divsites/mout,7) # Observed divergence: Proportion of sites with divergent nucleotides
K <- round(-3/4*log(1-4/3*D),7) # Real divergence: Jukes and Cantor model
## NATURAL SELECTION REGIMES ##
winsites <- [email protected]@biallelic.sites[[1]] # Biallelic sites in this window
bial.class <- gffseq[winsites] # GFF feature in each site
## MKT CALCULATION
# STANDARD: Same result as MKT function if done with all syn.
Pneu <- sum(poly.sites[bial.class==4],na.rm=TRUE) # Neutral=4-fold
Psel <- sum(poly.sites[bial.class==0],na.rm=TRUE) # Selected=0-fold
Dneu <- sum(divtotal[bial.class==4],na.rm=TRUE)
Dsel <- sum(divtotal[bial.class==0],na.rm=TRUE)
NI <- (Psel/Pneu)/(Dsel/Dneu)
alpha <- 1-NI
DoS <- Dsel/(Dsel+Dneu)-Psel/(Psel+Pneu) # Direction of selection
contingency.std <- matrix(c(Psel,Pneu,Dsel,Dneu),c(2,2))
test1 <- if(!is.na(sum(contingency.std))){
if(sum(contingency.std)>0){fisher.test(contingency.std)$p.value}
} else {NA}
test1 <- if (is.null(test1)) {NA} else {test1}
# ADAPTED:
# 1) Use four-fold degenerated sites 2) Remove sites according to DAF
# We use MAF for the adjustment because it is available for all positions, whereas some
# do not have DAF due to the lack of AA. For DAF, please use winDAF.
neuMAF <- as.vector(na.omit(MAF[bial.class==4 & poly.sites==TRUE]))
selMAF <- as.vector(na.omit(MAF[bial.class==0 & poly.sites==TRUE]))
Pneu.less5 <- sum(neuMAF<0.05) # P0 MAF less than 5%
Pneu.more5 <- sum(neuMAF>0.05) # P0 MAF more than 5%
Psel.less5 <- sum(selMAF<0.05) # Pi MAF less than 5%
Psel.more5 <- sum(selMAF>0.05) # Pi MAF more than 5%
Psel.neutral.less5 <- Psel*(Pneu.less5/Pneu) # Proportion of neutral within the MAF < 5% class
Psel.neutral <- Psel.neutral.less5 + Psel.more5 # For alpha
Psel.weak <- Psel.less5 - Psel.neutral.less5
alpha.cor <- 1-(Psel.neutral/Pneu)*(Dneu/Dsel)
contingency <- matrix(c(Psel.neutral,Pneu,Dsel,Dneu),c(2,2))
test2 <- if(!is.na(sum(contingency))){
if(sum(contingency)>0){fisher.test(contingency)$p.value}
} else {NA}
test2 <- if (is.null(test2)) {NA} else {test2}
# OTHER ESTIMATORS:
# To fully implement extended MKT, we need ms/mns, that is, the number of sites
# of each class. Doing that would require modifying the 'set.synnonsyn2' function
# to obtain all codons and check fold in every position (0,1,2)
m.neu <- sum(gffseq[winstart:winstart+wsize] == 4,na.rm=T) # neu=4-fold
m.sel <- sum(gffseq[winstart:winstart+wsize] == 0,na.rm=T) # sel=0-fold
## ADD NEW ROW ##
newrow <- c(S,Pi(k,m,n),DAF,divsites,D,K,unknowns,Pneu,Psel,Dneu,Dsel,NI,alpha,DoS,test1,
Pneu.less5,Pneu.more5,Psel.less5,Psel.more5,Psel.neutral.less5,Psel.neutral,Psel.weak,
alpha.cor,test2,m.sel,m.neu,notac)
tabsum[window,] <- newrow
return(tabsum)
}
## LINKAGE DISEQUILIBRIUM ##
##### Em sembla que això no ho calcularé...
#region <- calc.R2(region)
#[email protected]@linkage.disequilibrium # no sé, surt un núm només...
##### Aquest sí que sembla funcionar bé...
region <- linkage.stats(region,detail=FALSE)
#get.linkage(region)[[1]]
# Wall.B Wall.Q Rozas.ZA Rozas.ZZ Kelly.Z_nS
#16847853 - 16857852 : 0.1715976 0.1764706 0.3055362 0.03736886 0.2681674
# o accedir a cada mètrica com
#[email protected][,1] # cal dividir també per la wsize???
Wall_B <- round([email protected][,1],7)
Wall_Q <- round([email protected][,1],7)
Rozas_ZA <- round([email protected][,1],7)
Rozas_ZZ <- round([email protected][,1],7)
Kelly_ZnS <- round([email protected]_nS[,1],7)
region <- F_ST.stats(region,detail=FALSE)
# [email protected]@haplotype.diversity
# o accedir al valor com
#[email protected][,1] # cal dividir també per la wsize???
#[email protected]_ST[,1]
#[email protected]_ST[,1]
# others: haplotype.F_ST, nucleotide.F_ST, Nei.G_ST, Hudson.G_ST, Hudson.H_ST, Hudson.K_ST, nuc.diversity.within, hap.diversity.within, Pi
nucleotide_FST <- round([email protected]_ST[,1],7)
haplotype_FST <- round([email protected]_ST[,1],7)
Nei_GST <- round([email protected]_ST[,1],7)
Hudson_GST <- round([email protected]_ST[,1],7)
Hudson_HST <- round([email protected]_ST[,1],7)
Hudson_KST <- round([email protected]_ST[,1],7)
nuc_diversity_within <- round([email protected][,1]/wsize,7)
hap_diversity_within <- round([email protected][,1],7)
## INTEGRATION OF NEUTRALITY, DIVERSITY AND DIVERGENCE METRICS ##
# Results from neutrality stats module are divided by number of sites [email protected]
regiondata <- measures(region)
S2 <- [email protected][,1] # Segretaging sites excluding unknowns
Tajima_D <- round([email protected][,1]/wsize,7)
FuLi_F <- round([email protected][,1]/wsize,7)
FuLi_D <- round([email protected][,1]/wsize,7)
FayWu_H <- round([email protected][,1]/wsize,7)
Zeng_E <- round([email protected][,1]/wsize,7)
#Rozas_R2 <- round([email protected]_2[,1]/wsize,7) # it is a detailed statistic; if wanted, apply option: do.R2
theta <- round(region@theta_Watterson[,1]/wsize,7)
if (exists("S2")) {regiondata <- cbind(regiondata[,1:2],theta,S2,Tajima_D,FuLi_F,FuLi_D,FayWu_H,Zeng_E,Wall_B,Wall_Q,Rozas_ZA,Rozas_ZZ,Kelly_ZnS,
nucleotide_FST,haplotype_FST,Nei_GST,Hudson_GST,Hudson_HST,Hudson_KST,nuc_diversity_within,hap_diversity_within,regiondata[,3:ncol(regiondata)])
} else { regiondata <- cbind(regiondata[,1:2],theta=NA,S2=0,Tajima_D=NA,FuLi_F=NA,FuLi_D=NA,FayWu_H=NA,Zeng_E=NA,Wall_B=NA,Wall_Q=NA,Rozas_ZA=NA,Rozas_ZZ=NA,Kelly_ZnS=NA,
nucleotide_FST=NA,haplotype_FST=NA,Nei_GST=NA,Hudson_GST=NA,Hudson_HST=NA,Hudson_KST=NA,nuc_diversity_within=0,hap_diversity_within=0,
regiondata[,3:ncol(regiondata)]) }
######################
## DATA EXPORTATION ##
######################
# We add the chromosome info and the coordinates. For convenience in the next steps:
# a) Coordinates are converted to BED format: 0-based, subtract 1 from start column.
# b) Chromosome is expressed in "chrNN" format.
windows[,1] <- windows[,1]-1
export <- cbind(population=pop,chr=rep(paste(c("chr",chrom),collapse=""),NROW(windows)),windows,regiondata)
write.table(export,file=paste(db,"_",pop,"_",wsize,".tab",sep=""),quote=FALSE,sep="\t",row.names=F,append=TRUE,
col.names=!file.exists(paste(db,"_",pop,"_",wsize,".tab",sep=""))) # Column names written if file does not exist
print(Sys.time() - start.time)