calico.R

#!/usr/bin/env Rscript

#CALICO: Iterative clustering assisted by coarse graining
#Author: Billy Lau, billylau@stanford.edu (Hanlee Ji Lab/Stanford Genome Technology Center)
#See the most recent source at https://github.com/billytcl/calico
#Distributed under MIT License

if (!dir.exists(Sys.getenv("R_LIBS_USER")))
{
  dir.create(Sys.getenv("R_LIBS_USER"),recursive = T)
  .libPaths(c(Sys.getenv("R_LIBS_USER"), .libPaths()))
}

list.of.packages <- c("ggplot2", "png","optparse", "MASS", "raster")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com")


library(ggplot2)
library(png)
library(optparse)
library(MASS)
library(raster)

###PARSE ARGUMENTS

option_list <- list(
	make_option(c("-m", "--model"), action="store_true", type="logical", default=F,
	help="Specify to generate clustering model using reference data. Cannot be used alongside --cluster or --plot."),
	make_option(c("-c", "--cluster"), action="store_true", type="logical", default=F,
	help="Specify to cluster sample data. Cannot be used alongside --model or --plot"),
	make_option(c("-p", "--plot"), action="store_true", type="logical", default=F,
	help="Specify to plot clustered sample data. Cannot be used alongside --model or --cluster"),
	make_option(c("-i", "--input"), type="character", default=NULL, 
	help="Input droplet intensity reference/sample/clustered file", metavar="INPUT"),
	make_option(c("-o", "--output"), type="character", default="out.txt", 
	help="Output clustering model or clustered intensity file [default= %default]", metavar="OUTPUT"),
	make_option(c("-r", "--reference"), type="character", default=NULL, 
	help="Reference model generated by --model.", metavar="REFERENCE"),
	make_option(c("-q", "--quality"), type="double", default=0, 
	help="Quality score cutoff for de-noising", metavar="QUALITY"),
	make_option(c("-t", "--tiles"), type="double", default=9, 
	help="Number of tiles (actually tiles^2) to average for sliding window. Must be an odd number.", metavar="TILES"),
	make_option(c("--hard_ch1"), type="double", default=15000,
	help="Hard limit for Ch1 signal. Set to a very high number to disable.", metavar="HARDCH1"),
	make_option(c("--hard_ch2"), type="double", default=15000,
	help="Hard limit for Ch2 signal. Set to a very high number to disable.", metavar="HARDCH2"),
	make_option(c("--subsample"), type="double", default=1, 
	help="Subsample droplets. Set to 1 to disable. Useful for densely packed data.", metavar="SUBSAMPLE"),
	make_option(c("-x", "--resolution"), type="double", default=480,
	help="Rastering resolution (linear; number of pixels is resolution^2). Useful for densely packed data.", metavar="RESOLUTION"),
	make_option(c("-n", "--nclust"), type="double", default=3, 
	help="Number of clusters to generate", metavar="NCLUST")
)
 
opt_parser <- OptionParser(option_list=option_list)
opt <- parse_args(opt_parser)

if (is.null(opt$input)){
  print_help(opt_parser)
  stop("Argument must be supplied (input file).\n", call.=FALSE)
}

if (is.null(opt$output)){
  print_help(opt_parser)
  stop("Argument must be supplied (output file).\n", call.=FALSE)
}

if (opt$cluster && (is.null(opt$reference))){
  print_help(opt_parser)
  stop("Argument must be supplied (reference file).\n", call.=FALSE)
}

if (opt$plot && (is.null(opt$reference))){
  print_help(opt_parser)
  stop("Argument must be supplied (reference file).\n", call.=FALSE)
}

#http://stackoverflow.com/questions/3466452/xor-of-three-values
if (!(xor(xor(opt$model,opt$cluster),opt$plot) && !(opt$model && opt$cluster && opt$plot))) {
  print_help(opt_parser)
  stop("Only either --model or --cluster or --plot must be specified.\n", call.=FALSE)
}
 

## MAIN
theme_set(theme_bw(35))

if (opt$model) {
	ddpcr_data <- read.table(opt$input,skip=1,sep=',')

	ddpcr_data <- as.data.frame(ddpcr_data[,c("V1","V2")])
	colnames(ddpcr_data) <- c("Ch1","Ch2")

	#bootstrap subsample data to remove outliers
	ddpcr_data_subsample <- data.frame()
	
	for (i in 1:4) {
		set.seed(i)
		tmp <- ddpcr_data[sample(nrow(ddpcr_data), floor(0.9 * nrow(ddpcr_data))), ]
		tmp$Rep <- i
		ddpcr_data_subsample <- rbind(ddpcr_data_subsample, tmp)
	}
	
	
	ddpcr_data_subsample$Rep <- factor(ddpcr_data_subsample$Rep,levels=seq(1,max(ddpcr_data_subsample$Rep)))
	max_ch1 <- min(by(ddpcr_data_subsample,ddpcr_data_subsample$Rep,function(X) {X[which.max(X$Ch1),1]},simplify=T))
	max_ch2 <- min(by(ddpcr_data_subsample,ddpcr_data_subsample$Rep,function(X) {X[which.max(X$Ch2),2]},simplify=T))
	min_ch1 <- max(by(ddpcr_data_subsample,ddpcr_data_subsample$Rep,function(X) {X[which.min(X$Ch1),1]},simplify=T))
	min_ch2 <- max(by(ddpcr_data_subsample,ddpcr_data_subsample$Rep,function(X) {X[which.min(X$Ch2),2]},simplify=T))
	
	if (max_ch1 > opt$hard_ch1) {
		max_ch1 <- opt$hard_ch1
		}
	if (max_ch2 > opt$hard_ch2) {
		max_ch2 <- opt$hard_ch2
		}
	ddpcr_data <- subset(ddpcr_data, Ch1 > min_ch1 & Ch1 < max_ch1 & Ch2 > min_ch2 & Ch2 < max_ch2)
	
	#convert to grid, use sliding window to average in nearby cells on grid
	#http://gis.stackexchange.com/questions/24588/converting-point-data-into-gridded-dataframe-for-histogram-analysis-using-r
	
	if (opt$subsample < 1) {
		ddpcr_data <- ddpcr_data[sample(nrow(ddpcr_data), floor(opt$subsample * nrow(ddpcr_data))), ]
		}
	
	ddpcr_copy <- ddpcr_data
	
	coordinates(ddpcr_copy) <- ~Ch2+Ch1
	ddpcr_raster <- raster(ncols=opt$resolution,nrows=opt$resolution)
	extent(ddpcr_raster) <- extent(ddpcr_copy)
	ddpcr_raster <- rasterize(ddpcr_copy, ddpcr_raster, 1, background = 0, fun = function(X,...) {
	  if (length(X) > 0) {
	    1
	  }
	  else { 0 }
	})
	ddpcr_raster <- flip(ddpcr_raster,direction='y')
	
	ddpcr_raster_nonzero_focal <- focal(ddpcr_raster,w=matrix(1/opt$tiles,nc=opt$tiles,nr=opt$tiles))
	ddpcr_raster_nonzero <- as.data.frame(which(as.matrix(ddpcr_raster_nonzero_focal) > 0, arr.ind = T))
	ddpcr_raster_kmeans <- kmeans(ddpcr_raster_nonzero,opt$nclust,nstart = 25)
	
	#get extents of raster
	#xmin xmax ymin ymax
	raster_extents <- extent(ddpcr_raster)
	raster_extents <- c(raster_extents[1],raster_extents[2],raster_extents[3],raster_extents[4])

	#get real coordinates
	raster_centers <- ddpcr_raster_kmeans$centers
	raster_centers_locs <- as.matrix(t(apply(raster_centers,1,function(X) {
	  tmp_y <- raster_extents[1] + ((raster_extents[2] - raster_extents[1]) * X[2]/opt$resolution)
	  tmp_x <- raster_extents[3] + ((raster_extents[4] - raster_extents[3]) * X[1]/opt$resolution)
	  c(tmp_x,tmp_y)
	})))
	
	#perform second k-means round to get real variances and centers
	set.seed(12345)
	ddpcr_kc <- kmeans(ddpcr_data,centers=raster_centers_locs,trace=T,nstart=25)
	ddpcr_final <- ddpcr_data[,c("Ch1","Ch2")]
	ddpcr_final$cluster <- as.character(ddpcr_kc$cluster)

	#here save model parameters
	ddpcr_cluster_centers <- ddpcr_kc$centers

	#reorder clusters by Ch1 then Ch2 by a mapping vector
	ddpcr_order <- order(ddpcr_cluster_centers[,1],ddpcr_cluster_centers[,2])
	ddpcr_cluster_centers <- ddpcr_cluster_centers[ddpcr_order,]
	ddpcr_final$cluster <- ddpcr_order[as.numeric(ddpcr_final$cluster)]
	
	print (ddpcr_cluster_centers)
	print (table(ddpcr_final$cluster))
	
	ddpcr_cluster_cov <- list()
	
	for (i in 1:opt$nclust) {
	  tmp <- subset(ddpcr_final,cluster==i,select=c("Ch1","Ch2"))
	  ddpcr_cluster_cov[[i]] <- list()
	  ddpcr_cluster_cov[[i]]$center <- ddpcr_cluster_centers[i,]
	  ddpcr_cluster_cov[[i]]$cov <- cov.rob(tmp,method="mve")$cov
	}

	save(ddpcr_cluster_cov, ddpcr_final, file=opt$output)
}


if (opt$cluster) {

	#Cluster sample data based on empirical model
	load(opt$reference)
  
  if (!(exists("ddpcr_final") & exists("ddpcr_cluster_cov")))
  {
    print_help(opt_parser)
    stop("Input reference file seems to be invalid.\n", call.=FALSE)
  }

	ddpcr_data_new <- read.table(opt$input,skip=1,sep=',')
	ddpcr_data_new <- ddpcr_data_new[,c("V1","V2")]
	colnames(ddpcr_data_new) <- c("Ch1","Ch2")
	
	scores <- as.data.frame(lapply(ddpcr_cluster_cov,function(x) {
		#mahalanobis(x = ddpcr_data_new[,c("Ch1","Ch2")],cov = x$cov,center = x$center)
	  tmp_model <- mahalanobis(x = ddpcr_final[,c("Ch1","Ch2")], cov = x$cov, center=x$center)
	  tmp_m <- mahalanobis(x = ddpcr_data_new[,c("Ch1","Ch2")],cov = x$cov,center = x$center)
	  #use this to rescale mahalanobis score
	  #scale(tmp_m,center=min(tmp_m),scale=diff(range(tmp_m)))
	  scale(tmp_m,center=min(tmp_model),scale=diff(range(tmp_model)))
	}))
	
	#occasionally the scores can become a tiny negative number; force these to zero
	scores[scores<0] <- 0
	
	colnames(scores) <- paste("Cluster",c(seq(1,length(ddpcr_cluster_cov))),"Score", sep="")

	scores$q <- apply(scores,1,function(x) {
	  min_score <- min(x)
	  tmp_score <- (sum(min_score/x) - 1)^-1
		#tmp_score <- (sum(c(min(x[c(1,2,3)])/x[1],min(x[c(1,2,3)])/x[2],min(x[c(1,2,3)])/x[3])) - 1)^-1
		if (is.nan(tmp_score)) {
			tmp_score <- 10000000
			}
		tmp_score
		})

	scores$cluster <- apply(scores,1,function(x) {
	  if (tail(x,n=1) > opt$quality) {
	    which.min(head(x,-1))
		} else {
		0
		}})
	
	ddpcr_scoring_final <- cbind(ddpcr_data_new[,c("Ch1","Ch2")],scores)
	ddpcr_scoring_final$cluster <- factor(ddpcr_scoring_final$cluster, levels=seq(1,length(ddpcr_cluster_cov)))

	ddpcr_table <- as.matrix(table(ddpcr_scoring_final$cluster))
	
	cat(sprintf("%s\t%s\t%s\n",opt$input,sum(ddpcr_table),paste(c(ddpcr_table),collapse="\t")))

	#write to file
	write.csv(ddpcr_scoring_final,opt$output)
}


if (opt$plot) {
	
	#read cluster table
	ddpcr_scoring_final <- read.csv(opt$input)
	
	if (!("q" %in% colnames(ddpcr_scoring_final))) 
	{
	  print_help(opt_parser)
	  stop("This doesn't look like a clustered file. Please run -c first.\n", call.=FALSE)
	}
	
	load(opt$reference)
	
	ddpcr_final$cluster <- factor(ddpcr_final$cluster,levels=seq(1,max(ddpcr_final$cluster,na.rm=T)))
	ddpcr_scoring_final$cluster <- factor(ddpcr_scoring_final$cluster,levels=seq(1,max(ddpcr_scoring_final$cluster,na.rm=T)))
	
	p <- ggplot(ddpcr_scoring_final,aes(x=Ch1,y=Ch2,color=cluster))
	p <- p + geom_point(size=2) +
	  xlab("Channel 1 Fluorescence (AFU)") +
	  ylab("Channel 2 Fluorescence (AFU)") +
	  guides(color=guide_legend(title="Cluster")) +
	  geom_text(data=as.data.frame(ddpcr_scoring_final[which(is.na(ddpcr_scoring_final$cluster)),]),inherit.aes=T,color="red",label="*",size=20) +
	  theme(legend.background=element_blank())
	for (i in 1:max(as.numeric(ddpcr_scoring_final$cluster),na.rm=T)) {
	  p <- p + stat_ellipse(data=subset(ddpcr_final,cluster==i,select=c("Ch1","Ch2","cluster")),level=0.99,size=2,linetype=2,color="black")
	}
	
	p

	ggsave(opt$output,dpi=600,width=10,height=10)

}