gkmSVM 0.55 initial commit

mghandi · Jul 1, 2015 · 841b197 · 841b197
commit 841b197
Show file tree

Hide file tree

Showing 73 changed files with 14,167 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,12 @@
+Package: gkmSVM
+Type: Package
+Title: Gapped-Kmer Support Vector Machine
+Version: 0.55
+Date: 2015-06-29
+Author: Mahmoud Ghandi
+Maintainer: Mahmoud Ghandi <[email protected]>
+Description: Imports the 'gkmSVM' v2.0 functionalities into R (www.beerlab.org/gkmsvm). It also uses the 'kernlab' library (separate R package by different authors) for various SVM algorithms.   
+License: GPL (>= 2)
+Imports: Rcpp, kernlab, seqinr, utils
+LinkingTo: Rcpp
+SystemRequirements: C++11
diff --git a/LICENSE b/LICENSE
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,3 @@
+useDynLib(gkmSVM)
+exportPattern("^[[:alpha:]]+")
+importFrom(Rcpp, evalCpp)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,11 @@
+# This file was generated by Rcpp::compileAttributes
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#gkmsvm_classify <- function(params) {
+#    invisible(.Call('gkmSVM_gkmsvm_classify', PACKAGE = 'gkmSVM', params))
+#}
+
+#gkmsvm_kernel <- function(params) {
+#    invisible(.Call('gkmSVM_gkmsvm_kernel', PACKAGE = 'gkmSVM', params))
+#}
+
diff --git a/R/gkmsvm_classify.R b/R/gkmsvm_classify.R
@@ -0,0 +1,52 @@
+
+gkmsvm_classify <- function( seqfile, 
+                             svmfnprfx,
+                             outfile,
+                           L=10, 
+                           K=6, 
+                           maxnmm=3, 
+                           maxseqlen=10000,
+                           maxnumseq=1000000, 
+                           useTgkm=1,
+                           alg=0, 
+                           addRC=TRUE, 
+                           usePseudocnt=FALSE,
+                           batchSize=100000, 
+                           wildcardLambda=1.0, 
+                           wildcardMismatchM=2,
+                           alphabetFN="NULL",
+                           svseqfile=NA,
+                           alphafile=NA){
+
+                             if(is.na(svseqfile)){
+                               svseqfile= paste(svmfnprfx, 'svseq.fa', sep='_')
+                               alphafile= paste(svmfnprfx, 'svalpha.out', sep='_')
+                             }    
+
+                             params = list(seqfile=seqfile, 
+                                           svseqfile=svseqfile,
+                                           alphafile=alphafile,
+                                           outfile=outfile,
+                                           L=L, 
+                                           K=K, 
+                                           maxnmm=maxnmm, 
+                                           maxseqlen=maxseqlen,
+                                           maxnumseq=maxnumseq, 
+                                           useTgkm=useTgkm,
+                                           alg=alg, 
+                                           addRC=addRC, 
+                                           usePseudocnt=usePseudocnt, 
+                                           batchSize=batchSize,
+                                           wildcardLambda=wildcardLambda, 
+                                           wildcardMismatchM=wildcardMismatchM,
+                                           alphabetFN=alphabetFN
+                             ); 
+                             # print(params)
+
+                             invisible(.Call('gkmSVM_gkmsvm_classify', PACKAGE = 'gkmSVM', params))
+                           }
+
+
+
+
+
diff --git a/R/gkmsvm_kernel.R b/R/gkmsvm_kernel.R
@@ -0,0 +1,38 @@
+
+gkmsvm_kernel <- function( posfile, 
+                           negfile,
+                           outfile,
+                           L=10, 
+                           K=6, 
+                           maxnmm=3, 
+                           maxseqlen=10000,
+                           maxnumseq=1000000, 
+                           useTgkm=1,
+                           alg=0, 
+                           addRC=TRUE, 
+                           usePseudocnt=FALSE, 
+                           wildcardLambda=1.0, 
+                           wildcardMismatchM=2,
+                           alphabetFN="NULL"){
+
+  params = list(L=L, 
+                K=K, 
+                maxnmm=maxnmm, 
+                maxseqlen=maxseqlen,
+                maxnumseq=maxnumseq, 
+                useTgkm=useTgkm,
+                alg=alg, 
+                addRC=addRC, 
+                usePseudocnt=usePseudocnt, 
+                OutputBinary=FALSE, 
+                posfile=posfile, 
+                negfile=negfile,
+                outfile=outfile,
+                wildcardLambda=wildcardLambda, 
+                wildcardMismatchM=wildcardMismatchM,
+                alphabetFN=alphabetFN
+                ); 
+ # print(params)
+
+ invisible(.Call('gkmSVM_gkmsvm_kernel', PACKAGE = 'gkmSVM', params))
+}
diff --git a/R/gkmsvm_train.R b/R/gkmsvm_train.R
@@ -0,0 +1,50 @@
+
+gkmsvm_train = function (kernelfn, posfn, negfn, svmfnprfx,  Type="C-svc", C=1, ...){
+#TODO: add bootstrapping and cv capabilities -- also autyomatic choise of C  . check if kernlab does that 
+
+
+  #  library(seqinr)
+  #  library(kernlab)
+  #  library(utils)
+  if (requireNamespace("seqinr", quietly = TRUE)&
+      requireNamespace("utils", quietly = TRUE)&
+      requireNamespace("kernlab", quietly = TRUE)){
+
+
+
+  #  negfn= '/Users/mghandi/gkmsvm/test/testneg9.fa'
+  #  posfn= '/Users/mghandi/gkmsvm/test/testpos9.fa'
+  #  kernelfn= '/Users/mghandi/gkmsvm/test/test9kernel.txt'
+
+    pos = seqinr::read.fasta(posfn)
+    npos = length(pos)
+    neg = seqinr::read.fasta(negfn)
+    nneg = length(neg)
+    nseq = npos+nneg; 
+
+    mat <- data.matrix( utils::read.table(file=kernelfn, fill=TRUE, col.names=paste("V", 1:nseq)))
+    mat[upper.tri(mat)] <- t(mat)[upper.tri(mat)]
+    rownames(mat)=colnames(mat)
+    K <- kernlab::as.kernelMatrix(mat)
+    y = c(rep(1, npos), rep(0, nneg)); names(y)=rownames(mat)
+
+  #  svp <- ksvm(K, y, type="C-svc", C=1)
+    svp <- kernlab::ksvm(K, y, type=Type, C=C, ...)
+
+    seqnames = c(names(pos), names(neg))
+
+    if(svp@nSV>0){
+      alpha = unlist(svp@alpha )
+      ii = unlist(svp@SVindex)
+      jj = which(ii>npos); 
+      alpha[jj]= -alpha[jj];
+
+      utils::write.table(cbind(seqnames[ii], sprintf("%11.6e",alpha)),
+                  file = paste(svmfnprfx, 'svalpha.out', sep='_'),
+                  col.names=FALSE, row.names=FALSE, quote=FALSE, sep='\t')
+
+      svseqs = c(pos,neg)[ii]; 
+      seqinr::write.fasta(svseqs, names(svseqs),  file.out= paste(svmfnprfx, 'svseq.fa', sep='_'))
+    }
+  }
+}
diff --git a/gkmSVM.Rproj b/gkmSVM.Rproj
@@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/man/gkmSVM-package.Rd b/man/gkmSVM-package.Rd
@@ -0,0 +1,128 @@
+\name{gkmSVM-package}
+\alias{gkmSVM-package}
+\alias{gkmSVM}
+\docType{package}
+\title{
+Gapped-Kmer Support Vector Machine
+}
+\description{
+Imports the 'gkmSVM' v2.0 functionalities into R (www.beerlab.org/gkmsvm). It also uses the 'kernlab' library (separate R package by different authors) for various SVM algorithms.   
+}
+\details{
+\tabular{ll}{
+Package: \tab gkmSVM\cr
+Type: \tab Package\cr
+Version: \tab 0.55\cr
+Date: \tab 2015-06-29\cr
+License: \tab GPL (>= 2)\cr
+}
+
+  The gkm-SVM provides implementation of a new SVM kernel method using gapped
+  k-mers as features for DNA or Protein sequences.
+
+  There are three main functions in the gkmSVM package: 
+
+  gkmsvm_kernel: computes the kernel matrix
+
+  gkmsvm_train:  computes the SVM coefficients
+
+  gkmsvm_classify: scores new sequences using the SVM model  
+
+
+Tutorial
+
+========
+
+  We introduce the users to the basic workflow of our gkmSVM step-by-step.
+  Please refer to help messages for more detailed information of each function.
+
+  1) making a kernel matrix
+
+  First of all, we should calculate a full kernel matrix before training SVM
+  classifiers. In this tutorial, we are going to use test_positives.fa
+  as a positive set, and test_negatives.fa as a negative set. 
+
+  #Input file names:  
+
+  posfn= 'test_positives.fa'   #positive set (FASTA format)
+
+  negfn= 'test_negatives.fa'   #negative set (FASTA format)
+
+  testfn= 'test_testset.fa'    #test set (FASTA format)
+
+
+  #Output file names:  
+
+  kernelfn= 'test_kernel.txt' #kernel matrix
+
+  svmfnprfx= 'test_svmtrain'  #SVM files 
+
+  outfn =   'output.txt'      #output scores for sequences in the test set       
+
+  gkmsvm_kernel(posfn, negfn, kernelfn);                #computes kernel 
+
+  2) training SVM
+
+  We can now train a SVM classifier using the kernel matrix generated above. For that we use gkmsvm_train function It takes four arguments; kernel file, positive sequences file, negative sequences file, and prefix of output file names for the svm model.
+
+  gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx);        #trains SVM
+
+  It will generate two files, test_svmtrain_svalpha.out and
+  test_svmtrain_svseq.fa, which will then be used for classification/scoring
+  of test sequences as described below.
+
+  3) classification using SVM
+
+  gkmsvm_classify can be used to score any set of sequences. Here, we will 
+  score the test sequences which are given in test_testset.fa. Note that the same
+  set of parameters used in the gkmsvm_kernel should always be specified for
+  optimal classification (here we used default parameters).
+
+  gkmsvm_classify(testfn, svmfnprfx, outfn);            #scores test sequences 
+
+
+  In a more advanced example, we set the word length L=18, and the number of non gapped positions K=7, and maximum number of mismatches maxnmm=4:
+
+  gkmsvm_kernel(posfn, negfn, kernelfn, L=18, K=7, maxnmm=4);     #computes kernel 
+
+  gkmsvm_train(kernelfn,posfn, negfn, svmfnprfx);                 #trains SVM
+
+  gkmsvm_classify(testfn, svmfnprfx, outfn, L=18, K=7, maxnmm=4); #scores test sequences 
+
+}
+\author{
+Mahmoud Ghandi
+
+Maintainer: Mahmoud Ghandi <mghandi@gmail.com>
+}
+\references{
+Ghandi M, Lee D, Mohammad-Noori M, Beer MA. 2014. Enhanced Regulatory Sequence Prediction Using Gapped k-mer Features. PLoS Comput Biol 10: e1003711. 
+
+Ghandi M et al., gkmSVM a package for gapped-kmer SVM in R, Bioinformatics (in prep.)
+}
+\keyword{package}
+\keyword{gkmSVM}
+\keyword{kernel}
+\keyword{SVM}
+\examples{
+  #Input file names:  
+  posfn= 'test_positives.fa'   #positive set (FASTA format)
+  negfn= 'test_negatives.fa'   #negative set (FASTA format)
+  testfn= 'test_testset.fa'    #test set (FASTA format)
+
+  #Output file names:  
+  kernelfn= 'test_kernel.txt' #kernel matrix
+  svmfnprfx= 'test_svmtrain'  #SVM files 
+  outfn =   'output.txt'      #output scores for sequences in the test set       
+
+#  gkmsvm_kernel(posfn, negfn, kernelfn);                #computes kernel 
+#  gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx);       #trains SVM
+#            #scores test sequences 
+
+#  using L=18, K=7, maxnmm=4
+
+#  gkmsvm_kernel(posfn, negfn, kernelfn, L=18, K=7, maxnmm=4);     #computes kernel 
+#  gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx);                 #trains SVM
+#  gkmsvm_classify(testfn, svmfnprfx, outfn, L=18, K=7, maxnmm=4); #scores test sequences 
+
+}
diff --git a/man/gkmsvm_classify.Rd b/man/gkmsvm_classify.Rd
@@ -0,0 +1,57 @@
+\name{gkmsvm_classify}
+\alias{gkmsvm_classify}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{Classifying(/scoring) new sequences using the gkmSVM model}
+\description{Given support vectors SVs and corresponding coefficients alphas and a set of 
+  sequences, calculates the SVM scores for the sequences.}
+\usage{gkmsvm_classify(seqfile, svmfnprfx, outfile, L=10, K=6, maxnmm=3, 
+maxseqlen=10000, maxnumseq=1000000, useTgkm=1, alg=0, addRC=TRUE, usePseudocnt=FALSE, 
+batchSize=100000, wildcardLambda=1.0, wildcardMismatchM=2, alphabetFN="NULL", 
+svseqfile=NA, alphafile=NA)}
+\arguments{
+\item{seqfile}{input sequences file name (FASTA format)}
+\item{svmfnprfx}{SVM model file name prefix}
+\item{outfile}{output file name}
+\item{L}{word length, default=10}
+\item{K}{number of informative columns, default=6}
+\item{maxnmm}{maximum number of mismatches to consider, default=3}
+\item{maxseqlen}{maximum sequence length in the sequence files, default=10000}
+\item{maxnumseq}{maximum number of sequences in the sequence files, default=1000000}
+\item{useTgkm}{filter type: 0(use full filter), 1(use truncated filter: this gaurantees non-negative counts for all L-mers), 2(use h[m], gkm count vector), 3(wildcard), 4(mismatch), default=1}
+\item{alg}{algorithm type: 0(auto), 1(XOR Hashtable), 2(tree), default=0}
+\item{addRC}{adds reverse complement sequences, default=TRUE}
+\item{usePseudocnt}{adds a constant to count estimates, default=FALSE}
+\item{batchSize}{number of sequences to compute scores for in batch, default=100000}
+\item{wildcardLambda}{lambda for wildcard kernel, defaul=0.9}
+\item{wildcardMismatchM}{max mismatch for Mismatch kernel or wildcard kernel, default=2}
+\item{alphabetFN}{alphabets file name, if not specified, it is assumed the inputs are DNA sequences}
+\item{svseqfile}{SVM support vectors sequence file name (not needed if svmfnprfx is provided)}
+\item{alphafile}{SVM support vectors weights file name (not needed if svmfnprfx is provided)}
+
+}
+\details{ classification using SVM:
+  gkmsvm_classify can be used to score any set of sequences. Note that the same
+  set of parameters (L, K, maxnmm) used in the gkmsvm_kernel should be specified for
+  optimal classification.
+
+  gkmsvm_classify(testfn, svmfnprfx, outfn);            #scores test sequences  }
+\author{Mahmoud Ghandi}
+\examples{
+  #Input file names:  
+  posfn= 'test_positives.fa'   #positive set (FASTA format)
+  negfn= 'test_negatives.fa'   #negative set (FASTA format)
+  testfn= 'test_testset.fa'    #test set (FASTA format)
+
+  #Output file names:  
+  kernelfn= 'test_kernel.txt' #kernel matrix
+  svmfnprfx= 'test_svmtrain'  #SVM files 
+  outfn =   'output.txt'      #output scores for sequences in the test set       
+
+#  gkmsvm_kernel(posfn, negfn, kernelfn);                #computes kernel 
+#  gkmsvm_train(kernelfn,posfn, negfn, svmfnprfx);       #trains SVM
+#  gkmsvm_classify(testfn, svmfnprfx, outfn);            #scores test sequences 
+}
+
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+%\keyword{gkmsvm_classify}