Skip to content

Commit

Permalink
gkmSVM 0.55 initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mghandi committed Jul 1, 2015
0 parents commit 841b197
Show file tree
Hide file tree
Showing 73 changed files with 14,167 additions and 0 deletions.
12 changes: 12 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Package: gkmSVM
Type: Package
Title: Gapped-Kmer Support Vector Machine
Version: 0.55
Date: 2015-06-29
Author: Mahmoud Ghandi
Maintainer: Mahmoud Ghandi <[email protected]>
Description: Imports the 'gkmSVM' v2.0 functionalities into R (www.beerlab.org/gkmsvm). It also uses the 'kernlab' library (separate R package by different authors) for various SVM algorithms.
License: GPL (>= 2)
Imports: Rcpp, kernlab, seqinr, utils
LinkingTo: Rcpp
SystemRequirements: C++11
675 changes: 675 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
useDynLib(gkmSVM)
exportPattern("^[[:alpha:]]+")
importFrom(Rcpp, evalCpp)
11 changes: 11 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#gkmsvm_classify <- function(params) {
# invisible(.Call('gkmSVM_gkmsvm_classify', PACKAGE = 'gkmSVM', params))
#}

#gkmsvm_kernel <- function(params) {
# invisible(.Call('gkmSVM_gkmsvm_kernel', PACKAGE = 'gkmSVM', params))
#}

52 changes: 52 additions & 0 deletions R/gkmsvm_classify.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

gkmsvm_classify <- function( seqfile,
svmfnprfx,
outfile,
L=10,
K=6,
maxnmm=3,
maxseqlen=10000,
maxnumseq=1000000,
useTgkm=1,
alg=0,
addRC=TRUE,
usePseudocnt=FALSE,
batchSize=100000,
wildcardLambda=1.0,
wildcardMismatchM=2,
alphabetFN="NULL",
svseqfile=NA,
alphafile=NA){

if(is.na(svseqfile)){
svseqfile= paste(svmfnprfx, 'svseq.fa', sep='_')
alphafile= paste(svmfnprfx, 'svalpha.out', sep='_')
}

params = list(seqfile=seqfile,
svseqfile=svseqfile,
alphafile=alphafile,
outfile=outfile,
L=L,
K=K,
maxnmm=maxnmm,
maxseqlen=maxseqlen,
maxnumseq=maxnumseq,
useTgkm=useTgkm,
alg=alg,
addRC=addRC,
usePseudocnt=usePseudocnt,
batchSize=batchSize,
wildcardLambda=wildcardLambda,
wildcardMismatchM=wildcardMismatchM,
alphabetFN=alphabetFN
);
# print(params)

invisible(.Call('gkmSVM_gkmsvm_classify', PACKAGE = 'gkmSVM', params))
}





38 changes: 38 additions & 0 deletions R/gkmsvm_kernel.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

gkmsvm_kernel <- function( posfile,
negfile,
outfile,
L=10,
K=6,
maxnmm=3,
maxseqlen=10000,
maxnumseq=1000000,
useTgkm=1,
alg=0,
addRC=TRUE,
usePseudocnt=FALSE,
wildcardLambda=1.0,
wildcardMismatchM=2,
alphabetFN="NULL"){

params = list(L=L,
K=K,
maxnmm=maxnmm,
maxseqlen=maxseqlen,
maxnumseq=maxnumseq,
useTgkm=useTgkm,
alg=alg,
addRC=addRC,
usePseudocnt=usePseudocnt,
OutputBinary=FALSE,
posfile=posfile,
negfile=negfile,
outfile=outfile,
wildcardLambda=wildcardLambda,
wildcardMismatchM=wildcardMismatchM,
alphabetFN=alphabetFN
);
# print(params)

invisible(.Call('gkmSVM_gkmsvm_kernel', PACKAGE = 'gkmSVM', params))
}
50 changes: 50 additions & 0 deletions R/gkmsvm_train.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

gkmsvm_train = function (kernelfn, posfn, negfn, svmfnprfx, Type="C-svc", C=1, ...){
#TODO: add bootstrapping and cv capabilities -- also autyomatic choise of C . check if kernlab does that


# library(seqinr)
# library(kernlab)
# library(utils)
if (requireNamespace("seqinr", quietly = TRUE)&
requireNamespace("utils", quietly = TRUE)&
requireNamespace("kernlab", quietly = TRUE)){



# negfn= '/Users/mghandi/gkmsvm/test/testneg9.fa'
# posfn= '/Users/mghandi/gkmsvm/test/testpos9.fa'
# kernelfn= '/Users/mghandi/gkmsvm/test/test9kernel.txt'

pos = seqinr::read.fasta(posfn)
npos = length(pos)
neg = seqinr::read.fasta(negfn)
nneg = length(neg)
nseq = npos+nneg;

mat <- data.matrix( utils::read.table(file=kernelfn, fill=TRUE, col.names=paste("V", 1:nseq)))
mat[upper.tri(mat)] <- t(mat)[upper.tri(mat)]
rownames(mat)=colnames(mat)
K <- kernlab::as.kernelMatrix(mat)
y = c(rep(1, npos), rep(0, nneg)); names(y)=rownames(mat)

# svp <- ksvm(K, y, type="C-svc", C=1)
svp <- kernlab::ksvm(K, y, type=Type, C=C, ...)

seqnames = c(names(pos), names(neg))

if(svp@nSV>0){
alpha = unlist(svp@alpha )
ii = unlist(svp@SVindex)
jj = which(ii>npos);
alpha[jj]= -alpha[jj];

utils::write.table(cbind(seqnames[ii], sprintf("%11.6e",alpha)),
file = paste(svmfnprfx, 'svalpha.out', sep='_'),
col.names=FALSE, row.names=FALSE, quote=FALSE, sep='\t')

svseqs = c(pos,neg)[ii];
seqinr::write.fasta(svseqs, names(svseqs), file.out= paste(svmfnprfx, 'svseq.fa', sep='_'))
}
}
}
17 changes: 17 additions & 0 deletions gkmSVM.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
128 changes: 128 additions & 0 deletions man/gkmSVM-package.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
\name{gkmSVM-package}
\alias{gkmSVM-package}
\alias{gkmSVM}
\docType{package}
\title{
Gapped-Kmer Support Vector Machine
}
\description{
Imports the 'gkmSVM' v2.0 functionalities into R (www.beerlab.org/gkmsvm). It also uses the 'kernlab' library (separate R package by different authors) for various SVM algorithms.
}
\details{
\tabular{ll}{
Package: \tab gkmSVM\cr
Type: \tab Package\cr
Version: \tab 0.55\cr
Date: \tab 2015-06-29\cr
License: \tab GPL (>= 2)\cr
}

The gkm-SVM provides implementation of a new SVM kernel method using gapped
k-mers as features for DNA or Protein sequences.

There are three main functions in the gkmSVM package:

gkmsvm_kernel: computes the kernel matrix

gkmsvm_train: computes the SVM coefficients

gkmsvm_classify: scores new sequences using the SVM model


Tutorial

========

We introduce the users to the basic workflow of our gkmSVM step-by-step.
Please refer to help messages for more detailed information of each function.

1) making a kernel matrix

First of all, we should calculate a full kernel matrix before training SVM
classifiers. In this tutorial, we are going to use test_positives.fa
as a positive set, and test_negatives.fa as a negative set.

#Input file names:

posfn= 'test_positives.fa' #positive set (FASTA format)

negfn= 'test_negatives.fa' #negative set (FASTA format)

testfn= 'test_testset.fa' #test set (FASTA format)


#Output file names:

kernelfn= 'test_kernel.txt' #kernel matrix

svmfnprfx= 'test_svmtrain' #SVM files

outfn = 'output.txt' #output scores for sequences in the test set

gkmsvm_kernel(posfn, negfn, kernelfn); #computes kernel

2) training SVM

We can now train a SVM classifier using the kernel matrix generated above. For that we use gkmsvm_train function It takes four arguments; kernel file, positive sequences file, negative sequences file, and prefix of output file names for the svm model.

gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx); #trains SVM

It will generate two files, test_svmtrain_svalpha.out and
test_svmtrain_svseq.fa, which will then be used for classification/scoring
of test sequences as described below.

3) classification using SVM

gkmsvm_classify can be used to score any set of sequences. Here, we will
score the test sequences which are given in test_testset.fa. Note that the same
set of parameters used in the gkmsvm_kernel should always be specified for
optimal classification (here we used default parameters).

gkmsvm_classify(testfn, svmfnprfx, outfn); #scores test sequences


In a more advanced example, we set the word length L=18, and the number of non gapped positions K=7, and maximum number of mismatches maxnmm=4:

gkmsvm_kernel(posfn, negfn, kernelfn, L=18, K=7, maxnmm=4); #computes kernel

gkmsvm_train(kernelfn,posfn, negfn, svmfnprfx); #trains SVM

gkmsvm_classify(testfn, svmfnprfx, outfn, L=18, K=7, maxnmm=4); #scores test sequences

}
\author{
Mahmoud Ghandi

Maintainer: Mahmoud Ghandi <mghandi@gmail.com>
}
\references{
Ghandi M, Lee D, Mohammad-Noori M, Beer MA. 2014. Enhanced Regulatory Sequence Prediction Using Gapped k-mer Features. PLoS Comput Biol 10: e1003711.

Ghandi M et al., gkmSVM a package for gapped-kmer SVM in R, Bioinformatics (in prep.)
}
\keyword{package}
\keyword{gkmSVM}
\keyword{kernel}
\keyword{SVM}
\examples{
#Input file names:
posfn= 'test_positives.fa' #positive set (FASTA format)
negfn= 'test_negatives.fa' #negative set (FASTA format)
testfn= 'test_testset.fa' #test set (FASTA format)

#Output file names:
kernelfn= 'test_kernel.txt' #kernel matrix
svmfnprfx= 'test_svmtrain' #SVM files
outfn = 'output.txt' #output scores for sequences in the test set

# gkmsvm_kernel(posfn, negfn, kernelfn); #computes kernel
# gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx); #trains SVM
# #scores test sequences

# using L=18, K=7, maxnmm=4

# gkmsvm_kernel(posfn, negfn, kernelfn, L=18, K=7, maxnmm=4); #computes kernel
# gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx); #trains SVM
# gkmsvm_classify(testfn, svmfnprfx, outfn, L=18, K=7, maxnmm=4); #scores test sequences

}
57 changes: 57 additions & 0 deletions man/gkmsvm_classify.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
\name{gkmsvm_classify}
\alias{gkmsvm_classify}
%- Also NEED an '\alias' for EACH other topic documented here.
\title{Classifying(/scoring) new sequences using the gkmSVM model}
\description{Given support vectors SVs and corresponding coefficients alphas and a set of
sequences, calculates the SVM scores for the sequences.}
\usage{gkmsvm_classify(seqfile, svmfnprfx, outfile, L=10, K=6, maxnmm=3,
maxseqlen=10000, maxnumseq=1000000, useTgkm=1, alg=0, addRC=TRUE, usePseudocnt=FALSE,
batchSize=100000, wildcardLambda=1.0, wildcardMismatchM=2, alphabetFN="NULL",
svseqfile=NA, alphafile=NA)}
\arguments{
\item{seqfile}{input sequences file name (FASTA format)}
\item{svmfnprfx}{SVM model file name prefix}
\item{outfile}{output file name}
\item{L}{word length, default=10}
\item{K}{number of informative columns, default=6}
\item{maxnmm}{maximum number of mismatches to consider, default=3}
\item{maxseqlen}{maximum sequence length in the sequence files, default=10000}
\item{maxnumseq}{maximum number of sequences in the sequence files, default=1000000}
\item{useTgkm}{filter type: 0(use full filter), 1(use truncated filter: this gaurantees non-negative counts for all L-mers), 2(use h[m], gkm count vector), 3(wildcard), 4(mismatch), default=1}
\item{alg}{algorithm type: 0(auto), 1(XOR Hashtable), 2(tree), default=0}
\item{addRC}{adds reverse complement sequences, default=TRUE}
\item{usePseudocnt}{adds a constant to count estimates, default=FALSE}
\item{batchSize}{number of sequences to compute scores for in batch, default=100000}
\item{wildcardLambda}{lambda for wildcard kernel, defaul=0.9}
\item{wildcardMismatchM}{max mismatch for Mismatch kernel or wildcard kernel, default=2}
\item{alphabetFN}{alphabets file name, if not specified, it is assumed the inputs are DNA sequences}
\item{svseqfile}{SVM support vectors sequence file name (not needed if svmfnprfx is provided)}
\item{alphafile}{SVM support vectors weights file name (not needed if svmfnprfx is provided)}

}
\details{ classification using SVM:
gkmsvm_classify can be used to score any set of sequences. Note that the same
set of parameters (L, K, maxnmm) used in the gkmsvm_kernel should be specified for
optimal classification.

gkmsvm_classify(testfn, svmfnprfx, outfn); #scores test sequences }
\author{Mahmoud Ghandi}
\examples{
#Input file names:
posfn= 'test_positives.fa' #positive set (FASTA format)
negfn= 'test_negatives.fa' #negative set (FASTA format)
testfn= 'test_testset.fa' #test set (FASTA format)

#Output file names:
kernelfn= 'test_kernel.txt' #kernel matrix
svmfnprfx= 'test_svmtrain' #SVM files
outfn = 'output.txt' #output scores for sequences in the test set

# gkmsvm_kernel(posfn, negfn, kernelfn); #computes kernel
# gkmsvm_train(kernelfn,posfn, negfn, svmfnprfx); #trains SVM
# gkmsvm_classify(testfn, svmfnprfx, outfn); #scores test sequences
}

% Add one or more standard keywords, see file 'KEYWORDS' in the
% R documentation directory.
%\keyword{gkmsvm_classify}
Loading

0 comments on commit 841b197

Please sign in to comment.