-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 841b197
Showing
73 changed files
with
14,167 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
Package: gkmSVM | ||
Type: Package | ||
Title: Gapped-Kmer Support Vector Machine | ||
Version: 0.55 | ||
Date: 2015-06-29 | ||
Author: Mahmoud Ghandi | ||
Maintainer: Mahmoud Ghandi <[email protected]> | ||
Description: Imports the 'gkmSVM' v2.0 functionalities into R (www.beerlab.org/gkmsvm). It also uses the 'kernlab' library (separate R package by different authors) for various SVM algorithms. | ||
License: GPL (>= 2) | ||
Imports: Rcpp, kernlab, seqinr, utils | ||
LinkingTo: Rcpp | ||
SystemRequirements: C++11 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
useDynLib(gkmSVM) | ||
exportPattern("^[[:alpha:]]+") | ||
importFrom(Rcpp, evalCpp) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# This file was generated by Rcpp::compileAttributes | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
#gkmsvm_classify <- function(params) { | ||
# invisible(.Call('gkmSVM_gkmsvm_classify', PACKAGE = 'gkmSVM', params)) | ||
#} | ||
|
||
#gkmsvm_kernel <- function(params) { | ||
# invisible(.Call('gkmSVM_gkmsvm_kernel', PACKAGE = 'gkmSVM', params)) | ||
#} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
|
||
gkmsvm_classify <- function( seqfile, | ||
svmfnprfx, | ||
outfile, | ||
L=10, | ||
K=6, | ||
maxnmm=3, | ||
maxseqlen=10000, | ||
maxnumseq=1000000, | ||
useTgkm=1, | ||
alg=0, | ||
addRC=TRUE, | ||
usePseudocnt=FALSE, | ||
batchSize=100000, | ||
wildcardLambda=1.0, | ||
wildcardMismatchM=2, | ||
alphabetFN="NULL", | ||
svseqfile=NA, | ||
alphafile=NA){ | ||
|
||
if(is.na(svseqfile)){ | ||
svseqfile= paste(svmfnprfx, 'svseq.fa', sep='_') | ||
alphafile= paste(svmfnprfx, 'svalpha.out', sep='_') | ||
} | ||
|
||
params = list(seqfile=seqfile, | ||
svseqfile=svseqfile, | ||
alphafile=alphafile, | ||
outfile=outfile, | ||
L=L, | ||
K=K, | ||
maxnmm=maxnmm, | ||
maxseqlen=maxseqlen, | ||
maxnumseq=maxnumseq, | ||
useTgkm=useTgkm, | ||
alg=alg, | ||
addRC=addRC, | ||
usePseudocnt=usePseudocnt, | ||
batchSize=batchSize, | ||
wildcardLambda=wildcardLambda, | ||
wildcardMismatchM=wildcardMismatchM, | ||
alphabetFN=alphabetFN | ||
); | ||
# print(params) | ||
|
||
invisible(.Call('gkmSVM_gkmsvm_classify', PACKAGE = 'gkmSVM', params)) | ||
} | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
|
||
gkmsvm_kernel <- function( posfile, | ||
negfile, | ||
outfile, | ||
L=10, | ||
K=6, | ||
maxnmm=3, | ||
maxseqlen=10000, | ||
maxnumseq=1000000, | ||
useTgkm=1, | ||
alg=0, | ||
addRC=TRUE, | ||
usePseudocnt=FALSE, | ||
wildcardLambda=1.0, | ||
wildcardMismatchM=2, | ||
alphabetFN="NULL"){ | ||
|
||
params = list(L=L, | ||
K=K, | ||
maxnmm=maxnmm, | ||
maxseqlen=maxseqlen, | ||
maxnumseq=maxnumseq, | ||
useTgkm=useTgkm, | ||
alg=alg, | ||
addRC=addRC, | ||
usePseudocnt=usePseudocnt, | ||
OutputBinary=FALSE, | ||
posfile=posfile, | ||
negfile=negfile, | ||
outfile=outfile, | ||
wildcardLambda=wildcardLambda, | ||
wildcardMismatchM=wildcardMismatchM, | ||
alphabetFN=alphabetFN | ||
); | ||
# print(params) | ||
|
||
invisible(.Call('gkmSVM_gkmsvm_kernel', PACKAGE = 'gkmSVM', params)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
|
||
gkmsvm_train = function (kernelfn, posfn, negfn, svmfnprfx, Type="C-svc", C=1, ...){ | ||
#TODO: add bootstrapping and cv capabilities -- also autyomatic choise of C . check if kernlab does that | ||
|
||
|
||
# library(seqinr) | ||
# library(kernlab) | ||
# library(utils) | ||
if (requireNamespace("seqinr", quietly = TRUE)& | ||
requireNamespace("utils", quietly = TRUE)& | ||
requireNamespace("kernlab", quietly = TRUE)){ | ||
|
||
|
||
|
||
# negfn= '/Users/mghandi/gkmsvm/test/testneg9.fa' | ||
# posfn= '/Users/mghandi/gkmsvm/test/testpos9.fa' | ||
# kernelfn= '/Users/mghandi/gkmsvm/test/test9kernel.txt' | ||
|
||
pos = seqinr::read.fasta(posfn) | ||
npos = length(pos) | ||
neg = seqinr::read.fasta(negfn) | ||
nneg = length(neg) | ||
nseq = npos+nneg; | ||
|
||
mat <- data.matrix( utils::read.table(file=kernelfn, fill=TRUE, col.names=paste("V", 1:nseq))) | ||
mat[upper.tri(mat)] <- t(mat)[upper.tri(mat)] | ||
rownames(mat)=colnames(mat) | ||
K <- kernlab::as.kernelMatrix(mat) | ||
y = c(rep(1, npos), rep(0, nneg)); names(y)=rownames(mat) | ||
|
||
# svp <- ksvm(K, y, type="C-svc", C=1) | ||
svp <- kernlab::ksvm(K, y, type=Type, C=C, ...) | ||
|
||
seqnames = c(names(pos), names(neg)) | ||
|
||
if(svp@nSV>0){ | ||
alpha = unlist(svp@alpha ) | ||
ii = unlist(svp@SVindex) | ||
jj = which(ii>npos); | ||
alpha[jj]= -alpha[jj]; | ||
|
||
utils::write.table(cbind(seqnames[ii], sprintf("%11.6e",alpha)), | ||
file = paste(svmfnprfx, 'svalpha.out', sep='_'), | ||
col.names=FALSE, row.names=FALSE, quote=FALSE, sep='\t') | ||
|
||
svseqs = c(pos,neg)[ii]; | ||
seqinr::write.fasta(svseqs, names(svseqs), file.out= paste(svmfnprfx, 'svseq.fa', sep='_')) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
Version: 1.0 | ||
|
||
RestoreWorkspace: Default | ||
SaveWorkspace: Default | ||
AlwaysSaveHistory: Default | ||
|
||
EnableCodeIndexing: Yes | ||
UseSpacesForTab: Yes | ||
NumSpacesForTab: 2 | ||
Encoding: UTF-8 | ||
|
||
RnwWeave: Sweave | ||
LaTeX: pdfLaTeX | ||
|
||
BuildType: Package | ||
PackageUseDevtools: Yes | ||
PackageInstallArgs: --no-multiarch --with-keep.source |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
\name{gkmSVM-package} | ||
\alias{gkmSVM-package} | ||
\alias{gkmSVM} | ||
\docType{package} | ||
\title{ | ||
Gapped-Kmer Support Vector Machine | ||
} | ||
\description{ | ||
Imports the 'gkmSVM' v2.0 functionalities into R (www.beerlab.org/gkmsvm). It also uses the 'kernlab' library (separate R package by different authors) for various SVM algorithms. | ||
} | ||
\details{ | ||
\tabular{ll}{ | ||
Package: \tab gkmSVM\cr | ||
Type: \tab Package\cr | ||
Version: \tab 0.55\cr | ||
Date: \tab 2015-06-29\cr | ||
License: \tab GPL (>= 2)\cr | ||
} | ||
|
||
The gkm-SVM provides implementation of a new SVM kernel method using gapped | ||
k-mers as features for DNA or Protein sequences. | ||
|
||
There are three main functions in the gkmSVM package: | ||
|
||
gkmsvm_kernel: computes the kernel matrix | ||
|
||
gkmsvm_train: computes the SVM coefficients | ||
|
||
gkmsvm_classify: scores new sequences using the SVM model | ||
|
||
|
||
Tutorial | ||
|
||
======== | ||
|
||
We introduce the users to the basic workflow of our gkmSVM step-by-step. | ||
Please refer to help messages for more detailed information of each function. | ||
|
||
1) making a kernel matrix | ||
|
||
First of all, we should calculate a full kernel matrix before training SVM | ||
classifiers. In this tutorial, we are going to use test_positives.fa | ||
as a positive set, and test_negatives.fa as a negative set. | ||
|
||
#Input file names: | ||
|
||
posfn= 'test_positives.fa' #positive set (FASTA format) | ||
|
||
negfn= 'test_negatives.fa' #negative set (FASTA format) | ||
|
||
testfn= 'test_testset.fa' #test set (FASTA format) | ||
|
||
|
||
#Output file names: | ||
|
||
kernelfn= 'test_kernel.txt' #kernel matrix | ||
|
||
svmfnprfx= 'test_svmtrain' #SVM files | ||
|
||
outfn = 'output.txt' #output scores for sequences in the test set | ||
|
||
gkmsvm_kernel(posfn, negfn, kernelfn); #computes kernel | ||
|
||
2) training SVM | ||
|
||
We can now train a SVM classifier using the kernel matrix generated above. For that we use gkmsvm_train function It takes four arguments; kernel file, positive sequences file, negative sequences file, and prefix of output file names for the svm model. | ||
|
||
gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx); #trains SVM | ||
|
||
It will generate two files, test_svmtrain_svalpha.out and | ||
test_svmtrain_svseq.fa, which will then be used for classification/scoring | ||
of test sequences as described below. | ||
|
||
3) classification using SVM | ||
|
||
gkmsvm_classify can be used to score any set of sequences. Here, we will | ||
score the test sequences which are given in test_testset.fa. Note that the same | ||
set of parameters used in the gkmsvm_kernel should always be specified for | ||
optimal classification (here we used default parameters). | ||
|
||
gkmsvm_classify(testfn, svmfnprfx, outfn); #scores test sequences | ||
|
||
|
||
In a more advanced example, we set the word length L=18, and the number of non gapped positions K=7, and maximum number of mismatches maxnmm=4: | ||
|
||
gkmsvm_kernel(posfn, negfn, kernelfn, L=18, K=7, maxnmm=4); #computes kernel | ||
|
||
gkmsvm_train(kernelfn,posfn, negfn, svmfnprfx); #trains SVM | ||
|
||
gkmsvm_classify(testfn, svmfnprfx, outfn, L=18, K=7, maxnmm=4); #scores test sequences | ||
|
||
} | ||
\author{ | ||
Mahmoud Ghandi | ||
|
||
Maintainer: Mahmoud Ghandi <mghandi@gmail.com> | ||
} | ||
\references{ | ||
Ghandi M, Lee D, Mohammad-Noori M, Beer MA. 2014. Enhanced Regulatory Sequence Prediction Using Gapped k-mer Features. PLoS Comput Biol 10: e1003711. | ||
|
||
Ghandi M et al., gkmSVM a package for gapped-kmer SVM in R, Bioinformatics (in prep.) | ||
} | ||
\keyword{package} | ||
\keyword{gkmSVM} | ||
\keyword{kernel} | ||
\keyword{SVM} | ||
\examples{ | ||
#Input file names: | ||
posfn= 'test_positives.fa' #positive set (FASTA format) | ||
negfn= 'test_negatives.fa' #negative set (FASTA format) | ||
testfn= 'test_testset.fa' #test set (FASTA format) | ||
|
||
#Output file names: | ||
kernelfn= 'test_kernel.txt' #kernel matrix | ||
svmfnprfx= 'test_svmtrain' #SVM files | ||
outfn = 'output.txt' #output scores for sequences in the test set | ||
|
||
# gkmsvm_kernel(posfn, negfn, kernelfn); #computes kernel | ||
# gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx); #trains SVM | ||
# #scores test sequences | ||
|
||
# using L=18, K=7, maxnmm=4 | ||
|
||
# gkmsvm_kernel(posfn, negfn, kernelfn, L=18, K=7, maxnmm=4); #computes kernel | ||
# gkmsvm_train(kernelfn, posfn, negfn, svmfnprfx); #trains SVM | ||
# gkmsvm_classify(testfn, svmfnprfx, outfn, L=18, K=7, maxnmm=4); #scores test sequences | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
\name{gkmsvm_classify} | ||
\alias{gkmsvm_classify} | ||
%- Also NEED an '\alias' for EACH other topic documented here. | ||
\title{Classifying(/scoring) new sequences using the gkmSVM model} | ||
\description{Given support vectors SVs and corresponding coefficients alphas and a set of | ||
sequences, calculates the SVM scores for the sequences.} | ||
\usage{gkmsvm_classify(seqfile, svmfnprfx, outfile, L=10, K=6, maxnmm=3, | ||
maxseqlen=10000, maxnumseq=1000000, useTgkm=1, alg=0, addRC=TRUE, usePseudocnt=FALSE, | ||
batchSize=100000, wildcardLambda=1.0, wildcardMismatchM=2, alphabetFN="NULL", | ||
svseqfile=NA, alphafile=NA)} | ||
\arguments{ | ||
\item{seqfile}{input sequences file name (FASTA format)} | ||
\item{svmfnprfx}{SVM model file name prefix} | ||
\item{outfile}{output file name} | ||
\item{L}{word length, default=10} | ||
\item{K}{number of informative columns, default=6} | ||
\item{maxnmm}{maximum number of mismatches to consider, default=3} | ||
\item{maxseqlen}{maximum sequence length in the sequence files, default=10000} | ||
\item{maxnumseq}{maximum number of sequences in the sequence files, default=1000000} | ||
\item{useTgkm}{filter type: 0(use full filter), 1(use truncated filter: this gaurantees non-negative counts for all L-mers), 2(use h[m], gkm count vector), 3(wildcard), 4(mismatch), default=1} | ||
\item{alg}{algorithm type: 0(auto), 1(XOR Hashtable), 2(tree), default=0} | ||
\item{addRC}{adds reverse complement sequences, default=TRUE} | ||
\item{usePseudocnt}{adds a constant to count estimates, default=FALSE} | ||
\item{batchSize}{number of sequences to compute scores for in batch, default=100000} | ||
\item{wildcardLambda}{lambda for wildcard kernel, defaul=0.9} | ||
\item{wildcardMismatchM}{max mismatch for Mismatch kernel or wildcard kernel, default=2} | ||
\item{alphabetFN}{alphabets file name, if not specified, it is assumed the inputs are DNA sequences} | ||
\item{svseqfile}{SVM support vectors sequence file name (not needed if svmfnprfx is provided)} | ||
\item{alphafile}{SVM support vectors weights file name (not needed if svmfnprfx is provided)} | ||
|
||
} | ||
\details{ classification using SVM: | ||
gkmsvm_classify can be used to score any set of sequences. Note that the same | ||
set of parameters (L, K, maxnmm) used in the gkmsvm_kernel should be specified for | ||
optimal classification. | ||
|
||
gkmsvm_classify(testfn, svmfnprfx, outfn); #scores test sequences } | ||
\author{Mahmoud Ghandi} | ||
\examples{ | ||
#Input file names: | ||
posfn= 'test_positives.fa' #positive set (FASTA format) | ||
negfn= 'test_negatives.fa' #negative set (FASTA format) | ||
testfn= 'test_testset.fa' #test set (FASTA format) | ||
|
||
#Output file names: | ||
kernelfn= 'test_kernel.txt' #kernel matrix | ||
svmfnprfx= 'test_svmtrain' #SVM files | ||
outfn = 'output.txt' #output scores for sequences in the test set | ||
|
||
# gkmsvm_kernel(posfn, negfn, kernelfn); #computes kernel | ||
# gkmsvm_train(kernelfn,posfn, negfn, svmfnprfx); #trains SVM | ||
# gkmsvm_classify(testfn, svmfnprfx, outfn); #scores test sequences | ||
} | ||
|
||
% Add one or more standard keywords, see file 'KEYWORDS' in the | ||
% R documentation directory. | ||
%\keyword{gkmsvm_classify} |
Oops, something went wrong.