-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathADC_SNP_Data_Extract.R
87 lines (62 loc) · 2.43 KB
/
ADC_SNP_Data_Extract.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
setwd("Z:/Leng/LUAD/broad.mit.edu_LUAD.Genome_Wide_SNP_6.Level_2")
#read in SNP data, check if they are birdseed (SNP genotyping algorithm)
filenames <- list.files()
idx <- NULL
for (i in c(1:length(filenames))){
if ("birdseed" == strsplit(filenames[i],"\\.")[[1]][2]) idx <- c(idx,filenames[i])
}
N <- length(filenames)
file1 <- read.table(idx[1],sep="\t",header=F,skip=2)
M <- nrow(file1)
#Preallocate data matrix
data <- data.frame(matrix(ncol = M, nrow = N))
#Columns of files are: Composite Element REF, Call, Confidence
#SNP IDs are same for each file.
#Extract out SNP IDs and first file's data. Data from remaining files will be added.
trans.file1 <- t(file1[,c(1,2)])
data[1:2,] <- trans.file1
#Make one big 'ole data file.
for (i in 2:N){
file <- read.table(idx[i],sep="\t",header=F,skip=2)
trans.file <- t(file[,2])
data[i+1,] <- trans.file
}
# this takes a real long time.
# write.csv(data,"D:/Phil/SNPs/output/SNP_data.csv", row.names=c("Name",idx))
# extract SNPs listed in genes
# want a vector containing all SNPs for the genes, to cut down the data file
# and list of vectors containing SNPs for each gene.
# setwd("/home/delores/Academic/LRRI2013/Phil/SNPs/output/SNP_Info/")
setwd("D:/Phil/SNPs/output/SNP_Info/")
info.filenames <- list.files()
L <- length(info.filenames)
gene.SNPs.list <- vector(mode = "list", length = L)
all.SNPs <- c()
for (i in 1:L){
filename <- info.filenames[i]
gene <- strsplit(filename, "_")[[1]][1]
gene.file <- read.csv(filename, header=TRUE)
gene.SNPs.list[[i]] <- as.character(gene.file[,1])
names(gene.SNPs.list)[[i]] <- gene
all.SNPs <- c(all.SNPs, as.character(gene.file[,1]))
}
all.SNPs <- unique(all.SNPs)
#cut down data
SNPs.ind <- match(all.SNPs, data[1,])
#Not all SNPs in gene regions have corresponding entry in SNP-array files
SNPs.ind2 <- SNPs.ind[-which(is.na(SNPs.ind) == TRUE)]
data <- data[, SNPs.ind2]
names(data) <- data[1,]
data <- data[-1,]
Name <- idx
data <- cbind(Name,data)
#make a separate file of SNP data for each gene.
setwd("D:/Phil/SNPs/output/ADC_SNP_data/")
O <- length(gene.SNPs.list)
for (i in 1:O){
gene <- names(gene.SNPs.list)[i]
match.idx <- match(gene.SNPs.list[[i]], names(data))
match.idx <- match.idx[-(which(is.na(match.idx) == TRUE))]
filename <- paste(gene, "_ADC_SNP_data.csv", sep="")
write.csv(data[,c(1,match.idx)], filename, row.names=FALSE)
}