-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusters.R
108 lines (96 loc) · 4.11 KB
/
clusters.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
library(data.table)
library(openxlsx)
args = commandArgs(trailingOnly=TRUE)
chrs <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10",
"chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY")
wb <- loadWorkbook("/Users/mar/BIO/PROJECTS/APOBEC/NONBDNA/Gordenin/Suppl_tables_only_journal.pbio.3000464.s007/TabS5_journal.pbio.3000464.s011.xlsx")
dataClusters <- read.xlsx(wb,sheet=2)
dataClusters <- data.table(dataClusters)
dataClusters[cancer_type == "Bladder-TCC ", cancer := "BLCA"]
dataClusters[cancer_type == "Breast-AdenoCA ", cancer := "BRCA"]
dataClusters[cancer_type == "Head-SCC ", cancer := "HNSC"]
dataClusters[cancer_type == "Cervix-SCC ", cancer := "CESC"]
dataClusters[cancer_type == "Lung-AdenoCA ", cancer := "LUAD"]
dataClusters[cancer_type == "Lung-SCC ", cancer := "LUSC"]
if (length(args)!=0) {
arg1 <- args[1]
dataClusters <- dataClusters[CG_cluster_classification_more_3mutations == arg1]
} else {
arg1 <- "noargs"
}
activity <- read.csv("/Users/mar/BIO/PROJECTS/PCAWG_APOBEC/PCAWG_enrichment_6cancers.txt",sep='\t',header=FALSE,strip.white =TRUE)
activity <- data.table(activity)
setnames(activity,c("project","sample","enrichment"))
activity[,cancer := substr(project,1,4)]
strs <- list()
for(str in c("apr","dr","gq","ir","mr","str","z"))
{
dt <- data.table()
for(chr in chrs){
dttmp <- read.csv(paste0("/Users/mar/BIO/BIODATA/nonBDNADB/human_hg19.tsv/",str,"/tsv/",chr,"_",toupper(str),".tsv"), sep='\t')
dt <- rbind(dt, data.table(dttmp))
}
strs[str] <- list(dt)
}
results <- data.table()
for(canc in c("BRCA","HNSC","CESC","LUAD","LUSC")){
print(canc)
dc <- dataClusters[cancer == canc]
da <- activity[cancer == canc]
samples <- unique(da$sample)
for(s in samples){
print(s)
dcs <- dc[Tumor_Sample_Barcode == s]
if(nrow(dcs) == 0){
print(paste0(canc,': ',s))
}
cntList <- list()
sizeList <- list()
totalsize <- dcs[,sum(Cluster_Length)]
totalsize_check <- 0
for(str in c("apr","dr","gq","ir","mr","str","z")){
dt <- data.table(strs[[str]])
cnt <- 0
sizeIn <- 0
sizeOut <- 0
if(nrow(dcs) > 0) {
for(i in 1:nrow(dcs)){
chr <- dcs[i]$Chromosome
spos <- dcs[i]$Cluster_start
epos <- dcs[i]$Cluster_end
dtt <- copy(dt)
dtt[Sequence_name == chr, maxStart := ifelse(spos > Start, spos, Start)]
dtt[Sequence_name == chr, minEnd := ifelse(epos > Stop, Stop, epos)]
#dtintersect1 <- dt[Sequence_name == chr & ((spos >= Start & Start >= epos) | (spos >= Stop & Stop >= epos))]
dtintersect2 <- dtt[!is.na(maxStart) & ((minEnd - maxStart + 1) > 0)]
if(nrow(dtintersect2) > 0){
curSize <- dtintersect2[,sum(minEnd - maxStart + 1)]
} else {
curSize <- 0
}
sizeIn <- sizeIn + curSize
}
}
sizeList[str] <- sizeIn
}
results <- rbind(results, data.table("cancer"=canc,
"sample"=s,
"apr"=cntList[['apr']],
"aprSize"=sizeList[['apr']],
"dr"=cntList[['dr']],
"drSize"=sizeList[['dr']],
"gq"=cntList[['gq']],
"gqSize"=sizeList[['gq']],
"ir"=cntList[['ir']],
"irSize"=sizeList[['ir']],
"mr"=cntList[['mr']],
"mrSize"=sizeList[['mr']],
"str"=cntList[['str']],
"strSize"=sizeList[['str']],
"z"=cntList[['z']],
"zSize"=sizeList[['z']],
"cnt"=nrow(dcs),
"totalsize"=totalsize))
}
}
write.csv(results,paste0("nonbdna_clusters_",arg1,".txt"))