-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunDESeq.R
327 lines (236 loc) · 10 KB
/
runDESeq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# Author: Navami Shenoy
# script for differential gene expression analysis of control vs.
# LMP1 knockout lymphoblastoid cell lines
setwd("F:/bulkRNASeq/GM12878")
#load libraries
library(dplyr)
library(tidyverse)
library(tibble)
library(DESeq2)
library(ggplot2)
library(ComplexHeatmap)
library(ggridges)
library(clusterProfiler)
library(EnhancedVolcano)
library("EnsDb.Hsapiens.v86")
library("org.Hs.eg.db")
#-------Read in the counts matrix--------#
counts_matrix <- read.csv("counts/counts_matrix_5_samples.csv", header = TRUE,
row.names = 1, sep = "\t", skip = 1)
#change column names to reflect sample IDs
colnames(counts_matrix) <- c("Control_Rep2", "Control_Rep3",
"LMP1_KO_Rep1", "LMP1_KO_Rep2", "LMP1_KO_Rep3")
head(counts_matrix)
#------------Pre-filtering--------------#
#keep rows with more than 10 reads total
counts_matrix <- counts_matrix[which(rowSums(counts_matrix) > 10), ]
#reduced the number of rows by more than half!
#---------------DESeq2------------------#
#create dataframe containing sample information
condition <- factor(c("control", "control",
"knockout", "knockout", "knockout"))
coldata <- data.frame(row.names = colnames(counts_matrix), condition)
head(coldata)
#construct a DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = counts_matrix,
colData = coldata,
design = ~condition)
#set reference level
dds$condition <- relevel(dds$condition, ref = "control")
#run DESeq2
dds <- DESeq(dds)
#change p-value
res_0.05 <- results(dds, alpha = 0.05, contrast = c("condition", "knockout", "control"))
#drop NAs
sigs <- na.omit(res_0.05)
#get a summary of the dds object
summary(res_0.05)
#------------Collect significant genes-------------#
#get differentially expressed genes with adjusted p-value below 0.05
sigs <- sigs[sigs$padj < 0.05, ]
sigs
#map gene IDs to gene names
sigs.df <- as.data.frame(sigs)
sigs.df$symbol <- ensembldb::mapIds(EnsDb.Hsapiens.v86, keys = row.names(sigs.df),
keytype = "GENEID", column = "SYMBOL")
#use gene ID as gene name for genes that did not
#map to a symbol in the previous step
for(i in seq_along(sigs.df$symbol)){
if(is.na(sigs.df$symbol[i])){
sigs.df$symbol[i] = row.names(sigs.df)[i]
}
}
#make a copy so the original df is available if
#we need it later
sigs.df.copy <- sigs.df
#-----------Visualization------------#
#PCA plot
vsdata <- vst(dds, blind = FALSE)
plotPCA(vsdata, intgroup = "condition")
#Dispersion plot
plotDispEsts(dds)
#MA plot
plotMA(res_0.05)
#Volcano plot
select.genes <- c("EBI3", "CCL22", "CD40", "MDM2","BATF",
"NFKB1", "NFKB2", "IRF4", "CBS", "BCAT1")
keyvals <- ifelse(
sigs.df.copy$log2FoldChange < -1, '#3ebdc8',
ifelse(sigs.df.copy$log2FoldChange > 1, '#F6BE00',
'darkgrey'))
#reorder df
x <- sigs.df.copy$symbol %in% select.genes
sigs.reordered <- rbind(sigs.df.copy[!x,], sigs.df.copy[x,])
#generate key-value pairs to change color and label schemes
keyvals <- ifelse(
sigs.reordered$symbol %in% select.genes,
ifelse(sigs.reordered$log2FoldChange < 0, '#3ebdc8',
ifelse(sigs.reordered$log2FoldChange > 0, '#F6BE00',
'darkgrey')),
'darkgrey'
)
keyvals[is.na(keyvals)] <- 'darkgrey'
names(keyvals)[keyvals == '#F6BE00'] <- 'Upregulated'
names(keyvals)[keyvals == 'darkgrey'] <- 'Not significant'
names(keyvals)[keyvals == '#3ebdc8'] <- 'Downregulated'
EnhancedVolcano(sigs.reordered, x = "log2FoldChange", y = "padj", lab = sigs.reordered$symbol,
pCutoff = 0.005, FCcutoff = 1,
pointSize = (ifelse(sigs.reordered$symbol %in% select.genes, 3,1)),
labSize = 4,
selectLab = select.genes,
drawConnectors = TRUE,
title = NULL,
subtitle = NULL,
caption = NULL,
colCustom = keyvals,
colAlpha = (ifelse(sigs.reordered$symbol %in% select.genes, 15, 0.8)),
axisLabSize = 10,
legendPosition = 'none',
gridlines.major = FALSE,
gridlines.minor = FALSE,
xlim = c(-27,27))
#Heatmap
#get normalized counts for significant genes from the dds object
mat <- counts(dds, normalized = TRUE)[rownames(sigs.df.copy), ]
#get the z-score for each row
mat.z <- t(apply(mat, 1, scale))
colnames(mat.z) <- colnames(mat)
Heatmap(mat.z, cluster_rows = TRUE, cluster_columns = TRUE, column_labels = colnames(mat.z),
name = "Z-score",
column_names_gp = grid::gpar(fontsize = 6),
column_names_side = "top", column_names_rot = 0,
column_names_centered = TRUE,
col = c("#0056b3","white","#ff0055"),
show_row_names = FALSE,
show_row_dend = FALSE,
km = 2)
#-------------Gene Ontology------------#
#over-representated pathway analysis
#get ENSEMBL IDs of genes with highly altered expression
#upregulated genes
upreg.genes <- row.names(sigs.df.copy[sigs.df.copy$log2FoldChange > 1, ])
upreg.pathways <- enrichGO(gene = upreg.genes, OrgDb = "org.Hs.eg.db",
keyType = "ENSEMBL", ont = "BP")
head(as.data.frame(upreg.pathways))
upreg <- upreg.pathways %>% dplyr::arrange(desc(Count))
head(upreg)
#plot upregulated pathways
fit_upreg <- plot(barplot(upreg.pathways, showCategory = 20))
#plot only top 10
ggplot(upreg[1:10,], aes(y = reorder(Description, +Count), x = Count, fill = p.adjust)) +
geom_col() +
scale_fill_gradient(low="blue", high="red") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
#downregulated genes
downreg.genes <- rownames(sigs.df.copy[sigs.df.copy$log2FoldChange < -1, ])
downreg.pathways <- enrichGO(gene = downreg.genes, OrgDb = "org.Hs.eg.db",
keyType = "ENSEMBL", ont = "BP")
head(as.data.frame(downreg.pathways))
downreg <- downreg.pathways %>% dplyr::arrange(desc(Count))
head(downreg)
#plot downregulated pathways
fit_downreg <- plot(barplot(downreg.pathways, showCategory = 20))
#plot only top 10
ggplot(downreg[1:10,], aes(y = reorder(Description, +Count), x = Count, fill = p.adjust)) +
geom_col() +
scale_fill_gradient(low="blue", high="red") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
#-----------Gene set enrichment analysis----------#
#--------GSEA GO---------#
#order genes by log2 fold change
res_gse <- sigs.df.copy[order(-sigs.df.copy$log2FoldChange), ]
genes_list <- res_gse$log2FoldChange
names(genes_list) <- rownames(sigs.df.copy)
#run GSEA
gsea.go <- gseGO(genes_list, ont = "BP", keyType = "ENSEMBL",
OrgDb = "org.Hs.eg.db")
gsea.df <- as.data.frame(gsea.go)
gsea.df.desc <- gsea.df[order(-gsea.df$NES), ]
#GSEA plot
#Downregulated top 3
#(none were upregulated)
n <- nrow(gsea.df)
gseaplot(gsea.go, geneSetID = which(gsea.df$Description == gsea.df.desc$Description[n]),
title = gsea.df.desc$Description[n])
gseaplot(gsea.go, geneSetID = which(gsea.df$Description == gsea.df.desc$Description[n-1]),
title = gsea.df.desc$Description[n-1])
gseaplot(gsea.go, geneSetID = which(gsea.df$Description == gsea.df.desc$Description[n-2]),
title = gsea.df.desc$Description[n-2])
#Ridge plot
ridgeplot(gsea.go) + labs(x = "enrichment distribution")
#--------------GSEA KEGG-----------------#
# Convert gene IDs for gseKEGG function
# We will lose some genes here because not all IDs will be converted
sigs.entrez <- sigs.df.copy
sigs.entrez$entrezId <- ensembldb::mapIds(EnsDb.Hsapiens.v86, keys = row.names(sigs.df.copy),
keytype = "GENEID", column = "ENTREZID")
# omit any NA values
sigs.entrez <- na.omit(sigs.entrez)
#-----deal with duplicates------#
#check for duplicate gene names
#they need to be dealt with for GSEA
nrow(sigs.entrez[duplicated(sigs.entrez$entrezId), ])
#10 ENTREZ IDs with duplicates
#extract ENTREZ IDs of duplicates
dupes <- sigs.entrez[duplicated(sigs.entrez$entrezId), ]$entrezId
#filter out rows with duplicated ENTREZ IDs to view them
y <- sigs.entrez[sigs.entrez$entrezId %in% dupes, ]
y[order(y$entrezId, decreasing = TRUE), ]
#for ENSEMBL IDs that map to the same gene symbol,
#we'll keep the ones with the largest base mean
for(entrez in dupes){
to_drop = min(sigs.entrez$baseMean[c(which(sigs.entrez$entrezId == entrez))])
sigs.entrez <- sigs.entrez[-c(which(sigs.entrez$baseMean == to_drop)), ]
}
# Create a vector of the log2 fold changes
kegg_gene_list <- sigs.entrez$log2FoldChange
# Name vector with ENTREZ IDs
names(kegg_gene_list) <- sigs.entrez$entrezId
# sort the list in decreasing order (required for clusterProfiler)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
#create KEGG object
gsea.kegg <- gseKEGG(kegg_gene_list, organism = "hsa",
minGSSize = 1, maxGSSize = 20000, pvalueCutoff = 0.05,
pAdjustMethod = "BH", keyType = "ncbi-geneid")
kegg.df <- as.data.frame(gsea.kegg)
kegg.df <- kegg.df[order(kegg.df$NES, decreasing = TRUE), ]
#Bar plot
ggplot(kegg.df, aes(y = reorder(Description, -NES), x = NES, fill = p.adjust)) +
geom_col() +
scale_fill_gradient(low="blue", high="red") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
#cnetplot
gsea.kegg2 <- setReadable(gsea.kegg, OrgDb = "org.Hs.eg.db", keyType = "ENTREZID")
kegg2.df <- as.data.frame(gsea.kegg2)
options(ggrepel.max.overlaps = Inf)
cnetplot(gsea.kegg2, categorySize = "p.adjust", color.params = list(foldChange = kegg_gene_list)) # categorySize can be either 'pvalue' or 'geneNum'
#Ridge plot
ridgeplot(gsea.kegg2) + labs(x = "enrichment distribution")
#GSEA plot
#top 3 most negative (downregulated) NES
gseaplot(gsea.kegg, by = "all", title = gsea.kegg$Description[1], geneSetID = 1)
gseaplot(gsea.kegg, by = "all", title = gsea.kegg$Description[2], geneSetID = 2)
gseaplot(gsea.kegg, by = "all", title = gsea.kegg$Description[3], geneSetID = 3)