Added analysis scripts

fraunhofer-izi · Sep 3, 2023 · a46d1be · a46d1be
1 parent 707d86f
commit a46d1be
Show file tree

Hide file tree

Showing 84 changed files with 68,360 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/Grieb-et-al-2023.Rproj b/Grieb-et-al-2023.Rproj
@@ -0,0 +1,15 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+StripTrailingWhitespace: Yes
diff --git a/code/01_cellranger.R b/code/01_cellranger.R
@@ -0,0 +1,129 @@
+library(dplyr)
+library(yaml)
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# scRNA-Seq: CellRanger count
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+nodes = 1
+ntasks = 1
+ttime = "44:00:00"
+mail = "FAIL"
+mem = 190000
+cpu = 25
+
+manifest = yaml.load_file("manifest.yaml")
+
+# FASTQ files: NCBI GEO GSE234261
+fastqs = manifest$grieb$fastq
+
+# The FASTA sequences can be found in the supplementary material of the publication
+ref.gex.cilta = paste0(manifest$base$workdata, "references/GRCh38-CiltaCel/")
+ref.gex.ide = paste0(manifest$base$workdata, "references/GRCh38-IdeCel/")
+
+# VDJ reference: https://support.10xgenomics.com/single-cell-vdj/software/downloads/latest
+ref.vdj = paste0(manifest$base$workdata, "references/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0/")
+
+# ADT reference  can be found in the supplementary material of the publication or in "data/" of the repo
+ref.features = "data/feature_reference.csv"
+out.dir = paste0(manifest$base$workdata, "cohorts/grieb/cellranger/")
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# sbatch
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+write_subscript = function(path, job_id, csv.out){
+  file.create(path)
+
+  write("#!/bin/bash", file = path, append = TRUE)
+  write("", file = path, append = TRUE)
+
+  write(paste("#SBATCH -J", job_id), file = path, append = TRUE)
+  write(paste("#SBATCH --nodes", nodes), file = path, append = TRUE)
+  write(paste("#SBATCH --ntasks", ntasks), file = path, append = TRUE)
+  write(paste("#SBATCH --time", ttime), file = path, append = TRUE)
+  write(paste("#SBATCH --cpus-per-task", cpu), file = path, append = TRUE)
+  write(paste("#SBATCH --mem", mem), file = path, append = TRUE)
+  write(paste("#SBATCH --exclude=ribnode[009,020,006,007]"), file = path, append = TRUE)
+  write(paste("#SBATCH -e", paste0(job_id, ".e")), file = path, append = TRUE)
+  write(paste("#SBATCH -o", paste0(job_id, ".o")), file = path, append = TRUE)
+  write("#SBATCH --mail-type=END,FAIL", file = path, append = TRUE)
+
+  write("", file = path, append = TRUE)
+
+  write(
+    paste0(
+      "cellranger multi",
+      " --id ", job_id,
+      " --csv ", csv.out,
+      " --localcores=", cpu,
+      " --localmem=", mem/1000
+  ), file = path, append = TRUE)
+
+  write("", file = path, append = TRUE)
+}
+
+write_multi_csv = function(sample.paths, csv.out){
+
+  rna = sample.paths[sample.paths$SOURCE == "Gene Expression", , drop = F]
+  adt = sample.paths[sample.paths$SOURCE == "Antibody Capture", , drop = F]
+  tcr = sample.paths[sample.paths$SOURCE == "VDJ-T", , drop = F]
+  bcr = sample.paths[sample.paths$SOURCE == "VDJ-B", , drop = F]
+
+  file.create(csv.out)
+
+  write(paste0("[gene-expression]"), file = csv.out, append = TRUE)
+  write(paste0("reference,", rna$GEX_REF), file = csv.out, append = TRUE)
+  write(paste0("[vdj]"), file = csv.out, append = TRUE)
+  write(paste0("reference,", ref.vdj), file = csv.out, append = TRUE)
+  write(paste0("[feature]"), file = csv.out, append = TRUE)
+  write(paste0("reference,", ref.features), file = csv.out, append = TRUE)
+  write(paste0("[libraries]"), file = csv.out, append = TRUE)
+  write(paste0("fastq_id,fastqs,feature_types"), file = csv.out, append = TRUE)
+  write(paste0(rna$SAMPLE, ",", rna$PATH, ",", rna$SOURCE), file = csv.out, append = TRUE)
+  write(paste0(adt$SAMPLE, ",", adt$PATH, ",", adt$SOURCE), file = csv.out, append = TRUE)
+  write(paste0(tcr$SAMPLE, ",", tcr$PATH, ",", tcr$SOURCE), file = csv.out, append = TRUE)
+  write(paste0(bcr$SAMPLE, ",", bcr$PATH, ",", bcr$SOURCE), file = csv.out, append = TRUE)
+
+}
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# sample paths
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+fastq.files = list.files(path = fastqs, full.names = T, recursive = T)
+df = data.frame(
+  SAMPLE = basename(fastq.files),
+  PATH = dirname(fastq.files)
+)
+df$SAMPLE = gsub("_S.+", "", df$SAMPLE)
+df = df %>% dplyr::mutate(
+  SOURCE = dplyr::case_when(
+    grepl("_R$", SAMPLE) ~ "Gene Expression",
+    grepl("_A$", SAMPLE) ~ "Antibody Capture",
+    grepl("_T$", SAMPLE) ~ "VDJ-T",
+    grepl("_B$", SAMPLE) ~ "VDJ-B"
+  )
+)
+df$SAMPLE_SHORT = gsub("_R|_A|_T|_B", "", df$SAMPLE)
+
+# cilta-cel samples
+cilta.samples = c("MXMERZ002A_03", "MXMERZ002A_19", "MXMERZ002A_20", "MXMERZ002A_04", "MXMERZ002A_08")
+df$GEX_REF = gsub("_R|_A|_T|_B", "", df$SAMPLE_SHORT) %in% cilta.samples
+df$GEX_REF = ifelse(df$GEX_REF == T, ref.gex.cilta, ref.gex.ide)
+
+df = df[!duplicated(df$SAMPLE), ]
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# submit to Slurm
+# There is a link with this script under this path: "out.dir".
+# The script was executed there.
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+for (sample in unique(df$SAMPLE_SHORT)) {
+  job_id = paste0("multi_", sample)
+  print(job_id)
+  csv.out = paste0(out.dir, job_id, ".csv")
+  sample.paths = subset(df, SAMPLE_SHORT == sample)
+  write_multi_csv(sample.paths, csv.out)
+  write_subscript(paste0(out.dir, job_id, ".slurm"), job_id, csv.out)
+
+  cmd = paste0("sbatch ", paste0(out.dir, job_id, ".slurm"))
+  system(cmd)
+}
diff --git a/code/02_cellranger_to_seurat.R b/code/02_cellranger_to_seurat.R
@@ -0,0 +1,99 @@
+print("# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
+print("Grieb")
+print("# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# Libraries
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+.cran_packages = c("Seurat", "yaml", "dplyr", "doParallel", "parallel", "data.table")
+.bioc_packages = c("biomaRt", "scDblFinder", "SingleCellExperiment")
+
+# Install CRAN packages (if not already installed)
+.inst = .cran_packages %in% installed.packages()
+if (any(!.inst)) {
+  install.packages(.cran_packages[!.inst], repos = "http://cran.rstudio.com/")
+}
+
+# Install bioconductor packages (if not already installed)
+.inst <- .bioc_packages %in% installed.packages()
+if (any(!.inst)) {
+  library(BiocManager)
+  BiocManager::install(.bioc_packages[!.inst], ask = T)
+}
+
+list.of.packages = c(.cran_packages, .bioc_packages)
+
+## Loading library
+for (pack in list.of.packages) {
+  suppressMessages(library(
+    pack,
+    quietly = TRUE,
+    verbose = FALSE,
+    character.only = TRUE
+  ))
+}
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# Load Rawcounts and create a merged Seurat object
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+manifest = yaml.load_file("manifest.yaml")
+work.path = manifest$grieb$data_dl
+seurat.path = manifest$grieb$seurat
+dir.create(seurat.path, recursive = T)
+
+obj.path = paste0(paste0(seurat.path, "seurat_ori_pub.Rds"))
+obj.sub.path = paste0(paste0(seurat.path, "seurat_sub_ori_pub.Rds"))
+
+cellranger.dirs = list.dirs(path = paste0(work.path, "cellranger"), full.names = T, recursive = T)
+
+fltrd.dirs = cellranger.dirs[grepl("sample_filtered_feature_bc_matrix", cellranger.dirs)]
+tmp2 = gsub(paste0(work.path, "cellranger/"), "", fltrd.dirs)
+tmp2 = gsub("/", "", gsub("out.*", "", tmp2))
+tmp2 = gsub("multi_", "", tmp2)
+names(fltrd.dirs) = tmp2
+fltrd.dirs = fltrd.dirs[!grepl("^bck", names(fltrd.dirs))]
+
+# i = names(fltrd.dirs)[1]
+bpparam = BiocParallel::MulticoreParam(workers = 5)
+seurat.l = BiocParallel::bplapply(names(fltrd.dirs), function(i) {
+
+  id = i
+  fltrd.counts = Read10X(data.dir = fltrd.dirs[names(fltrd.dirs) == id], gene.column = 2)
+
+  seu.obj = CreateSeuratObject(counts = fltrd.counts[[1]], project = id)
+  seu.obj[["ADT"]] = CreateAssayObject(counts = fltrd.counts[[2]])
+  seu.obj@meta.data$orig.ident = id
+  seu.obj@meta.data$orig.ident = factor(seu.obj@meta.data$orig.ident)
+
+  # scDblFinder
+  sce = scDblFinder(GetAssayData(seu.obj, slot="counts"))
+  stopifnot(identical(colnames(seu.obj), colnames(sce)))
+  seu.obj@meta.data$scDblFinder_score = sce$scDblFinder.score
+  seu.obj@meta.data$scDblFinder_class = sce$scDblFinder.class
+
+  seu.obj
+
+}, BPPARAM = bpparam)
+
+seurat = merge(seurat.l[[1]], y = seurat.l[2:length(seurat.l)], add.cell.ids = names(seurat.l), project = "Grieb")
+seurat@meta.data$orig.ident = factor(seurat@meta.data$orig.ident)
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# Assay ADT
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+DefaultAssay(seurat) = "ADT"
+adt.ftrs = gsub("-proteona", "", rownames(seurat))
+
+rownames(seurat@assays$ADT@counts) = toupper(adt.ftrs)
+seurat@assays$ADT@data = seurat@assays$ADT@counts
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# Save
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+DefaultAssay(seurat) = "RNA"
+Idents(seurat) = "orig.ident"
+
+saveRDS(seurat, file = obj.path)
+saveRDS(subset(x = seurat, downsample = 200), file = obj.sub.path)
+