OpenOmics · chenv3 · Jan 9, 2025 · Aug 5, 2024 · Sep 3, 2024 · Sep 17, 2024
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -32,3 +32,23 @@ jobs:
       run: |
         docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \
         echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
+  Dry_Run_and_Lint_cellranger:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: docker://snakemake/snakemake:v5.24.2
+    - name: Dry Run with test data
+      run: |
+        docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 \
+        /opt2/cell-seek run --input \
+        /opt2/.tests/WT/ \
+        --output /opt2/output --genome hg38 --pipeline gex --cellranger 8.0.0 --mode local --dry-run
+    - name: View the pipeline config file
+      run: |
+        echo "Generated config file for pipeline...." && cat $PWD/output/config.json
+    - name: Lint Workflow
+      continue-on-error: true
+      run: |
+        docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \
+        echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
+
diff --git a/.tests/WT/outs/web_summary.html b/.tests/WT/outs/web_summary.html
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.1
+3.0.0
diff --git a/cell-seek b/cell-seek
@@ -314,7 +314,7 @@ def parsed_arguments(name, description):
                 [--aggregate {{mapped, none}}][--libraries LIBRARIES] \\
                 [--features FEATURES] [--cmo-reference CMOREFERENCE] \\
                 [--cmo-sample CMOSAMPLE] [--exclude-introns] [--filter FILTER] \\
-                [--create-bam] [--rename RENAMEFILE] \\
+                [--create-bam] [--rename RENAMEFILE] [--forcecells FORCECELLS]\\
                 --input INPUT [INPUT ...] \\
                 --output OUTPUT \\
                 --pipeline {{gex, ...}} \\
@@ -324,17 +324,25 @@ def parsed_arguments(name, description):
 
         {3}{4}Description:{5}
           To run the cell-seek pipeline with your data raw data, please
-        provide a space seperated list of FastQ (globbing is supported) and an output
+        provide a space separated list of FastQ (globbing is supported) and an output
         directory to store results.
 
         {3}{4}Required arguments:{5}
           --input INPUT [INPUT ...]
-                                Input FastQ file(s) to process. The pipeline does NOT
-                                support single-end data. FastQ files for one or more
-                                samples can be provided. Multiple input FastQ files
-                                should be seperated by a space. Globbing for multiple
-                                file is also supported.
+                                Input FastQ file(s) or Cell Ranger output folders to
+                                process. The pipeline does NOT support single-end data. 
+                                FastQ files for one or more samples can be provided. 
+                                Multiple input FastQ files per sample can be provided. 
+                                Multiple input FastQ files should be separated by a 
+                                space.
+                                Cell Ranger output folders can be provided. It is 
+                                expected that the outs folder is contained within the 
+                                Cell Ranger output folders.
+                                Globbing for multiple files/folders is also supported.
+                                FastQ Input:
                                   Example: --input .tests/*.R?.fastq.gz
+                                Cell Ranger Input:
+                                  Example: --input .tests/*/
           --output OUTPUT
                                 Path to an output directory. This location is where
                                 the pipeline will create all of its output files, also
@@ -359,11 +367,11 @@ def parsed_arguments(name, description):
                                 options: hg38, mm10, hg2024, mm2024.
                                   Example: --genome hg38
         {3}{4}Analysis options:{5}
-          --cellranger {{7.1.0, 7.2.0, 8.0.0}}
+          --cellranger {{7.1.0, 7.2.0, 8.0.0, 9.0.0}}
                                 The version of CellRanger to run. This option specifies
                                 which version of CellRanger to use when running GEX,
                                 CITE, or MULTI. Please select one of the following
-                                options: 7.1.0, 7.2.0, 8.0.0
+                                options: 7.1.0, 7.2.0, 8.0.0, 9.0.0
                                   Example: --cellranger 7.1.0
           --aggregate  {{mapped,none}}
                                 Cell Ranger aggregate. This option defines the
@@ -372,9 +380,11 @@ def parsed_arguments(name, description):
                                 from higher depth samples until each library type has an
                                 equal number of reads per cell that are confidently mapped.
                                 None means to not normalize at all. If this flag is not
-                                used then aggregate will not be run. To run Cell Ranger
-                                aggregate, please select one of the following options:
-                                mapped, none.
+                                used then aggregate will not be run. Aggregate analysis
+                                is generally not needed, but it can be used to generate a
+                                Loupe Browser file for interactive exploration of the data.
+                                To run Cell Ranger aggregate, please select one of the 
+                                following options: mapped, none.
                                   Example: --aggregate mapped
           --libraries LIBRARIES
                                 Libraries file. A CSV file containing information about
@@ -556,16 +566,67 @@ def parsed_arguments(name, description):
                                   Here is an example rename.csv file:
                                     FASTQ,Name
                                     original_name1,new_name1
-                                    original_name2,new_name1
-                                    original_name3,new_name2
-                                    original_name4,new_name3
+                                    original_name2,new_name2
+                                    original_name3,new_name3
+                                    original_name3-2,new_name3
+                                    original_name4,original_name4
+                                  where:
+                                    • FASTQ: The name that is used in the FASTQ file
+                                    • Name: Unique sample ID that is the sample name used for
+                                      Cell Ranger count.
                                 In this example, new_name3 has FASTQ files with two different
                                 names. With this input, both sets of FASTQ files will be used
                                 when processing the sample as new_name3. original_name4 will not
                                 be renamed. Any FASTQ file that does not have the name
                                 original_name1, original_name2, original_name3, or original_name4
                                 will not be run.
                                   Example: --rename rename.csv
+          --forcecells FORCECELLS
+                                Force cells file. A CSV file containing the name of the sample
+                                (the Cell Ranger outputted name) and the number of cells to
+                                force the sample to. This flag is applicable when using the GEX,
+                                CITE, MULTI, and ATAC pipelines. It will generally be used if
+                                the first analysis run appears to do a poor job at estimating
+                                the number of cells, and a re-run is needed to adjust the number
+                                of cells in the sample.
+
+                                This file can created in two different formats. The first one
+                                can be used for the GEX, CITE, MULTI, and ATAC pipelines. It
+                                will contain the name of the sample and the number of cells
+                                to be forced to.
+                                  Here is an example forcecells.csv file:
+                                    Sample,Cells
+                                    Sample1,3000
+                                    Sample2,5000
+                                  where:
+                                    • Sample: The sample name used as the Cell Ranger output
+                                    • Cells: The number of cells the sample should be forced to
+                                In this example, Sample1 and Sample2 will be run while being forced
+                                to have 3000 and 5000 cells respectively. Any other samples that
+                                are processed will be run without using the force cells flag and
+                                will use the default cell calling algorithm.
+
+                                The second format is only compatible with the MULTI pipeline and
+                                would be used when hashtag multiplexing is used and the number of
+                                cells needs to be forced for a specific hashtagged sample.
+                                  Here is an example forcecells.csv file:
+                                    Name,Sample,Cells
+                                    Library1,HTO_1,3000
+                                    Library1,HTO_2,5000
+                                  where:
+                                    • Library: The name of the library that is provided as to Cell
+                                      Ranger when running multi analysis. This should match the
+                                      name that is given in the libraries.csv file.
+                                    • Sample: The sample ID used for the associated hashtag. This
+                                      will have to match the value used in the CMO sample file or
+                                      the CMO reference file that is provided as input. If only a
+                                      CMO reference file is provided, the pipeline default assigns
+                                      each hashtag with the IDs of HTO_1, HTO_2, etc.
+                                    • Cells: The number of cells the sample should be forced to
+                                  In this example, the hashtags HTO_1 and HTO_2 in Library 1 will
+                                  be run while being forced to 3000 and 5000 cells respectively.
+                                  Any other libraries or samples that are processed will be run
+                                  without using the force cells flag.
 
         {3}{4}Orchestration options:{5}
           --mode {{slurm,local}}
@@ -836,7 +897,16 @@ def parsed_arguments(name, description):
         type = str.lower,
         required = False,
         default = "",
-        choices = ['7.1.0', '7.2.0', '8.0.0'],
+        choices = ['7.1.0', '7.2.0', '8.0.0', '9.0.0'],
+        help = argparse.SUPPRESS
+    )
+
+    # Number of cells to force samples to when running Cell Ranger analysis
+    subparser_run.add_argument(
+        '--forcecells',
+        # Check if the file exists and if it is readable
+        type = lambda file: permissions(parser, file, os.R_OK),
+        required = False,
         help = argparse.SUPPRESS
     )
 

diff --git a/config/cluster.json b/config/cluster.json
@@ -25,5 +25,11 @@
         "threads": "16",
         "mem": "96g",
         "time": "1-00:00:00"
+    },
+    "seuratIntegrate": {
+        "threads": "8",
+        "mem": "350g",
+	"partition": "largemem",
+        "time": "1-00:00:00"
     }
 }
diff --git a/config/modules.json b/config/modules.json
@@ -1,10 +1,11 @@
 {
     "tools": {
-        "cellranger": {"7.1.0": "cellranger/7.1.0", "7.2.0": "cellranger/7.2.0", "8.0.0": "cellranger/8.0.0"},
+        "cellranger": {"7.1.0": "cellranger/7.1.0", "7.2.0": "cellranger/7.2.0", "8.0.0": "cellranger/8.0.0", "9.0.0": "cellranger/9.0.0"},
 	"cellranger-atac": "cellranger-atac/2.1.0",
 	"cellranger-arc": "cellranger-arc/2.0.1",
         "python2": "python/2.7",
-        "python3": "python/3.8"
+        "python3": "python/3.8",
+	"rversion": "R/4.4.0"
     },
     "r_libs": {
         "ext": "/data/OpenOmics/references/cyte-seek/R/4.1/library"