Fix_dimred (#254)

* Add key in process_datasets * add nf-tower scripts * add normalization_id argument to all normalization methods * update process_openproblems_v1 workflow * add config_mods * update to main-script path * update process_datasets workflow * update run_benchmark workflow * add nf_rower_scripts * fix param typo in nf_tower_scripts * update opv1 wf * wip multimodal data * switch to viash 0.8.0 rc3 * fill in metadata fields * fix sh * fix batchint * fix script * simplify nf * update wf * remove dataset_id from the dataset workflow arguments * remove id from batch integration benchmark workflow * wip refactor wfs * fix scripts * wip dimred and also make changes to batch int and datasets * fix dataset workflows * fix batch int * fix dimred * update wfs * update readme * bump viash version * fix workflows * add back normalization id * undo workarounds * Update src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml Co-authored-by: Kai Waldrant <[email protected]> --------- Co-authored-by: Kai Waldrant <[email protected]>
openproblems-bio · Oct 11, 2023 · 3da9d9e · 3da9d9e
1 parent e609459
commit 3da9d9e
Show file tree

Hide file tree

Showing 30 changed files with 406 additions and 329 deletions.
diff --git a/_viash.yaml b/_viash.yaml
@@ -1,4 +1,4 @@
-viash_version: 0.8.0-RC4
+viash_version: 0.8.0-RC5
 
 source: src
 target: target

diff --git a/src/datasets/resource_scripts/openproblems_v1.sh b/src/datasets/resource_scripts/openproblems_v1.sh
@@ -22,7 +22,6 @@ param_list:
   - id: allen_brain_atlas
     obs_celltype: label
     layer_counts: counts
-    dataset_id: allen_brain_atlas
     dataset_name: Mouse Brain Atlas
     data_url: http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE71585
     data_reference: tasic2016adult
@@ -35,7 +34,6 @@ param_list:
     obs_batch: experiment_code
     obs_tissue: tissue
     layer_counts: counts
-    dataset_id: cengen
     dataset_name: CeNGEN
     data_url: https://www.cengen.org
     data_reference: hammarlund2018cengen
@@ -48,7 +46,6 @@ param_list:
     obs_batch: batch
     obs_tissue: tissue
     layer_counts: counts
-    dataset_id: immune_cells
     dataset_name: Human immune
     data_url: https://theislab.github.io/scib-reproducibility/dataset_immune_cell_hum.html
     data_reference: luecken2022benchmarking
@@ -59,7 +56,6 @@ param_list:
   - id: mouse_blood_olsson_labelled
     obs_celltype: celltype
     layer_counts: counts
-    dataset_id: mouse_blood_olsson_labelled
     dataset_name: Mouse myeloid
     data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE70245
     data_reference: olsson2016single
@@ -70,7 +66,6 @@ param_list:
   - id: mouse_hspc_nestorowa2016
     obs_celltype: cell_type_label
     layer_counts: counts
-    dataset_id: mouse_hspc_nestorowa2016
     dataset_name: Mouse HSPC
     data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81682
     data_reference: nestorowa2016single
@@ -82,29 +77,27 @@ param_list:
     obs_celltype: celltype
     obs_batch: tech
     layer_counts: counts
-    dataset_id: pancreas
     dataset_name: Human pancreas
     data_url: https://theislab.github.io/scib-reproducibility/dataset_pancreas.html
     data_reference: luecken2022benchmarking
     dataset_summary: Human pancreas cells dataset from the scIB benchmarks
     dataset_description: Human pancreatic islet scRNA-seq data from 6 datasets across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq). 
     dataset_organism: homo_sapiens
 
-  - id: tabula_muris_senis_droplet_lung
-    obs_celltype: cell_type
-    obs_batch: donor_id
-    layer_counts: counts
-    dataset_id: tabula_muris_senis_droplet_lung
-    dataset_name: Tabula Muris Senis Lung
-    data_url: https://tabula-muris-senis.ds.czbiohub.org
-    data_reference: tabula2020single
-    dataset_summary: Aging mouse lung cells from Tabula Muris Senis
-    dataset_description: All lung cells from 10x profiles in Tabula Muris Senis, a 500k cell-atlas from 18 organs and tissues across the mouse lifespan.
-    dataset_organism: mus_musculus
+  # disabled as this is not working in openproblemsv1
+  # - id: tabula_muris_senis_droplet_lung
+  #   obs_celltype: cell_type
+  #   obs_batch: donor_id
+  #   layer_counts: counts
+  #   dataset_name: Tabula Muris Senis Lung
+  #   data_url: https://tabula-muris-senis.ds.czbiohub.org
+  #   data_reference: tabula2020single
+  #   dataset_summary: Aging mouse lung cells from Tabula Muris Senis
+  #   dataset_description: All lung cells from 10x profiles in Tabula Muris Senis, a 500k cell-atlas from 18 organs and tissues across the mouse lifespan.
+  #   dataset_organism: mus_musculus
 
   - id: tenx_1k_pbmc
     layer_counts: counts
-    dataset_id: tenx_1k_pbmc
     dataset_name: 1k PBMCs
     data_url: https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0
     data_reference: 10x2018pbmc
@@ -114,7 +107,6 @@ param_list:
 
   - id: tenx_5k_pbmc
     layer_counts: counts
-    dataset_id: tenx_5k_pbmc
     dataset_name: 5k PBMCs
     data_url: https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0
     data_reference: 10x2019pbmc
@@ -125,7 +117,6 @@ param_list:
   - id: tnbc_wu2021
     obs_celltype: celltype_minor
     layer_counts: counts
-    dataset_id: tnbc_wu2021
     dataset_name: Triple-Negative Breast Cancer
     data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118389
     data_reference: wu2021single
@@ -137,15 +128,14 @@ param_list:
     obs_celltype: cell_type
     obs_batch: lab
     layer_counts: counts
-    dataset_id: zebrafish
     dataset_name: Zebrafish embryonic cells
     data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112294
     data_reference: wagner2018single
     dataset_summary: Single-cell mRNA sequencing of zebrafish embryonic cells.
     dataset_description: 90k cells from zebrafish embryos throughout the first day of development, with and without a knockout of chordin, an important developmental gene. 
     dataset_organism: danio_rerio
 
-normalization_id: [log_cp10k, sqrt_cp10k, l1_sqrt]
+normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt]
 output_dataset: '$id/dataset.h5ad'
 output_meta: '$id/dataset_metadata.yaml'
 output_state: '$id/state.yaml'

diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh
@@ -20,7 +20,6 @@ if [ ! -f $params_file ]; then
   cat > "$params_file" << 'HERE'
 param_list:
   - id: citeseq_cbmc
-    dataset_id: citeseq_cbmc
     dataset_name: "CITE-Seq CBMC"
     dataset_summary: "CITE-seq profiles of 8k Cord Blood Mononuclear Cells"
     dataset_description: "8k cord blood mononuclear cells profiled by CITEsequsing a panel of 13 antibodies."
@@ -30,18 +29,16 @@ param_list:
     layer_counts: counts
 
   - id: scicar_cell_lines
-    dataset_id: scicar_cell_lines
     dataset_name: "sci-CAR Cell Lines"
     dataset_summary: "sci-CAR profiles of 5k cell line cells (HEK293T, NIH/3T3, A549) across three treatment conditions (DEX 0h, 1h and 3h)"
     dataset_description: "Single cell RNA-seq and ATAC-seq co-profiling for HEK293T cells, NIH/3T3 cells, A549 cells across three treatment conditions (DEX 0 hour, 1 hour and 3 hour treatment)."
     data_reference: cao2018joint
     data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089
-    dataset_organism: [homo_sapiens, mus_musculus]
+    dataset_organism: "[homo_sapiens, mus_musculus]"
     obs_celltype: cell_name
     layer_counts: counts
 
   - id: scicar_mouse_kidney
-    dataset_id: scicar_mouse_kidney
     dataset_name: "sci-CAR Mouse Kidney"
     dataset_summary: "sci-CAR profiles of 11k mouse kidney cells"
     dataset_description: "Single cell RNA-seq and ATAC-seq co-profiling of 11k mouse kidney cells."
@@ -52,11 +49,11 @@ param_list:
     obs_batch: replicate
     layer_counts: counts
 
-normalization_id: [log_cp10k, sqrt_cp10k, l1_sqrt]
+normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt]
 output_dataset_mod1: '$id/dataset_mod1.h5ad'
-output_dataset_mod1: '$id/dataset_mod2.h5ad'
-output_meta_mod1: '$id/dataset_metadata_mod1.h5ad'
-output_meta_mod1: '$id/dataset_metadata_mod2.h5ad'
+output_dataset_mod2: '$id/dataset_mod2.h5ad'
+output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
+output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
 HERE
 fi

diff --git a/src/datasets/resource_test_scripts/pancreas.sh b/src/datasets/resource_test_scripts/pancreas.sh
@@ -28,7 +28,6 @@ nextflow run . \
   --obs_celltype "celltype" \
   --obs_batch "tech" \
   --layer_counts "counts" \
-  --dataset_id pancreas \
   --dataset_name "Human pancreas" \
   --data_url "https://theislab.github.io/scib-reproducibility/dataset_pancreas.html" \
   --data_reference "luecken2022benchmarking" \

diff --git a/src/datasets/workflows/process_openproblems_v1/main.nf b/src/datasets/workflows/process_openproblems_v1/main.nf
@@ -22,8 +22,14 @@ workflow run_wf {
       key: "sqrt_cpm",
       args: [normalization_id: "sqrt_cpm", n_cp: 1000000],
     ),
-    l1_sqrt,
-    log_scran_pooling
+    l1_sqrt.run(
+      key: "l1_sqrt",
+      args: [normalization_id: "l1_sqrt"],
+    ),
+    log_scran_pooling.run(
+      key: "log_scran_pooling",
+      args: [normalization_id: "log_scran_pooling"],
+    )
   ]
 
   output_ch = input_ch
@@ -49,14 +55,14 @@ workflow run_wf {
         "dataset_description": "dataset_description",
         "dataset_organism": "dataset_organism",
       ],
-      toState: ["raw": "output"]
+      toState: ["output_raw": "output"]
     )
 
     // subsample if so desired
     | subsample.run(
       runIf: { id, state -> state.do_subsample },
       fromState: [
-        "input": "raw",
+        "input": "output_raw",
         "n_obs": "n_obs",
         "n_vars": "n_vars",
         "keep_features": "keep_features",
@@ -66,7 +72,7 @@ workflow run_wf {
         "seed": "seed"
       ],
       args: [output_mod2: null],
-      toState: [raw: "output"]
+      toState: ["output_raw": "output"]
     )
 
     | runEach(
@@ -81,49 +87,53 @@ workflow run_wf {
       filter: { id, state, comp ->
         comp.name in state.normalization_methods
       },
-      fromState: ["input": "raw"],
-      toState: ["normalized": "output"]
+      fromState: ["input": "output_raw"],
+      toState: ["output_normalized": "output"]
     )
 
     | pca.run(
-      fromState: ["input": "normalized"],
-      toState: ["pca": "output" ]
+      fromState: ["input": "output_normalized"],
+      toState: ["output_pca": "output" ]
     )
 
     | hvg.run(
-      fromState: ["input": "pca"],
-      toState: ["hvg": "output"]
+      fromState: ["input": "output_pca"],
+      toState: ["output_hvg": "output"]
     )
 
     | knn.run(
-      fromState: ["input": "hvg"],
-      toState: ["knn": "output"]
+      fromState: ["input": "output_hvg"],
+      toState: ["output_knn": "output"]
     )
 
     | check_dataset_schema.run(
-      fromState: { id, state ->
-        [
-          input: state.knn,
-          checks: null
-        ]
-      },
-      toState: ["dataset": "output", "meta": "meta"]
+      fromState: ["input": "output_knn"],
+      toState: ["output_dataset": "output", "output_meta": "meta"]
     )
 
-    // only output the files for which an output file was specified
-    | setState{ id, state ->
-      [
-        "output_dataset": state.output_dataset ? state.dataset : null,
-        "output_meta": state.output_meta ? state.meta : null,
-        "output_raw": state.output_raw ? state.raw : null,
-        "output_normalized": state.output_normalized ? state.normalized : null,
-        "output_pca": state.output_pca ? state.pca : null,
-        "output_hvg": state.output_hvg ? state.hvg : null,
-        "output_knn": state.output_knn ? state.knn : null,
-        "_meta": state._meta
-      ]
+    | filter{ id, state ->
+      def uns = (new org.yaml.snakeyaml.Yaml().load(state.output_meta)).uns
+      def expected_id = "${uns.dataset_id}/${uns.normalization_id}"
+
+      def is_ok = id == expected_id
+
+      if (!is_ok) {
+        println("DETECTED ID MISMATCH: $id != $expected_id.\nState: $state\n")
+      }
     }
 
+    // only output the files for which an output file was specified
+    | setState([
+      "output_dataset",
+      "output_meta",
+      "output_raw",
+      "output_normalized",
+      "output_pca",
+      "output_hvg",
+      "output_knn",
+      "_meta"
+    ])
+
   emit:
   output_ch
 }
diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf b/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf
@@ -22,8 +22,14 @@ workflow run_wf {
       key: "sqrt_cpm",
       args: [normalization_id: "sqrt_cpm", n_cp: 1000000]
     ),
-    l1_sqrt,
-    log_scran_pooling
+    l1_sqrt.run(
+      key: "l1_sqrt",
+      args: [normalization_id: "l1_sqrt"]
+    ),
+    log_scran_pooling.run(
+      key: "log_scran_pooling",
+      args: [normalization_id: "log_scran_pooling"]
+    )
   ]
 
   output_ch = input_ch

diff --git a/src/tasks/batch_integration/README.md b/src/tasks/batch_integration/README.md
@@ -172,8 +172,7 @@ Arguments:
 
 Unintegrated AnnData HDF5 file.
 
-Example file:
-`resources_test/batch_integration/pancreas/unintegrated.h5ad`
+Example file: `resources_test/batch_integration/pancreas/dataset.h5ad`
 
 Description:
 

diff --git a/src/tasks/batch_integration/resources_scripts/process_datasets.sh b/src/tasks/batch_integration/resources_scripts/process_datasets.sh
@@ -11,18 +11,16 @@ set -e
 COMMON_DATASETS="resources/datasets/openproblems_v1"
 OUTPUT_DIR="resources/batch_integration/datasets/openproblems_v1"
 
-if [ ! -d "$OUTPUT_DIR" ]; then
-  mkdir -p "$OUTPUT_DIR"
-fi
-
 export NXF_VER=22.04.5
+
 nextflow run . \
-  -main-script src/tasks/batch_integration/workflows/process_datasets/main.nf \
+  -main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \
   -profile docker \
   -entry auto \
   -resume \
-  --id resources \
-  --input_states "resources/datasets/openproblems_v1/**/state.yaml" \
+  --input_states "$COMMON_DATASETS/**/state.yaml" \
   --rename_keys 'input:output_dataset' \
-  --settings '{"output_dataset": "dataset.h5ad", "output_solution": "solution.h5ad"}' \
-  --publish_dir "$OUTPUT_DIR"
+  --settings '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' \
+  --publish_dir "$OUTPUT_DIR" \
+  --output_state '$id/state.yaml'
+# output_state should be moved to settings once workaround is solved
diff --git a/src/tasks/batch_integration/resources_scripts/run_benchmark.sh b/src/tasks/batch_integration/resources_scripts/run_benchmark.sh
@@ -19,12 +19,13 @@ fi
 
 export NXF_VER=22.04.5
 nextflow run . \
-  -main-script src/tasks/batch_integration/workflows/run_benchmark/main.nf \
+  -main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \
   -profile docker \
   -resume \
   -entry auto \
-  --id resources \
   --input_states "$DATASETS_DIR/**/state.yaml" \
   --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \
   --settings '{"output": "scores.tsv"}' \
-  --publish_dir "$OUTPUT_DIR"
+  --publish_dir "$OUTPUT_DIR" \
+  --output_state '$id/state.yaml'
+# output_state should be moved to settings once workaround is solved
diff --git a/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml b/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml
@@ -5,16 +5,15 @@ functionality:
     - name: Inputs
       arguments:
         - name: "--input"
-          required: true
-          example: dataset.h5ad
           __merge__: "/src/tasks/batch_integration/api/file_common_dataset.yaml"
-    - name: Schemas
-      arguments:
+          required: true
+          direction: input
         - name: "--dataset_schema"
           type: "file"
           description: "The schema of the dataset to validate against"
           required: true
           default: "src/tasks/batch_integration/api/file_common_dataset.yaml"
+          direction: input
     - name: Outputs
       arguments:
         - name: "--output_dataset"