add temporary fix for PM task (#307)

openproblems-bio · Dec 13, 2023 · ccb90ca · ccb90ca
1 parent 451f48e
commit ccb90ca
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 7 deletions.
diff --git a/src/datasets/resource_test_scripts/bmmc_x_starter.sh b/src/datasets/resource_test_scripts/bmmc_x_starter.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# TODO: replace this with a run of the correct dataset loader once it is available
+
 NEURIPS2021_URL="https://github.com/openproblems-bio/neurips2021_multimodal_viash/raw/main/resources_test/common"
 DATASET_DIR="resources_test/common"
 
@@ -16,6 +18,32 @@ output_dataset_rna: !file dataset_rna.h5ad
 output_dataset_other_mod: !file dataset_adt.h5ad
 HERE
 
+python - << HERE
+import anndata as ad
+
+rna = ad.read_h5ad("$SUBDIR/dataset_rna.h5ad")
+mod2 = ad.read_h5ad("$SUBDIR/dataset_adt.h5ad")
+
+rna.uns["dataset_id"] = "bmmc_cite_starter"
+mod2.uns["dataset_id"] = "bmmc_cite_starter"
+rna.uns["dataset_name"] = "BMMC Cite Starter"
+mod2.uns["dataset_name"] = "BMMC Cite Starter"
+rna.uns["dataset_url"] = "https://foo.bar"
+mod2.uns["dataset_url"] = "https://foo.bar"
+rna.uns["dataset_reference"] = "foo2001bar"
+mod2.uns["dataset_reference"] = "foo2001bar"
+rna.uns["dataset_summary"] = "summary"
+mod2.uns["dataset_summary"] = "summary"
+rna.uns["dataset_description"] = "description"
+mod2.uns["dataset_description"] = "description"
+rna.uns["dataset_organism"] = "homo_sapiens"
+mod2.uns["dataset_organism"] = "homo_sapiens"
+
+rna.write_h5ad("$SUBDIR/dataset_rna.h5ad")
+mod2.write_h5ad("$SUBDIR/dataset_adt.h5ad")
+HERE
+
+
 SUBDIR="$DATASET_DIR/bmmc_multiome_starter"
 mkdir -p "$SUBDIR"
 wget "$NEURIPS2021_URL/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad" \
@@ -29,5 +57,31 @@ output_dataset_rna: !file dataset_rna.h5ad
 output_dataset_other_mod: !file dataset_atac.h5ad
 HERE
 
+
+python - << HERE
+import anndata as ad
+
+rna = ad.read_h5ad("$SUBDIR/dataset_rna.h5ad")
+mod2 = ad.read_h5ad("$SUBDIR/dataset_atac.h5ad")
+
+rna.uns["dataset_id"] = "bmmc_multiome_starter"
+mod2.uns["dataset_id"] = "bmmc_multipme_starter"
+rna.uns["dataset_name"] = "BMMC Multiome Starter"
+mod2.uns["dataset_name"] = "BMMC Multiome Starter"
+rna.uns["dataset_url"] = "https://foo.bar"
+mod2.uns["dataset_url"] = "https://foo.bar"
+rna.uns["dataset_reference"] = "foo2001bar"
+mod2.uns["dataset_reference"] = "foo2001bar"
+rna.uns["dataset_summary"] = "summary"
+mod2.uns["dataset_summary"] = "summary"
+rna.uns["dataset_description"] = "description"
+mod2.uns["dataset_description"] = "description"
+rna.uns["dataset_organism"] = "homo_sapiens"
+mod2.uns["dataset_organism"] = "homo_sapiens"
+
+rna.write_h5ad("$SUBDIR/dataset_rna.h5ad")
+mod2.write_h5ad("$SUBDIR/dataset_atac.h5ad")
+HERE
+
 # run task process dataset components
 src/tasks/predict_modality/resources_test_scripts/bmmc_x_starter.sh
diff --git a/src/tasks/predict_modality/process_dataset/script.R b/src/tasks/predict_modality/process_dataset/script.R
@@ -40,11 +40,8 @@ ad2_mod <- unique(ad2$var[["feature_types"]])
 new_dataset_id <- paste0(ad1$uns[["dataset_id"]], "_", tolower(ad1_mod), "2", tolower(ad2_mod))
 
 # determine new uns
-ad1_uns <- ad2_uns <- list(
-  dataset_id = new_dataset_id,
-  # TODO: this should already be part of the source dataset
-  dataset_organism = "homo_sapiens"
-)
+uns_vars <- c("dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism")
+ad1_uns <- ad2_uns <- ad1$uns[uns_vars]
 ad1_uns$modality <- ad1_mod
 ad2_uns$modality <- ad2_mod
 

diff --git a/src/tasks/predict_modality/workflows/process_datasets/main.nf b/src/tasks/predict_modality/workflows/process_datasets/main.nf
@@ -19,7 +19,7 @@ workflow run_wf {
             fromState: { id, state ->
         // as a resource
         [
-          "input": state.input,
+          "input": state.input_rna,
           "schema": meta.resources_dir.resolve("file_common_dataset_rna.yaml")
         ]
       },
@@ -37,7 +37,7 @@ workflow run_wf {
             fromState: { id, state ->
         // as a resource
         [
-          "input": state.input,
+          "input": state.input_other_mod,
           "schema": meta.resources_dir.resolve("file_common_dataset_other_mod.yaml")
         ]
       },