Skip to content

Commit

Permalink
Fix_dimred (#254)
Browse files Browse the repository at this point in the history
* Add key in process_datasets

* add nf-tower scripts

* add normalization_id argument to all normalization methods

* update process_openproblems_v1 workflow

* add config_mods

* update to main-script path

* update process_datasets workflow

* update run_benchmark workflow

* add nf_rower_scripts

* fix param typo in nf_tower_scripts

* update opv1 wf

* wip multimodal data

* switch to viash 0.8.0 rc3

* fill in metadata fields

* fix sh

* fix batchint

* fix script

* simplify nf

* update wf

* remove dataset_id from the dataset workflow arguments

* remove id from batch integration benchmark workflow

* wip refactor wfs

* fix scripts

* wip dimred and also make changes to batch int and datasets

* fix dataset workflows

* fix batch int

* fix dimred

* update wfs

* update readme

* bump viash version

* fix workflows

* add back normalization id

* undo workarounds

* Update src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml

Co-authored-by: Kai Waldrant <[email protected]>

---------

Co-authored-by: Kai Waldrant <[email protected]>
  • Loading branch information
rcannood and KaiWaldrant authored Oct 11, 2023
1 parent e609459 commit 3da9d9e
Show file tree
Hide file tree
Showing 30 changed files with 406 additions and 329 deletions.
2 changes: 1 addition & 1 deletion _viash.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
viash_version: 0.8.0-RC4
viash_version: 0.8.0-RC5

source: src
target: target
Expand Down
34 changes: 12 additions & 22 deletions src/datasets/resource_scripts/openproblems_v1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ param_list:
- id: allen_brain_atlas
obs_celltype: label
layer_counts: counts
dataset_id: allen_brain_atlas
dataset_name: Mouse Brain Atlas
data_url: http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE71585
data_reference: tasic2016adult
Expand All @@ -35,7 +34,6 @@ param_list:
obs_batch: experiment_code
obs_tissue: tissue
layer_counts: counts
dataset_id: cengen
dataset_name: CeNGEN
data_url: https://www.cengen.org
data_reference: hammarlund2018cengen
Expand All @@ -48,7 +46,6 @@ param_list:
obs_batch: batch
obs_tissue: tissue
layer_counts: counts
dataset_id: immune_cells
dataset_name: Human immune
data_url: https://theislab.github.io/scib-reproducibility/dataset_immune_cell_hum.html
data_reference: luecken2022benchmarking
Expand All @@ -59,7 +56,6 @@ param_list:
- id: mouse_blood_olsson_labelled
obs_celltype: celltype
layer_counts: counts
dataset_id: mouse_blood_olsson_labelled
dataset_name: Mouse myeloid
data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE70245
data_reference: olsson2016single
Expand All @@ -70,7 +66,6 @@ param_list:
- id: mouse_hspc_nestorowa2016
obs_celltype: cell_type_label
layer_counts: counts
dataset_id: mouse_hspc_nestorowa2016
dataset_name: Mouse HSPC
data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81682
data_reference: nestorowa2016single
Expand All @@ -82,29 +77,27 @@ param_list:
obs_celltype: celltype
obs_batch: tech
layer_counts: counts
dataset_id: pancreas
dataset_name: Human pancreas
data_url: https://theislab.github.io/scib-reproducibility/dataset_pancreas.html
data_reference: luecken2022benchmarking
dataset_summary: Human pancreas cells dataset from the scIB benchmarks
dataset_description: Human pancreatic islet scRNA-seq data from 6 datasets across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq).
dataset_organism: homo_sapiens
- id: tabula_muris_senis_droplet_lung
obs_celltype: cell_type
obs_batch: donor_id
layer_counts: counts
dataset_id: tabula_muris_senis_droplet_lung
dataset_name: Tabula Muris Senis Lung
data_url: https://tabula-muris-senis.ds.czbiohub.org
data_reference: tabula2020single
dataset_summary: Aging mouse lung cells from Tabula Muris Senis
dataset_description: All lung cells from 10x profiles in Tabula Muris Senis, a 500k cell-atlas from 18 organs and tissues across the mouse lifespan.
dataset_organism: mus_musculus
# disabled as this is not working in openproblemsv1
# - id: tabula_muris_senis_droplet_lung
# obs_celltype: cell_type
# obs_batch: donor_id
# layer_counts: counts
# dataset_name: Tabula Muris Senis Lung
# data_url: https://tabula-muris-senis.ds.czbiohub.org
# data_reference: tabula2020single
# dataset_summary: Aging mouse lung cells from Tabula Muris Senis
# dataset_description: All lung cells from 10x profiles in Tabula Muris Senis, a 500k cell-atlas from 18 organs and tissues across the mouse lifespan.
# dataset_organism: mus_musculus
- id: tenx_1k_pbmc
layer_counts: counts
dataset_id: tenx_1k_pbmc
dataset_name: 1k PBMCs
data_url: https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0
data_reference: 10x2018pbmc
Expand All @@ -114,7 +107,6 @@ param_list:
- id: tenx_5k_pbmc
layer_counts: counts
dataset_id: tenx_5k_pbmc
dataset_name: 5k PBMCs
data_url: https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0
data_reference: 10x2019pbmc
Expand All @@ -125,7 +117,6 @@ param_list:
- id: tnbc_wu2021
obs_celltype: celltype_minor
layer_counts: counts
dataset_id: tnbc_wu2021
dataset_name: Triple-Negative Breast Cancer
data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118389
data_reference: wu2021single
Expand All @@ -137,15 +128,14 @@ param_list:
obs_celltype: cell_type
obs_batch: lab
layer_counts: counts
dataset_id: zebrafish
dataset_name: Zebrafish embryonic cells
data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112294
data_reference: wagner2018single
dataset_summary: Single-cell mRNA sequencing of zebrafish embryonic cells.
dataset_description: 90k cells from zebrafish embryos throughout the first day of development, with and without a knockout of chordin, an important developmental gene.
dataset_organism: danio_rerio
normalization_id: [log_cp10k, sqrt_cp10k, l1_sqrt]
normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt]
output_dataset: '$id/dataset.h5ad'
output_meta: '$id/dataset_metadata.yaml'
output_state: '$id/state.yaml'
Expand Down
13 changes: 5 additions & 8 deletions src/datasets/resource_scripts/openproblems_v1_multimodal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ if [ ! -f $params_file ]; then
cat > "$params_file" << 'HERE'
param_list:
- id: citeseq_cbmc
dataset_id: citeseq_cbmc
dataset_name: "CITE-Seq CBMC"
dataset_summary: "CITE-seq profiles of 8k Cord Blood Mononuclear Cells"
dataset_description: "8k cord blood mononuclear cells profiled by CITEsequsing a panel of 13 antibodies."
Expand All @@ -30,18 +29,16 @@ param_list:
layer_counts: counts
- id: scicar_cell_lines
dataset_id: scicar_cell_lines
dataset_name: "sci-CAR Cell Lines"
dataset_summary: "sci-CAR profiles of 5k cell line cells (HEK293T, NIH/3T3, A549) across three treatment conditions (DEX 0h, 1h and 3h)"
dataset_description: "Single cell RNA-seq and ATAC-seq co-profiling for HEK293T cells, NIH/3T3 cells, A549 cells across three treatment conditions (DEX 0 hour, 1 hour and 3 hour treatment)."
data_reference: cao2018joint
data_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089
dataset_organism: [homo_sapiens, mus_musculus]
dataset_organism: "[homo_sapiens, mus_musculus]"
obs_celltype: cell_name
layer_counts: counts
- id: scicar_mouse_kidney
dataset_id: scicar_mouse_kidney
dataset_name: "sci-CAR Mouse Kidney"
dataset_summary: "sci-CAR profiles of 11k mouse kidney cells"
dataset_description: "Single cell RNA-seq and ATAC-seq co-profiling of 11k mouse kidney cells."
Expand All @@ -52,11 +49,11 @@ param_list:
obs_batch: replicate
layer_counts: counts
normalization_id: [log_cp10k, sqrt_cp10k, l1_sqrt]
normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt]
output_dataset_mod1: '$id/dataset_mod1.h5ad'
output_dataset_mod1: '$id/dataset_mod2.h5ad'
output_meta_mod1: '$id/dataset_metadata_mod1.h5ad'
output_meta_mod1: '$id/dataset_metadata_mod2.h5ad'
output_dataset_mod2: '$id/dataset_mod2.h5ad'
output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
output_state: '$id/state.yaml'
HERE
fi
Expand Down
1 change: 0 additions & 1 deletion src/datasets/resource_test_scripts/pancreas.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ nextflow run . \
--obs_celltype "celltype" \
--obs_batch "tech" \
--layer_counts "counts" \
--dataset_id pancreas \
--dataset_name "Human pancreas" \
--data_url "https://theislab.github.io/scib-reproducibility/dataset_pancreas.html" \
--data_reference "luecken2022benchmarking" \
Expand Down
74 changes: 42 additions & 32 deletions src/datasets/workflows/process_openproblems_v1/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ workflow run_wf {
key: "sqrt_cpm",
args: [normalization_id: "sqrt_cpm", n_cp: 1000000],
),
l1_sqrt,
log_scran_pooling
l1_sqrt.run(
key: "l1_sqrt",
args: [normalization_id: "l1_sqrt"],
),
log_scran_pooling.run(
key: "log_scran_pooling",
args: [normalization_id: "log_scran_pooling"],
)
]

output_ch = input_ch
Expand All @@ -49,14 +55,14 @@ workflow run_wf {
"dataset_description": "dataset_description",
"dataset_organism": "dataset_organism",
],
toState: ["raw": "output"]
toState: ["output_raw": "output"]
)

// subsample if so desired
| subsample.run(
runIf: { id, state -> state.do_subsample },
fromState: [
"input": "raw",
"input": "output_raw",
"n_obs": "n_obs",
"n_vars": "n_vars",
"keep_features": "keep_features",
Expand All @@ -66,7 +72,7 @@ workflow run_wf {
"seed": "seed"
],
args: [output_mod2: null],
toState: [raw: "output"]
toState: ["output_raw": "output"]
)

| runEach(
Expand All @@ -81,49 +87,53 @@ workflow run_wf {
filter: { id, state, comp ->
comp.name in state.normalization_methods
},
fromState: ["input": "raw"],
toState: ["normalized": "output"]
fromState: ["input": "output_raw"],
toState: ["output_normalized": "output"]
)

| pca.run(
fromState: ["input": "normalized"],
toState: ["pca": "output" ]
fromState: ["input": "output_normalized"],
toState: ["output_pca": "output" ]
)

| hvg.run(
fromState: ["input": "pca"],
toState: ["hvg": "output"]
fromState: ["input": "output_pca"],
toState: ["output_hvg": "output"]
)

| knn.run(
fromState: ["input": "hvg"],
toState: ["knn": "output"]
fromState: ["input": "output_hvg"],
toState: ["output_knn": "output"]
)

| check_dataset_schema.run(
fromState: { id, state ->
[
input: state.knn,
checks: null
]
},
toState: ["dataset": "output", "meta": "meta"]
fromState: ["input": "output_knn"],
toState: ["output_dataset": "output", "output_meta": "meta"]
)

// only output the files for which an output file was specified
| setState{ id, state ->
[
"output_dataset": state.output_dataset ? state.dataset : null,
"output_meta": state.output_meta ? state.meta : null,
"output_raw": state.output_raw ? state.raw : null,
"output_normalized": state.output_normalized ? state.normalized : null,
"output_pca": state.output_pca ? state.pca : null,
"output_hvg": state.output_hvg ? state.hvg : null,
"output_knn": state.output_knn ? state.knn : null,
"_meta": state._meta
]
| filter{ id, state ->
def uns = (new org.yaml.snakeyaml.Yaml().load(state.output_meta)).uns
def expected_id = "${uns.dataset_id}/${uns.normalization_id}"

def is_ok = id == expected_id

if (!is_ok) {
println("DETECTED ID MISMATCH: $id != $expected_id.\nState: $state\n")
}
}

// only output the files for which an output file was specified
| setState([
"output_dataset",
"output_meta",
"output_raw",
"output_normalized",
"output_pca",
"output_hvg",
"output_knn",
"_meta"
])

emit:
output_ch
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ workflow run_wf {
key: "sqrt_cpm",
args: [normalization_id: "sqrt_cpm", n_cp: 1000000]
),
l1_sqrt,
log_scran_pooling
l1_sqrt.run(
key: "l1_sqrt",
args: [normalization_id: "l1_sqrt"]
),
log_scran_pooling.run(
key: "log_scran_pooling",
args: [normalization_id: "log_scran_pooling"]
)
]

output_ch = input_ch
Expand Down
3 changes: 1 addition & 2 deletions src/tasks/batch_integration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,7 @@ Arguments:

Unintegrated AnnData HDF5 file.

Example file:
`resources_test/batch_integration/pancreas/unintegrated.h5ad`
Example file: `resources_test/batch_integration/pancreas/dataset.h5ad`

Description:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,16 @@ set -e
COMMON_DATASETS="resources/datasets/openproblems_v1"
OUTPUT_DIR="resources/batch_integration/datasets/openproblems_v1"

if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi

export NXF_VER=22.04.5

nextflow run . \
-main-script src/tasks/batch_integration/workflows/process_datasets/main.nf \
-main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \
-profile docker \
-entry auto \
-resume \
--id resources \
--input_states "resources/datasets/openproblems_v1/**/state.yaml" \
--input_states "$COMMON_DATASETS/**/state.yaml" \
--rename_keys 'input:output_dataset' \
--settings '{"output_dataset": "dataset.h5ad", "output_solution": "solution.h5ad"}' \
--publish_dir "$OUTPUT_DIR"
--settings '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' \
--publish_dir "$OUTPUT_DIR" \
--output_state '$id/state.yaml'
# output_state should be moved to settings once workaround is solved
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ fi

export NXF_VER=22.04.5
nextflow run . \
-main-script src/tasks/batch_integration/workflows/run_benchmark/main.nf \
-main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \
-profile docker \
-resume \
-entry auto \
--id resources \
--input_states "$DATASETS_DIR/**/state.yaml" \
--rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \
--settings '{"output": "scores.tsv"}' \
--publish_dir "$OUTPUT_DIR"
--publish_dir "$OUTPUT_DIR" \
--output_state '$id/state.yaml'
# output_state should be moved to settings once workaround is solved
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@ functionality:
- name: Inputs
arguments:
- name: "--input"
required: true
example: dataset.h5ad
__merge__: "/src/tasks/batch_integration/api/file_common_dataset.yaml"
- name: Schemas
arguments:
required: true
direction: input
- name: "--dataset_schema"
type: "file"
description: "The schema of the dataset to validate against"
required: true
default: "src/tasks/batch_integration/api/file_common_dataset.yaml"
direction: input
- name: Outputs
arguments:
- name: "--output_dataset"
Expand Down
Loading

0 comments on commit 3da9d9e

Please sign in to comment.