Skip to content

Commit

Permalink
Add components for extracting dataset info (#315)
Browse files Browse the repository at this point in the history
* add dataset comp

* add get_dataset_info to workflow

* commit

* undo changes -- will be addressed in #314

* move get_dataset_info component

* remove unnecessary dependencies

* add component for extracting the dataset info

* fix script

* fix typo

* fix script

* update script

* fix get_dataset_info

---------

Co-authored-by: Kai Waldrant <[email protected]>
  • Loading branch information
rcannood and KaiWaldrant authored Dec 19, 2023
1 parent 0a22803 commit 17cc7cf
Show file tree
Hide file tree
Showing 12 changed files with 216 additions and 22 deletions.
20 changes: 20 additions & 0 deletions src/common/process_task_results/get_dataset_info/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
__merge__: ../api/get_info.yaml
functionality:
name: "get_dataset_info"
description: "Extract dataset info and convert to expected format for website results"
resources:
- type: r_script
path: script.R
test_resources:
- type: file
path: /resources_test/common/task_metadata/dataset_info.yaml
dest: test_file.yaml
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_r:1.0.2
setup:
- type: r
cran: [ purrr, dplyr, yaml, rlang, processx ]
- type: nextflow
directives:
label: [lowmem, lowtime, lowcpu]
28 changes: 28 additions & 0 deletions src/common/process_task_results/get_dataset_info/script.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
library(purrr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)

## VIASH START
par <- list(
input = "resources_test/common/task_metadata/dataset_info.yaml",
output = "output/metric_info.json"
)
## VIASH END

datasets <- yaml::yaml.load_file(par$input)

df <- map_df(datasets, function(dataset) {
info <- as_tibble(map(dataset, as.data.frame))
}) %>%
rename(
data_url = dataset_url,
data_reference = dataset_reference
)


jsonlite::write_json(
purrr::transpose(df),
par$output,
auto_unbox = TRUE,
pretty = TRUE
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ platforms:
setup:
- type: r
cran: [ purrr, dplyr, yaml, rlang, processx ]
- type: apt
packages: [ curl, default-jdk ]
- type: docker
run: "curl -fsSL dl.viash.io | bash && mv viash /usr/bin/viash"
- type: nextflow
directives:
label: [lowmem, lowtime, lowcpu]
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ platforms:
setup:
- type: r
cran: [ purrr, dplyr, yaml, rlang, processx ]
- type: apt
packages: [ curl, default-jdk ]
- type: docker
run: "curl -fsSL dl.viash.io | bash && mv viash /usr/bin/viash"
- type: nextflow
directives:
label: [lowmem, lowtime, lowcpu]
1 change: 1 addition & 0 deletions src/common/process_task_results/run/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ functionality:
- name: common/process_task_results/get_results
- name: common/process_task_results/get_method_info
- name: common/process_task_results/get_metric_info
- name: common/process_task_results/get_dataset_info
- name: common/process_task_results/yaml_to_json
platforms:
- type: nextflow
3 changes: 1 addition & 2 deletions src/common/process_task_results/run/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ workflow run_wf {
}
)

| yaml_to_json.run(
key: "dataset_info",
| get_dataset_info.run(
fromState: [
"input": "input_dataset_info",
"output": "output_dataset_info"
Expand Down
11 changes: 3 additions & 8 deletions src/common/process_task_results/yaml_to_json/script.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
from os import path
import yaml
import json

## VIASH START
par = {
"input" : ".",
"task_id" : "denoising",
"input": ".",
"task_id": "denoising",
"output": "output/task.json",

}
meta = { "functionality" : "foo" }

## VIASH END

with open(par["input"], "r") as f:
yaml_file = yaml.safe_load(f)


with open(par["output"], "w") as out:
json.dump(yaml_file, out, indent=2)
json.dump(yaml_file, out, indent=2)
5 changes: 1 addition & 4 deletions src/common/resources_test_scripts/task_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,6 @@ nextflow run . \
-entry auto \
--input_states "$DATASETS_DIR/**/state.yaml" \
--rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \
--settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml"}' \
--settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \
--publish_dir "$OUTPUT_DIR" \
--output_state "state.yaml"

# Copy task info
cp src/tasks/batch_integration/api/task_info.yaml "$OUTPUT_DIR/task_info.yaml"
40 changes: 40 additions & 0 deletions src/datasets/resource_scripts/dataset_info.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

cat > "/tmp/params.yaml" << HERE
param_list:
- id: openproblems_v1
input_states: "$DATASETS_DIR/openproblems_v1/**/log_cp10k/state.yaml"
rename_keys: 'input:output_dataset'
- id: openproblems_v1_multimodal
input_states: "$DATASETS_DIR/openproblems_v1_multimodal/**/log_cp10k/state.yaml"
rename_keys: 'input:output_dataset_mod1'
- id: cellxgene_census
input_states: "$DATASETS_DIR/cellxgene_census/**/log_cp10k/state.yaml"
rename_keys: 'input:output_dataset'
settings: '{"output": "dataset_info.yaml"}'
output_state: state.yaml
publish_dir: "$DATASETS_DIR"
HERE

cat > /tmp/nextflow.config << HERE
process {
executor = 'awsbatch'
withLabel: highmem {
memory = '350GB'
}
withName: '.*publishStatesProc' {
memory = '16GB'
disk = '100GB'
}
}
HERE

tw launch https://github.com/openproblems-bio/openproblems-v2.git \
--revision main_build \
--entry-name auto \
--pull-latest \
--main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \
--workspace 53907369739130 \
--compute-env 1pK56PjjzeraOOC2LDZvN2 \
--params-file "/tmp/params.yaml" \
--config /tmp/nextflow.config
34 changes: 34 additions & 0 deletions src/datasets/workflows/extract_dataset_info/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
functionality:
name: "extract_dataset_info"
namespace: "datasets/workflows"
argument_groups:
- name: Inputs
arguments:
- name: "--input"
__merge__: /src/datasets/api/file_raw.yaml
required: true
direction: input
- name: Filter arguments
arguments:
- name: "--filter_normalization_id"
type: string
required: false
direction: input
description: If defined, only the normalization with this ID will be included in the output.
multiple: true
default: [ log_cp10k ]
- name: Outputs
arguments:
- name: "--output"
type: file
required: true
direction: output
example: dataset_uns.yaml
resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf
dependencies:
- name: common/check_dataset_schema
platforms:
- type: nextflow
56 changes: 56 additions & 0 deletions src/datasets/workflows/extract_dataset_info/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
workflow auto {
findStates(params, meta.config)
| meta.workflow.run(
auto: [publish: "state"]
)
}

workflow run_wf {
take:
input_ch

main:
output_ch = input_ch

// extract the dataset metadata
| check_dataset_schema.run(
fromState: [input: "input"],
toState: { id, output, state ->
def dataset_uns = (new org.yaml.snakeyaml.Yaml().load(output.meta)).uns
state + [dataset_uns: dataset_uns]
}
)

// only keep one of the normalization methods
| filter{ id, state ->
if (state.filter_normalization_id) {
state.filter_normalization_id.contains(state.dataset_uns.normalization_id)
} else {
true
}
}

| joinStates { ids, states ->
// remove normalization id
def dataset_uns = states.collect{state ->
def uns = state.dataset_uns.clone()
uns.remove("normalization_id")
uns
}

// store data as yaml
def dataset_uns_yaml_blob = toYamlBlob(dataset_uns)
def dataset_uns_file = tempFile("dataset_uns.yaml")
dataset_uns_file.write(dataset_uns_yaml_blob)

def new_state = [
output: dataset_uns_file,
_meta: [join_id: ids[0]]
]
["output", new_state]
}


emit:
output_ch
}
32 changes: 32 additions & 0 deletions src/datasets/workflows/extract_dataset_info/run_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

set -e

# export TOWER_WORKSPACE_ID=53907369739130

OUTPUT_DIR="output/temp"

if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi

DATASETS_DIR="resources_test/common"

export NXF_VER=22.04.5
nextflow run . \
-main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \
-profile docker \
-resume \
-c src/wf_utils/labels_ci.config \
-entry auto \
--input_states "$DATASETS_DIR/**/state.yaml" \
--rename_keys 'input:output_dataset' \
--settings '{"output": "dataset_info.yaml"}' \
--publish_dir "$OUTPUT_DIR" \
--output_state "state.yaml"

0 comments on commit 17cc7cf

Please sign in to comment.