From aab07afa0046ed6b1648ffcd6994ffddb481299e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michaela=20M=C3=BCller?= <51025211+mumichae@users.noreply.github.com> Date: Tue, 27 Aug 2024 08:39:04 +0200 Subject: [PATCH] Batch integration: partial reading (#460) * WIP: use partial read fuction for asw_batch * fix asw_batch with partial reading * update dependencies and add helper script * adapt metrics scripts to use partial reading * use latest base images * update base iamges for methods * use partial reading function for all integration methods * apply partial reading for control methods * use partial reading for transformers --- .gitignore | 2 + .../helper_functions/read_anndata_partial.py | 77 +++++++++++++++++++ .../batch_embed/config.vsh.yaml | 4 +- .../no_integration/batch_embed/script.py | 16 +++- .../global_embed/config.vsh.yaml | 4 +- .../no_integration/global_embed/script.py | 12 ++- .../global_feature/config.vsh.yaml | 4 +- .../no_integration/global_feature/script.py | 15 +++- .../global_graph/config.vsh.yaml | 4 +- .../no_integration/global_graph/script.py | 8 +- .../celltype_embed/config.vsh.yaml | 4 +- .../celltype_embed/script.py | 7 +- .../celltype_jitter_embed/config.vsh.yaml | 4 +- .../celltype_jitter_embed/script.py | 7 +- .../batch_embed/config.vsh.yaml | 4 +- .../random_integration/batch_embed/script.py | 8 +- .../batch_feature/config.vsh.yaml | 4 +- .../batch_feature/script.py | 12 ++- .../batch_graph/config.vsh.yaml | 4 +- .../random_integration/batch_graph/script.py | 8 +- .../celltype_embed/config.vsh.yaml | 4 +- .../celltype_embed/script.py | 8 +- .../celltype_feature/config.vsh.yaml | 4 +- .../celltype_feature/script.py | 11 ++- .../celltype_graph/config.vsh.yaml | 4 +- .../celltype_graph/script.py | 8 +- .../global_embed/config.vsh.yaml | 4 +- .../random_integration/global_embed/script.py | 8 +- .../global_feature/config.vsh.yaml | 4 +- .../global_feature/script.py | 13 +++- .../global_graph/config.vsh.yaml | 4 +- .../random_integration/global_graph/script.py | 8 +- .../methods/bbknn/config.vsh.yaml | 4 +- .../batch_integration/methods/bbknn/script.py | 15 +++- .../methods/combat/config.vsh.yaml | 4 +- .../methods/combat/script.py | 14 +++- .../methods/fastmnn_embedding/config.vsh.yaml | 2 +- .../methods/fastmnn_feature/config.vsh.yaml | 2 +- .../methods/liger/config.vsh.yaml | 2 +- .../methods/mnn_correct/config.vsh.yaml | 2 +- .../methods/mnnpy/config.vsh.yaml | 2 + .../batch_integration/methods/mnnpy/script.py | 14 +++- .../methods/pyliger/config.vsh.yaml | 4 +- .../methods/pyliger/script.py | 22 +++--- .../methods/scalex_embed/config.vsh.yaml | 4 +- .../methods/scalex_embed/script.py | 18 ++++- .../methods/scalex_feature/config.vsh.yaml | 6 +- .../methods/scalex_feature/script.py | 56 -------------- .../methods/scanorama_embed/config.vsh.yaml | 4 +- .../methods/scanorama_embed/script.py | 17 +++- .../methods/scanorama_feature/config.vsh.yaml | 6 +- .../methods/scanorama_feature/script.py | 74 ------------------ .../methods/scanvi/config.vsh.yaml | 4 +- .../methods/scanvi/script.py | 15 +++- .../methods/scvi/config.vsh.yaml | 4 +- .../batch_integration/methods/scvi/script.py | 14 +++- .../metrics/asw_batch/config.vsh.yaml | 4 +- .../metrics/asw_batch/script.py | 19 +++-- .../metrics/asw_label/config.vsh.yaml | 4 +- .../metrics/asw_label/script.py | 19 +++-- .../cell_cycle_conservation/config.vsh.yaml | 4 +- .../metrics/cell_cycle_conservation/script.py | 37 ++++++--- .../clustering_overlap/config.vsh.yaml | 4 +- .../metrics/clustering_overlap/script.py | 28 +++---- .../graph_connectivity/config.vsh.yaml | 4 +- .../metrics/graph_connectivity/script.py | 22 +++--- .../metrics/hvg_overlap/config.vsh.yaml | 4 +- .../metrics/hvg_overlap/script.py | 33 +++++--- .../isolated_label_asw/config.vsh.yaml | 4 +- .../metrics/isolated_label_asw/script.py | 19 +++-- .../metrics/isolated_label_f1/config.vsh.yaml | 4 +- .../metrics/isolated_label_f1/script.py | 22 +++--- .../metrics/kbet/config.vsh.yaml | 4 +- .../batch_integration/metrics/kbet/script.py | 19 +++-- .../metrics/lisi/config.vsh.yaml | 6 +- .../batch_integration/metrics/lisi/script.py | 30 ++++---- .../metrics/pcr/config.vsh.yaml | 4 +- .../batch_integration/metrics/pcr/script.py | 33 ++++++-- .../embed_to_graph/config.vsh.yaml | 7 +- .../transformers/embed_to_graph/script.py | 21 ++++- .../feature_to_embed/config.vsh.yaml | 7 +- .../transformers/feature_to_embed/script.py | 24 +++++- 82 files changed, 644 insertions(+), 339 deletions(-) create mode 100644 src/common/helper_functions/read_anndata_partial.py delete mode 100644 src/tasks/batch_integration/methods/scalex_feature/script.py delete mode 100644 src/tasks/batch_integration/methods/scanorama_feature/script.py diff --git a/.gitignore b/.gitignore index b27efa26e7..c19f926ba4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +README.html +README_files/ *.DS_Store *__pycache__ *.h5ad diff --git a/src/common/helper_functions/read_anndata_partial.py b/src/common/helper_functions/read_anndata_partial.py new file mode 100644 index 0000000000..efbea0592d --- /dev/null +++ b/src/common/helper_functions/read_anndata_partial.py @@ -0,0 +1,77 @@ +import warnings +from pathlib import Path +import anndata as ad +import h5py +from scipy.sparse import csr_matrix +from anndata.experimental import read_elem, sparse_dataset + + +def read_anndata( + file: str, + backed: bool = False, + **kwargs +) -> ad.AnnData: + """ + Read anndata file + :param file: path to anndata file in h5ad format + :param kwargs: AnnData parameter to group mapping + """ + assert Path(file).exists(), f'File not found: {file}' + + f = h5py.File(file, 'r') + kwargs = {x: x for x in f} if not kwargs else kwargs + if len(f.keys()) == 0: + return ad.AnnData() + # check if keys are available + for name, slot in kwargs.items(): + if slot not in f: + warnings.warn( + f'Cannot find "{slot}" for AnnData parameter `{name}` from "{file}"' + ) + adata = read_partial(f, backed=backed, **kwargs) + if not backed: + f.close() + + return adata + + +def read_partial( + group: h5py.Group, + backed: bool = False, + force_sparse_types: [str, list] = None, + **kwargs +) -> ad.AnnData: + """ + Partially read h5py groups + :params group: file group + :params force_sparse_types: encoding types to convert to sparse_dataset via csr_matrix + :params backed: read sparse matrix as sparse_dataset + :params **kwargs: dict of slot_name: slot, by default use all available slot for the h5py file + :return: AnnData object + """ + if force_sparse_types is None: + force_sparse_types = [] + elif isinstance(force_sparse_types, str): + force_sparse_types = [force_sparse_types] + slots = {} + if backed: + print('Read as backed sparse matrix...') + + for slot_name, slot in kwargs.items(): + print(f'Read slot "{slot}", store as "{slot_name}"...') + if slot not in group: + warnings.warn(f'Slot "{slot}" not found, skip...') + slots[slot_name] = None + else: + elem = group[slot] + iospec = ad._io.specs.get_spec(elem) + if iospec.encoding_type in ("csr_matrix", "csc_matrix") and backed: + slots[slot_name] = sparse_dataset(elem) + elif iospec.encoding_type in force_sparse_types: + slots[slot_name] = csr_matrix(read_elem(elem)) + if backed: + slots[slot_name] = sparse_dataset(slots[slot_name]) + else: + slots[slot_name] = read_elem(elem) + return ad.AnnData(**slots) + diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml index 67d74ae8ab..9e7a05b56f 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml @@ -14,9 +14,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py index 7fbb4a537e..801440ce65 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc import numpy as np @@ -15,9 +16,18 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) -adata.X = adata.layers["normalized"] +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) adata.var["highly_variable"] = adata.var["hvg"] print("Process dataset", flush=True) @@ -27,7 +37,7 @@ n_comps = min(50, np.sum(batch_idx)) solver = "full" if n_comps == np.sum(batch_idx) else "arpack" adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca( - adata[batch_idx], + adata[batch_idx].copy(), n_comps=n_comps, use_highly_variable=True, svd_solver=solver, diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml index 6b2f724ed9..229f5e7352 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml @@ -14,9 +14,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py index 4b16b82525..f45038806b 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc ## VIASH START @@ -15,8 +16,17 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print("process dataset", flush=True) adata.obsm["X_emb"] = adata.obsm["X_pca"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml index 7b1013221e..d480b4bb3e 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml @@ -14,9 +14,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py index 9ddbab0432..2acdbf9b7a 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc ## VIASH START @@ -15,12 +16,22 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) # no processing, subset matrix to highly variable genes adata_hvg = adata[:, adata.var["hvg"]].copy() -adata.layers['corrected_counts'] = adata_hvg.layers["normalized"].copy() +adata.layers['corrected_counts'] = adata_hvg.X.copy() print("Store outputs", flush=True) adata.uns['method_id'] = meta['functionality_name'] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml index ead6281806..afaed6346a 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py index 22b39d10d5..4824c8f443 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py @@ -19,10 +19,16 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _set_uns +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print("process dataset", flush=True) neighbors_map = adata.uns['knn'] diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml index 9d50f13aaf..4dec4675f4 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py index b15ce33047..ca16a60ab2 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py @@ -16,10 +16,15 @@ ## VIASH END sys.path.append(meta["resources_dir"]) from utils import _perfect_embedding +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + uns='uns' +) print('Process data...', flush=True) adata.obsm["X_emb"] = _perfect_embedding(partition=adata.obs["label"]) diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml index e0af4e4a5b..07ec9dfc68 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml @@ -18,10 +18,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py index 75f5889f8d..8f88f77472 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py @@ -17,10 +17,15 @@ ## VIASH END sys.path.append(meta["resources_dir"]) from utils import _perfect_embedding +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + uns='uns' +) print('Process data...', flush=True) adata.obsm["X_emb"] = _perfect_embedding( diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml index 717d14ab42..09e1d2d731 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py index 3cc476b863..175a449a49 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py @@ -19,9 +19,15 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print("process dataset", flush=True) adata.obsm["X_emb"] = _randomize_features( diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml index ad1957b070..c26ac8027e 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py index 755f4782f9..630871e780 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py @@ -19,12 +19,20 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) adata.layers['corrected_counts'] = _randomize_features( - adata.layers["normalized"], + adata.X, partition=adata.obs["batch"], ) diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml index 553e7431a8..761cfc6ccd 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py index d07e3b339e..d5c20aa185 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py @@ -18,10 +18,16 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_graph +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print('Randomize graph...', flush=True) adata = _randomize_graph( diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml index d591b2a1df..aeef32e5e7 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py index bf793fad75..bf26568079 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py @@ -16,10 +16,16 @@ ## VIASH END sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print('Process data...', flush=True) adata.obsm["X_emb"] = _randomize_features( diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml index 2719a68d87..843b45cd36 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py index a06e6c1ab7..9f1302df0d 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py @@ -19,14 +19,21 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) print("Process data...", flush=True) adata.layers['corrected_counts'] = _randomize_features( - adata.layers["normalized"], + adata.X, partition=adata.obs["label"] ) diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml index 948bcacf29..13fa6213e7 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py index 7b02353ed4..3634d55dbd 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py @@ -19,9 +19,15 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_graph +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print("Process data...", flush=True) adata = _randomize_graph( diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml index b17174744f..7b8efce8e8 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py index fc7ba6cee5..ca626600b8 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py @@ -19,9 +19,15 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print("process dataset", flush=True) adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_pca"]) diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml index 8dd71aec93..09551d3078 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py index 1c7c838b6e..c74c7d2a5e 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py @@ -19,11 +19,18 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) - -adata.layers['corrected_counts'] = _randomize_features(adata.layers["normalized"]) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +adata.layers['corrected_counts'] = _randomize_features(adata.X) print("Store outputs", flush=True) adata.uns['method_id'] = meta['functionality_name'] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml index 9780485e92..fb0db3f66c 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py index c0277c74b7..cd4d64f043 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py @@ -18,10 +18,16 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_graph +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print('Randomize graph...', flush=True) adata = _randomize_graph(adata, neighbors_key="knn") diff --git a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml index 1d1d42aa89..99f8346ed4 100644 --- a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml @@ -37,9 +37,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/bbknn/script.py b/src/tasks/batch_integration/methods/bbknn/script.py index d2a6e464ae..1496fda0bb 100644 --- a/src/tasks/batch_integration/methods/bbknn/script.py +++ b/src/tasks/batch_integration/methods/bbknn/script.py @@ -1,4 +1,6 @@ +import sys import anndata as ad +import scanpy as sc import bbknn ## VIASH START @@ -15,13 +17,24 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] adata = adata[:, idx].copy() + sc.pp.pca(adata) print('Run BBKNN', flush=True) kwargs = dict(batch_key='batch', copy=True) diff --git a/src/tasks/batch_integration/methods/combat/config.vsh.yaml b/src/tasks/batch_integration/methods/combat/config.vsh.yaml index dbb4b042ec..198b20dc3d 100644 --- a/src/tasks/batch_integration/methods/combat/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/combat/config.vsh.yaml @@ -32,9 +32,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [midtime, highmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/combat/script.py b/src/tasks/batch_integration/methods/combat/script.py index c5f0ed8dd5..9f282efb9c 100644 --- a/src/tasks/batch_integration/methods/combat/script.py +++ b/src/tasks/batch_integration/methods/combat/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc from scipy.sparse import csr_matrix @@ -15,8 +16,18 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -25,7 +36,6 @@ print('Run Combat', flush=True) -adata.X = adata.layers['normalized'] adata.X = sc.pp.combat(adata, key='batch', inplace=False) diff --git a/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml index 1fc6910a81..11eb009031 100644 --- a/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml @@ -26,7 +26,7 @@ functionality: path: ../fastmnn_feature/script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: ghcr.io/openproblems-bio/base_images/r:1.1.0 setup: - type: r bioc: diff --git a/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml index 4336f93c8e..b3036f278c 100644 --- a/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml @@ -25,7 +25,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: ghcr.io/openproblems-bio/base_images/r:1.1.0 setup: - type: r bioc: batchelor diff --git a/src/tasks/batch_integration/methods/liger/config.vsh.yaml b/src/tasks/batch_integration/methods/liger/config.vsh.yaml index d0db8e2996..bc5ec32a64 100644 --- a/src/tasks/batch_integration/methods/liger/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/liger/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: ghcr.io/openproblems-bio/base_images/r:1.1.0 setup: - type: apt packages: cmake diff --git a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml index 7a795fc759..683bc17af9 100644 --- a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml @@ -17,7 +17,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: ghcr.io/openproblems-bio/base_images/r:1.1.0 setup: - type: r bioc: diff --git a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml index de1894ab68..649672dac1 100644 --- a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml @@ -30,6 +30,8 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: # Due to a [ gcc-8 ] dependency in the mnnpy package, we need to use a python:3.8 image - type: docker diff --git a/src/tasks/batch_integration/methods/mnnpy/script.py b/src/tasks/batch_integration/methods/mnnpy/script.py index 34e726133e..56d5cce3a2 100644 --- a/src/tasks/batch_integration/methods/mnnpy/script.py +++ b/src/tasks/batch_integration/methods/mnnpy/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import mnnpy @@ -13,8 +14,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -22,7 +33,6 @@ adata = adata[:, idx].copy() print('Run mnn', flush=True) -adata.X = adata.layers['normalized'] split = [] batch_categories = adata.obs['batch'].cat.categories for i in batch_categories: diff --git a/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml b/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml index 0d8f262620..f0489da337 100644 --- a/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml @@ -21,9 +21,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/pyliger/script.py b/src/tasks/batch_integration/methods/pyliger/script.py index aa2250a857..2066e6965b 100644 --- a/src/tasks/batch_integration/methods/pyliger/script.py +++ b/src/tasks/batch_integration/methods/pyliger/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import numpy as np import pyliger @@ -12,21 +13,24 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('>> Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) +adata.layers['norm_data'] = read_anndata(par['input'], X='layers/normalized').X print('>> Prepare data', flush=True) adata_per_batch = [] for batch in adata.obs['batch'].unique(): adb = adata[adata.obs['batch'] == batch].copy() - - # move counts - adb.X = adb.layers['counts'] - del adb.layers['counts'] - - # move normalized data - adb.layers["norm_data"] = adb.layers["normalized"] - del adb.layers["normalized"] # save row sum and sum of squares for further use norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0)) diff --git a/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml index 179d478412..e448d11006 100644 --- a/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml @@ -25,9 +25,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scalex_embed/script.py b/src/tasks/batch_integration/methods/scalex_embed/script.py index 1259fd130a..9974eba4b3 100644 --- a/src/tasks/batch_integration/methods/scalex_embed/script.py +++ b/src/tasks/batch_integration/methods/scalex_embed/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scalex @@ -13,8 +14,19 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -22,7 +34,6 @@ adata = adata[:, idx].copy() print('Run SCALEX', flush=True) -adata.X = adata.layers['normalized'] adata = scalex.SCALEX( adata, batch_key="batch", @@ -42,6 +53,9 @@ output = ad.AnnData( obs=adata.obs[[]], var=adata.var[[]], + layers={ + 'corrected_counts': adata.layers["impute"], + }, obsm={ 'X_emb': adata.obsm['latent'], }, diff --git a/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml index 2d8d05a98f..f25023f730 100644 --- a/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml @@ -24,10 +24,12 @@ functionality: description: Number of highly variable genes to use. resources: - type: python_script - path: script.py + path: ../scalex_embed/script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scalex_feature/script.py b/src/tasks/batch_integration/methods/scalex_feature/script.py deleted file mode 100644 index ef33ee2a43..0000000000 --- a/src/tasks/batch_integration/methods/scalex_feature/script.py +++ /dev/null @@ -1,56 +0,0 @@ -import anndata as ad -import scanpy as sc -import scalex - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'n_hvg': 2000, -} -meta = { - 'functionality_name' : 'foo', - 'config': 'bar' -} -## VIASH END - -print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - -print('Run SCALEX', flush=True) -adata.X = adata.layers['normalized'] -adata = scalex.SCALEX( - adata, - batch_key="batch", - ignore_umap=True, - impute=adata.obs["batch"].cat.categories[0], - processed=True, - max_iteration=40, - min_features=None, - min_cells=None, - n_top_features=0, - outdir=None, - gpu=0, -) - -print("Store output", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - layers={ - 'corrected_counts': adata.layers["impute"], - }, - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml index 387745fc38..68161f0ebc 100644 --- a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml @@ -27,9 +27,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scanorama_embed/script.py b/src/tasks/batch_integration/methods/scanorama_embed/script.py index 950aa3b193..db12b458d5 100644 --- a/src/tasks/batch_integration/methods/scanorama_embed/script.py +++ b/src/tasks/batch_integration/methods/scanorama_embed/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scanorama @@ -13,6 +14,10 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + # based on scib # -> https://github.com/theislab/scib/blob/59ae6eee5e611d9d3db067685ec96c28804e9127/scib/utils.py#L51C1-L72C62 def merge_adata(*adata_list, **kwargs): @@ -40,7 +45,13 @@ def merge_adata(*adata_list, **kwargs): print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -48,7 +59,6 @@ def merge_adata(*adata_list, **kwargs): adata = adata[:, idx].copy() print('Run scanorama', flush=True) -adata.X = adata.layers['normalized'] split = [] batch_categories = adata.obs['batch'].cat.categories for i in batch_categories: @@ -65,6 +75,9 @@ def merge_adata(*adata_list, **kwargs): 'normalization_id': adata.uns['normalization_id'], 'method_id': meta['functionality_name'], }, + layers={ + 'corrected_counts': corrected.X, + }, obsm={ 'X_emb': corrected.obsm["X_scanorama"], } diff --git a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml index 50246875ae..e70e68bb48 100644 --- a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml @@ -26,10 +26,12 @@ functionality: description: Number of highly variable genes to use. resources: - type: python_script - path: script.py + path: ../scanorama_embed/script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scanorama_feature/script.py b/src/tasks/batch_integration/methods/scanorama_feature/script.py deleted file mode 100644 index 614180ec99..0000000000 --- a/src/tasks/batch_integration/methods/scanorama_feature/script.py +++ /dev/null @@ -1,74 +0,0 @@ -import anndata as ad -import scanorama - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'hvg': True, -} -meta = { - 'functionality_name': 'foo', - 'config': 'bar' -} -## VIASH END - -# based on scib -# -> https://github.com/theislab/scib/blob/59ae6eee5e611d9d3db067685ec96c28804e9127/scib/utils.py#L51C1-L72C62 -def merge_adata(*adata_list, **kwargs): - """Merge adatas from list while remove duplicated ``obs`` and ``var`` columns - - :param adata_list: ``anndata`` objects to be concatenated - :param kwargs: arguments to be passed to ``anndata.AnnData.concatenate`` - """ - - if len(adata_list) == 1: - return adata_list[0] - - # Make sure that adatas do not contain duplicate columns - for _adata in adata_list: - for attr in ("obs", "var"): - df = getattr(_adata, attr) - dup_mask = df.columns.duplicated() - if dup_mask.any(): - print( - f"Deleting duplicated keys `{list(df.columns[dup_mask].unique())}` from `adata.{attr}`." - ) - setattr(_adata, attr, df.loc[:, ~dup_mask]) - - return ad.AnnData.concatenate(*adata_list, **kwargs) - - -print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - -print('Run scanorama', flush=True) -adata.X = adata.layers['normalized'] -split = [] -batch_categories = adata.obs['batch'].cat.categories -for i in batch_categories: - split.append(adata[adata.obs['batch'] == i].copy()) -corrected = scanorama.correct_scanpy(split, return_dimred=True) -corrected = merge_adata(*corrected, batch_key='batch', batch_categories=batch_categories, index_unique=None) - -print("Store output", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - }, - layers={ - 'corrected_counts': corrected.X, - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml index 3801d5bbe7..becf9467c2 100644 --- a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml @@ -44,9 +44,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scanvi/script.py b/src/tasks/batch_integration/methods/scanvi/script.py index 9c0886816d..35d5b80f32 100644 --- a/src/tasks/batch_integration/methods/scanvi/script.py +++ b/src/tasks/batch_integration/methods/scanvi/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scvi.model import SCVI, SCANVI @@ -17,8 +18,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) if par["n_hvg"]: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -26,7 +37,7 @@ adata = adata[:, idx].copy() print("Processing data", flush=True) -SCVI.setup_anndata(adata, layer="counts", batch_key="batch") +SCVI.setup_anndata(adata, batch_key="batch") print("Run scVI", flush=True) model_kwargs = { diff --git a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml index 86f9e919b2..68485a188c 100644 --- a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml @@ -42,9 +42,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scvi/script.py b/src/tasks/batch_integration/methods/scvi/script.py index 3c5feb6f9c..26490737a5 100644 --- a/src/tasks/batch_integration/methods/scvi/script.py +++ b/src/tasks/batch_integration/methods/scvi/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scvi.model import SCVI @@ -16,8 +17,17 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) if par["n_hvg"]: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -25,7 +35,7 @@ adata = adata[:, idx].copy() print("Processing data", flush=True) -SCVI.setup_anndata(adata, layer="counts", batch_key="batch") +SCVI.setup_anndata(adata, batch_key="batch") print("Run scVI", flush=True) model_kwargs = { diff --git a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml index bb2f7b48c7..c8ac04f374 100644 --- a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml @@ -36,9 +36,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/asw_batch/script.py b/src/tasks/batch_integration/metrics/asw_batch/script.py index fceda260d9..35b110b895 100644 --- a/src/tasks/batch_integration/metrics/asw_batch/script.py +++ b/src/tasks/batch_integration/metrics/asw_batch/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import silhouette_batch @@ -11,14 +12,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = silhouette_batch( - input_solution, + adata, batch_key='batch', label_key='label', embed='X_emb', @@ -27,9 +32,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml index 4fd0d7ac32..21e40aad42 100644 --- a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml @@ -24,9 +24,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/asw_label/script.py b/src/tasks/batch_integration/metrics/asw_label/script.py index 938efef5ac..01a7a2ad41 100644 --- a/src/tasks/batch_integration/metrics/asw_label/script.py +++ b/src/tasks/batch_integration/metrics/asw_label/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import silhouette @@ -12,14 +13,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = silhouette( - input_solution, + adata, label_key='label', embed='X_emb' ) @@ -27,9 +32,9 @@ print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - "method_id": input_integrated.uns['method_id'], + "dataset_id": adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + "method_id": adata.uns['method_id'], "metric_ids": [meta['functionality_name']], "metric_values": [score] } diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml index 1e8edd5ee7..42fe1c8fa8 100644 --- a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml @@ -33,9 +33,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py b/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py index 6114defd81..fa432a21c6 100644 --- a/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py +++ b/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import cell_cycle import numpy as np @@ -12,15 +13,27 @@ 'functionality_name': 'foo' } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.X = input_solution.layers['normalized'] +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + obs='obs', + obsm='obsm', + uns='uns' +) print('Use gene symbols for features', flush=True) -input_solution.var_names = input_solution.var['feature_name'] -input_integrated.var_names = input_integrated.var['feature_name'] +adata_solution.var_names = adata_solution.var['feature_name'] translator = { "homo_sapiens": "human", @@ -28,13 +41,13 @@ } print('Compute score', flush=True) -if input_solution.uns['dataset_organism'] not in translator: +if adata_solution.uns['dataset_organism'] not in translator: score = np.nan else: - organism = translator[input_solution.uns['dataset_organism']] + organism = translator[adata_solution.uns['dataset_organism']] score = cell_cycle( - input_solution, - input_integrated, + adata_solution, + adata_integrated, batch_key='batch', embed='X_emb', organism=organism, @@ -43,9 +56,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + 'method_id': adata_integrated.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml index 6fa7b9c9a9..ee74fb410b 100644 --- a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml @@ -47,9 +47,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/script.py b/src/tasks/batch_integration/metrics/clustering_overlap/script.py index b92ecd66cb..7bb9e533c8 100644 --- a/src/tasks/batch_integration/metrics/clustering_overlap/script.py +++ b/src/tasks/batch_integration/metrics/clustering_overlap/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scanpy as sc from scib.metrics.clustering import cluster_optimal_resolution @@ -5,7 +6,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_graph.h5ad', + 'adata_integrated': 'resources_test/batch_integration/pancreas/integrated_graph.h5ad', 'output': 'output.h5ad', } @@ -14,36 +15,35 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('Run optimal Leiden clustering', flush=True) cluster_optimal_resolution( - adata=input_solution, + adata=adata, label_key='label', cluster_key='cluster', cluster_function=sc.tl.leiden, ) print('Compute ARI score', flush=True) -ari_score = ari(input_solution, group1='cluster', group2='label') +ari_score = ari(adata, cluster_key='cluster', label_key='label') print('Compute NMI score', flush=True) -nmi_score = nmi(input_solution, group1='cluster', group2='label') +nmi_score = nmi(adata, cluster_key='cluster', label_key='label') print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - "method_id": input_integrated.uns['method_id'], + "dataset_id": adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + "method_id": adata.uns['method_id'], "metric_ids": [ "ari", "nmi" ], "metric_values": [ ari_score, nmi_score ] } diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml b/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml index 627f480e4c..77e5191d33 100644 --- a/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml @@ -33,9 +33,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/script.py b/src/tasks/batch_integration/metrics/graph_connectivity/script.py index 35a1b2367c..ead8f146bc 100644 --- a/src/tasks/batch_integration/metrics/graph_connectivity/script.py +++ b/src/tasks/batch_integration/metrics/graph_connectivity/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scib @@ -11,28 +12,27 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = scib.metrics.graph_connectivity( - input_solution, + adata, label_key='label' ) print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml index 1076f03619..803c535420 100644 --- a/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml @@ -32,9 +32,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/script.py b/src/tasks/batch_integration/metrics/hvg_overlap/script.py index e3221765fd..b7d177e991 100644 --- a/src/tasks/batch_integration/metrics/hvg_overlap/script.py +++ b/src/tasks/batch_integration/metrics/hvg_overlap/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import hvg_overlap @@ -12,25 +13,39 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.X = input_solution.layers["normalized"] -input_integrated.X = input_integrated.layers["corrected_counts"] +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + X='layers/corrected_counts', + obs='obs', + var='var', + uns='uns' +) print('compute score', flush=True) score = hvg_overlap( - input_solution, - input_integrated, + adata_solution, + adata_integrated, batch_key="batch" ) print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - "method_id": input_integrated.uns['method_id'], + "dataset_id": adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + "method_id": adata_integrated.uns['method_id'], "metric_ids": [meta['functionality_name']], "metric_values": [score] } diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml index cf1702fb93..d3d62877b5 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml @@ -26,9 +26,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/script.py b/src/tasks/batch_integration/metrics/isolated_label_asw/script.py index 176239665b..094937e687 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_asw/script.py +++ b/src/tasks/batch_integration/metrics/isolated_label_asw/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import isolated_labels_asw @@ -12,15 +13,19 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = isolated_labels_asw( - input_solution, + adata, label_key='label', batch_key='batch', embed='X_emb', @@ -32,9 +37,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml index 4208e502ec..f36550ca4b 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml @@ -38,9 +38,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/script.py b/src/tasks/batch_integration/metrics/isolated_label_f1/script.py index 8c89b98f8f..30fe25bccf 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_f1/script.py +++ b/src/tasks/batch_integration/metrics/isolated_label_f1/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import isolated_labels_f1 @@ -12,19 +13,18 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = isolated_labels_f1( - input_solution, + adata, label_key='label', batch_key='batch', embed=None, @@ -36,9 +36,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml b/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml index 39bd895680..f2d83d8030 100644 --- a/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml @@ -38,9 +38,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: ghcr.io/openproblems-bio/base_images/r:1.1.0 setup: - type: r github: theislab/kBET diff --git a/src/tasks/batch_integration/metrics/kbet/script.py b/src/tasks/batch_integration/metrics/kbet/script.py index 24cf8bdf69..9834f525d5 100644 --- a/src/tasks/batch_integration/metrics/kbet/script.py +++ b/src/tasks/batch_integration/metrics/kbet/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import kBET @@ -12,14 +13,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = kBET( - input_solution, + adata, batch_key="batch", label_key="label", type_="embed", @@ -32,9 +37,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml b/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml index 1687dc5c1c..dc87c769e4 100644 --- a/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml @@ -42,13 +42,15 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: - - git+https://github.com/theislab/scib.git@v1.1.4 + - git+https://github.com/theislab/scib.git@v1.1.5 - type: nextflow directives: label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/lisi/script.py b/src/tasks/batch_integration/metrics/lisi/script.py index bdc9ed4e1a..44181dab71 100644 --- a/src/tasks/batch_integration/metrics/lisi/script.py +++ b/src/tasks/batch_integration/metrics/lisi/script.py @@ -1,6 +1,7 @@ +import sys import numpy as np import anndata as ad -from scib.metrics.lisi import recompute_knn, lisi_graph_py +from scib.metrics.lisi import lisi_graph_py ## VIASH START par = { @@ -12,19 +13,18 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute iLISI score...', flush=True) ilisi_scores = lisi_graph_py( - adata=input_solution, + adata=adata, obs_key='batch', n_neighbors=90, perplexity=None, @@ -33,11 +33,11 @@ verbose=False, ) ilisi = np.nanmedian(ilisi_scores) -ilisi = (ilisi - 1) / (input_solution.obs['batch'].nunique() - 1) +ilisi = (ilisi - 1) / (adata.obs['batch'].nunique() - 1) print('compute cLISI scores...', flush=True) clisi_scores = lisi_graph_py( - adata=input_solution, + adata=adata, obs_key='label', n_neighbors=90, perplexity=None, @@ -46,15 +46,15 @@ verbose=False, ) clisi = np.nanmedian(clisi_scores) -nlabs = input_solution.obs['label'].nunique() +nlabs = adata.obs['label'].nunique() clisi = (nlabs - clisi) / (nlabs - 1) print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ 'ilisi', 'clisi' ], 'metric_values': [ ilisi, clisi ] } diff --git a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml index 8644120657..4eb6c0f854 100644 --- a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml @@ -30,9 +30,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/pcr/script.py b/src/tasks/batch_integration/metrics/pcr/script.py index 392332963c..512b3dff6b 100644 --- a/src/tasks/batch_integration/metrics/pcr/script.py +++ b/src/tasks/batch_integration/metrics/pcr/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import pcr_comparison @@ -12,15 +13,31 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.X = input_solution.layers['normalized'] +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + # obsm='obsm', + # varm='varm', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + obs='obs', + obsm='obsm', + uns='uns' +) print('compute score', flush=True) score = pcr_comparison( - input_solution, - input_integrated, + adata_solution, + adata_integrated, embed='X_emb', covariate='batch', verbose=False @@ -29,9 +46,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + 'method_id': adata_integrated.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml b/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml index 7f881da214..0bbaf29256 100644 --- a/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml @@ -9,12 +9,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - pypi: scanpy + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/script.py b/src/tasks/batch_integration/transformers/embed_to_graph/script.py index 1731e82066..74166eb77c 100644 --- a/src/tasks/batch_integration/transformers/embed_to_graph/script.py +++ b/src/tasks/batch_integration/transformers/embed_to_graph/script.py @@ -1,4 +1,4 @@ -import yaml +import sys import scanpy as sc ## VIASH START @@ -6,12 +6,27 @@ 'input': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', 'ouput': 'output.h5ad' } + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) + -print('Run kNN', flush=True) +print('Run kNN...', flush=True) sc.pp.neighbors(adata, use_rep='X_emb') print("Store outputs", flush=True) diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml b/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml index 8ec4da8170..ca576977b6 100644 --- a/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml @@ -10,12 +10,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - pypi: scanpy + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 - type: nextflow directives: label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/script.py b/src/tasks/batch_integration/transformers/feature_to_embed/script.py index f7793bb153..0e022db8b1 100644 --- a/src/tasks/batch_integration/transformers/feature_to_embed/script.py +++ b/src/tasks/batch_integration/transformers/feature_to_embed/script.py @@ -1,22 +1,38 @@ +import sys import scanpy as sc -import yaml ## VIASH START par = { 'input': 'resources_test/batch_integration/pancreas/integrated_feature.h5ad', 'ouput': 'output.h5ad' } + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} + ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata= sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/corrected_counts', + obs='obs', + var='var', + uns='uns' +) print('Run PCA', flush=True) adata.obsm['X_emb'] = sc.pp.pca( - adata.layers["corrected_counts"], + adata.X, n_comps=50, - use_highly_variable=False, + use_highly_variable=False, # Do we want to set this to True? svd_solver='arpack', return_info=False )