@@ -96,12 +96,13 @@
# ChromatinHD defines a set of regions of interest, typically surrounding transcription start sites of a gene. Since we typically do not know which transcription start sites are used, we can either use the canonical ones (as determined by e.g. ENCODE) or use the ATAC-seq data to select the one that is most open. We will use the latter option here.
# %% [markdown]
-# We first get the transcripts for each gene.
+# We first get the transcripts for each gene. We extract this from biomart using the ensembl gene ids, which in this case are used as the index of the `transcriptome.var`.
# %%
transcripts = chd.biomart.get_transcripts(chd.biomart.Dataset.from_genome("GRCh38"), gene_ids=transcriptome.var.index)
fragments_file = dataset_folder / "fragments.tsv.gz"
transcripts = chd.data.regions.select_tss_from_fragments(transcripts, fragments_file)
+transcripts.head()
# %% [markdown]
# Now we can define the regions around the TSS. In this case we choose -10kb and +10kb around a TSS, although in real situations this will typically be much bigger (e.g. -100kb - +100kb)
@@ -110,11 +111,9 @@
regions = chd.data.Regions.from_transcripts(
transcripts,
path=dataset_folder / "regions",
- window=[-10000, 10000],
+ window=[-100000, 100000],
)
-
-# %%
-# !ls {dataset_folder}/*
+regions
# %% [markdown]
#
@@ -140,14 +139,11 @@
# %%
if not (dataset_folder / "fragments.tsv.gz.tbi").exists():
- import subprocess
+ import pysam
+ pysam.tabix_index(str(dataset_folder / "fragments.tsv.gz"), preset = "bed")
- subprocess.run(
- [
- "tabix",
- dataset_folder / "fragments.tsv.gz",
- ]
- )
+# %% [markdown]
+# We now create a ChromatinHD fragments object using the `fragments.tsv` file and a set of regions. This will populate a set of tensors on disk containing information on where each fragment lies within the region and to which cell and region it belongs:
# %%
fragments = chd.data.Fragments.from_fragments_tsv(
@@ -156,12 +152,22 @@
obs=transcriptome.obs,
path=dataset_folder / "fragments",
)
+fragments
-# %%
-fragments.create_cellxgene_indptr()
+# %% [markdown]
+# During training or inference of any models, we often require fast access to all fragments belong to a particular cell and region. This can be sped up by precalculating pointers to each region and cell combination:
# %%
-# !ls {dataset_folder}/*
+fragments.create_regionxcell_indptr()
+
+# %% [markdown]
+#
+#
+# How data is stored in ChromatinHD
+# We use zarr format to store data, and either TensorStore or Xarray to load data as needed.
+#
+#
+#
# %% [markdown]
# ### Training folds
@@ -171,6 +177,11 @@
# %%
folds = chd.data.folds.Folds(dataset_folder / "folds" / "5x1").sample_cells(fragments, 5, 1)
+folds
+
+# %%
+folds = chd.data.folds.Folds(dataset_folder / "folds" / "5x5").sample_cells(fragments, 5, 5)
+folds
# %% [markdown]
# ## Optional data¶
@@ -183,67 +194,20 @@
# %%
clustering = chd.data.Clustering.from_labels(adata.obs["celltype"], path=dataset_folder / "clustering")
-
-# %%
-# !ls {clustering.path}
+clustering
# %% [markdown]
# ### Motif scan¶
# %% [markdown]
-#
-# This functionality is still under construction and is only used for visualization/interpretation use cases for now.
-#
+# We can also scan for motifs and store it on disk, to be used to link transcription factors to particular regions of interest. Models that use this data directly are forthcoming.
# %% [markdown]
-# Let's first download the HOCOMOCO motif data:
+# Let's first download the HOCOMOCO motif data. This is a simple wrapper function that downloads and processes relevant motif data from the HOCOMOCO website.
# %%
-motifs_folder = dataset_folder / "motifs"
-motifs_folder.mkdir(exist_ok=True, parents=True)
-
-# download cutoffs, pwms and annotations
-# !wget https://hocomoco11.autosome.org/final_bundle/hocomoco11/core/HUMAN/mono/HOCOMOCOv11_core_standard_thresholds_HUMAN_mono.txt -O {motifs_folder}/pwm_cutoffs.txt
-# !wget https://hocomoco11.autosome.org/final_bundle/hocomoco11/core/HUMAN/mono/HOCOMOCOv11_core_pwms_HUMAN_mono.txt -O {motifs_folder}/pwms.txt
-# !wget https://hocomoco11.autosome.org/final_bundle/hocomoco11/core/HUMAN/mono/HOCOMOCOv11_core_annotation_HUMAN_mono.tsv -O {motifs_folder}/annot.txt
-
-# %%
-import numpy as np
-import pandas as pd
-
-# %% [markdown]
-# We read in all PWMs:
-
-# %%
-pwms = {}
-motif = None
-for line in (motifs_folder / "pwms.txt").open():
- if line.startswith(">"):
- if motif is not None:
- pwms[motif_id] = motif
- motif_id = line[1:].strip("\n")
- motif = []
- else:
- motif.append([float(x) for x in line.split("\t")])
-pwms = {motif_id: np.array(pwm) for motif_id, pwm in pwms.items()}
-
-# %% [markdown]
-# And gather some metadata about each motif
-
-# %%
-motifs = pd.DataFrame({"motif": pwms.keys()}).set_index("motif")
-motif_cutoffs = pd.read_table(
- motifs_folder / "pwm_cutoffs.txt",
- names=["motif", "cutoff_001", "cutoff_0005", "cutoff_0001"],
- skiprows=1,
-).set_index("motif")
-motifs = motifs.join(motif_cutoffs)
-annot = (
- pd.read_table(motifs_folder / "annot.txt")
- .rename(columns={"Model": "motif", "Transcription factor": "gene_label"})
- .set_index("motif")
-)
-motifs = motifs.join(annot)
+import chromatinhd.data.motifscan.download
+pwms, motifs = chd.data.motifscan.download.get_hocomoco(dataset_folder / "motifs", organism = "human")
# %% [markdown]
# You also need to provide the location where the genome fasta file is stored. In our case this is located at /data/genome/GRCh38/, which was installed using `genomepy.install_genome("GRCh38", genomes_dir = "/data/genome/")`.
@@ -263,10 +227,8 @@
pwms,
regions,
motifs=motifs,
- cutoff_col="cutoff_0001",
+ cutoff_col="cutoff_0.0001",
fasta_file=fasta_file,
path=dataset_folder / "motifscan",
)
-
-# %%
-# !ls {motifscan.path}
+motifscan
diff --git a/docs/source/quickstart/2_pred.py b/docs/source/quickstart/2_pred.py
index c08d65f0..454d8d37 100644
--- a/docs/source/quickstart/2_pred.py
+++ b/docs/source/quickstart/2_pred.py
@@ -7,7 +7,7 @@
# format_version: '1.3'
# jupytext_version: 1.14.7
# kernelspec:
-# display_name: Python 3
+# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
@@ -48,10 +48,12 @@
# The basic ChromatinHD-*pred* model
# %%
-models = chd.models.pred.model.additive.Models(dataset_folder / "models" / "additive", reset=True)
+models = chd.models.pred.model.multiscale.Models(dataset_folder / "models", reset=True)
# %% tags=["hide_output"]
-models.train_models(fragments, transcriptome, folds)
+models.train_models(
+ fragments=fragments, transcriptome=transcriptome, folds=folds, regions_oi=transcriptome.gene_id(["CCL4", "IRF1"])
+)
# %% [markdown]
# ## Some quality checks
@@ -60,11 +62,11 @@
# We will first check whether the model learned something, by comparing the predictive performance with a baseline
# %%
-gene_cors = models.get_gene_cors(fragments, transcriptome, folds)
+gene_cors = models.get_region_cors(fragments, transcriptome, folds)
gene_cors["symbol"] = gene_cors.index.map(transcriptome.symbol)
# %%
-gene_cors.sort_values("deltacor", ascending=False).head(10)
+gene_cors
# %%
import pandas as pd
@@ -73,9 +75,9 @@
fig, ax = plt.subplots(figsize=(4, 4))
for name, group in gene_cors.iterrows():
- ax.plot([0, 1], group[["cor_n_fragments", "cor_predicted"]], color="#3338", zorder=0, marker="o", markersize=2)
+ ax.plot([0, 1], group[["cor_n_fragments", "cor"]], color="#3338", zorder=0, marker="o", markersize=2)
ax.boxplot(
- gene_cors[["cor_n_fragments", "cor_predicted"]].values,
+ gene_cors[["cor_n_fragments", "cor"]].values,
positions=[0, 1],
widths=0.1,
showfliers=False,
@@ -84,60 +86,58 @@
meanprops={"color": "red", "linewidth": 2},
)
ax.set_xticks([0, 1])
+ax.set_ylim(0)
ax.set_xticklabels(["# fragments", "ChromatinHD-pred"])
ax.set_ylabel("$cor$")
-# %% [markdown]
-# Note that every gene gains from the ChromatinHD model, even if some only gain a little. The genes with a low $\Delta cor$ are often those with only a few fragments:
-
-# %%
-fig, ax = plt.subplots(figsize=(4, 4))
-ax.scatter(gene_cors["n_fragments"], gene_cors["deltacor"])
-ax.set_ylabel("$\\Delta$ cor")
-ax.set_xlabel("# fragments")
-ax.set_xscale("log")
-
# %% [markdown]
# ## Predictivity per position
# %% [markdown]
-# To determine which regions were important for the model to predict gene expression, we will censor fragments from windows of various sizes, and then check whether the model performance on a set of test cells decreased. This functionality is implemented in the `RegionMultiWindow` class. This will only run the censoring for a subset of genes to speed up interpretation.
+# To determine which regions were important for the model to predict gene expression, we will censor fragments from windows of various sizes, and then check whether the model performance on a set of test cells decreased. This functionality is implemented in the `GeneMultiWindow` class. This will only run the censoring for a subset of genes to speed up interpretation.
# %%
censorer = chd.models.pred.interpret.MultiWindowCensorer(fragments.regions.window)
-regionmultiwindow = chd.models.pred.interpret.RegionMultiWindow(models.path / "interpret" / "regionmultiwindow")
+import chromatinhd
+censorer.__class__ = chromatinhd.models.pred.interpret.censorers.MultiWindowCensorer
+regionmultiwindow = chd.models.pred.interpret.RegionMultiWindow.create(
+ path = models.path / "interpret" / "regionmultiwindow",
+ folds = folds,
+ transcriptome = transcriptome,
+ censorer = censorer,
+ fragments = fragments,
+)
# %%
regionmultiwindow.score(
- fragments,
- transcriptome,
- models,
- folds,
- transcriptome.gene_id(
+ models = models,
+ regions = transcriptome.gene_id(
[
"CCL4",
- "IL1B",
- "EBF1",
- "PAX5",
- "CD79A",
- "RHEX",
+ "IRF1",
]
),
- censorer=censorer,
+ folds = folds,
)
# %%
regionmultiwindow.interpolate()
+# %% [markdown]
+# ### Visualizing predictivity
+
+# %% [markdown]
+# We can visualize the predictivity as follows. This shows which regions of the genome are positively and negatively associated with gene expression.
+
# %%
-symbol = "EBF1"
+symbol = "IRF1"
fig = chd.grid.Figure(chd.grid.Grid(padding_height=0.05))
width = 10
region = fragments.regions.coordinates.loc[transcriptome.gene_id(symbol)]
-panel_genes = chd.plot.genome.genes.Genes.from_region(region, width=width)
+panel_genes = chd.plot.genome.genes.Genes.from_region(region, width=width, genome = "GRCh38")
fig.main.add_under(panel_genes)
panel_pileup = chd.models.pred.plot.Pileup.from_regionmultiwindow(
@@ -153,49 +153,106 @@
fig.plot()
# %% [markdown]
-# ## Co-predictivity per position
+# Given that accessibility can be sparse, we often simply visualize the predictivity in regions with at least a minimum of accessibility.
# %% [markdown]
-# In a similar fashion we can determine the co-predictivity per position.
+# Let's first select regions based on the number of fragments. Regions that are close together will be merged.
# %%
-censorer = chd.models.pred.interpret.WindowCensorer(fragments.regions.window)
-regionpairwindow = chd.models.pred.interpret.RegionPairWindow(
- models.path / "interpret" / "regionpairwindow", reset=True
+symbol = "IRF1"
+# symbol = "CCL4"
+gene_id = transcriptome.gene_id(symbol)
+
+# %%
+# decrease the lost_cutoff to see more regions
+regions = regionmultiwindow.select_regions(gene_id, lost_cutoff = 0.15)
+breaking = chd.grid.Breaking(regions)
+
+# %%
+fig = chd.grid.Figure(chd.grid.Grid(padding_height=0.05))
+
+region = fragments.regions.coordinates.loc[transcriptome.gene_id(symbol)]
+panel_genes = chd.plot.genome.genes.GenesBroken.from_region(region, breaking=breaking, genome = "GRCh38")
+fig.main.add_under(panel_genes)
+
+panel_pileup = chd.models.pred.plot.PileupBroken.from_regionmultiwindow(
+ regionmultiwindow, gene_id, breaking=breaking
)
-regionpairwindow.score(
- fragments, transcriptome, models, folds, censorer=censorer, genes=transcriptome.gene_id(["CCL4"])
+fig.main.add_under(panel_pileup)
+
+panel_predictivity = chd.models.pred.plot.PredictivityBroken.from_regionmultiwindow(
+ regionmultiwindow, gene_id, breaking=breaking, ymax = -0.1
)
+fig.main.add_under(panel_predictivity)
+
+fig.plot()
+
+# %% [markdown] vscode={"languageId": "markdown"}
+# ## Co-predictivity
+
+# %% [markdown]
+# In a similar fashion we can determine the co-predictivity per position.
+
+# %%
+censorer = chd.models.pred.interpret.WindowCensorer(fragments.regions.window)
+regionpairwindow = chd.models.pred.interpret.RegionPairWindow(models.path / "interpret" / "regionpairwindow", reset=True)
+regionpairwindow.score(models, censorer = censorer, folds = folds, fragments = fragments)
+
+# %% [markdown]
+# ### Visualization of co-predictivity
# %%
-symbol = "CCL4"
+symbol = "IRF1"
+# symbol = "CCL4"
+gene_id = transcriptome.gene_id(symbol)
+# %%
+windows = regionmultiwindow.select_regions(gene_id, lost_cutoff = 0.2)
+breaking = chd.grid.Breaking(windows)
+
+# %%
fig = chd.grid.Figure(chd.grid.Grid(padding_height=0.05))
width = 10
# genes
-region = fragments.regions.coordinates.loc[transcriptome.gene_id(symbol)]
-panel_genes = chd.plot.genome.genes.Genes.from_region(region, width=width)
+region = fragments.regions.coordinates.loc[gene_id]
+panel_genes = chd.plot.genome.genes.GenesBroken.from_region(region, breaking = breaking)
fig.main.add_under(panel_genes)
# pileup
-panel_pileup = chd.models.pred.plot.Pileup.from_RegionMultiWindow(
- RegionMultiWindow, transcriptome.gene_id(symbol), width=width
+panel_pileup = chd.models.pred.plot.PileupBroken.from_regionmultiwindow(
+ regionmultiwindow, gene_id, breaking = breaking,
)
fig.main.add_under(panel_pileup)
# predictivity
-panel_predictivity = chd.models.pred.plot.Predictivity.from_RegionMultiWindow(
- RegionMultiWindow, transcriptome.gene_id(symbol), width=width
+panel_predictivity = chd.models.pred.plot.PredictivityBroken.from_regionmultiwindow(
+ regionmultiwindow, gene_id, breaking=breaking
)
fig.main.add_under(panel_predictivity)
# copredictivity
-panel_copredictivity = chd.models.pred.plot.Copredictivity.from_regionpairwindow(
- regionpairwindow, transcriptome.gene_id(symbol), width=width
+panel_copredictivity = chd.models.pred.plot.CopredictivityBroken.from_regionpairwindow(
+ regionpairwindow, gene_id, breaking = breaking
)
-fig.main.add_under(panel_copredictivity)
+fig.main.add_under(panel_copredictivity, padding = 0.)
fig.plot()
# %%
+plotdata = regionpairwindow.get_plotdata(gene_id, windows = windows).sort_values("cor")
+plotdata["deltacor_min"] = plotdata[["deltacor1", "deltacor2"]].values.min(1)
+plotdata["deltacor_max"] = plotdata[["deltacor1", "deltacor2"]].values.max(1)
+plotdata["deltacor_prod"] = plotdata["deltacor1"] * plotdata["deltacor2"]
+plotdata["deltacor_sum"] = plotdata["deltacor1"] + plotdata["deltacor2"]
+
+fig, ax = plt.subplots()
+ax.scatter(plotdata_oi["deltacor_prod"].abs(), plotdata_oi["cor"].abs())
+
+# %% [markdown]
+# ## Extract predictive regions
+
+# %%
+regionmultiwindow.extract_predictive_regions()
+
+# %%
diff --git a/docs/source/quickstart/3_diff.py b/docs/source/quickstart/3_diff.py
index b54a025e..216eae6f 100644
--- a/docs/source/quickstart/3_diff.py
+++ b/docs/source/quickstart/3_diff.py
@@ -7,7 +7,7 @@
# format_version: '1.3'
# jupytext_version: 1.14.7
# kernelspec:
-# display_name: Python 3
+# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
@@ -49,10 +49,10 @@
# The basic ChromatinHD-*diff* model
# %%
-models = chd.models.diff.model.cutnf.Models(dataset_folder / "models" / "cutnf", reset=True)
+models = chd.models.diff.model.binary.Models(dataset_folder / "models" / "cutnf", reset=True)
# %% tags=["hide_output"]
-models.train_models(fragments, clustering, folds)
+models.train_models(fragments = fragments, clustering = clustering, folds = folds)
# %% [markdown]
# ## Interpret positionally
@@ -61,35 +61,72 @@
# Currently, the ChromatinHD-model is purely positional, i.e. it only looks whether Tn5 insertion sites increase or decrease within a region. As such, we can only interpret it positionally:
# %%
-import chromatinhd.models.diff.interpret.genepositional
+import chromatinhd.models.diff.interpret.regionpositional
# %%
clustering.cluster_info.index.name = "cluster"
# %%
-genepositional = chromatinhd.models.diff.interpret.genepositional.GenePositional(
- path=models.path / "interpret" / "genepositional"
+regionpositional = chromatinhd.models.diff.interpret.regionpositional.RegionPositional(
+ path=models.path / "interpret" / "regionpositional"
)
-genepositional.score(
- fragments,
- clustering,
- models,
+regionpositional.score(
+ fragments = fragments,
+ clustering = clustering,
+ models = models,
force=True,
)
+# %% [markdown]
+# ### Plot differential accessibility at a specific window
+
+# %% [markdown]
+# To avoid overplotting, we will plot only the window close to the TSS.
+
+# %%
+symbol = "IRF1"
+gene_id = transcriptome.gene_id(symbol)
+
+# %%
+window = [-10000, 10000]
+
+# %%
+fig = chd.grid.Figure(chd.grid.Grid(padding_height=0.05, padding_width=0.05))
+width = 10
+
+region = fragments.regions.coordinates.loc[transcriptome.gene_id(symbol)]
+panel_genes = chd.plot.genome.genes.Genes.from_region(region, width=width, window = window)
+fig.main.add_under(panel_genes)
+
+panel_differential = chd.models.diff.plot.Differential.from_regionpositional(
+ transcriptome.gene_id(symbol), regionpositional, cluster_info=clustering.cluster_info, panel_height=0.5, width=width, window = window
+)
+fig.main.add_under(panel_differential)
+
+panel_expression = chd.models.diff.plot.DifferentialExpression.from_transcriptome(
+ transcriptome=transcriptome, clustering=clustering, gene=transcriptome.gene_id(symbol), panel_height=0.5
+)
+fig.main.add_right(panel_expression, row=panel_differential)
+
+fig.plot()
+
+# %% [markdown]
+# ### Plot differential accessibility for all windows
+
# %%
-symbol = "EBF1"
+windows = regionpositional.select_windows(gene_id)
+breaking = chd.grid.Breaking(windows)
+# %%
fig = chd.grid.Figure(chd.grid.Grid(padding_height=0.05, padding_width=0.05))
width = 10
region = fragments.regions.coordinates.loc[transcriptome.gene_id(symbol)]
-panel_genes = chd.plot.genome.genes.Genes.from_region(region, width=width)
+panel_genes = chd.plot.genome.genes.GenesBroken.from_region(region, breaking=breaking)
fig.main.add_under(panel_genes)
-plotdata, plotdata_mean = genepositional.get_plotdata(transcriptome.gene_id(symbol))
-panel_differential = chd.models.diff.plot.Differential(
- plotdata, plotdata_mean, cluster_info=clustering.cluster_info, panel_height=0.5, width=width
+panel_differential = chd.models.diff.plot.DifferentialBroken.from_regionpositional(
+ transcriptome.gene_id(symbol), regionpositional, cluster_info=clustering.cluster_info, panel_height=0.5, breaking=breaking, window = window
)
fig.main.add_under(panel_differential)
@@ -100,4 +137,7 @@
fig.plot()
+# %% [markdown]
+# ## Differential TFBSs
+
# %%