Skip to content

Commit

Permalink
update encoders
Browse files Browse the repository at this point in the history
  • Loading branch information
zouter committed Apr 3, 2024
1 parent d69e466 commit 825717a
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/chromatinhd/biomart/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .dataset import Dataset
from .tss import get_canonical_transcripts, get_exons, get_transcripts, map_symbols
from .tss import get_canonical_transcripts, get_exons, get_transcripts, map_symbols, get_genes
from . import tss
from .homology import get_orthologs

Expand Down
2 changes: 1 addition & 1 deletion src/chromatinhd/biomart/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def from_genome(self, genome):
elif genome in ["GRCm39"]:
return Dataset(
"mmusculus_gene_ensembl",
"https://nov2020.archive.ensembl.org/biomart/martservice?",
"http://www.ensembl.org/biomart/martservice?",
"ENSEMBL_MART_ENSEMBL",
)
else:
Expand Down
16 changes: 16 additions & 0 deletions src/chromatinhd/biomart/tss.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,22 @@ def map_symbols(biomart_dataset: Dataset, symbols):
return mapping


def get_genes(
biomart_dataset: Dataset,
) -> pd.DataFrame:
"""
Get all canonical transcripts
"""
genes = biomart_dataset.get(
[
biomart_dataset.attribute("ensembl_gene_id"),
biomart_dataset.attribute("external_gene_name"),
],
)

return genes


def get_transcripts(
biomart_dataset: Dataset,
gene_ids=None,
Expand Down
1 change: 1 addition & 0 deletions src/chromatinhd/models/diff/interpret/regionpositional.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ def spread_true(arr, width=5):

plotdata, plotdata_mean = self.get_plotdata(region_id)
selection = pd.DataFrame({"chosen": (plotdata["prob"].unstack() > prob_cutoff).any()})
print(selection.max())

# add padding
step = plotdata.index.get_level_values("coord")[1] - plotdata.index.get_level_values("coord")[0]
Expand Down
25 changes: 25 additions & 0 deletions src/chromatinhd/models/pred/model/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,31 @@ def forward(self, coordinates):
return out


class SplineBinaryFullEncoding(torch.nn.Module):
def __init__(self, binwidths=(100, 200, 500, 1000, 2000, 5000), window=(-100000, 100000)):
super().__init__()
self.register_buffer("binwidths", torch.tensor(binwidths))
self.register_buffer("binshifts", window[0] // self.binwidths)
self.nbins = torch.tensor([(window[1] - window[0]) // binwidth + 1 for binwidth in self.binwidths])
self.register_buffer(
"bincumstarts", torch.cat([torch.zeros(1, dtype=torch.int64), torch.cumsum(self.nbins, 0)[:-1]])
)
self.window = window

self.register_buffer(
"binpositions", torch.concatenate([torch.arange(window[0], window[1] + 1, bw) for bw in binwidths])
)
self.register_buffer(
"binscales",
torch.concatenate([torch.tensor([bw] * math.ceil((window[1] - window[0] + 1) / bw)) for bw in binwidths]),
)

def forward(self, coordinates):
coordinates = coordinates[..., None]
embedding = torch.clamp(1 - torch.abs((self.binpositions - coordinates) / self.binscales), 0, 1).flatten(-2)
return embedding


import time


Expand Down
45 changes: 45 additions & 0 deletions src/chromatinhd/models/pred/model/peakcounts.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,51 @@ def _score_gene(self, gene_oi, predictor, layer):
end = time.time()
self.scores["time"][gene_oi] = end - start

def get_prediction(self, gene_oi, predictor, layer, fold_ix):
peaks, x = self.peakcounts.get_peak_counts(gene_oi)

fold_ix, fold = fold_ix, self.folds[fold_ix]

if layer is None:
layer = list(self.transcriptome.layers.keys())[0]
y = self.transcriptome.layers[layer][:, self.transcriptome.var.index == gene_oi][:, 0]
if x.shape[1] > 0:
cells_train = np.hstack([fold["cells_train"]])

x_train = x[cells_train]
x_validation = x[fold["cells_validation"]]
x_test = x[fold["cells_test"]]

y_train = y[cells_train]
y_validation = y[fold["cells_validation"]]
y_test = y[fold["cells_test"]]

if predictor == "linear":
lm = sklearn.linear_model.LinearRegression()
lm.fit(x_train, y_train)
else:
if predictor == "lasso":
lm = lasso_cv(x_train, y_train, x_validation, y_validation)
elif predictor == "rf":
lm = rf_cv(x_train, y_train, x_validation, y_validation)
elif predictor == "ridge":
lm = sklearn.linear_model.RidgeCV(alphas=10)
elif predictor == "xgboost":
import xgboost

lm = xgboost_cv(x_train, y_train, x_validation, y_validation)
elif predictor == "xgboost_gpu":
import xgboost

lm = xgboost_cv_gpu(x_train, y_train, x_validation, y_validation)
else:
raise ValueError(f"predictor {predictor} not recognized")

cors = []

y_predicted = lm.predict(x_test)
return y_predicted, y_test


class PredictionTest(Flow):
scores = chd.flow.SparseDataset()
Expand Down

0 comments on commit 825717a

Please sign in to comment.