diff --git a/.github/workflows/tox-lint-and-pytest.yml b/.github/workflows/tox-lint-and-pytest.yml new file mode 100644 index 0000000..00a4c19 --- /dev/null +++ b/.github/workflows/tox-lint-and-pytest.yml @@ -0,0 +1,26 @@ +name: Tests on hdbo (conda, python 3.10) + +on: [push] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + python -m pip install tox + - name: Check linting and tests with tox + run: | + tox \ No newline at end of file diff --git a/README.md b/README.md index 778bc96..08bf027 100644 --- a/README.md +++ b/README.md @@ -61,14 +61,14 @@ After implementing a solver in `poli-baselines`, you can **register it** in `src The scripts used to run the benchmarks can be found in `src/hdbo_benchmark/experiments`. To run e.g. `albuterol_similarity` [of the PMO benchmark](https://openreview.net/forum?id=yCZRdI0Y7G) you can run: ```bash -conda run -n hdbo python src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py \ - --function_name=albuterol_similarity \ - --solver_name=your_solver_name \ - --latent_dim=128 \ +conda run -n hdbo_benchmark python src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py \ + --function-name=albuterol_similarity \ + --solver-name=line_bo \ + --latent-dim=128 \ --max-iter=300 \ ``` -assuming `hdbo` is an environment in which you can run your solver, and in which this package is installed. Examples of environments where solvers have been tested to run can be found in `poli-baselines`. +assuming `hdbo_benchmark` is an environment in which you can run your solver, and in which this package is installed. Examples of environments where solvers have been tested to run can be found in `poli-baselines`. ## Replicating the data preprocessing for downloading zinc250k diff --git a/environment.yml b/environment.yml index e69de29..2c19d8c 100644 --- a/environment.yml +++ b/environment.yml @@ -0,0 +1,15 @@ +name: hdbo_benchmark +channels: + - defaults +dependencies: + - python=3.10 + - pip + - pip: + - botorch + - seaborn + - CairoSVG + - wandb + - click + - "git+https://github.com/MachineLearningLifeScience/poli.git@dev" + - "git+https://github.com/MachineLearningLifeScience/poli-baselines.git@main" + - -e . \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 08c1977..2c901a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,8 +6,13 @@ build-backend = "setuptools.build_meta" name = "hdbo_benchmark" version = "0.0.1" dependencies = [ - "numpy", - "torch", + "botorch", + "seaborn", + "CairoSVG", + "wandb", + "click", + "poli@git+https://github.com/MachineLearningLifeScience/poli.git@dev", + "poli-baselines@git+https://github.com/MachineLearningLifeScience/poli-baselines.git@main" ] [tool.mypy] diff --git a/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py b/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py index e4c9883..ef87533 100644 --- a/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py +++ b/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py @@ -1,4 +1,5 @@ """Loads the processed datasets, and saves them as a csv.""" + import pickle from pathlib import Path diff --git a/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py b/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py index 50d00c5..6822785 100644 --- a/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py +++ b/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py @@ -17,7 +17,6 @@ import hdbo_benchmark from hdbo_benchmark.generative_models.vae_factory import VAEFactory, VAESelfies, VAE -from hdbo_benchmark.generative_models.vae_molopt import VAEMolOpt from hdbo_benchmark.utils.experiments.load_solvers import load_solver, SOLVER_NAMES from hdbo_benchmark.utils.experiments.load_metadata_for_vaes import ( load_alphabet_for_pmo, @@ -39,8 +38,6 @@ def in_latent_space( ) -> Callable[[np.ndarray], np.ndarray]: def _latent_f(z: np.ndarray) -> np.ndarray: selfies_strings = vae.decode_to_string_array(z) - if isinstance(vae, VAEMolOpt): - selfies_strings = np.array(["".join(selfies_strings)]) val: np.ndarray = f(np.array(selfies_strings)) return val diff --git a/src/hdbo_benchmark/generative_models/vae_factory.py b/src/hdbo_benchmark/generative_models/vae_factory.py index 200dc3c..9335f28 100644 --- a/src/hdbo_benchmark/generative_models/vae_factory.py +++ b/src/hdbo_benchmark/generative_models/vae_factory.py @@ -8,7 +8,6 @@ import torch from hdbo_benchmark.generative_models.vae import VAE -from hdbo_benchmark.generative_models.vae_molopt import VAEMolOpt from hdbo_benchmark.generative_models.vae_selfies import VAESelfies from hdbo_benchmark.generative_models.vae_rnn_selfies import VAERNNSelfies from hdbo_benchmark.generative_models.vae_mario import VAEMario @@ -77,7 +76,7 @@ def _create_vae_on_mario(self, latent_dim: int) -> VAEMario: opt_vae.load_state_dict(torch.load(weights_path, map_location=DEVICE)) return opt_vae - def _create_vae_on_molecules(self, latent_dim: int) -> VAESelfies | VAEMolOpt: + def _create_vae_on_molecules(self, latent_dim: int) -> VAESelfies: match latent_dim: case 2: weights_path = ( diff --git a/src/hdbo_benchmark/generative_models/vae_mario.py b/src/hdbo_benchmark/generative_models/vae_mario.py index b499869..f902119 100644 --- a/src/hdbo_benchmark/generative_models/vae_mario.py +++ b/src/hdbo_benchmark/generative_models/vae_mario.py @@ -133,8 +133,7 @@ def _from_level_to_onehot(self, level: str): return onehot - def decode_to_string_array(self, z: np.ndarray) -> np.ndarray: - ... + def decode_to_string_array(self, z: np.ndarray) -> np.ndarray: ... def plot_grid( self, @@ -180,9 +179,9 @@ def plot_grid( pixels = 16 * 14 final_img = np.zeros((n_cols * pixels, n_rows * pixels, 3)) for z, (i, j) in positions.items(): - final_img[ - i * pixels : (i + 1) * pixels, j * pixels : (j + 1) * pixels - ] = img_dict[z] + final_img[i * pixels : (i + 1) * pixels, j * pixels : (j + 1) * pixels] = ( + img_dict[z] + ) final_img = final_img.astype(int) diff --git a/src/hdbo_benchmark/generative_models/vae_molopt.py b/src/hdbo_benchmark/generative_models/vae_molopt.py deleted file mode 100644 index 8a69962..0000000 --- a/src/hdbo_benchmark/generative_models/vae_molopt.py +++ /dev/null @@ -1,198 +0,0 @@ -""" -Implements a Variational Autoencoder that loads MolOpt VAE model. -It is originally designed to handle the -SELFIES. -""" - -from typing import Tuple, Dict, Optional -from pathlib import Path -import sys - -import numpy as np - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from torch.distributions import Normal, Categorical - -from hdbo_benchmark.utils.selfies.tokens import from_selfie_to_tensor -from hdbo_benchmark.generative_models.vae import VAE - -ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve() - -# add model, utils, etc of molopt to the namespace -sys.path.append( - str( - ROOT_DIR - / "src" - / "hdbo_benchmark" - / "generative_models" - / "molopt" - / "selfies_vae" - ) -) -import models -import utils - - -class VAEMolOpt(VAE): - def __init__( - self, - max_len: int = 100, - model_path: Path = None, - vocab_path: Path = None, - config_path: Path = None, - device: torch.device = torch.device("cpu"), - ) -> None: - molopt_assets_path = ( - ROOT_DIR - / "src" - / "hdbo_benchmark" - / "generative_models" - / "molopt" - / "selfies_vae" - / "checkpoint" - ) - if not model_path: - model_path = molopt_assets_path / "selfies_vae_model_020.pt" - if not vocab_path: - vocab_path = molopt_assets_path / "selfies_vae_vocab.txt" - if not config_path: - config_path = molopt_assets_path / "selfies_vae_config.pt" - self.vocab = torch.load(vocab_path) - for ss in ("bos", "eos", "unk", "pad"): - setattr(self, ss, getattr(self.vocab, ss)) - self._config = torch.load(config_path) # TODO: check if latent dim is d_z - alphabet_s_to_i = self.vocab.c2i # TODO: assess if that is the same object - self.max_len = max_len - - super().__init__( - latent_dim=self._config.d_z, - alphabet_s_to_i=alphabet_s_to_i, - device=device, - ) - self._vae = torch.load(model_path) - # Moves to device - self.to(device) - - def _forward(self, x: torch.Tensor) -> torch.TensorType: - """ - NOTE: molopt builtin forward returns z and kl_loss, NOT mu or logvar. - Custom shorter forward_encoder implementation below. - """ - x = [self._vae.x_emb(i_x) for i_x in x] - x = nn.utils.rnn.pack_sequence(x) - - _, h = self._vae.encoder_rnn(x, None) - - h = h[-(1 + int(self._vae.encoder_rnn.bidirectional)) :] - h = torch.cat(h.split(1), dim=-1).squeeze(0) - - mu, logvar = self._vae.q_mu(h), self._vae.q_logvar(h) - return mu, logvar - - def encode(self, x: torch.Tensor) -> Normal: - """ - Computes the approximate posterior q(z|x) over - the latent variable z. - """ - mu, log_var = self._forward(x) - - return Normal(loc=mu, scale=torch.exp(0.5 * log_var)) - - def decode(self, z: torch.Tensor, temp: float = 1.0) -> Categorical: - """ - Returns a categorical likelihood over the vocabulary - """ - n_batch = 2 - z = torch.cat([z, z], dim=0) - probits = [] - with torch.no_grad(): - z = z.to(self.device) - z_0 = z.unsqueeze(1) - # Initial values - h = self._vae.decoder_lat(z) - # print('decode', h.shape) - h = h.unsqueeze(0).repeat(self._vae.decoder_rnn.num_layers, 1, 1) - w = torch.tensor(self.bos, device=self.device).repeat(n_batch) - x = torch.tensor([self.pad], device=self.device).repeat( - n_batch, self.max_len - ) - x[:, 0] = self.bos - - # Generating cycle - for i in range(0, self.max_len): - x_emb = self._vae.x_emb(w).unsqueeze(1) - x_input = torch.cat([x_emb, z_0], dim=-1) - o, h = self._vae.decoder_rnn(x_input, h) - y = self._vae.decoder_fc(o.squeeze(1)) - y = F.softmax(y / temp, dim=-1) - probits.append(y) - - probs = torch.stack(probits, axis=1)[ - 0 - ] # select first entry of the n_batch=2, see original implementation - # The categorical distribution expects (batch_size, ..., num_classes) - return Categorical(probs=probs) - - def forward(self, x: torch.Tensor) -> Tuple[Normal, Categorical]: - """ - Computes a forward pass through the VAE, returning - the distributions q_z_given_x and p_x_given_z. - """ - q_z_given_x = self.encode(x) - z = q_z_given_x.rsample() - - p_x_given_z = self.decode(z) - - return q_z_given_x, p_x_given_z - - def loss_function(self, x: torch.Tensor) -> torch.Tensor: - """ - Computes the ELBO loss for a given batch {x}. - # TODO: use reference loss implementation - """ - q_z_given_x, p_x_given_z = self.forward(x) - - # Computes the KL divergence between q(z|x) and p(z) - kl_div = torch.distributions.kl_divergence(q_z_given_x, self.p_z).sum(dim=-1) - - # Computes the reconstruction loss - recon_loss = -p_x_given_z.log_prob(x.argmax(dim=-1).to(self.device)).sum(dim=-1) - - # Computes the ELBO loss - loss: torch.Tensor = (kl_div + recon_loss).mean() - - return loss - - def encode_from_string_array(self, x: np.ndarray) -> np.ndarray: - # Assuming x is an array of strings [b, L] or [b,] - # TODO: assumes tokenization - # selfies_strings: list[str] = ["".join(x_i) for x_i in x] - encoded_selfies = np.array( - [[self.vocab.c2i.get(s) for s in selfie] for selfie in x] - ) - enc_x = torch.from_numpy(encoded_selfies) - - z_dist = self.encode(enc_x) - z_ = z_dist.mean - z: np.ndarray = z_.cpu().detach().numpy() - - return z - - def decode_to_string_array(self, z: np.ndarray, sample=False) -> np.ndarray: - selfie_strs = [] - for z_i in z: - decoder_cat = self.decode( - torch.from_numpy(np.atleast_1d(z_i.reshape(1, -1))).to( - torch.get_default_dtype() - ) - ) - if not sample: - id_seqs = decoder_cat.probs.argmax(0).detach().numpy() - else: - id_seqs = decoder_cat.sample().detach().numpy() - selfie_str = np.array([self.vocab.i2c.get(s) for s in id_seqs]) - selfie_strs.append("".join(selfie_str)) - return np.array(selfie_strs) diff --git a/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py b/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py index 8c0cb0b..93889d4 100644 --- a/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py +++ b/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py @@ -106,7 +106,6 @@ def summary_per_function( def plot_heatmap(df, normalized: bool = True): - summary_avg, _ = summary_per_function(df, normalized_per_row=normalized) # We keep the columns in solver_name_but_pretty order diff --git a/src/hdbo_benchmark/tests/__init__.py b/src/hdbo_benchmark/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/hdbo_benchmark/tests/benchmark_on_pmo/__init__.py b/src/hdbo_benchmark/tests/benchmark_on_pmo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/hdbo_benchmark/tests/benchmark_on_pmo/test_run.py b/src/hdbo_benchmark/tests/benchmark_on_pmo/test_run.py new file mode 100644 index 0000000..6b614aa --- /dev/null +++ b/src/hdbo_benchmark/tests/benchmark_on_pmo/test_run.py @@ -0,0 +1,35 @@ +import os + +import pytest + +from click.testing import CliRunner +from hdbo_benchmark.experiments.benchmark_on_pmo.run import main + + +@pytest.mark.parametrize("function_name", ["albuterol_similarity", "valsartan_smarts"]) +@pytest.mark.parametrize("solver_name", ["random_mutation", "line_bo", "turbo"]) +@pytest.mark.parametrize("latent_dim", [2, 128]) +def test_main_run(function_name, solver_name, latent_dim): + os.environ["WANDB_MODE"] = "disabled" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "--solver-name", + solver_name, + "--function-name", + function_name, + "--latent-dim", + str(latent_dim), + "--max-iter", + "3", + "--n-initial-points", + "2", + "--no-strict-on-hash", + "--force-run", + "--solve-in-discrete-space", + "--tag", + "test", + ], + ) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..d202d14 --- /dev/null +++ b/tox.ini @@ -0,0 +1,26 @@ +[tox] +env_list = + hdbo-base-py310 +minversion = 4.10.0 + +[testenv] +description = run the tests with pytest +package = wheel +wheel_build_env = .pkg +allowlist_externals = + sh +deps = + pytest>=6 +commands = + pytest {tty:--color=yes} -v {posargs} + +[testenv:hdbo-base-py310] +description = run the tests with pytest on the hdbo_benchmark env +basepython = python3.10 +wheel_build_env = .pkg +deps= + {[testenv]deps} + black +commands= + black --check --diff . + {[testenv]commands}