diff --git a/.github/workflows/tox-lint-and-pytest.yml b/.github/workflows/tox-lint-and-pytest.yml
new file mode 100644
index 0000000..00a4c19
--- /dev/null
+++ b/.github/workflows/tox-lint-and-pytest.yml
@@ -0,0 +1,26 @@
+name: Tests on hdbo (conda, python 3.10)
+
+on: [push]
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Install dependencies
+      run: |
+        python -m pip install tox
+    - name: Check linting and tests with tox
+      run: |
+        tox
\ No newline at end of file
diff --git a/README.md b/README.md
index 778bc96..08bf027 100644
--- a/README.md
+++ b/README.md
@@ -61,14 +61,14 @@ After implementing a solver in `poli-baselines`, you can **register it** in `src
 The scripts used to run the benchmarks can be found in `src/hdbo_benchmark/experiments`. To run e.g. `albuterol_similarity` [of the PMO benchmark](https://openreview.net/forum?id=yCZRdI0Y7G) you can run:
 
 ```bash
-conda run -n hdbo python src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py \
-    --function_name=albuterol_similarity \
-    --solver_name=your_solver_name \
-    --latent_dim=128 \
+conda run -n hdbo_benchmark python src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py \
+    --function-name=albuterol_similarity \
+    --solver-name=line_bo \
+    --latent-dim=128 \
     --max-iter=300 \
 ```
 
-assuming `hdbo` is an environment in which you can run your solver, and in which this package is installed. Examples of environments where solvers have been tested to run can be found in `poli-baselines`.
+assuming `hdbo_benchmark` is an environment in which you can run your solver, and in which this package is installed. Examples of environments where solvers have been tested to run can be found in `poli-baselines`.
 
 ## Replicating the data preprocessing for downloading zinc250k
 
diff --git a/environment.yml b/environment.yml
index e69de29..2c19d8c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -0,0 +1,15 @@
+name: hdbo_benchmark 
+channels:
+  - defaults
+dependencies:
+  - python=3.10
+  - pip
+  - pip:
+    - botorch
+    - seaborn
+    - CairoSVG
+    - wandb
+    - click
+    - "git+https://github.com/MachineLearningLifeScience/poli.git@dev"
+    - "git+https://github.com/MachineLearningLifeScience/poli-baselines.git@main"
+    - -e .
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 08c1977..2c901a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,8 +6,13 @@ build-backend = "setuptools.build_meta"
 name = "hdbo_benchmark"
 version = "0.0.1"
 dependencies = [
-    "numpy",
-    "torch",
+    "botorch",
+    "seaborn",
+    "CairoSVG",
+    "wandb",
+    "click",
+    "poli@git+https://github.com/MachineLearningLifeScience/poli.git@dev",
+    "poli-baselines@git+https://github.com/MachineLearningLifeScience/poli-baselines.git@main"
 ]
 
 [tool.mypy]
diff --git a/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py b/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py
index e4c9883..ef87533 100644
--- a/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py
+++ b/src/hdbo_benchmark/data_preprocessing/zinc250k/02_save_as_csv.py
@@ -1,4 +1,5 @@
 """Loads the processed datasets, and saves them as a csv."""
+
 import pickle
 from pathlib import Path
 
diff --git a/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py b/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py
index 50d00c5..6822785 100644
--- a/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py
+++ b/src/hdbo_benchmark/experiments/benchmark_on_pmo/run.py
@@ -17,7 +17,6 @@
 
 import hdbo_benchmark
 from hdbo_benchmark.generative_models.vae_factory import VAEFactory, VAESelfies, VAE
-from hdbo_benchmark.generative_models.vae_molopt import VAEMolOpt
 from hdbo_benchmark.utils.experiments.load_solvers import load_solver, SOLVER_NAMES
 from hdbo_benchmark.utils.experiments.load_metadata_for_vaes import (
     load_alphabet_for_pmo,
@@ -39,8 +38,6 @@ def in_latent_space(
 ) -> Callable[[np.ndarray], np.ndarray]:
     def _latent_f(z: np.ndarray) -> np.ndarray:
         selfies_strings = vae.decode_to_string_array(z)
-        if isinstance(vae, VAEMolOpt):
-            selfies_strings = np.array(["".join(selfies_strings)])
         val: np.ndarray = f(np.array(selfies_strings))
         return val
 
diff --git a/src/hdbo_benchmark/generative_models/vae_factory.py b/src/hdbo_benchmark/generative_models/vae_factory.py
index 200dc3c..9335f28 100644
--- a/src/hdbo_benchmark/generative_models/vae_factory.py
+++ b/src/hdbo_benchmark/generative_models/vae_factory.py
@@ -8,7 +8,6 @@
 import torch
 
 from hdbo_benchmark.generative_models.vae import VAE
-from hdbo_benchmark.generative_models.vae_molopt import VAEMolOpt
 from hdbo_benchmark.generative_models.vae_selfies import VAESelfies
 from hdbo_benchmark.generative_models.vae_rnn_selfies import VAERNNSelfies
 from hdbo_benchmark.generative_models.vae_mario import VAEMario
@@ -77,7 +76,7 @@ def _create_vae_on_mario(self, latent_dim: int) -> VAEMario:
         opt_vae.load_state_dict(torch.load(weights_path, map_location=DEVICE))
         return opt_vae
 
-    def _create_vae_on_molecules(self, latent_dim: int) -> VAESelfies | VAEMolOpt:
+    def _create_vae_on_molecules(self, latent_dim: int) -> VAESelfies:
         match latent_dim:
             case 2:
                 weights_path = (
diff --git a/src/hdbo_benchmark/generative_models/vae_mario.py b/src/hdbo_benchmark/generative_models/vae_mario.py
index b499869..f902119 100644
--- a/src/hdbo_benchmark/generative_models/vae_mario.py
+++ b/src/hdbo_benchmark/generative_models/vae_mario.py
@@ -133,8 +133,7 @@ def _from_level_to_onehot(self, level: str):
 
         return onehot
 
-    def decode_to_string_array(self, z: np.ndarray) -> np.ndarray:
-        ...
+    def decode_to_string_array(self, z: np.ndarray) -> np.ndarray: ...
 
     def plot_grid(
         self,
@@ -180,9 +179,9 @@ def plot_grid(
         pixels = 16 * 14
         final_img = np.zeros((n_cols * pixels, n_rows * pixels, 3))
         for z, (i, j) in positions.items():
-            final_img[
-                i * pixels : (i + 1) * pixels, j * pixels : (j + 1) * pixels
-            ] = img_dict[z]
+            final_img[i * pixels : (i + 1) * pixels, j * pixels : (j + 1) * pixels] = (
+                img_dict[z]
+            )
 
         final_img = final_img.astype(int)
 
diff --git a/src/hdbo_benchmark/generative_models/vae_molopt.py b/src/hdbo_benchmark/generative_models/vae_molopt.py
deleted file mode 100644
index 8a69962..0000000
--- a/src/hdbo_benchmark/generative_models/vae_molopt.py
+++ /dev/null
@@ -1,198 +0,0 @@
-"""
-Implements a Variational Autoencoder that loads MolOpt VAE model. 
-It is originally designed to handle the
-SELFIES.
-"""
-
-from typing import Tuple, Dict, Optional
-from pathlib import Path
-import sys
-
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch.distributions import Normal, Categorical
-
-from hdbo_benchmark.utils.selfies.tokens import from_selfie_to_tensor
-from hdbo_benchmark.generative_models.vae import VAE
-
-ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
-
-# add model, utils, etc of molopt to the namespace
-sys.path.append(
-    str(
-        ROOT_DIR
-        / "src"
-        / "hdbo_benchmark"
-        / "generative_models"
-        / "molopt"
-        / "selfies_vae"
-    )
-)
-import models
-import utils
-
-
-class VAEMolOpt(VAE):
-    def __init__(
-        self,
-        max_len: int = 100,
-        model_path: Path = None,
-        vocab_path: Path = None,
-        config_path: Path = None,
-        device: torch.device = torch.device("cpu"),
-    ) -> None:
-        molopt_assets_path = (
-            ROOT_DIR
-            / "src"
-            / "hdbo_benchmark"
-            / "generative_models"
-            / "molopt"
-            / "selfies_vae"
-            / "checkpoint"
-        )
-        if not model_path:
-            model_path = molopt_assets_path / "selfies_vae_model_020.pt"
-        if not vocab_path:
-            vocab_path = molopt_assets_path / "selfies_vae_vocab.txt"
-        if not config_path:
-            config_path = molopt_assets_path / "selfies_vae_config.pt"
-        self.vocab = torch.load(vocab_path)
-        for ss in ("bos", "eos", "unk", "pad"):
-            setattr(self, ss, getattr(self.vocab, ss))
-        self._config = torch.load(config_path)  # TODO: check if latent dim is d_z
-        alphabet_s_to_i = self.vocab.c2i  # TODO: assess if that is the same object
-        self.max_len = max_len
-
-        super().__init__(
-            latent_dim=self._config.d_z,
-            alphabet_s_to_i=alphabet_s_to_i,
-            device=device,
-        )
-        self._vae = torch.load(model_path)
-        # Moves to device
-        self.to(device)
-
-    def _forward(self, x: torch.Tensor) -> torch.TensorType:
-        """
-        NOTE: molopt builtin forward returns z and kl_loss, NOT mu or logvar.
-        Custom shorter forward_encoder implementation below.
-        """
-        x = [self._vae.x_emb(i_x) for i_x in x]
-        x = nn.utils.rnn.pack_sequence(x)
-
-        _, h = self._vae.encoder_rnn(x, None)
-
-        h = h[-(1 + int(self._vae.encoder_rnn.bidirectional)) :]
-        h = torch.cat(h.split(1), dim=-1).squeeze(0)
-
-        mu, logvar = self._vae.q_mu(h), self._vae.q_logvar(h)
-        return mu, logvar
-
-    def encode(self, x: torch.Tensor) -> Normal:
-        """
-        Computes the approximate posterior q(z|x) over
-        the latent variable z.
-        """
-        mu, log_var = self._forward(x)
-
-        return Normal(loc=mu, scale=torch.exp(0.5 * log_var))
-
-    def decode(self, z: torch.Tensor, temp: float = 1.0) -> Categorical:
-        """
-        Returns a categorical likelihood over the vocabulary
-        """
-        n_batch = 2
-        z = torch.cat([z, z], dim=0)
-        probits = []
-        with torch.no_grad():
-            z = z.to(self.device)
-            z_0 = z.unsqueeze(1)
-            # Initial values
-            h = self._vae.decoder_lat(z)
-            # print('decode', h.shape)
-            h = h.unsqueeze(0).repeat(self._vae.decoder_rnn.num_layers, 1, 1)
-            w = torch.tensor(self.bos, device=self.device).repeat(n_batch)
-            x = torch.tensor([self.pad], device=self.device).repeat(
-                n_batch, self.max_len
-            )
-            x[:, 0] = self.bos
-
-            # Generating cycle
-            for i in range(0, self.max_len):
-                x_emb = self._vae.x_emb(w).unsqueeze(1)
-                x_input = torch.cat([x_emb, z_0], dim=-1)
-                o, h = self._vae.decoder_rnn(x_input, h)
-                y = self._vae.decoder_fc(o.squeeze(1))
-                y = F.softmax(y / temp, dim=-1)
-                probits.append(y)
-
-        probs = torch.stack(probits, axis=1)[
-            0
-        ]  # select first entry of the n_batch=2, see original implementation
-        # The categorical distribution expects (batch_size, ..., num_classes)
-        return Categorical(probs=probs)
-
-    def forward(self, x: torch.Tensor) -> Tuple[Normal, Categorical]:
-        """
-        Computes a forward pass through the VAE, returning
-        the distributions q_z_given_x and p_x_given_z.
-        """
-        q_z_given_x = self.encode(x)
-        z = q_z_given_x.rsample()
-
-        p_x_given_z = self.decode(z)
-
-        return q_z_given_x, p_x_given_z
-
-    def loss_function(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Computes the ELBO loss for a given batch {x}.
-        # TODO: use reference loss implementation
-        """
-        q_z_given_x, p_x_given_z = self.forward(x)
-
-        # Computes the KL divergence between q(z|x) and p(z)
-        kl_div = torch.distributions.kl_divergence(q_z_given_x, self.p_z).sum(dim=-1)
-
-        # Computes the reconstruction loss
-        recon_loss = -p_x_given_z.log_prob(x.argmax(dim=-1).to(self.device)).sum(dim=-1)
-
-        # Computes the ELBO loss
-        loss: torch.Tensor = (kl_div + recon_loss).mean()
-
-        return loss
-
-    def encode_from_string_array(self, x: np.ndarray) -> np.ndarray:
-        # Assuming x is an array of strings [b, L] or [b,]
-        # TODO: assumes tokenization
-        # selfies_strings: list[str] = ["".join(x_i) for x_i in x]
-        encoded_selfies = np.array(
-            [[self.vocab.c2i.get(s) for s in selfie] for selfie in x]
-        )
-        enc_x = torch.from_numpy(encoded_selfies)
-
-        z_dist = self.encode(enc_x)
-        z_ = z_dist.mean
-        z: np.ndarray = z_.cpu().detach().numpy()
-
-        return z
-
-    def decode_to_string_array(self, z: np.ndarray, sample=False) -> np.ndarray:
-        selfie_strs = []
-        for z_i in z:
-            decoder_cat = self.decode(
-                torch.from_numpy(np.atleast_1d(z_i.reshape(1, -1))).to(
-                    torch.get_default_dtype()
-                )
-            )
-            if not sample:
-                id_seqs = decoder_cat.probs.argmax(0).detach().numpy()
-            else:
-                id_seqs = decoder_cat.sample().detach().numpy()
-            selfie_str = np.array([self.vocab.i2c.get(s) for s in id_seqs])
-            selfie_strs.append("".join(selfie_str))
-        return np.array(selfie_strs)
diff --git a/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py b/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py
index 8c0cb0b..93889d4 100644
--- a/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py
+++ b/src/hdbo_benchmark/results/benchmark_on_pmo/visualize_function_vs_solver_heatmap.py
@@ -106,7 +106,6 @@ def summary_per_function(
 
 
 def plot_heatmap(df, normalized: bool = True):
-
     summary_avg, _ = summary_per_function(df, normalized_per_row=normalized)
 
     # We keep the columns in solver_name_but_pretty order
diff --git a/src/hdbo_benchmark/tests/__init__.py b/src/hdbo_benchmark/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/hdbo_benchmark/tests/benchmark_on_pmo/__init__.py b/src/hdbo_benchmark/tests/benchmark_on_pmo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/hdbo_benchmark/tests/benchmark_on_pmo/test_run.py b/src/hdbo_benchmark/tests/benchmark_on_pmo/test_run.py
new file mode 100644
index 0000000..6b614aa
--- /dev/null
+++ b/src/hdbo_benchmark/tests/benchmark_on_pmo/test_run.py
@@ -0,0 +1,35 @@
+import os
+
+import pytest
+
+from click.testing import CliRunner
+from hdbo_benchmark.experiments.benchmark_on_pmo.run import main
+
+
+@pytest.mark.parametrize("function_name", ["albuterol_similarity", "valsartan_smarts"])
+@pytest.mark.parametrize("solver_name", ["random_mutation", "line_bo", "turbo"])
+@pytest.mark.parametrize("latent_dim", [2, 128])
+def test_main_run(function_name, solver_name, latent_dim):
+    os.environ["WANDB_MODE"] = "disabled"
+
+    runner = CliRunner()
+    result = runner.invoke(
+        main,
+        [
+            "--solver-name",
+            solver_name,
+            "--function-name",
+            function_name,
+            "--latent-dim",
+            str(latent_dim),
+            "--max-iter",
+            "3",
+            "--n-initial-points",
+            "2",
+            "--no-strict-on-hash",
+            "--force-run",
+            "--solve-in-discrete-space",
+            "--tag",
+            "test",
+        ],
+    )
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..d202d14
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,26 @@
+[tox]
+env_list =
+    hdbo-base-py310
+minversion = 4.10.0
+
+[testenv]
+description = run the tests with pytest
+package = wheel
+wheel_build_env = .pkg
+allowlist_externals =
+    sh
+deps =
+    pytest>=6
+commands =
+    pytest {tty:--color=yes} -v {posargs}
+
+[testenv:hdbo-base-py310]
+description = run the tests with pytest on the hdbo_benchmark env
+basepython = python3.10
+wheel_build_env = .pkg
+deps=
+    {[testenv]deps}
+    black
+commands=
+    black --check --diff .
+    {[testenv]commands}