Replicating the data preprocessing for VAE training (#2)

* Updates the preprocessing steps, adds replicability * Adds an action for the preprocessing env * Updates readme's instructions * Updates the dependencies on numpy
MachineLearningLifeScience · Jun 17, 2024 · 6633a44 · 6633a44
1 parent d7e5c53
commit 6633a44
Show file tree

Hide file tree

Showing 12 changed files with 93 additions and 7 deletions.
diff --git a/.github/workflows/preprocessing-env-builds.yml b/.github/workflows/preprocessing-env-builds.yml
@@ -0,0 +1,23 @@
+name: Data preprocessing env builds (conda, python 3.8)
+
+on: [push]
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Preprocessing env builds
+      run: |
+        conda env create -f environment.data_preprocessing.yml
diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,8 @@ wandb/
 data/small_molecule_datasets/raw/
 data/small_molecule_datasets/processed/*.npz
 data/small_molecule_datasets/processed/*.pkl
+data/small_molecule_datasets/processed/*.pkl
+data/small_molecule_datasets/processed/*.old.*
 data/small_molecule_datasets/processed/zinc250k.csv
 data/trained_models/zinc_250k
 data/trained_models/vae_mario

diff --git a/README.md b/README.md
@@ -79,10 +79,10 @@ assuming `hdbo_benchmark` is an environment in which you can run your solver, an
 We use [torchdrug](https://torchdrug.ai/docs/installation.html) to download the dataset. It has very picky dependencies, but you should be able to install it by running
 
 ```bash
-conda env create --file environment.vae_training.yml
+conda env create --file environment.data_preprocessing.yml
 ```
 
-and following the scripts in `src/hdbo_benchmark/data_preprocessing/zinc250k`.
+and following the scripts in `src/hdbo_benchmark/data_preprocessing/zinc250k` inside that env (`conda activate hdbo__data_preprocessing`).
 
 ## Citing all the relevant work
 

diff --git a/data/small_molecule_datasets/processed/zinc250k_sequence_lengths.jpg b/data/small_molecule_datasets/processed/zinc250k_sequence_lengths.jpg
diff --git a/environment.data_preprocessing.yml b/environment.data_preprocessing.yml
@@ -0,0 +1,15 @@
+name: hdbo__data_preprocessing
+channels:
+  - conda-forge
+  - defaults
+  - milagraph
+  - pytorch
+  - pyg
+dependencies:
+  - python=3.8
+  - torchdrug
+  - pip
+  - pip:
+    - pandas
+    - selfies
+    - torchdrug==0.2.1
diff --git a/environment.vae_training.yml b/environment.vae_training.yml
diff --git a/environment.yml b/environment.yml
@@ -5,6 +5,7 @@ dependencies:
   - python=3.10
   - pip
   - pip:
+    - numpy<2
     - botorch
     - seaborn
     - CairoSVG

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,6 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "hdbo_benchmark"
 version = "0.0.1"
 dependencies = [
+    "numpy<2",
     "botorch",
     "seaborn",
     "CairoSVG",

diff --git a/src/hdbo_benchmark/data_preprocessing/zinc250k/01_from_smiles_to_selfies.py b/src/hdbo_benchmark/data_preprocessing/zinc250k/01_from_smiles_to_selfies.py
@@ -5,10 +5,51 @@
 molecules in the ZINC dataset to SELFIES.
 """
 
+from typing import List
 import pickle
 from pathlib import Path
 
-from poli.core.util.chemistry.string_to_molecule import translate_smiles_to_selfies  # type: ignore
+import selfies as sf
+
+
+def translate_smiles_to_selfies(
+    smiles_strings: List[str], strict: bool = False
+) -> List[str]:
+    """Translates a list of SMILES strings to SELFIES strings.
+
+    Given a list of SMILES strings, returns the translation
+    into SELFIES strings. If strict is True, it raises an error
+    if a SMILES string in the list cannot be parsed. Else, it
+    returns None for those.
+
+    This function uses the `selfies` package from Aspuru-Guzik's
+    lab. See https://github.com/aspuru-guzik-group/selfies
+
+
+    Parameters
+    ----------
+    smiles_strings : List[str]
+        A list of SMILES strings.
+    strict : bool, optional
+        If True, raise an error if a SMILES string in the list cannot be parsed.
+
+    Returns
+    -------
+    List[str]
+        A list of SELFIES strings.
+    """
+    selfies_strings = []
+    for smile in smiles_strings:
+        try:
+            selfies_strings.append(sf.encoder(smile))
+        except sf.EncoderError:
+            if strict:
+                raise ValueError(f"Failed to encode SMILES to SELFIES.")
+            else:
+                selfies_strings.append(None)
+
+    return selfies_strings
+
 
 if __name__ == "__main__":
     # We get the path to the ZINC dataset

diff --git a/src/hdbo_benchmark/data_preprocessing/zinc250k/04_computing_alphabet.py b/src/hdbo_benchmark/data_preprocessing/zinc250k/04_computing_alphabet.py
@@ -1,5 +1,6 @@
 """Computes the alphabet by counting the tokens in the dataset."""
 
+from __future__ import annotations
 from collections import defaultdict
 from pathlib import Path
 import json

diff --git a/..._benchmark/data_preprocessing/zinc250k/05_computing_integer_and_onehot_representations.py b/..._benchmark/data_preprocessing/zinc250k/05_computing_integer_and_onehot_representations.py
@@ -21,7 +21,7 @@
     PROCESSED_DIR = ROOT_DIR / "data" / "small_molecule_datasets" / "processed"
 
     dataset_path = PROCESSED_DIR / "zinc250k.csv"
-    alphabet_path = PROCESSED_DIR / "alphabet_stoi.json"
+    alphabet_path = PROCESSED_DIR / "zinc250k_alphabet_stoi.json"
 
     assert (
         dataset_path.exists()

diff --git a/src/hdbo_benchmark/data_preprocessing/zinc250k/06_human_readable_metadata.py b/src/hdbo_benchmark/data_preprocessing/zinc250k/06_human_readable_metadata.py
@@ -15,11 +15,13 @@
     PROCESSED_DIR = ROOT_DIR / "data" / "small_molecule_datasets" / "processed"
 
     # We load the sequence lengths
-    sequence_lengths = pd.read_csv(PROCESSED_DIR / "sequence_lengths.csv")["SELFIES"]
+    sequence_lengths = pd.read_csv(PROCESSED_DIR / "zinc250k_sequence_lengths.csv")[
+        "SELFIES"
+    ]
     max_sequence_length = max(sequence_lengths)
 
     # We compute the length of the alphabet
-    with open(PROCESSED_DIR / "alphabet_stoi.json", "r") as fin:
+    with open(PROCESSED_DIR / "zinc250k_alphabet_stoi.json", "r") as fin:
         alphabet = json.load(fin)
 
     alphabet_length = len(alphabet)
@@ -30,5 +32,5 @@
     }
 
     # We save the metadata
-    with open(PROCESSED_DIR / "metadata.json", "w") as fout:
+    with open(PROCESSED_DIR / "zinc250k_metadata.json", "w") as fout:
         json.dump(metadata, fout, indent=4)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ dependencies: @@
       - python=3.10
       - pip
       - pip:
+        - numpy<2
         - botorch
         - seaborn
         - CairoSVG
@@ Expand Down @@