From d983fcfe411d8c38f148c5542b21ce57daf39d48 Mon Sep 17 00:00:00 2001
From: Martin Glauer <martinglauer89@gmail.com>
Date: Thu, 8 Aug 2024 10:46:17 +0200
Subject: [PATCH 001/112] Add actions for unittests

---
 .github/workflows/test.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..a687fdda
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,20 @@
+name: Unittests
+
+on: [pull_request]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Display Python version
+        run: python -m unittest
\ No newline at end of file

From 491428d6a445265fad45450c8212b8af9b3a3285 Mon Sep 17 00:00:00 2001
From: Martin Glauer <martinglauer89@gmail.com>
Date: Thu, 8 Aug 2024 10:52:59 +0200
Subject: [PATCH 002/112] Add dependencies to test scipt

---
 .github/workflows/test.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a687fdda..28d9b4cb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,5 +16,10 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade pip setuptools wheel
+          pip install -e .
       - name: Display Python version
         run: python -m unittest
\ No newline at end of file

From 04615d91769002ca9e6b092cfe3699a293909d0c Mon Sep 17 00:00:00 2001
From: Martin Glauer <martinglauer89@gmail.com>
Date: Thu, 8 Aug 2024 11:01:08 +0200
Subject: [PATCH 003/112] Install cpu-based version of torch

---
 .github/workflows/test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 28d9b4cb..f2143ff2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,6 +20,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install --upgrade pip setuptools wheel
-          pip install -e .
+          python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          python -m pip install -e .
       - name: Display Python version
         run: python -m unittest
\ No newline at end of file

From 605064425dd220c8f19627c2386663b4665fd015 Mon Sep 17 00:00:00 2001
From: Martin Glauer <martinglauer89@gmail.com>
Date: Thu, 8 Aug 2024 11:17:13 +0200
Subject: [PATCH 004/112] Disable fail-fast

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f2143ff2..a75533f8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -7,6 +7,7 @@ jobs:
 
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11"]
 

From 6bb1a85ae3503e39b379ccddaa6e0e6358d514da Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 26 Aug 2024 19:45:34 +0200
Subject: [PATCH 005/112] Create data_exploration.ipynb

---
 data_exploration.ipynb | 637 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 637 insertions(+)
 create mode 100644 data_exploration.ipynb

diff --git a/data_exploration.ipynb b/data_exploration.ipynb
new file mode 100644
index 00000000..6f1045a4
--- /dev/null
+++ b/data_exploration.ipynb
@@ -0,0 +1,637 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "81559360-c8b8-462d-bfa1-6ae22bed1615",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "# Ignore all warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "\n",
+    "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n",
+    "\n",
+    "---\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33275d3c-cdbf-4c1f-aa04-f135511f3643",
+   "metadata": {},
+   "source": [
+    "# 1. Instantiation of a Data Class\r\n",
+    "\r\n",
+    "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n",
+    "### Inheritance Hierarchy\n",
+    "\n",
+    "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n",
+    "\n",
+    "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n",
+    "\n",
+    "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n",
+    "\n",
+    "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n",
+    ".\r\n",
+    "\r\n",
+    "### Explanation\r\n",
+    "a ChEBI data classiData` class can be configured with the following main parameters:\r\n",
+    "\r\n",
+    "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\r\n",
+    "\r\n",
+    "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\r\n",
+    "\r\n",
+    "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\r\n",
+    "\r\n",
+    "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\r\n",
+    "\r\n",
+    "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\r\n",
+    "\r\n",
+    "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n",
+    "\r\n",
+    "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\r\n",
+    "\r\n",
+    "### Additional Input Parameters\r\n",
+    "\r\n",
+    "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\r\n",
+    "\r\n",
+    "- **batch_size (int)**: The batch size for data loading. Default is `1`.\r\n",
+    "\r\n",
+    "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\r\n",
+    "\r\n",
+    "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\r\n",
+    "\r\n",
+    "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\r\n",
+    "\r\n",
+    "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\r\n",
+    "\r\n",
+    "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\r\n",
+    "\r\n",
+    "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\r\n",
+    "\r\n",
+    "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\r\n",
+    "\r\n",
+    "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\r\n",
+    "\r\n",
+    "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\r\n",
+    "\r\n",
+    "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\r\n",
+    "\r\n",
+    "- **kwargs**: Additional keyword arguments.\r\n",
+    "\r\n",
+    "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\r\n",
+    "ining and validation.\r\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a",
+   "metadata": {},
+   "source": [
+    "# Available ChEBI Data Classes\n",
+    "\n",
+    "## `ChEBIOver100`\n",
+    "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
+    "\n",
+    "## `ChEBIOver50`\n",
+    "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
+    "\n",
+    "## `ChEBIOver100DeepSMILES`\n",
+    "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n",
+    "\n",
+    "## `ChEBIOver100SELFIES`\n",
+    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n",
+    "\n",
+    "## `ChEBIOver50SELFIES`\n",
+    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n",
+    "\n",
+    "## `ChEBIOver50Partial`\n",
+    "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chebai.preprocessing.datasets.chebi import ChEBIOver50"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a71b7301-6195-4155-a439-f5eb3183d0f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chebi_class = ChEBIOver50(chebi_version=231)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8456b545-88c5-401d-baa5-47e8ae710f04",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1655d489-25fe-46de-9feb-eeca5d36936f",
+   "metadata": {},
+   "source": [
+    "# 2. Preparation / Setup Methods\r\n",
+    "\r\n",
+    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\r\n",
+    "\r\n",
+    "### Why is Preparation Needed?\r\n",
+    "\r\n",
+    "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\r\n",
+    "- **Data Integrity**: It ensures that the data files are up-to-date and compatible with the specified ChEBI version.\r\n",
+    "\r\n",
+    "### Main Methods for Data Preprocessing\r\n",
+    "\r\n",
+    "The data preprocessing in a data class involves two main methods:\r\n",
+    "\r\n",
+    "1. **`prepare_data` Method**:\r\n",
+    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\r\n",
+    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\r\n",
+    "\r\n",
+    "2. **`setup` Method**:\r\n",
+    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\r\n",
+    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\r\n",
+    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\r\n",
+    "\r\n",
+    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes.\r\n",
+    "alidation processes.\r\n",
+    "processed(data_df, processed_name)\r\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "f2df4bd1-cf34-4414-bce4-54379ffac006",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n",
+      "Cross-validation enabled: False\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n",
+      "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n",
+      "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n"
+     ]
+    }
+   ],
+   "source": [
+    "chebi_class.prepare_data()\n",
+    "chebi_class.setup()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e",
+   "metadata": {},
+   "source": [
+    "# 3. Different Data Files Created and their Structure\n",
+    "\r\n",
+    "\r\n",
+    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\r\n",
+    "\r\n",
+    "### Data Files\r\n",
+    "\r\n",
+    "1. **`Raw Data Files`**: (e.g., `.obo` file)\r\n",
+    "   - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n",
+    "\r\n",
+    "2. **`data.pkl`**\r\n",
+    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n",
+    "\r\n",
+    "3. **`data.pt`**\r\n",
+    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n",
+    "\r\n",
+    "4. **`classes.txt`**\r\n",
+    "   - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\r\n",
+    "\r\n",
+    "5. **`splits.csv`**\r\n",
+    "   - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\r\n",
+    "\r\n",
+    "### File Structure and Preprocessing Stages\r\n",
+    "\r\n",
+    "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\r\n",
+    "\r\n",
+    "1. **Raw Data Stage**:\r\n",
+    "   - **File**: `chebi.obo`\r\n",
+    "   - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n",
+    "\r\n",
+    "2. **Processed Data Stage 1**:\r\n",
+    "   - **File**: `data.pkl`\r\n",
+    "   - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n",
+    "   - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\r\n",
+    "\r\n",
+    "3. **Processed Data Stage 2**:\r\n",
+    "   - **File**: `data.pt`\r\n",
+    "   - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\r\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n",
+    "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\r\n",
+    "\r\n",
+    "### Data Splits\r\n",
+    "\r\n",
+    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\r\n",
+    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\r\n",
+    "\r\n",
+    "### Summary of File Paths\r\n",
+    "\r\n",
+    "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\r\n",
+    "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\r\n",
+    "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\r\n",
+    "\r\n",
+    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\r\n",
+    "that each step is well-documented and reproducible.\r\n",
+    "sing, from raw input to model-ready formats.\r\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74adb549-9e02-472d-a535-78a584853b52",
+   "metadata": {},
+   "source": [
+    "# 4. Information Stored in the Files\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "fd490270-59b8-4c1c-8b09-204defddf592",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "322bc926-69ff-4b93-9e95-5e8b85869c38",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## data.pkl\n",
+    "\n",
+    "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n",
+    "\n",
+    "\n",
+    "\n",
+    "### Structure of `data.pkl`\n",
+    "`data.pkl` as following structure: \n",
+    "- **Column 0**: Contains the ID of each ChEBI data instance.\n",
+    "- **Column 1**: Contains the name of each ChEBI data instance.\n",
+    "- **Column 2**: Contains the SMILES representation of the chemical.\n",
+    "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n",
+    "\n",
+    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "d7d16247-092c-4e8d-96c2-ab23931cf766",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of the data (rows x columns):  (129184, 1335)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>SMILES</th>\n",
+       "      <th>1722</th>\n",
+       "      <th>2468</th>\n",
+       "      <th>2571</th>\n",
+       "      <th>2580</th>\n",
+       "      <th>2634</th>\n",
+       "      <th>3098</th>\n",
+       "      <th>3992</th>\n",
+       "      <th>...</th>\n",
+       "      <th>143017</th>\n",
+       "      <th>143212</th>\n",
+       "      <th>143813</th>\n",
+       "      <th>146180</th>\n",
+       "      <th>147334</th>\n",
+       "      <th>156473</th>\n",
+       "      <th>166828</th>\n",
+       "      <th>166904</th>\n",
+       "      <th>167497</th>\n",
+       "      <th>167559</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>33429</td>\n",
+       "      <td>monoatomic monoanion</td>\n",
+       "      <td>[*-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>30151</td>\n",
+       "      <td>aluminide(1-)</td>\n",
+       "      <td>[Al-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>16042</td>\n",
+       "      <td>halide anion</td>\n",
+       "      <td>[*-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>17051</td>\n",
+       "      <td>fluoride</td>\n",
+       "      <td>[F-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>28741</td>\n",
+       "      <td>sodium fluoride</td>\n",
+       "      <td>[F-].[Na+]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 1335 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      id                  name      SMILES   1722   2468   2571   2580   2634  \\\n",
+       "0  33429  monoatomic monoanion        [*-]  False  False  False  False  False   \n",
+       "1  30151         aluminide(1-)       [Al-]  False  False  False  False  False   \n",
+       "2  16042          halide anion        [*-]  False  False  False  False  False   \n",
+       "3  17051              fluoride        [F-]  False  False  False  False  False   \n",
+       "4  28741       sodium fluoride  [F-].[Na+]  False  False  False  False  False   \n",
+       "\n",
+       "    3098   3992  ...  143017  143212  143813  146180  147334  156473  166828  \\\n",
+       "0  False  False  ...   False   False   False   False   False   False   False   \n",
+       "1  False  False  ...   False   False   False   False   False   False   False   \n",
+       "2  False  False  ...   False   False   False   False   False   False   False   \n",
+       "3  False  False  ...   False   False   False   False   False   False   False   \n",
+       "4  False  False  ...   False   False   False   False   False   False   False   \n",
+       "\n",
+       "   166904  167497  167559  \n",
+       "0   False   False   False  \n",
+       "1   False   False   False  \n",
+       "2   False   False   False  \n",
+       "3   False   False   False  \n",
+       "4   False   False   False  \n",
+       "\n",
+       "[5 rows x 1335 columns]"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n",
+    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
+    "pkl_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d",
+   "metadata": {},
+   "source": [
+    "# 6. Example Molecule: Different Encodings\n",
+    "\n",
+    "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n",
+    "\n",
+    "### Explanation:\n",
+    "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
+    "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b0f7974-f262-429c-b064-4207277e22ad",
+   "metadata": {},
+   "source": [
+    "# 7. Additional Useful Features\n",
+    "\n",
+    "- **Substructure Search**: `chebai` allows you to perform substructure searches within the ChEBI database.\n",
+    "- **Property Filters**: You can filter molecules based on specific properties, such as molecular weight or charge.\n",
+    "- **Visualization**: `chebai` provides tools for visualizing molecular structures directly within the notebook.\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "314801c7-9a1c-4247-9809-497f8481ac90",
+   "metadata": {},
+   "source": [
+    "# Conclusion\n",
+    "\n",
+    "This notebook provided an introduction to the `chebai` package, focusing on how data is structured and utilized. With this knowledge, you can start exploring chemical data more effectively using `chebai`."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (env_chebai)",
+   "language": "python",
+   "name": "env_chebai"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 830184f6886a42f293c2ff702c0509aff29ca9cb Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 27 Aug 2024 00:04:40 +0200
Subject: [PATCH 006/112] added information stored in files

---
 data_exploration.ipynb | 289 +++++++++++++++++++++++++++++++++++------
 1 file changed, 251 insertions(+), 38 deletions(-)

diff --git a/data_exploration.ipynb b/data_exploration.ipynb
index 6f1045a4..c4d60ab2 100644
--- a/data_exploration.ipynb
+++ b/data_exploration.ipynb
@@ -1,18 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "81559360-c8b8-462d-bfa1-6ae22bed1615",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import warnings\n",
-    "\n",
-    "# Ignore all warnings\n",
-    "warnings.filterwarnings(\"ignore\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b",
@@ -314,13 +301,51 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "fd490270-59b8-4c1c-8b09-204defddf592",
+   "cell_type": "markdown",
+   "id": "43329709-5134-4ce5-88e7-edd2176bf84d",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import pandas as pd"
+    "## chebi.obo\n",
+    "\n",
+    "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n",
+    "\n",
+    "### Structure of `chebi.obo`\n",
+    "\n",
+    "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n",
+    "\n",
+    "#### Example of a Term Document\n",
+    "\n",
+    "```plaintext\n",
+    "[Term]\n",
+    "id: CHEBI:24867\n",
+    "name: monoatomic ion\n",
+    "subset: 3_STAR\n",
+    "synonym: \"monoatomic ions\" RELATED [ChEBI]\n",
+    "is_a: CHEBI:24870\n",
+    "is_a: CHEBI:33238\n",
+    "```0\r\n",
+    "is_a: CHEBI:3323Relevant 8\r\n",
+    "```\r\n",
+    "\r\n",
+    "### Breakdown of Attributes\r\n",
+    "\r\n",
+    "Each term document in the `chebi.obo` file consists of the following key attributes:\r\n",
+    "\r\n",
+    "- **`[Term]`**: \r\n",
+    "  - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\r\n",
+    "\r\n",
+    "- **`id: CHEBI:24867`**: \r\n",
+    "  - **Description**: A unique identifier for the chemical entity within the ChEBI database.\r\n",
+    "  - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\r\n",
+    "\r\n",
+    "- **`name: monoatomic ion`**: \r\n",
+    "  - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\r\n",
+    "  - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\r\n",
+    "\r\n",
+    "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \r\n",
+    "  - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\r\n",
+    "  - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats.\r\n",
+    "```"
    ]
   },
   {
@@ -345,6 +370,16 @@
     "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "fd490270-59b8-4c1c-8b09-204defddf592",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 53,
@@ -566,50 +601,228 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee",
+   "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07",
    "metadata": {},
    "source": [
-    "---"
+    "## `data.pt` File\n",
+    "\n",
+    "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n",
+    "\n",
+    "### Structure of `data.pt`\n",
+    "\n",
+    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
+    "\n",
+    "- **`features`**: \n",
+    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
+    "\n",
+    "- **`labels`**: \n",
+    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
+    "\n",
+    "- **`ident`**: \n",
+    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Type of loaded data: <class 'list'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n",
+    "print(\"Type of loaded data:\", type(data_pt))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n",
+      "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n",
+      "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n",
+      "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n",
+      "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(5):\n",
+    "    print(data_pt[i])"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d",
+   "id": "861da1c3-0401-49f0-a22f-109814ed95d5",
    "metadata": {},
    "source": [
-    "# 6. Example Molecule: Different Encodings\n",
+    "## `classes.txt` File\n",
     "\n",
-    "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n",
+    "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n",
     "\n",
-    "### Explanation:\n",
-    "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
-    "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
-    "\n",
-    "---"
+    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1722\n",
+      "2468\n",
+      "2571\n",
+      "2580\n",
+      "2634\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n",
+    "    for i in range(5):\n",
+    "        line = file.readline()\n",
+    "        print(line.strip())"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "5b0f7974-f262-429c-b064-4207277e22ad",
+   "id": "b058714f-e434-4367-89b9-74c129ac727f",
+   "metadata": {},
+   "source": [
+    "## `splits.csv`\r\n",
+    "\r\n",
+    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>split</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>33429</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>30151</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>17051</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>32129</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>30340</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      id  split\n",
+       "0  33429  train\n",
+       "1  30151  train\n",
+       "2  17051  train\n",
+       "3  32129  train\n",
+       "4  30340  train"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n",
+    "csv_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee",
    "metadata": {},
    "source": [
-    "# 7. Additional Useful Features\n",
-    "\n",
-    "- **Substructure Search**: `chebai` allows you to perform substructure searches within the ChEBI database.\n",
-    "- **Property Filters**: You can filter molecules based on specific properties, such as molecular weight or charge.\n",
-    "- **Visualization**: `chebai` provides tools for visualizing molecular structures directly within the notebook.\n",
-    "\n",
     "---"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "314801c7-9a1c-4247-9809-497f8481ac90",
+   "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d",
    "metadata": {},
    "source": [
-    "# Conclusion\n",
+    "# 6. Example Molecule: Different Encodings\n",
+    "\n",
+    "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n",
     "\n",
-    "This notebook provided an introduction to the `chebai` package, focusing on how data is structured and utilized. With this knowledge, you can start exploring chemical data more effectively using `chebai`."
+    "### Explanation:\n",
+    "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
+    "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
+    "\n",
+    "---"
    ]
   }
  ],

From 7005a69c420b95cfe4e0ad4a23414ccc90858199 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 27 Aug 2024 00:29:31 +0200
Subject: [PATCH 007/112] Molecule: Different Encodings

---
 data_exploration.ipynb | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/data_exploration.ipynb b/data_exploration.ipynb
index c4d60ab2..e36fc1fe 100644
--- a/data_exploration.ipynb
+++ b/data_exploration.ipynb
@@ -353,8 +353,7 @@
    "id": "322bc926-69ff-4b93-9e95-5e8b85869c38",
    "metadata": {},
    "source": [
-    "\n",
-    "## data.pkl\n",
+    "## `data.pkl` File\n",
     "\n",
     "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n",
     "\n",
@@ -716,7 +715,7 @@
    "id": "b058714f-e434-4367-89b9-74c129ac727f",
    "metadata": {},
    "source": [
-    "## `splits.csv`\r\n",
+    "## `splits.csv` File\r\n",
     "\r\n",
     "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n"
    ]
@@ -814,7 +813,7 @@
    "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d",
    "metadata": {},
    "source": [
-    "# 6. Example Molecule: Different Encodings\n",
+    "# 5. Example Molecule: Different Encodings\n",
     "\n",
     "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n",
     "\n",
@@ -822,7 +821,40 @@
     "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
     "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
     "\n",
-    "---"
+    "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\r\n",
+    "\r\n",
+    "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\r\n",
+    "   - **Benzene SMILES**: `c1ccccc1`\r\n",
+    "   - **Explanation**: \r\n",
+    "     - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\r\n",
+    "\r\n",
+    "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\r\n",
+    "   - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\r\n",
+    "   - **Explanation**: \r\n",
+    "     - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\r\n",
+    "     - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\r\n",
+    "\r\n",
+    "### 3. **InChI (IUPAC International Chemical Identifier)**\r\n",
+    "   - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\r\n",
+    "   - **Explanation**: \r\n",
+    "     - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\r\n",
+    "\r\n",
+    "### 4. **InChIKey**\r\n",
+    "   - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\r\n",
+    "   - **Explanation**: \r\n",
+    "     - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\r\n",
+    "\r\n",
+    "### 5. **Canonical SMILES**\r\n",
+    "   - **Benzene Canonical SMILES**: `c1ccccc1`\r\n",
+    "   - **Explanation**:\r\n",
+    "     - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\r\n",
+    "\r\n",
+    "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\r\n",
+    "   - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\r\n",
+    "   - **Explanation**: \r\n",
+    "     - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\r\n",
+    "\r\n",
+    "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools."
    ]
   }
  ],

From 13aa945938079e265aa28947e9509a5484d03a2d Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 27 Aug 2024 11:24:05 +0200
Subject: [PATCH 008/112] add info related to protein dataset

---
 data_exploration.ipynb | 418 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 418 insertions(+)

diff --git a/data_exploration.ipynb b/data_exploration.ipynb
index e36fc1fe..b0c9e78f 100644
--- a/data_exploration.ipynb
+++ b/data_exploration.ipynb
@@ -856,6 +856,424 @@
     "\r\n",
     "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93e328cf-09f9-4694-b175-28320590937d",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde",
+   "metadata": {},
+   "source": [
+    "# Information for Protein Dataset\r\n",
+    "\r\n",
+    "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\r\n",
+    "\r\n",
+    "### Configuration Parameters\r\n",
+    "\r\n",
+    "Data classes related to proteins can be configured using the following main parameters:\r\n",
+    "\r\n",
+    "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\r\n",
+    "\r\n",
+    "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\r\n",
+    "\r\n",
+    "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\r\n",
+    "\r\n",
+    "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n",
+    "\r\n",
+    "### Available GOUniProt Data Classes\r\n",
+    "\r\n",
+    "#### `GOUniProtOver250`\r\n",
+    "\r\n",
+    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\r\n",
+    "\r\n",
+    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n",
+    "\r\n",
+    "#### `GOUniProtOver50`\r\n",
+    "\r\n",
+    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\r\n",
+    "\r\n",
+    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n",
+    "\r\n",
+    "### Instantiation Example\r\n",
+    "\r\n",
+    "```python\r\n",
+    "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\r\n",
+    "go_class = GOUniProtOver250()\r\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ffca830-bc0b-421c-8054-0860c95c10f2",
+   "metadata": {},
+   "source": [
+    "## GOUniProt Data File Structure\r\n",
+    "\r\n",
+    "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\r\n",
+    "   - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\r\n",
+    "   - **File Paths**:\r\n",
+    "     - `data/GO_UniProt/raw/${filename}.obo`\r\n",
+    "     - `data/GO_UniProt/raw/${filename}.dat`\r\n",
+    "\r\n",
+    "2. **`data.pkl`**\r\n",
+    "   - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\r\n",
+    "\r\n",
+    "3. **`data.pt`**\r\n",
+    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\r\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\r\n",
+    "\r\n",
+    "4. **`classes.txt`**\r\n",
+    "   - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\r\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\r\n",
+    "\r\n",
+    "5. **`splits.csv`**\r\n",
+    "   - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\r\n",
+    "\r\n",
+    "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\r\n",
+    "}/processed/splits.csv`\r\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "61bc261e-2328-4968-aca6-14c48bb24348",
+   "metadata": {},
+   "source": [
+    "## data.pkl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "id": "31df4ee7-4c03-4ea2-9798-5e5082a74c2b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of the data (rows x columns):  (27459, 1050)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>swiss_id</th>\n",
+       "      <th>accession</th>\n",
+       "      <th>go_ids</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>41</th>\n",
+       "      <th>75</th>\n",
+       "      <th>122</th>\n",
+       "      <th>165</th>\n",
+       "      <th>209</th>\n",
+       "      <th>226</th>\n",
+       "      <th>...</th>\n",
+       "      <th>2000145</th>\n",
+       "      <th>2000146</th>\n",
+       "      <th>2000147</th>\n",
+       "      <th>2000241</th>\n",
+       "      <th>2000243</th>\n",
+       "      <th>2000377</th>\n",
+       "      <th>2001020</th>\n",
+       "      <th>2001141</th>\n",
+       "      <th>2001233</th>\n",
+       "      <th>2001234</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>14331_ARATH</td>\n",
+       "      <td>P42643,Q945M2,Q9M0S7</td>\n",
+       "      <td>[19222]</td>\n",
+       "      <td>MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>14331_CAEEL</td>\n",
+       "      <td>P41932,Q21537</td>\n",
+       "      <td>[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...</td>\n",
+       "      <td>MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>14331_MAIZE</td>\n",
+       "      <td>P49106</td>\n",
+       "      <td>[3677, 5634, 10468, 44877]</td>\n",
+       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>14332_MAIZE</td>\n",
+       "      <td>Q01526</td>\n",
+       "      <td>[3677, 5634, 10468, 44877]</td>\n",
+       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>14333_ARATH</td>\n",
+       "      <td>P42644,F4KBI7,Q945L2</td>\n",
+       "      <td>[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...</td>\n",
+       "      <td>MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 1050 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       swiss_id             accession  \\\n",
+       "8   14331_ARATH  P42643,Q945M2,Q9M0S7   \n",
+       "9   14331_CAEEL         P41932,Q21537   \n",
+       "10  14331_MAIZE                P49106   \n",
+       "13  14332_MAIZE                Q01526   \n",
+       "14  14333_ARATH  P42644,F4KBI7,Q945L2   \n",
+       "\n",
+       "                                               go_ids  \\\n",
+       "8                                             [19222]   \n",
+       "9   [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...   \n",
+       "10                         [3677, 5634, 10468, 44877]   \n",
+       "13                         [3677, 5634, 10468, 44877]   \n",
+       "14  [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...   \n",
+       "\n",
+       "                                             sequence     41     75    122  \\\n",
+       "8   MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...  False  False  False   \n",
+       "9   MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...  False  False  False   \n",
+       "10  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
+       "13  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
+       "14  MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...  False  False  False   \n",
+       "\n",
+       "      165    209    226  ...  2000145  2000146  2000147  2000241  2000243  \\\n",
+       "8   False  False  False  ...    False    False    False    False    False   \n",
+       "9   False  False  False  ...    False    False    False    False    False   \n",
+       "10  False  False  False  ...    False    False    False    False    False   \n",
+       "13  False  False  False  ...    False    False    False    False    False   \n",
+       "14  False  False  False  ...    False    False    False    False    False   \n",
+       "\n",
+       "    2000377  2001020  2001141  2001233  2001234  \n",
+       "8     False    False    False    False    False  \n",
+       "9     False    False    False    False    False  \n",
+       "10    False    False    False    False    False  \n",
+       "13    False    False    False    False    False  \n",
+       "14    False    False    False    False    False  \n",
+       "\n",
+       "[5 rows x 1050 columns]"
+      ]
+     },
+     "execution_count": 123,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n",
+    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
+    "pkl_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be0078fd-bcf1-4d4c-b8c6-c84e3aeac99c",
+   "metadata": {},
+   "source": [
+    "## data.pt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "id": "a70f9c35-daca-4728-a9ea-b1212866f421",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Type of loaded data: <class 'list'>\n",
+      "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n",
+    "print(\"Type of loaded data:\", type(data_pt))\n",
+    "for i in range(1):\n",
+    "    print(data_pt[i])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "380049c1-2963-4223-b698-a7b59b9fe595",
+   "metadata": {},
+   "source": [
+    "## Protein Representation Using Amino Acid Sequence Notation\n",
+    "\n",
+    "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n",
+    "\n",
+    "### Example Protein Sequence\n",
+    "\n",
+    "Protein: **Lysozyme C** from **Gallus gallus** (Chicken).  \n",
+    "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n",
+    "\n",
+    "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n",
+    "- **Sequence Length**: 147\n",
+    "\n",
+    "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n",
+    "\n",
+    "### The 20 Amino Acids and Their One-Letter Notations\n",
+    "\n",
+    "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n",
+    "\n",
+    "| One-Letter Notation | Amino Acid Name      | Description                                             |\n",
+    "|---------------------|----------------------|---------------------------------------------------------|\n",
+    "| **A**               | Alanine              | Non-polar, aliphatic amino acid.                        |\n",
+    "| **C**               | Cysteine             | Polar, contains a thiol group, forms disulfide bonds.   |\n",
+    "| **D**               | Aspartic Acid        | Acidic, negatively charged at physiological pH.         |\n",
+    "| **E**               | Glutamic Acid        | Acidic, negatively charged at physiological pH.         |\n",
+    "| **F**               | Phenylalanine        | Aromatic, non-polar.                                    |\n",
+    "| **G**               | Glycine              | Smallest amino acid, non-polar.                         |\n",
+    "| **H**               | Histidine            | Polar, positively charged, can participate in enzyme active sites. |\n",
+    "| **I**               | Isoleucine           | Non-polar, aliphatic.                                   |\n",
+    "| **K**               | Lysine               | Basic, positively charged at physiological pH.          |\n",
+    "| **L**               | Leucine              | Non-polar, aliphatic.                                   |\n",
+    "| **M**               | Methionine           | Non-polar, contains sulfur, start codon in mRNA translation. |\n",
+    "| **N**               | Asparagine           | Polar, uncharged.                                       |\n",
+    "| **P**               | Proline              | Non-polar, introduces kinks in protein chains.          |\n",
+    "| **Q**               | Glutamine            | Polar, uncharged.                                       |\n",
+    "| **R**               | Arginine             | Basic, positively charged, involved in binding phosphate groups. |\n",
+    "| **S**               | Serine               | Polar, can be phosphorylated.                           |\n",
+    "| **T**               | Threonine            | Polar, can be phosphorylated.                           |\n",
+    "| **V**               | Valine               | Non-polar, aliphatic.                                   |\n",
+    "| **W**               | Tryptophan           | Aromatic, non-polar, largest amino acid.                |\n",
+    "| **Y**               | Tyrosine             | Aromatic, polar, can be phosphorylated.                 |\n",
+    "\n",
+    "### Understanding Protein Sequences\n",
+    "\n",
+    "In the example sequence `MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQGQL`, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n",
+    "\n",
+    "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n",
+    "\n",
+    "\n",
+    "_Note_:  Refer for amino acid sequence:  https://en.wikipedia.org/wiki/Protein_primary_structure"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "702359d6-5338-4391-b196-2328ba5676a1",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
   }
  ],
  "metadata": {

From 0e4814fde3f5b365587912729eba6ef5aba131c6 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 27 Aug 2024 12:33:37 +0200
Subject: [PATCH 009/112] fix - jupyter markdown cells formatting issue

- https://github.com/jupyter/notebook/issues/7002
- Fix using notebook formatter provided by pycharm professional
---
 data_exploration.ipynb | 512 ++++++++++++++++++++---------------------
 1 file changed, 252 insertions(+), 260 deletions(-)

diff --git a/data_exploration.ipynb b/data_exploration.ipynb
index b0c9e78f..8cd834b1 100644
--- a/data_exploration.ipynb
+++ b/data_exploration.ipynb
@@ -14,11 +14,11 @@
   },
   {
    "cell_type": "markdown",
-   "id": "33275d3c-cdbf-4c1f-aa04-f135511f3643",
+   "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d",
    "metadata": {},
    "source": [
-    "# 1. Instantiation of a Data Class\r\n",
-    "\r\n",
+    "# 1. Instantiation of a Data Class\n",
+    "\n",
     "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n",
     "### Inheritance Hierarchy\n",
     "\n",
@@ -29,55 +29,54 @@
     "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n",
     "\n",
     "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n",
-    ".\r\n",
-    "\r\n",
-    "### Explanation\r\n",
-    "a ChEBI data classiData` class can be configured with the following main parameters:\r\n",
-    "\r\n",
-    "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\r\n",
-    "\r\n",
-    "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\r\n",
-    "\r\n",
-    "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\r\n",
-    "\r\n",
-    "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\r\n",
-    "\r\n",
-    "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\r\n",
-    "\r\n",
-    "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n",
-    "\r\n",
-    "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\r\n",
-    "\r\n",
-    "### Additional Input Parameters\r\n",
-    "\r\n",
-    "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\r\n",
-    "\r\n",
-    "- **batch_size (int)**: The batch size for data loading. Default is `1`.\r\n",
-    "\r\n",
-    "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\r\n",
-    "\r\n",
-    "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\r\n",
-    "\r\n",
-    "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\r\n",
-    "\r\n",
-    "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\r\n",
-    "\r\n",
-    "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\r\n",
-    "\r\n",
-    "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\r\n",
-    "\r\n",
-    "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\r\n",
-    "\r\n",
-    "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\r\n",
-    "\r\n",
-    "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\r\n",
-    "\r\n",
-    "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\r\n",
-    "\r\n",
-    "- **kwargs**: Additional keyword arguments.\r\n",
-    "\r\n",
-    "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\r\n",
-    "ining and validation.\r\n"
+    "\n",
+    "\n",
+    "### Explanation\n",
+    "A ChEBI data class can be configured with the following main parameters:\n",
+    "\n",
+    "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n",
+    "\n",
+    "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n",
+    "\n",
+    "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\n",
+    "\n",
+    "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\n",
+    "\n",
+    "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n",
+    "\n",
+    "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\n",
+    "\n",
+    "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\n",
+    "\n",
+    "### Additional Input Parameters\n",
+    "\n",
+    "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n",
+    "\n",
+    "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n",
+    "\n",
+    "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\n",
+    "\n",
+    "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\n",
+    "\n",
+    "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\n",
+    "\n",
+    "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\n",
+    "\n",
+    "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\n",
+    "\n",
+    "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\n",
+    "\n",
+    "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\n",
+    "\n",
+    "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\n",
+    "\n",
+    "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\n",
+    "\n",
+    "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\n",
+    "\n",
+    "- **kwargs**: Additional keyword arguments.\n",
+    "\n",
+    "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\n"
    ]
   },
   {
@@ -151,31 +150,29 @@
    "id": "1655d489-25fe-46de-9feb-eeca5d36936f",
    "metadata": {},
    "source": [
-    "# 2. Preparation / Setup Methods\r\n",
-    "\r\n",
-    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\r\n",
-    "\r\n",
-    "### Why is Preparation Needed?\r\n",
-    "\r\n",
-    "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\r\n",
-    "- **Data Integrity**: It ensures that the data files are up-to-date and compatible with the specified ChEBI version.\r\n",
-    "\r\n",
-    "### Main Methods for Data Preprocessing\r\n",
-    "\r\n",
-    "The data preprocessing in a data class involves two main methods:\r\n",
-    "\r\n",
-    "1. **`prepare_data` Method**:\r\n",
-    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\r\n",
-    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\r\n",
-    "\r\n",
-    "2. **`setup` Method**:\r\n",
-    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\r\n",
-    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\r\n",
-    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\r\n",
-    "\r\n",
-    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes.\r\n",
-    "alidation processes.\r\n",
-    "processed(data_df, processed_name)\r\n"
+    "# 2. Preparation / Setup Methods\n",
+    "\n",
+    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
+    "\n",
+    "### Why is Preparation Needed?\n",
+    "\n",
+    "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n",
+    "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n",
+    "\n",
+    "### Main Methods for Data Preprocessing\n",
+    "\n",
+    "The data preprocessing in a data class involves two main methods:\n",
+    "\n",
+    "1. **`prepare_data` Method**:\n",
+    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n",
+    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n",
+    "\n",
+    "2. **`setup` Method**:\n",
+    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n",
+    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n",
+    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
+    "\n",
+    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
    ]
   },
   {
@@ -221,67 +218,65 @@
    "metadata": {},
    "source": [
     "# 3. Different Data Files Created and their Structure\n",
-    "\r\n",
-    "\r\n",
-    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\r\n",
-    "\r\n",
-    "### Data Files\r\n",
-    "\r\n",
-    "1. **`Raw Data Files`**: (e.g., `.obo` file)\r\n",
-    "   - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n",
-    "\r\n",
-    "2. **`data.pkl`**\r\n",
-    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n",
-    "\r\n",
-    "3. **`data.pt`**\r\n",
-    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n",
-    "\r\n",
-    "4. **`classes.txt`**\r\n",
-    "   - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\r\n",
-    "\r\n",
-    "5. **`splits.csv`**\r\n",
-    "   - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\r\n",
-    "\r\n",
-    "### File Structure and Preprocessing Stages\r\n",
-    "\r\n",
-    "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\r\n",
-    "\r\n",
-    "1. **Raw Data Stage**:\r\n",
-    "   - **File**: `chebi.obo`\r\n",
-    "   - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n",
-    "\r\n",
-    "2. **Processed Data Stage 1**:\r\n",
-    "   - **File**: `data.pkl`\r\n",
-    "   - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n",
-    "   - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\r\n",
-    "\r\n",
-    "3. **Processed Data Stage 2**:\r\n",
-    "   - **File**: `data.pt`\r\n",
-    "   - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\r\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n",
-    "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\r\n",
-    "\r\n",
-    "### Data Splits\r\n",
-    "\r\n",
-    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\r\n",
-    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\r\n",
-    "\r\n",
-    "### Summary of File Paths\r\n",
-    "\r\n",
-    "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\r\n",
-    "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\r\n",
-    "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\r\n",
-    "\r\n",
-    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\r\n",
-    "that each step is well-documented and reproducible.\r\n",
-    "sing, from raw input to model-ready formats.\r\n"
+    "\n",
+    "\n",
+    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n",
+    "\n",
+    "### Data Files\n",
+    "\n",
+    "1. **`Raw Data Files`**: (e.g., `.obo` file)\n",
+    "   - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
+    "\n",
+    "2. **`data.pkl`**\n",
+    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
+    "\n",
+    "3. **`data.pt`**\n",
+    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
+    "\n",
+    "4. **`classes.txt`**\n",
+    "   - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n",
+    "\n",
+    "5. **`splits.csv`**\n",
+    "   - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n",
+    "\n",
+    "### File Structure and Preprocessing Stages\n",
+    "\n",
+    "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n",
+    "\n",
+    "1. **Raw Data Stage**:\n",
+    "   - **File**: `chebi.obo`\n",
+    "   - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
+    "\n",
+    "2. **Processed Data Stage 1**:\n",
+    "   - **File**: `data.pkl`\n",
+    "   - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
+    "   - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n",
+    "\n",
+    "3. **Processed Data Stage 2**:\n",
+    "   - **File**: `data.pt`\n",
+    "   - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
+    "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n",
+    "\n",
+    "### Data Splits\n",
+    "\n",
+    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n",
+    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n",
+    "\n",
+    "### Summary of File Paths\n",
+    "\n",
+    "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n",
+    "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n",
+    "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n",
+    "\n",
+    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments."
    ]
   },
   {
@@ -323,29 +318,27 @@
     "synonym: \"monoatomic ions\" RELATED [ChEBI]\n",
     "is_a: CHEBI:24870\n",
     "is_a: CHEBI:33238\n",
-    "```0\r\n",
-    "is_a: CHEBI:3323Relevant 8\r\n",
-    "```\r\n",
-    "\r\n",
-    "### Breakdown of Attributes\r\n",
-    "\r\n",
-    "Each term document in the `chebi.obo` file consists of the following key attributes:\r\n",
-    "\r\n",
-    "- **`[Term]`**: \r\n",
-    "  - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\r\n",
-    "\r\n",
-    "- **`id: CHEBI:24867`**: \r\n",
-    "  - **Description**: A unique identifier for the chemical entity within the ChEBI database.\r\n",
-    "  - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\r\n",
-    "\r\n",
-    "- **`name: monoatomic ion`**: \r\n",
-    "  - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\r\n",
-    "  - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\r\n",
-    "\r\n",
-    "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \r\n",
-    "  - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\r\n",
-    "  - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats.\r\n",
-    "```"
+    "is_a: CHEBI:3323Relevant 8\n",
+    "```\n",
+    "\n",
+    "### Breakdown of Attributes\n",
+    "\n",
+    "Each term document in the `chebi.obo` file consists of the following key attributes:\n",
+    "\n",
+    "- **`[Term]`**: \n",
+    "  - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n",
+    "\n",
+    "- **`id: CHEBI:24867`**: \n",
+    "  - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n",
+    "  - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n",
+    "\n",
+    "- **`name: monoatomic ion`**: \n",
+    "  - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n",
+    "  - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n",
+    "\n",
+    "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n",
+    "  - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n",
+    "  - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats."
    ]
   },
   {
@@ -715,9 +708,9 @@
    "id": "b058714f-e434-4367-89b9-74c129ac727f",
    "metadata": {},
    "source": [
-    "## `splits.csv` File\r\n",
-    "\r\n",
-    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n"
+    "## `splits.csv` File\n",
+    "\n",
+    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n"
    ]
   },
   {
@@ -821,40 +814,40 @@
     "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
     "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
     "\n",
-    "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\r\n",
-    "\r\n",
-    "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\r\n",
-    "   - **Benzene SMILES**: `c1ccccc1`\r\n",
-    "   - **Explanation**: \r\n",
-    "     - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\r\n",
-    "\r\n",
-    "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\r\n",
-    "   - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\r\n",
-    "   - **Explanation**: \r\n",
-    "     - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\r\n",
-    "     - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\r\n",
-    "\r\n",
-    "### 3. **InChI (IUPAC International Chemical Identifier)**\r\n",
-    "   - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\r\n",
-    "   - **Explanation**: \r\n",
-    "     - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\r\n",
-    "\r\n",
-    "### 4. **InChIKey**\r\n",
-    "   - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\r\n",
-    "   - **Explanation**: \r\n",
-    "     - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\r\n",
-    "\r\n",
-    "### 5. **Canonical SMILES**\r\n",
-    "   - **Benzene Canonical SMILES**: `c1ccccc1`\r\n",
-    "   - **Explanation**:\r\n",
-    "     - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\r\n",
-    "\r\n",
-    "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\r\n",
-    "   - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\r\n",
-    "   - **Explanation**: \r\n",
-    "     - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\r\n",
-    "\r\n",
-    "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools."
+    "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n",
+    "\n",
+    "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n",
+    "   - **Benzene SMILES**: `c1ccccc1`\n",
+    "   - **Explanation**: \n",
+    "     - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n",
+    "\n",
+    "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n",
+    "   - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n",
+    "   - **Explanation**: \n",
+    "     - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n",
+    "     - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n",
+    "\n",
+    "### 3. **InChI (IUPAC International Chemical Identifier)**\n",
+    "   - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\n",
+    "   - **Explanation**: \n",
+    "     - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\n",
+    "\n",
+    "### 4. **InChIKey**\n",
+    "   - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\n",
+    "   - **Explanation**: \n",
+    "     - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\n",
+    "\n",
+    "### 5. **Canonical SMILES**\n",
+    "   - **Benzene Canonical SMILES**: `c1ccccc1`\n",
+    "   - **Explanation**:\n",
+    "     - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\n",
+    "\n",
+    "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\n",
+    "   - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\n",
+    "   - **Explanation**: \n",
+    "     - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\n",
+    "\n",
+    "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics."
    ]
   },
   {
@@ -870,41 +863,41 @@
    "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde",
    "metadata": {},
    "source": [
-    "# Information for Protein Dataset\r\n",
-    "\r\n",
-    "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\r\n",
-    "\r\n",
-    "### Configuration Parameters\r\n",
-    "\r\n",
-    "Data classes related to proteins can be configured using the following main parameters:\r\n",
-    "\r\n",
-    "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\r\n",
-    "\r\n",
-    "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\r\n",
-    "\r\n",
-    "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\r\n",
-    "\r\n",
-    "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n",
-    "\r\n",
-    "### Available GOUniProt Data Classes\r\n",
-    "\r\n",
-    "#### `GOUniProtOver250`\r\n",
-    "\r\n",
-    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\r\n",
-    "\r\n",
-    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n",
-    "\r\n",
-    "#### `GOUniProtOver50`\r\n",
-    "\r\n",
-    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\r\n",
-    "\r\n",
-    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n",
-    "\r\n",
-    "### Instantiation Example\r\n",
-    "\r\n",
-    "```python\r\n",
-    "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\r\n",
-    "go_class = GOUniProtOver250()\r\n"
+    "# Information for Protein Dataset\n",
+    "\n",
+    "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\n",
+    "\n",
+    "### Configuration Parameters\n",
+    "\n",
+    "Data classes related to proteins can be configured using the following main parameters:\n",
+    "\n",
+    "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n",
+    "\n",
+    "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\n",
+    "\n",
+    "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n",
+    "\n",
+    "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\n",
+    "\n",
+    "### Available GOUniProt Data Classes\n",
+    "\n",
+    "#### `GOUniProtOver250`\n",
+    "\n",
+    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n",
+    "\n",
+    "#### `GOUniProtOver50`\n",
+    "\n",
+    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n",
+    "\n",
+    "### Instantiation Example\n",
+    "\n",
+    "```python\n",
+    "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\n",
+    "go_class = GOUniProtOver250()\n"
    ]
   },
   {
@@ -912,32 +905,31 @@
    "id": "2ffca830-bc0b-421c-8054-0860c95c10f2",
    "metadata": {},
    "source": [
-    "## GOUniProt Data File Structure\r\n",
-    "\r\n",
-    "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\r\n",
-    "   - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\r\n",
-    "   - **File Paths**:\r\n",
-    "     - `data/GO_UniProt/raw/${filename}.obo`\r\n",
-    "     - `data/GO_UniProt/raw/${filename}.dat`\r\n",
-    "\r\n",
-    "2. **`data.pkl`**\r\n",
-    "   - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\r\n",
-    "\r\n",
-    "3. **`data.pt`**\r\n",
-    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\r\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\r\n",
-    "\r\n",
-    "4. **`classes.txt`**\r\n",
-    "   - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\r\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\r\n",
-    "\r\n",
-    "5. **`splits.csv`**\r\n",
-    "   - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\r\n",
-    "\r\n",
-    "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\r\n",
-    "}/processed/splits.csv`\r\n"
+    "## GOUniProt Data File Structure\n",
+    "\n",
+    "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n",
+    "   - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n",
+    "   - **File Paths**:\n",
+    "     - `data/GO_UniProt/raw/${filename}.obo`\n",
+    "     - `data/GO_UniProt/raw/${filename}.dat`\n",
+    "\n",
+    "2. **`data.pkl`**\n",
+    "   - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n",
+    "\n",
+    "3. **`data.pt`**\n",
+    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n",
+    "\n",
+    "4. **`classes.txt`**\n",
+    "   - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n",
+    "\n",
+    "5. **`splits.csv`**\n",
+    "   - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n",
+    "\n",
+    "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n"
    ]
   },
   {
@@ -1259,7 +1251,7 @@
     "\n",
     "### Understanding Protein Sequences\n",
     "\n",
-    "In the example sequence `MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQGQL`, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n",
+    "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n",
     "\n",
     "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n",
     "\n",

From 8539f3bc3f1376dcf98eecfa06de6258f7a0b77a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 27 Aug 2024 12:34:47 +0200
Subject: [PATCH 010/112] move to tutorials dir

---
 data_exploration.ipynb => tutorials/data_exploration.ipynb | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename data_exploration.ipynb => tutorials/data_exploration.ipynb (100%)

diff --git a/data_exploration.ipynb b/tutorials/data_exploration.ipynb
similarity index 100%
rename from data_exploration.ipynb
rename to tutorials/data_exploration.ipynb

From cc5bc08d31ca7bbd1731144f96e44647ace78f82 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 29 Aug 2024 21:07:45 +0200
Subject: [PATCH 011/112] move previous tests to integration dir

---
 tests/integration/__init__.py                       |   3 +++
 tests/{ => integration}/testChebiData.py            |   0
 .../{ => integration}/testChebiDynamicDataSplits.py |   0
 .../testCustomBalancedAccuracyMetric.py             |   0
 tests/{ => integration}/testCustomMacroF1Metric.py  |   0
 tests/{ => integration}/testPubChemData.py          |   0
 tests/{ => integration}/testTox21MolNetData.py      |   0
 .../test_data/ChEBIOver100_test/labels000.pt        | Bin
 .../test_data/ChEBIOver100_test/labels001.pt        | Bin
 .../test_data/ChEBIOver100_test/labels002.pt        | Bin
 .../test_data/ChEBIOver100_test/labels003.pt        | Bin
 .../test_data/ChEBIOver100_test/labels004.pt        | Bin
 .../test_data/ChEBIOver100_test/labels005.pt        | Bin
 .../test_data/ChEBIOver100_test/labels006.pt        | Bin
 .../test_data/ChEBIOver100_test/labels007.pt        | Bin
 .../test_data/ChEBIOver100_test/labels008.pt        | Bin
 .../test_data/ChEBIOver100_test/labels009.pt        | Bin
 .../test_data/ChEBIOver100_test/labels010.pt        | Bin
 .../test_data/ChEBIOver100_test/labels011.pt        | Bin
 .../test_data/ChEBIOver100_test/labels012.pt        | Bin
 .../test_data/ChEBIOver100_test/labels013.pt        | Bin
 .../test_data/ChEBIOver100_test/labels014.pt        | Bin
 .../test_data/ChEBIOver100_test/labels015.pt        | Bin
 .../test_data/ChEBIOver100_test/labels016.pt        | Bin
 .../test_data/ChEBIOver100_test/labels017.pt        | Bin
 .../test_data/ChEBIOver100_test/labels018.pt        | Bin
 .../test_data/ChEBIOver100_test/labels019.pt        | Bin
 .../test_data/ChEBIOver100_test/preds000.pt         | Bin
 .../test_data/ChEBIOver100_test/preds001.pt         | Bin
 .../test_data/ChEBIOver100_test/preds002.pt         | Bin
 .../test_data/ChEBIOver100_test/preds003.pt         | Bin
 .../test_data/ChEBIOver100_test/preds004.pt         | Bin
 .../test_data/ChEBIOver100_test/preds005.pt         | Bin
 .../test_data/ChEBIOver100_test/preds006.pt         | Bin
 .../test_data/ChEBIOver100_test/preds007.pt         | Bin
 .../test_data/ChEBIOver100_test/preds008.pt         | Bin
 .../test_data/ChEBIOver100_test/preds009.pt         | Bin
 .../test_data/ChEBIOver100_test/preds010.pt         | Bin
 .../test_data/ChEBIOver100_test/preds011.pt         | Bin
 .../test_data/ChEBIOver100_test/preds012.pt         | Bin
 .../test_data/ChEBIOver100_test/preds013.pt         | Bin
 .../test_data/ChEBIOver100_test/preds014.pt         | Bin
 .../test_data/ChEBIOver100_test/preds015.pt         | Bin
 .../test_data/ChEBIOver100_test/preds016.pt         | Bin
 .../test_data/ChEBIOver100_test/preds017.pt         | Bin
 .../test_data/ChEBIOver100_test/preds018.pt         | Bin
 .../test_data/ChEBIOver100_test/preds019.pt         | Bin
 47 files changed, 3 insertions(+)
 create mode 100644 tests/integration/__init__.py
 rename tests/{ => integration}/testChebiData.py (100%)
 rename tests/{ => integration}/testChebiDynamicDataSplits.py (100%)
 rename tests/{ => integration}/testCustomBalancedAccuracyMetric.py (100%)
 rename tests/{ => integration}/testCustomMacroF1Metric.py (100%)
 rename tests/{ => integration}/testPubChemData.py (100%)
 rename tests/{ => integration}/testTox21MolNetData.py (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels000.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels001.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels002.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels003.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels004.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels005.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels006.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels007.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels008.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels009.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels010.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels011.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels012.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels013.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels014.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels015.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels016.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels017.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels018.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/labels019.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds000.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds001.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds002.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds003.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds004.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds005.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds006.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds007.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds008.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds009.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds010.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds011.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds012.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds013.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds014.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds015.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds016.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds017.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds018.pt (100%)
 rename tests/{ => integration}/test_data/ChEBIOver100_test/preds019.pt (100%)

diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..caa8759f
--- /dev/null
+++ b/tests/integration/__init__.py
@@ -0,0 +1,3 @@
+"""
+This directory contains integration tests that cover the overall behavior of the data preprocessing tool.
+"""
diff --git a/tests/testChebiData.py b/tests/integration/testChebiData.py
similarity index 100%
rename from tests/testChebiData.py
rename to tests/integration/testChebiData.py
diff --git a/tests/testChebiDynamicDataSplits.py b/tests/integration/testChebiDynamicDataSplits.py
similarity index 100%
rename from tests/testChebiDynamicDataSplits.py
rename to tests/integration/testChebiDynamicDataSplits.py
diff --git a/tests/testCustomBalancedAccuracyMetric.py b/tests/integration/testCustomBalancedAccuracyMetric.py
similarity index 100%
rename from tests/testCustomBalancedAccuracyMetric.py
rename to tests/integration/testCustomBalancedAccuracyMetric.py
diff --git a/tests/testCustomMacroF1Metric.py b/tests/integration/testCustomMacroF1Metric.py
similarity index 100%
rename from tests/testCustomMacroF1Metric.py
rename to tests/integration/testCustomMacroF1Metric.py
diff --git a/tests/testPubChemData.py b/tests/integration/testPubChemData.py
similarity index 100%
rename from tests/testPubChemData.py
rename to tests/integration/testPubChemData.py
diff --git a/tests/testTox21MolNetData.py b/tests/integration/testTox21MolNetData.py
similarity index 100%
rename from tests/testTox21MolNetData.py
rename to tests/integration/testTox21MolNetData.py
diff --git a/tests/test_data/ChEBIOver100_test/labels000.pt b/tests/integration/test_data/ChEBIOver100_test/labels000.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels000.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels000.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels001.pt b/tests/integration/test_data/ChEBIOver100_test/labels001.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels001.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels001.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels002.pt b/tests/integration/test_data/ChEBIOver100_test/labels002.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels002.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels002.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels003.pt b/tests/integration/test_data/ChEBIOver100_test/labels003.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels003.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels003.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels004.pt b/tests/integration/test_data/ChEBIOver100_test/labels004.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels004.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels004.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels005.pt b/tests/integration/test_data/ChEBIOver100_test/labels005.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels005.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels005.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels006.pt b/tests/integration/test_data/ChEBIOver100_test/labels006.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels006.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels006.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels007.pt b/tests/integration/test_data/ChEBIOver100_test/labels007.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels007.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels007.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels008.pt b/tests/integration/test_data/ChEBIOver100_test/labels008.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels008.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels008.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels009.pt b/tests/integration/test_data/ChEBIOver100_test/labels009.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels009.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels009.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels010.pt b/tests/integration/test_data/ChEBIOver100_test/labels010.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels010.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels010.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels011.pt b/tests/integration/test_data/ChEBIOver100_test/labels011.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels011.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels011.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels012.pt b/tests/integration/test_data/ChEBIOver100_test/labels012.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels012.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels012.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels013.pt b/tests/integration/test_data/ChEBIOver100_test/labels013.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels013.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels013.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels014.pt b/tests/integration/test_data/ChEBIOver100_test/labels014.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels014.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels014.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels015.pt b/tests/integration/test_data/ChEBIOver100_test/labels015.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels015.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels015.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels016.pt b/tests/integration/test_data/ChEBIOver100_test/labels016.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels016.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels016.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels017.pt b/tests/integration/test_data/ChEBIOver100_test/labels017.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels017.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels017.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels018.pt b/tests/integration/test_data/ChEBIOver100_test/labels018.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels018.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels018.pt
diff --git a/tests/test_data/ChEBIOver100_test/labels019.pt b/tests/integration/test_data/ChEBIOver100_test/labels019.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/labels019.pt
rename to tests/integration/test_data/ChEBIOver100_test/labels019.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds000.pt b/tests/integration/test_data/ChEBIOver100_test/preds000.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds000.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds000.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds001.pt b/tests/integration/test_data/ChEBIOver100_test/preds001.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds001.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds001.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds002.pt b/tests/integration/test_data/ChEBIOver100_test/preds002.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds002.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds002.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds003.pt b/tests/integration/test_data/ChEBIOver100_test/preds003.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds003.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds003.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds004.pt b/tests/integration/test_data/ChEBIOver100_test/preds004.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds004.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds004.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds005.pt b/tests/integration/test_data/ChEBIOver100_test/preds005.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds005.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds005.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds006.pt b/tests/integration/test_data/ChEBIOver100_test/preds006.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds006.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds006.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds007.pt b/tests/integration/test_data/ChEBIOver100_test/preds007.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds007.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds007.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds008.pt b/tests/integration/test_data/ChEBIOver100_test/preds008.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds008.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds008.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds009.pt b/tests/integration/test_data/ChEBIOver100_test/preds009.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds009.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds009.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds010.pt b/tests/integration/test_data/ChEBIOver100_test/preds010.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds010.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds010.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds011.pt b/tests/integration/test_data/ChEBIOver100_test/preds011.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds011.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds011.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds012.pt b/tests/integration/test_data/ChEBIOver100_test/preds012.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds012.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds012.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds013.pt b/tests/integration/test_data/ChEBIOver100_test/preds013.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds013.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds013.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds014.pt b/tests/integration/test_data/ChEBIOver100_test/preds014.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds014.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds014.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds015.pt b/tests/integration/test_data/ChEBIOver100_test/preds015.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds015.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds015.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds016.pt b/tests/integration/test_data/ChEBIOver100_test/preds016.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds016.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds016.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds017.pt b/tests/integration/test_data/ChEBIOver100_test/preds017.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds017.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds017.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds018.pt b/tests/integration/test_data/ChEBIOver100_test/preds018.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds018.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds018.pt
diff --git a/tests/test_data/ChEBIOver100_test/preds019.pt b/tests/integration/test_data/ChEBIOver100_test/preds019.pt
similarity index 100%
rename from tests/test_data/ChEBIOver100_test/preds019.pt
rename to tests/integration/test_data/ChEBIOver100_test/preds019.pt

From 5af03512863cb7b68193eb0698c899b762de721b Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 29 Aug 2024 21:13:07 +0200
Subject: [PATCH 012/112] unit dir + test for ChemDataReader

---
 tests/unit/__init__.py                        |  4 ++
 tests/unit/collators/__init__.py              |  0
 tests/unit/data_readers/__init__.py           |  0
 tests/unit/data_readers/testChemDataReader.py | 71 +++++++++++++++++++
 tests/unit/dataset_classes/__init__.py        |  0
 5 files changed, 75 insertions(+)
 create mode 100644 tests/unit/__init__.py
 create mode 100644 tests/unit/collators/__init__.py
 create mode 100644 tests/unit/data_readers/__init__.py
 create mode 100644 tests/unit/data_readers/testChemDataReader.py
 create mode 100644 tests/unit/dataset_classes/__init__.py

diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..6640a696
--- /dev/null
+++ b/tests/unit/__init__.py
@@ -0,0 +1,4 @@
+"""
+This directory contains unit tests, which focus on individual functions and methods, ensuring they work as
+expected in isolation.
+"""
diff --git a/tests/unit/collators/__init__.py b/tests/unit/collators/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/data_readers/__init__.py b/tests/unit/data_readers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/data_readers/testChemDataReader.py b/tests/unit/data_readers/testChemDataReader.py
new file mode 100644
index 00000000..bf3dea6e
--- /dev/null
+++ b/tests/unit/data_readers/testChemDataReader.py
@@ -0,0 +1,71 @@
+import unittest
+from typing import List
+from unittest.mock import mock_open, patch
+
+from chebai.preprocessing.reader import EMBEDDING_OFFSET, ChemDataReader
+
+
+class TestChemDataReader(unittest.TestCase):
+    """
+    Unit tests for the ChemDataReader class.
+    """
+
+    @patch(
+        "chebai.preprocessing.reader.open",
+        new_callable=mock_open,
+        read_data="C\nO\nN\n=\n1\n(",
+    )
+    def setUp(self, mock_file: mock_open) -> None:
+        """
+        Set up the test environment by initializing a ChemDataReader instance with a mocked token file.
+
+        Args:
+            mock_file: Mock object for file operations.
+        """
+        self.reader = ChemDataReader(token_path="/mock/path")
+        # After initializing, self.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '(']
+        self.assertEqual(self.reader.cache, ["C", "O", "N", "=", "1", "("])
+
+    def test_read_data(self) -> None:
+        """
+        Test the _read_data method with a SMILES string to ensure it correctly tokenizes the string.
+        """
+        raw_data = "CC(=O)NC1"
+        # Expected output as per the tokens already in the cache, and ")" getting added to it.
+        expected_output: List[int] = [
+            EMBEDDING_OFFSET + 0,  # C
+            EMBEDDING_OFFSET + 0,  # C
+            EMBEDDING_OFFSET + 5,  # =
+            EMBEDDING_OFFSET + 3,  # O
+            EMBEDDING_OFFSET + 1,  # N
+            EMBEDDING_OFFSET + 6,  # (
+            EMBEDDING_OFFSET + 2,  # C
+            EMBEDDING_OFFSET + 0,  # C
+            EMBEDDING_OFFSET + 4,  # 1
+        ]
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+    def test_read_data_with_new_token(self) -> None:
+        """
+        Test the _read_data method with a SMILES string that includes a new token.
+        Ensure that the new token is added to the cache and processed correctly.
+        """
+        raw_data = "[H-]"
+
+        # Note: test methods within a TestCase class are not guaranteed to be executed in any specific order.
+        # Determine the index for the new token based on the current size of the cache.
+        index_for_last_token = len(self.reader.cache)
+        expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token]
+
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+        # Verify that '[H-]' was added to the cache
+        self.assertIn("[H-]", self.reader.cache)
+        # Ensure it's at the correct index
+        self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/dataset_classes/__init__.py b/tests/unit/dataset_classes/__init__.py
new file mode 100644
index 00000000..e69de29b

From a0810a233dd319c7fcb18bb3684eacd3047796ef Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 29 Aug 2024 21:15:49 +0200
Subject: [PATCH 013/112] Test for DataReader

---
 tests/unit/data_readers/testDataReader.py | 51 +++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 tests/unit/data_readers/testDataReader.py

diff --git a/tests/unit/data_readers/testDataReader.py b/tests/unit/data_readers/testDataReader.py
new file mode 100644
index 00000000..1a511b26
--- /dev/null
+++ b/tests/unit/data_readers/testDataReader.py
@@ -0,0 +1,51 @@
+import unittest
+from typing import Any, Dict, List
+
+from chebai.preprocessing.reader import DataReader
+
+
+class TestDataReader(unittest.TestCase):
+    """
+    Unit tests for the DataReader class.
+    """
+
+    def setUp(self) -> None:
+        """
+        Set up the test environment by initializing a DataReader instance.
+        """
+        self.reader = DataReader()
+
+    def test_to_data(self) -> None:
+        """
+        Test the to_data method to ensure it correctly processes the input row
+        and formats it according to the expected output.
+
+        This method tests the conversion of raw data into a processed format,
+        including extracting features, labels, ident, group, and additional
+        keyword arguments.
+        """
+        features_list: List[int] = [10, 20, 30]
+        labels_list: List[bool] = [True, False, True]
+        ident_no: int = 123
+
+        row: Dict[str, Any] = {
+            "features": features_list,
+            "labels": labels_list,
+            "ident": ident_no,
+            "group": "group_data",
+            "additional_kwargs": {"extra_key": "extra_value"},
+        }
+
+        expected: Dict[str, Any] = {
+            "features": features_list,
+            "labels": labels_list,
+            "ident": ident_no,
+            "group": "group_data",
+            "extra_key": "extra_value",
+        }
+
+        self.assertEqual(self.reader.to_data(row), expected)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 1b3836d5c103a1455f41245b757a94acc0b3d5f5 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 29 Aug 2024 23:09:53 +0200
Subject: [PATCH 014/112] tests for DeepChemReader

---
 .../{data_readers => readers}/__init__.py     |  0
 .../testChemDataReader.py                     |  8 +-
 .../testDataReader.py                         |  0
 tests/unit/readers/testDeepChemDataReader.py  | 80 +++++++++++++++++++
 4 files changed, 85 insertions(+), 3 deletions(-)
 rename tests/unit/{data_readers => readers}/__init__.py (100%)
 rename tests/unit/{data_readers => readers}/testChemDataReader.py (90%)
 rename tests/unit/{data_readers => readers}/testDataReader.py (100%)
 create mode 100644 tests/unit/readers/testDeepChemDataReader.py

diff --git a/tests/unit/data_readers/__init__.py b/tests/unit/readers/__init__.py
similarity index 100%
rename from tests/unit/data_readers/__init__.py
rename to tests/unit/readers/__init__.py
diff --git a/tests/unit/data_readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
similarity index 90%
rename from tests/unit/data_readers/testChemDataReader.py
rename to tests/unit/readers/testChemDataReader.py
index bf3dea6e..2bc525e1 100644
--- a/tests/unit/data_readers/testChemDataReader.py
+++ b/tests/unit/readers/testChemDataReader.py
@@ -8,6 +8,8 @@
 class TestChemDataReader(unittest.TestCase):
     """
     Unit tests for the ChemDataReader class.
+
+    Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order.
     """
 
     @patch(
@@ -30,7 +32,7 @@ def test_read_data(self) -> None:
         """
         Test the _read_data method with a SMILES string to ensure it correctly tokenizes the string.
         """
-        raw_data = "CC(=O)NC1"
+        raw_data = "CC(=O)NC1[Mg-2]"
         # Expected output as per the tokens already in the cache, and ")" getting added to it.
         expected_output: List[int] = [
             EMBEDDING_OFFSET + 0,  # C
@@ -38,10 +40,11 @@ def test_read_data(self) -> None:
             EMBEDDING_OFFSET + 5,  # =
             EMBEDDING_OFFSET + 3,  # O
             EMBEDDING_OFFSET + 1,  # N
-            EMBEDDING_OFFSET + 6,  # (
+            EMBEDDING_OFFSET + len(self.reader.cache),  # (
             EMBEDDING_OFFSET + 2,  # C
             EMBEDDING_OFFSET + 0,  # C
             EMBEDDING_OFFSET + 4,  # 1
+            EMBEDDING_OFFSET + len(self.reader.cache) + 1,  # [Mg-2]
         ]
         result = self.reader._read_data(raw_data)
         self.assertEqual(result, expected_output)
@@ -53,7 +56,6 @@ def test_read_data_with_new_token(self) -> None:
         """
         raw_data = "[H-]"
 
-        # Note: test methods within a TestCase class are not guaranteed to be executed in any specific order.
         # Determine the index for the new token based on the current size of the cache.
         index_for_last_token = len(self.reader.cache)
         expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token]
diff --git a/tests/unit/data_readers/testDataReader.py b/tests/unit/readers/testDataReader.py
similarity index 100%
rename from tests/unit/data_readers/testDataReader.py
rename to tests/unit/readers/testDataReader.py
diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py
new file mode 100644
index 00000000..c93e2592
--- /dev/null
+++ b/tests/unit/readers/testDeepChemDataReader.py
@@ -0,0 +1,80 @@
+import unittest
+from typing import List
+from unittest.mock import mock_open, patch
+
+from chebai.preprocessing.reader import EMBEDDING_OFFSET, DeepChemDataReader
+
+
+class TestDeepChemDataReader(unittest.TestCase):
+    """
+    Unit tests for the DeepChemDataReader class.
+
+    Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order.
+    """
+
+    @patch(
+        "chebai.preprocessing.reader.open",
+        new_callable=mock_open,
+        read_data="C\nO\nc\n)",
+    )
+    def setUp(self, mock_file: mock_open) -> None:
+        """
+        Set up the test environment by initializing a DeepChemDataReader instance with a mocked token file.
+
+        Args:
+            mock_file: Mock object for file operations.
+        """
+        self.reader = DeepChemDataReader(token_path="/mock/path")
+        # After initializing, self.reader.cache should now be set to ['C', 'O', 'c', ')']
+        self.assertEqual(self.reader.cache, ["C", "O", "c", ")"])
+
+    def test_read_data(self) -> None:
+        """
+        Test the _read_data method with a SMILES string to ensure it correctly tokenizes the string.
+        """
+        raw_data = "c1ccccc1C(Br)(OC)I[Ni-2]"
+
+        # Expected output as per the tokens already in the cache, and new tokens getting added to it.
+        expected_output: List[int] = [
+            EMBEDDING_OFFSET + 2,  # c
+            EMBEDDING_OFFSET + 2,  # c
+            EMBEDDING_OFFSET + 2,  # c
+            EMBEDDING_OFFSET + 2,  # c
+            EMBEDDING_OFFSET + 2,  # c
+            EMBEDDING_OFFSET + 2,  # c
+            EMBEDDING_OFFSET + len(self.reader.cache),  # 6 (new token)
+            EMBEDDING_OFFSET + 0,  # C
+            EMBEDDING_OFFSET + len(self.reader.cache) + 1,  # Br (new token)
+            EMBEDDING_OFFSET + 3,  # )
+            EMBEDDING_OFFSET + 1,  # O
+            EMBEDDING_OFFSET + 0,  # C
+            EMBEDDING_OFFSET + 3,  # )
+            EMBEDDING_OFFSET + 3,  # )
+            EMBEDDING_OFFSET + len(self.reader.cache) + 2,  # I (new token)
+            EMBEDDING_OFFSET + len(self.reader.cache) + 3,  # [Ni-2] (new token)
+        ]
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+    def test_read_data_with_new_token(self) -> None:
+        """
+        Test the _read_data method with a SMILES string that includes a new token.
+        Ensure that the new token is added to the cache and processed correctly.
+        """
+        raw_data = "[H-]"
+
+        # Determine the index for the new token based on the current size of the cache.
+        index_for_last_token = len(self.reader.cache)
+        expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token]
+
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+        # Verify that '[H-]' was added to the cache
+        self.assertIn("[H-]", self.reader.cache)
+        # Ensure it's at the correct index
+        self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token)
+
+
+if __name__ == "__main__":
+    unittest.main()

From aa467c6fde67a9545b23c79132c128d0a837b69e Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 30 Aug 2024 00:06:39 +0200
Subject: [PATCH 015/112] Test for SelfiesReader

---
 tests/unit/readers/testDeepChemDataReader.py |   3 +
 tests/unit/readers/testSelfiesReader.py      | 106 +++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 tests/unit/readers/testSelfiesReader.py

diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py
index c93e2592..ac1a50b7 100644
--- a/tests/unit/readers/testDeepChemDataReader.py
+++ b/tests/unit/readers/testDeepChemDataReader.py
@@ -34,6 +34,9 @@ def test_read_data(self) -> None:
         """
         raw_data = "c1ccccc1C(Br)(OC)I[Ni-2]"
 
+        # benzene is c1ccccc1 in SMILES but cccccc6 in DeepSMILES
+        # SMILES C(Br)(OC)I can be converted to the DeepSMILES CBr)OC))I.
+        # Resultant String: "cccccc6CBr)OC))I[Ni-2]"
         # Expected output as per the tokens already in the cache, and new tokens getting added to it.
         expected_output: List[int] = [
             EMBEDDING_OFFSET + 2,  # c
diff --git a/tests/unit/readers/testSelfiesReader.py b/tests/unit/readers/testSelfiesReader.py
new file mode 100644
index 00000000..41202757
--- /dev/null
+++ b/tests/unit/readers/testSelfiesReader.py
@@ -0,0 +1,106 @@
+import unittest
+from typing import List
+from unittest.mock import mock_open, patch
+
+from chebai.preprocessing.reader import EMBEDDING_OFFSET, SelfiesReader
+
+
+class TestSelfiesReader(unittest.TestCase):
+    """
+    Unit tests for the SelfiesReader class.
+
+    Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order.
+    """
+
+    @patch(
+        "chebai.preprocessing.reader.open",
+        new_callable=mock_open,
+        read_data="[C]\n[O]\n[=C]",
+    )
+    def setUp(self, mock_file: mock_open) -> None:
+        """
+        Set up the test environment by initializing a SelfiesReader instance with a mocked token file.
+
+        Args:
+            mock_file: Mock object for file operations.
+        """
+        self.reader = SelfiesReader(token_path="/mock/path")
+        # After initializing, self.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[(']
+        self.assertEqual(
+            self.reader.cache,
+            [
+                "[C]",
+                "[O]",
+                "[=C]",
+            ],
+        )
+
+    def test_read_data(self) -> None:
+        """
+        Test the _read_data method with a SELFIES string to ensure it correctly tokenizes the string.
+        """
+        raw_data = "c1ccccc1C(Br)(OC)I[Ni-2]"
+
+        # benzene is "c1ccccc1" in SMILES is translated to "[C][=C][C][=C][C][=C][Ring1][=Branch1]" in SELFIES
+        # SELFIES translation of SMILES "c1ccccc1C(Br)(OC)I[Ni-2]":
+        # "[C][=C][C][=C][C][=C][Ring1][=Branch1][C][Branch1][C][Br][Branch1][Ring1][O][C][I][Ni-2]"
+        expected_output: List[int] = [
+            EMBEDDING_OFFSET + 0,  # [C] (already in cache)
+            EMBEDDING_OFFSET + 2,  # [=C] (already in cache)
+            EMBEDDING_OFFSET + 0,  # [C] (already in cache)
+            EMBEDDING_OFFSET + 2,  # [=C] (already in cache)
+            EMBEDDING_OFFSET + 0,  # [C] (already in cache)
+            EMBEDDING_OFFSET + 2,  # [=C] (already in cache)
+            EMBEDDING_OFFSET + len(self.reader.cache),  # [Ring1] (new token)
+            EMBEDDING_OFFSET + len(self.reader.cache) + 1,  # [=Branch1] (new token)
+            EMBEDDING_OFFSET + 0,  # [C] (already in cache)
+            EMBEDDING_OFFSET + len(self.reader.cache) + 2,  # [Branch1] (new token)
+            EMBEDDING_OFFSET + 0,  # [C] (already in cache)
+            EMBEDDING_OFFSET + len(self.reader.cache) + 3,  # [Br] (new token)
+            EMBEDDING_OFFSET
+            + len(self.reader.cache)
+            + 2,  # [Branch1] (reused new token)
+            EMBEDDING_OFFSET + len(self.reader.cache),  # [Ring1] (reused new token)
+            EMBEDDING_OFFSET + 1,  # [O] (already in cache)
+            EMBEDDING_OFFSET + 0,  # [C] (already in cache)
+            EMBEDDING_OFFSET + len(self.reader.cache) + 4,  # [I] (new token)
+            EMBEDDING_OFFSET + len(self.reader.cache) + 5,  # [Ni-2] (new token)
+        ]
+
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+    def test_read_data_with_new_token(self) -> None:
+        """
+        Test the _read_data method with a SELFIES string that includes a new token.
+        Ensure that the new token is added to the cache and processed correctly.
+        """
+        raw_data = "[H-]"
+
+        # Determine the index for the new token based on the current size of the cache.
+        index_for_last_token = len(self.reader.cache)
+        expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token]
+
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+        # Verify that '[H-1]' was added to the cache, "[H-]" translated to "[H-1]" in SELFIES
+        self.assertIn("[H-1]", self.reader.cache)
+        # Ensure it's at the correct index
+        self.assertEqual(self.reader.cache.index("[H-1]"), index_for_last_token)
+
+    def test_read_data_with_invalid_selfies(self) -> None:
+        """
+        Test the _read_data method with an invalid SELFIES string to ensure error handling works.
+        """
+        raw_data = "[C][O][INVALID][N]"
+
+        result = self.reader._read_data(raw_data)
+        self.assertIsNone(result)
+
+        # Verify that the error count was incremented
+        self.assertEqual(self.reader.error_count, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()

From b6f5e5162d22359a67fa212c288b13715fd51356 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 30 Aug 2024 23:14:49 +0200
Subject: [PATCH 016/112] test for ProteinDataReader

---
 tests/unit/readers/testProteinDataReader.py | 105 ++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 tests/unit/readers/testProteinDataReader.py

diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py
new file mode 100644
index 00000000..5f828e75
--- /dev/null
+++ b/tests/unit/readers/testProteinDataReader.py
@@ -0,0 +1,105 @@
+import unittest
+from typing import List
+from unittest.mock import mock_open, patch
+
+from chebai.preprocessing.reader import EMBEDDING_OFFSET, ProteinDataReader
+
+
+class TestProteinDataReader(unittest.TestCase):
+    """
+    Unit tests for the ProteinDataReader class.
+    """
+
+    @patch(
+        "chebai.preprocessing.reader.open",
+        new_callable=mock_open,
+        read_data="M\nK\nT\nF\nR\nN",
+    )
+    def setUp(self, mock_file: mock_open) -> None:
+        """
+        Set up the test environment by initializing a ProteinDataReader instance with a mocked token file.
+
+        Args:
+            mock_file: Mock object for file operations.
+        """
+        self.reader = ProteinDataReader(token_path="/mock/path")
+        # After initializing, self.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N']
+        self.assertEqual(self.reader.cache, ["M", "K", "T", "F", "R", "N"])
+
+    def test_read_data(self) -> None:
+        """
+        Test the _read_data method with a protein sequence to ensure it correctly tokenizes the sequence.
+        """
+        raw_data = "MKTFFRN"
+
+        # Expected output based on the cached tokens
+        expected_output: List[int] = [
+            EMBEDDING_OFFSET + 0,  # M
+            EMBEDDING_OFFSET + 1,  # K
+            EMBEDDING_OFFSET + 2,  # T
+            EMBEDDING_OFFSET + 3,  # F
+            EMBEDDING_OFFSET + 3,  # F (repeated token)
+            EMBEDDING_OFFSET + 4,  # R
+            EMBEDDING_OFFSET + 5,  # N
+        ]
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+    def test_read_data_with_new_token(self) -> None:
+        """
+        Test the _read_data method with a protein sequence that includes a new token.
+        Ensure that the new token is added to the cache and processed correctly.
+        """
+        raw_data = "MKTFY"
+
+        # 'Y' is not in the initial cache and should be added.
+        expected_output: List[int] = [
+            EMBEDDING_OFFSET + 0,  # M
+            EMBEDDING_OFFSET + 1,  # K
+            EMBEDDING_OFFSET + 2,  # T
+            EMBEDDING_OFFSET + 3,  # F
+            EMBEDDING_OFFSET + len(self.reader.cache),  # Y (new token)
+        ]
+
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+        # Verify that 'Y' was added to the cache
+        self.assertIn("Y", self.reader.cache)
+        # Ensure it's at the correct index
+        self.assertEqual(self.reader.cache.index("Y"), len(self.reader.cache) - 1)
+
+    def test_read_data_with_invalid_token(self) -> None:
+        """
+        Test the _read_data method with an invalid amino acid token to ensure it raises a KeyError.
+        """
+        raw_data = "MKTFZ"  # 'Z' is not a valid amino acid token
+
+        with self.assertRaises(KeyError) as context:
+            self.reader._read_data(raw_data)
+
+        self.assertIn("Invalid token 'Z' encountered", str(context.exception))
+
+    def test_read_data_with_empty_sequence(self) -> None:
+        """
+        Test the _read_data method with an empty protein sequence to ensure it returns an empty list.
+        """
+        raw_data = ""
+
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, [])
+
+    def test_read_data_with_repeated_tokens(self) -> None:
+        """
+        Test the _read_data method with repeated amino acid tokens to ensure it handles them correctly.
+        """
+        raw_data = "MMMMM"
+
+        expected_output: List[int] = [EMBEDDING_OFFSET + 0] * 5  # All tokens are 'M'
+
+        result = self.reader._read_data(raw_data)
+        self.assertEqual(result, expected_output)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 73f05c01f81c90107eccb61c638529755b05df15 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 31 Aug 2024 00:03:21 +0200
Subject: [PATCH 017/112] test for DefaultCollator

---
 tests/unit/collators/testDefaultCollator.py | 52 +++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 tests/unit/collators/testDefaultCollator.py

diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py
new file mode 100644
index 00000000..6362d0a6
--- /dev/null
+++ b/tests/unit/collators/testDefaultCollator.py
@@ -0,0 +1,52 @@
+import unittest
+from typing import Dict, List
+
+from chebai.preprocessing.collate import DefaultCollator
+from chebai.preprocessing.structures import XYData
+
+
+class TestDefaultCollator(unittest.TestCase):
+    """
+    Unit tests for the DefaultCollator class.
+    """
+
+    def setUp(self) -> None:
+        """
+        Set up the test environment by initializing a DefaultCollator instance.
+        """
+        self.collator = DefaultCollator()
+
+    def test_call_with_valid_data(self) -> None:
+        """
+        Test the __call__ method with valid data to ensure features and labels are correctly extracted.
+        """
+        data: List[Dict] = [
+            {"features": [1.0, 2.0], "labels": 0},
+            {"features": [3.0, 4.0], "labels": 1},
+        ]
+
+        result: XYData = self.collator(data)
+        self.assertIsInstance(result, XYData)
+
+        expected_x = ([1.0, 2.0], [3.0, 4.0])
+        expected_y = (0, 1)
+
+        self.assertEqual(result.x, expected_x)
+        self.assertEqual(result.y, expected_y)
+
+    def test_call_with_empty_data(self) -> None:
+        """
+        Test the __call__ method with an empty list to ensure it handles the edge case correctly.
+        """
+        data: List[Dict] = []
+
+        with self.assertRaises(ValueError) as context:
+            self.collator(data)
+
+        self.assertEqual(
+            str(context.exception), "not enough values to unpack (expected 2, got 0)"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 8007f37f7622168fa3db1837e5b7fafcb8307a5e Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 31 Aug 2024 22:05:16 +0200
Subject: [PATCH 018/112] test for RaggedColllator

---
 tests/unit/collators/testRaggedCollator.py | 150 +++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 tests/unit/collators/testRaggedCollator.py

diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py
new file mode 100644
index 00000000..97e1c08f
--- /dev/null
+++ b/tests/unit/collators/testRaggedCollator.py
@@ -0,0 +1,150 @@
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+
+from chebai.preprocessing.collate import RaggedCollator
+from chebai.preprocessing.structures import XYData
+
+
+class TestRaggedCollator(unittest.TestCase):
+    """
+    Unit tests for the RaggedCollator class.
+    """
+
+    def setUp(self) -> None:
+        """
+        Set up the test environment by initializing a RaggedCollator instance.
+        """
+        self.collator = RaggedCollator()
+
+    def test_call_with_valid_data(self) -> None:
+        """
+        Test the __call__ method with valid ragged data to ensure features, labels, and masks are correctly handled.
+        """
+        data: List[Dict] = [
+            {"features": [1, 2], "labels": [1, 0], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": [0, 1, 1], "ident": "sample2"},
+            {"features": [6], "labels": [1], "ident": "sample3"},
+        ]
+
+        result: XYData = self.collator(data)
+
+        expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
+        expected_y = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]])
+        expected_mask_for_x = torch.tensor(
+            [[True, True, False], [True, True, True], [True, False, False]]
+        )
+        expected_lens_for_x = torch.tensor([2, 3, 1])
+
+        self.assertTrue(torch.equal(result.x, expected_x))
+        self.assertTrue(torch.equal(result.y, expected_y))
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
+            )
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
+            )
+        )
+        self.assertEqual(
+            result.additional_fields["idents"], ("sample1", "sample2", "sample3")
+        )
+
+    def test_call_with_missing_entire_labels(self) -> None:
+        """
+        Test the __call__ method with data where some samples are missing labels.
+        """
+        data: List[Dict] = [
+            {"features": [1, 2], "labels": [1, 0], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": None, "ident": "sample2"},
+            {"features": [6], "labels": [1], "ident": "sample3"},
+        ]
+
+        result: XYData = self.collator(data)
+
+        expected_x = torch.tensor([[1, 2], [6, 0]])
+        expected_y = torch.tensor([[1, 0], [1, 0]])
+        expected_mask_for_x = torch.tensor([[True, True], [True, False]])
+        expected_lens_for_x = torch.tensor([2, 1])
+
+        self.assertTrue(torch.equal(result.x, expected_x))
+        self.assertTrue(torch.equal(result.y, expected_y))
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
+            )
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
+            )
+        )
+        self.assertEqual(
+            result.additional_fields["loss_kwargs"]["non_null_labels"], [0, 2]
+        )
+        self.assertEqual(
+            result.additional_fields["idents"], ("sample1", "sample2", "sample3")
+        )
+
+    def test_call_with_none_in_labels(self) -> None:
+        """
+        Test the __call__ method with data where one of the elements in the labels is None.
+        """
+        data: List[Dict] = [
+            {"features": [1, 2], "labels": [None, 1], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": [1, 0], "ident": "sample2"},
+            {"features": [6], "labels": [1], "ident": "sample3"},
+        ]
+
+        result: XYData = self.collator(data)
+
+        expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
+        expected_y = torch.tensor([[0, 1], [1, 0], [1, 0]])  # None is replaced by 0
+        expected_mask_for_x = torch.tensor(
+            [[True, True, False], [True, True, True], [True, False, False]]
+        )
+        expected_lens_for_x = torch.tensor([2, 3, 1])
+
+        self.assertTrue(torch.equal(result.x, expected_x))
+        self.assertTrue(torch.equal(result.y, expected_y))
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
+            )
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
+            )
+        )
+        self.assertEqual(
+            result.additional_fields["idents"], ("sample1", "sample2", "sample3")
+        )
+
+    def test_call_with_empty_data(self) -> None:
+        """
+        Test the __call__ method with an empty list to ensure it raises an error.
+        """
+        data: List[Dict] = []
+
+        with self.assertRaises(Exception):
+            self.collator(data)
+
+    def test_process_label_rows(self) -> None:
+        """
+        Test the process_label_rows method to ensure it pads label sequences correctly.
+        """
+        labels: Tuple = ([1, 0], [0, 1, 1], [1])
+
+        result: torch.Tensor = self.collator.process_label_rows(labels)
+
+        expected_output = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]])
+
+        self.assertTrue(torch.equal(result, expected_output))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 248eaa7034ac2aa204d578c2c249096ee07dbd83 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 31 Aug 2024 23:55:52 +0200
Subject: [PATCH 019/112] modify tests to use `setUpClass` class method instead
 of `setUp` instance method

---
 tests/unit/collators/testDefaultCollator.py  |  5 +++--
 tests/unit/collators/testRaggedCollator.py   |  5 +++--
 tests/unit/readers/testChemDataReader.py     |  9 +++++----
 tests/unit/readers/testDataReader.py         |  5 +++--
 tests/unit/readers/testDeepChemDataReader.py |  9 +++++----
 tests/unit/readers/testProteinDataReader.py  |  9 +++++----
 tests/unit/readers/testSelfiesReader.py      | 16 +++++-----------
 7 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py
index 6362d0a6..287cadcd 100644
--- a/tests/unit/collators/testDefaultCollator.py
+++ b/tests/unit/collators/testDefaultCollator.py
@@ -10,11 +10,12 @@ class TestDefaultCollator(unittest.TestCase):
     Unit tests for the DefaultCollator class.
     """
 
-    def setUp(self) -> None:
+    @classmethod
+    def setUpClass(cls) -> None:
         """
         Set up the test environment by initializing a DefaultCollator instance.
         """
-        self.collator = DefaultCollator()
+        cls.collator = DefaultCollator()
 
     def test_call_with_valid_data(self) -> None:
         """
diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py
index 97e1c08f..a3126314 100644
--- a/tests/unit/collators/testRaggedCollator.py
+++ b/tests/unit/collators/testRaggedCollator.py
@@ -12,11 +12,12 @@ class TestRaggedCollator(unittest.TestCase):
     Unit tests for the RaggedCollator class.
     """
 
-    def setUp(self) -> None:
+    @classmethod
+    def setUpClass(cls) -> None:
         """
         Set up the test environment by initializing a RaggedCollator instance.
         """
-        self.collator = RaggedCollator()
+        cls.collator = RaggedCollator()
 
     def test_call_with_valid_data(self) -> None:
         """
diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
index 2bc525e1..3d7b5e6f 100644
--- a/tests/unit/readers/testChemDataReader.py
+++ b/tests/unit/readers/testChemDataReader.py
@@ -12,21 +12,22 @@ class TestChemDataReader(unittest.TestCase):
     Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order.
     """
 
+    @classmethod
     @patch(
         "chebai.preprocessing.reader.open",
         new_callable=mock_open,
         read_data="C\nO\nN\n=\n1\n(",
     )
-    def setUp(self, mock_file: mock_open) -> None:
+    def setUpClass(cls, mock_file: mock_open) -> None:
         """
         Set up the test environment by initializing a ChemDataReader instance with a mocked token file.
 
         Args:
             mock_file: Mock object for file operations.
         """
-        self.reader = ChemDataReader(token_path="/mock/path")
-        # After initializing, self.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '(']
-        self.assertEqual(self.reader.cache, ["C", "O", "N", "=", "1", "("])
+        cls.reader = ChemDataReader(token_path="/mock/path")
+        # After initializing, cls.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '(']
+        assert cls.reader.cache == ["C", "O", "N", "=", "1", "("]
 
     def test_read_data(self) -> None:
         """
diff --git a/tests/unit/readers/testDataReader.py b/tests/unit/readers/testDataReader.py
index 1a511b26..8a8af053 100644
--- a/tests/unit/readers/testDataReader.py
+++ b/tests/unit/readers/testDataReader.py
@@ -9,11 +9,12 @@ class TestDataReader(unittest.TestCase):
     Unit tests for the DataReader class.
     """
 
-    def setUp(self) -> None:
+    @classmethod
+    def setUpClass(cls) -> None:
         """
         Set up the test environment by initializing a DataReader instance.
         """
-        self.reader = DataReader()
+        cls.reader = DataReader()
 
     def test_to_data(self) -> None:
         """
diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py
index ac1a50b7..23ac35d5 100644
--- a/tests/unit/readers/testDeepChemDataReader.py
+++ b/tests/unit/readers/testDeepChemDataReader.py
@@ -12,21 +12,22 @@ class TestDeepChemDataReader(unittest.TestCase):
     Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order.
     """
 
+    @classmethod
     @patch(
         "chebai.preprocessing.reader.open",
         new_callable=mock_open,
         read_data="C\nO\nc\n)",
     )
-    def setUp(self, mock_file: mock_open) -> None:
+    def setUpClass(cls, mock_file: mock_open) -> None:
         """
         Set up the test environment by initializing a DeepChemDataReader instance with a mocked token file.
 
         Args:
             mock_file: Mock object for file operations.
         """
-        self.reader = DeepChemDataReader(token_path="/mock/path")
-        # After initializing, self.reader.cache should now be set to ['C', 'O', 'c', ')']
-        self.assertEqual(self.reader.cache, ["C", "O", "c", ")"])
+        cls.reader = DeepChemDataReader(token_path="/mock/path")
+        # After initializing, cls.reader.cache should now be set to ['C', 'O', 'c', ')']
+        assert cls.reader.cache == ["C", "O", "c", ")"]
 
     def test_read_data(self) -> None:
         """
diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py
index 5f828e75..6e5f325c 100644
--- a/tests/unit/readers/testProteinDataReader.py
+++ b/tests/unit/readers/testProteinDataReader.py
@@ -10,21 +10,22 @@ class TestProteinDataReader(unittest.TestCase):
     Unit tests for the ProteinDataReader class.
     """
 
+    @classmethod
     @patch(
         "chebai.preprocessing.reader.open",
         new_callable=mock_open,
         read_data="M\nK\nT\nF\nR\nN",
     )
-    def setUp(self, mock_file: mock_open) -> None:
+    def setUpClass(cls, mock_file: mock_open) -> None:
         """
         Set up the test environment by initializing a ProteinDataReader instance with a mocked token file.
 
         Args:
             mock_file: Mock object for file operations.
         """
-        self.reader = ProteinDataReader(token_path="/mock/path")
-        # After initializing, self.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N']
-        self.assertEqual(self.reader.cache, ["M", "K", "T", "F", "R", "N"])
+        cls.reader = ProteinDataReader(token_path="/mock/path")
+        # After initializing, cls.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N']
+        assert cls.reader.cache == ["M", "K", "T", "F", "R", "N"]
 
     def test_read_data(self) -> None:
         """
diff --git a/tests/unit/readers/testSelfiesReader.py b/tests/unit/readers/testSelfiesReader.py
index 41202757..019a0f59 100644
--- a/tests/unit/readers/testSelfiesReader.py
+++ b/tests/unit/readers/testSelfiesReader.py
@@ -12,28 +12,22 @@ class TestSelfiesReader(unittest.TestCase):
     Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order.
     """
 
+    @classmethod
     @patch(
         "chebai.preprocessing.reader.open",
         new_callable=mock_open,
         read_data="[C]\n[O]\n[=C]",
     )
-    def setUp(self, mock_file: mock_open) -> None:
+    def setUpClass(cls, mock_file: mock_open) -> None:
         """
         Set up the test environment by initializing a SelfiesReader instance with a mocked token file.
 
         Args:
             mock_file: Mock object for file operations.
         """
-        self.reader = SelfiesReader(token_path="/mock/path")
-        # After initializing, self.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[(']
-        self.assertEqual(
-            self.reader.cache,
-            [
-                "[C]",
-                "[O]",
-                "[=C]",
-            ],
-        )
+        cls.reader = SelfiesReader(token_path="/mock/path")
+        # After initializing, cls.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[(']
+        assert cls.reader.cache == ["[C]", "[O]", "[=C]"]
 
     def test_read_data(self) -> None:
         """

From 3e57d78420ec8b1076b5e5842c535b03b212da8a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 1 Sep 2024 13:33:18 +0200
Subject: [PATCH 020/112] bool labels instead of numeric, for realistic data

---
 tests/unit/collators/testDefaultCollator.py |  6 ++--
 tests/unit/collators/testRaggedCollator.py  | 34 +++++++++++++--------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py
index 287cadcd..29b1cc91 100644
--- a/tests/unit/collators/testDefaultCollator.py
+++ b/tests/unit/collators/testDefaultCollator.py
@@ -22,15 +22,15 @@ def test_call_with_valid_data(self) -> None:
         Test the __call__ method with valid data to ensure features and labels are correctly extracted.
         """
         data: List[Dict] = [
-            {"features": [1.0, 2.0], "labels": 0},
-            {"features": [3.0, 4.0], "labels": 1},
+            {"features": [1.0, 2.0], "labels": [True, False, True]},
+            {"features": [3.0, 4.0], "labels": [False, False, True]},
         ]
 
         result: XYData = self.collator(data)
         self.assertIsInstance(result, XYData)
 
         expected_x = ([1.0, 2.0], [3.0, 4.0])
-        expected_y = (0, 1)
+        expected_y = ([True, False, True], [False, False, True])
 
         self.assertEqual(result.x, expected_x)
         self.assertEqual(result.y, expected_y)
diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py
index a3126314..81947b47 100644
--- a/tests/unit/collators/testRaggedCollator.py
+++ b/tests/unit/collators/testRaggedCollator.py
@@ -24,15 +24,17 @@ def test_call_with_valid_data(self) -> None:
         Test the __call__ method with valid ragged data to ensure features, labels, and masks are correctly handled.
         """
         data: List[Dict] = [
-            {"features": [1, 2], "labels": [1, 0], "ident": "sample1"},
-            {"features": [3, 4, 5], "labels": [0, 1, 1], "ident": "sample2"},
-            {"features": [6], "labels": [1], "ident": "sample3"},
+            {"features": [1, 2], "labels": [True, False], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": [False, True, True], "ident": "sample2"},
+            {"features": [6], "labels": [True], "ident": "sample3"},
         ]
 
         result: XYData = self.collator(data)
 
         expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
-        expected_y = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]])
+        expected_y = torch.tensor(
+            [[True, False, False], [False, True, True], [True, False, False]]
+        )
         expected_mask_for_x = torch.tensor(
             [[True, True, False], [True, True, True], [True, False, False]]
         )
@@ -59,15 +61,17 @@ def test_call_with_missing_entire_labels(self) -> None:
         Test the __call__ method with data where some samples are missing labels.
         """
         data: List[Dict] = [
-            {"features": [1, 2], "labels": [1, 0], "ident": "sample1"},
+            {"features": [1, 2], "labels": [True, False], "ident": "sample1"},
             {"features": [3, 4, 5], "labels": None, "ident": "sample2"},
-            {"features": [6], "labels": [1], "ident": "sample3"},
+            {"features": [6], "labels": [True], "ident": "sample3"},
         ]
 
         result: XYData = self.collator(data)
 
         expected_x = torch.tensor([[1, 2], [6, 0]])
-        expected_y = torch.tensor([[1, 0], [1, 0]])
+        expected_y = torch.tensor(
+            [[True, False], [True, False]]
+        )  # True -> 1, False -> 0
         expected_mask_for_x = torch.tensor([[True, True], [True, False]])
         expected_lens_for_x = torch.tensor([2, 1])
 
@@ -95,15 +99,17 @@ def test_call_with_none_in_labels(self) -> None:
         Test the __call__ method with data where one of the elements in the labels is None.
         """
         data: List[Dict] = [
-            {"features": [1, 2], "labels": [None, 1], "ident": "sample1"},
-            {"features": [3, 4, 5], "labels": [1, 0], "ident": "sample2"},
-            {"features": [6], "labels": [1], "ident": "sample3"},
+            {"features": [1, 2], "labels": [None, True], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": [True, False], "ident": "sample2"},
+            {"features": [6], "labels": [True], "ident": "sample3"},
         ]
 
         result: XYData = self.collator(data)
 
         expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
-        expected_y = torch.tensor([[0, 1], [1, 0], [1, 0]])  # None is replaced by 0
+        expected_y = torch.tensor(
+            [[False, True], [True, False], [True, False]]
+        )  # None -> False
         expected_mask_for_x = torch.tensor(
             [[True, True, False], [True, True, True], [True, False, False]]
         )
@@ -138,11 +144,13 @@ def test_process_label_rows(self) -> None:
         """
         Test the process_label_rows method to ensure it pads label sequences correctly.
         """
-        labels: Tuple = ([1, 0], [0, 1, 1], [1])
+        labels: Tuple = ([True, False], [False, True, True], [True])
 
         result: torch.Tensor = self.collator.process_label_rows(labels)
 
-        expected_output = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]])
+        expected_output = torch.tensor(
+            [[True, False, False], [False, True, True], [True, False, False]]
+        )
 
         self.assertTrue(torch.equal(result, expected_output))
 

From f9ca653d76b9a8434b1a1a487ee57b796156b40a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 1 Sep 2024 13:33:51 +0200
Subject: [PATCH 021/112] test for XYBaseDataModule

---
 .../dataset_classes/testXYBaseDataModule.py   | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 tests/unit/dataset_classes/testXYBaseDataModule.py

diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py
new file mode 100644
index 00000000..d8aabc67
--- /dev/null
+++ b/tests/unit/dataset_classes/testXYBaseDataModule.py
@@ -0,0 +1,76 @@
+import unittest
+from unittest.mock import PropertyMock, patch
+
+from chebai.preprocessing.datasets.base import XYBaseDataModule
+from chebai.preprocessing.reader import ProteinDataReader
+
+
+class TestXYBaseDataModule(unittest.TestCase):
+    """
+    Unit tests for the methods of the XYBaseDataModule class.
+    """
+
+    @classmethod
+    @patch.object(XYBaseDataModule, "_name", new_callable=PropertyMock)
+    def setUpClass(cls, mock_name_property) -> None:
+        """
+        Set up a base instance of XYBaseDataModule for testing.
+        """
+
+        # Mock the _name property of XYBaseDataModule
+        mock_name_property.return_value = "MockedXYBaseDataModule"
+
+        # Assign a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
+        XYBaseDataModule.READER = ProteinDataReader
+
+        # Initialize the module with a label_filter
+        cls.module = XYBaseDataModule(
+            label_filter=1,  # Provide a label_filter
+            balance_after_filter=1.0,  # Balance ratio
+        )
+
+    def test_filter_labels_valid_index(self) -> None:
+        """
+        Test the _filter_labels method with a valid label_filter index.
+        """
+        self.module.label_filter = 1
+        row = {
+            "features": ["feature1", "feature2"],
+            "labels": [0, 3, 1, 2],  # List of labels
+        }
+        filtered_row = self.module._filter_labels(row)
+        expected_labels = [3]  # Only the label at index 1 should be kept
+
+        self.assertEqual(filtered_row["labels"], expected_labels)
+
+        row = {
+            "features": ["feature1", "feature2"],
+            "labels": [True, False, True, True],
+        }
+        self.assertEqual(self.module._filter_labels(row)["labels"], [False])
+
+    def test_filter_labels_no_filter(self) -> None:
+        """
+        Test the _filter_labels method with no label_filter index.
+        """
+        # Update the module to have no label filter
+        self.module.label_filter = None
+        row = {"features": ["feature1", "feature2"], "labels": [False, True]}
+        # Handle the case where the index is out of bounds
+        with self.assertRaises(TypeError):
+            self.module._filter_labels(row)
+
+    def test_filter_labels_invalid_index(self) -> None:
+        """
+        Test the _filter_labels method with an invalid label_filter index.
+        """
+        # Set an invalid label filter index (e.g., greater than the number of labels)
+        self.module.label_filter = 10
+        row = {"features": ["feature1", "feature2"], "labels": [False, True]}
+        # Handle the case where the index is out of bounds
+        with self.assertRaises(IndexError):
+            self.module._filter_labels(row)
+
+
+if __name__ == "__main__":
+    unittest.main()

From d8016aa6459548f8981c43473706a80c9748fca9 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 2 Sep 2024 00:25:58 +0200
Subject: [PATCH 022/112] test for DynamicDataset

---
 .../dataset_classes/testDynamicDataset.py     | 231 ++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 tests/unit/dataset_classes/testDynamicDataset.py

diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py
new file mode 100644
index 00000000..ae69952a
--- /dev/null
+++ b/tests/unit/dataset_classes/testDynamicDataset.py
@@ -0,0 +1,231 @@
+import unittest
+from typing import Tuple
+from unittest.mock import PropertyMock, patch
+
+import pandas as pd
+
+from chebai.preprocessing.datasets.base import _DynamicDataset
+from chebai.preprocessing.reader import ProteinDataReader
+
+
+class TestDynamicDataset(unittest.TestCase):
+    """
+    Test case for _DynamicDataset functionality, ensuring correct data splits and integrity
+    of train, validation, and test datasets.
+    """
+
+    @classmethod
+    @patch.multiple(_DynamicDataset, __abstractmethods__=frozenset())
+    @patch.object(_DynamicDataset, "base_dir", new_callable=PropertyMock)
+    @patch.object(_DynamicDataset, "_name", new_callable=PropertyMock)
+    def setUpClass(
+        cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock
+    ) -> None:
+        """
+        Set up a base instance of _DynamicDataset for testing with mocked properties.
+        """
+
+        # Mocking properties
+        mock_base_dir_property.return_value = "MockedBaseDirProperty"
+        mock_name_property.return_value = "MockedNameProperty"
+
+        # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
+        _DynamicDataset.READER = ProteinDataReader
+
+        # Creating an instance of the dataset
+        cls.dataset: _DynamicDataset = _DynamicDataset()
+
+        # Dataset with a balanced distribution of labels
+        X = [
+            [1, 2],
+            [3, 4],
+            [5, 6],
+            [7, 8],
+            [9, 10],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+            [17, 18],
+            [19, 20],
+            [21, 22],
+            [23, 24],
+            [25, 26],
+            [27, 28],
+            [29, 30],
+            [31, 32],
+        ]
+        y = [
+            [False, False],
+            [False, True],
+            [True, False],
+            [True, True],
+            [False, False],
+            [False, True],
+            [True, False],
+            [True, True],
+            [False, False],
+            [False, True],
+            [True, False],
+            [True, True],
+            [False, False],
+            [False, True],
+            [True, False],
+            [True, True],
+        ]
+        cls.df = pd.DataFrame(
+            {"ident": [f"id{i + 1}" for i in range(len(X))], "features": X, "labels": y}
+        )
+
+    def test_get_test_split_valid(self) -> None:
+        """
+        Test splitting the dataset into train and test sets and verify balance and non-overlap.
+        """
+        self.dataset.train_split = 0.5
+        # Test size will be 0.25 * 16 = 4
+        train_df, test_df = self.dataset.get_test_split(self.df, seed=0)
+
+        # Assert the correct number of rows in train and test sets
+        self.assertEqual(len(train_df), 12, "Train set should contain 12 samples.")
+        self.assertEqual(len(test_df), 4, "Test set should contain 4 samples.")
+
+        # Check positive and negative label counts in train and test sets
+        train_pos_count, train_neg_count = self.get_positive_negative_labels_counts(
+            train_df
+        )
+        test_pos_count, test_neg_count = self.get_positive_negative_labels_counts(
+            test_df
+        )
+
+        # Ensure that the train and test sets have balanced positives and negatives
+        self.assertEqual(
+            train_pos_count, train_neg_count, "Train set labels should be balanced."
+        )
+        self.assertEqual(
+            test_pos_count, test_neg_count, "Test set labels should be balanced."
+        )
+
+        # Assert there is no overlap between train and test sets
+        train_idents = set(train_df["ident"])
+        test_idents = set(test_df["ident"])
+        self.assertEqual(
+            len(train_idents.intersection(test_idents)),
+            0,
+            "Train and test sets should not overlap.",
+        )
+
+    def test_get_test_split_missing_labels(self) -> None:
+        """
+        Test the behavior when the 'labels' column is missing in the dataset.
+        """
+        df_missing_labels = pd.DataFrame({"ident": ["id1", "id2"]})
+        with self.assertRaises(
+            KeyError, msg="Expected KeyError when 'labels' column is missing."
+        ):
+            self.dataset.get_test_split(df_missing_labels)
+
+    def test_get_test_split_seed_consistency(self) -> None:
+        """
+        Test that splitting the dataset with the same seed produces consistent results.
+        """
+        train_df1, test_df1 = self.dataset.get_test_split(self.df, seed=42)
+        train_df2, test_df2 = self.dataset.get_test_split(self.df, seed=42)
+
+        pd.testing.assert_frame_equal(
+            train_df1,
+            train_df2,
+            obj="Train sets should be identical for the same seed.",
+        )
+        pd.testing.assert_frame_equal(
+            test_df1, test_df2, obj="Test sets should be identical for the same seed."
+        )
+
+    def test_get_train_val_splits_given_test(self) -> None:
+        """
+        Test splitting the dataset into train and validation sets and verify balance and non-overlap.
+        """
+        self.dataset.use_inner_cross_validation = False
+        self.dataset.train_split = 0.5
+        df_train_main, test_df = self.dataset.get_test_split(self.df, seed=0)
+        train_df, val_df = self.dataset.get_train_val_splits_given_test(
+            df_train_main, test_df, seed=42
+        )
+
+        # Ensure there is no overlap between train and test sets
+        train_idents = set(train_df["ident"])
+        test_idents = set(test_df["ident"])
+        self.assertEqual(
+            len(train_idents.intersection(test_idents)),
+            0,
+            "Train and test sets should not overlap.",
+        )
+
+        # Ensure there is no overlap between validation and test sets
+        val_idents = set(val_df["ident"])
+        self.assertEqual(
+            len(val_idents.intersection(test_idents)),
+            0,
+            "Validation and test sets should not overlap.",
+        )
+
+        # Ensure there is no overlap between train and validation sets
+        self.assertEqual(
+            len(train_idents.intersection(val_idents)),
+            0,
+            "Train and validation sets should not overlap.",
+        )
+
+        # Check positive and negative label counts in train and validation sets
+        train_pos_count, train_neg_count = self.get_positive_negative_labels_counts(
+            train_df
+        )
+        val_pos_count, val_neg_count = self.get_positive_negative_labels_counts(val_df)
+
+        # Ensure that the train and validation sets have balanced positives and negatives
+        self.assertEqual(
+            train_pos_count, train_neg_count, "Train set labels should be balanced."
+        )
+        self.assertEqual(
+            val_pos_count, val_neg_count, "Validation set labels should be balanced."
+        )
+
+    def test_get_train_val_splits_given_test_consistency(self) -> None:
+        """
+        Test that splitting the dataset into train and validation sets with the same seed produces consistent results.
+        """
+        test_df = self.df.iloc[12:]  # Assume rows 12 onward are for testing
+        train_df1, val_df1 = self.dataset.get_train_val_splits_given_test(
+            self.df, test_df, seed=42
+        )
+        train_df2, val_df2 = self.dataset.get_train_val_splits_given_test(
+            self.df, test_df, seed=42
+        )
+
+        pd.testing.assert_frame_equal(
+            train_df1,
+            train_df2,
+            obj="Train sets should be identical for the same seed.",
+        )
+        pd.testing.assert_frame_equal(
+            val_df1,
+            val_df2,
+            obj="Validation sets should be identical for the same seed.",
+        )
+
+    @staticmethod
+    def get_positive_negative_labels_counts(df: pd.DataFrame) -> Tuple[int, int]:
+        """
+        Count the number of True and False values within the labels column.
+
+        Args:
+            df (pd.DataFrame): The DataFrame containing the 'labels' column.
+
+        Returns:
+            Tuple[int, int]: A tuple containing the counts of True and False values, respectively.
+        """
+        true_count = sum(sum(label) for label in df["labels"])
+        false_count = sum(len(label) - sum(label) for label in df["labels"])
+        return true_count, false_count
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0c7c5b8fab7612bbcfc8c7feba8d07d7b797a3d9 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 2 Sep 2024 00:53:59 +0200
Subject: [PATCH 023/112] add relevant msg to each assert statement

---
 tests/unit/collators/testDefaultCollator.py   | 20 ++++-
 tests/unit/collators/testRaggedCollator.py    | 73 ++++++++++++++-----
 .../dataset_classes/testDynamicDataset.py     |  4 +-
 .../dataset_classes/testXYBaseDataModule.py   | 25 +++++--
 tests/unit/readers/testChemDataReader.py      | 33 +++++++--
 tests/unit/readers/testDataReader.py          |  6 +-
 tests/unit/readers/testDeepChemDataReader.py  | 31 ++++++--
 tests/unit/readers/testProteinDataReader.py   | 49 +++++++++++--
 tests/unit/readers/testSelfiesReader.py       | 43 +++++++++--
 9 files changed, 227 insertions(+), 57 deletions(-)

diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py
index 29b1cc91..73f09c75 100644
--- a/tests/unit/collators/testDefaultCollator.py
+++ b/tests/unit/collators/testDefaultCollator.py
@@ -27,13 +27,23 @@ def test_call_with_valid_data(self) -> None:
         ]
 
         result: XYData = self.collator(data)
-        self.assertIsInstance(result, XYData)
+        self.assertIsInstance(
+            result, XYData, "The result should be an instance of XYData."
+        )
 
         expected_x = ([1.0, 2.0], [3.0, 4.0])
         expected_y = ([True, False, True], [False, False, True])
 
-        self.assertEqual(result.x, expected_x)
-        self.assertEqual(result.y, expected_y)
+        self.assertEqual(
+            result.x,
+            expected_x,
+            "The feature data 'x' does not match the expected output.",
+        )
+        self.assertEqual(
+            result.y,
+            expected_y,
+            "The label data 'y' does not match the expected output.",
+        )
 
     def test_call_with_empty_data(self) -> None:
         """
@@ -45,7 +55,9 @@ def test_call_with_empty_data(self) -> None:
             self.collator(data)
 
         self.assertEqual(
-            str(context.exception), "not enough values to unpack (expected 2, got 0)"
+            str(context.exception),
+            "not enough values to unpack (expected 2, got 0)",
+            "The exception message for empty data is not as expected.",
         )
 
 
diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py
index 81947b47..d31776a6 100644
--- a/tests/unit/collators/testRaggedCollator.py
+++ b/tests/unit/collators/testRaggedCollator.py
@@ -40,20 +40,30 @@ def test_call_with_valid_data(self) -> None:
         )
         expected_lens_for_x = torch.tensor([2, 3, 1])
 
-        self.assertTrue(torch.equal(result.x, expected_x))
-        self.assertTrue(torch.equal(result.y, expected_y))
+        self.assertTrue(
+            torch.equal(result.x, expected_x),
+            "The feature tensor 'x' does not match the expected output.",
+        )
+        self.assertTrue(
+            torch.equal(result.y, expected_y),
+            "The label tensor 'y' does not match the expected output.",
+        )
         self.assertTrue(
             torch.equal(
                 result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
-            )
+            ),
+            "The mask tensor does not match the expected output.",
         )
         self.assertTrue(
             torch.equal(
                 result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
-            )
+            ),
+            "The lens tensor does not match the expected output.",
         )
         self.assertEqual(
-            result.additional_fields["idents"], ("sample1", "sample2", "sample3")
+            result.additional_fields["idents"],
+            ("sample1", "sample2", "sample3"),
+            "The identifiers do not match the expected output.",
         )
 
     def test_call_with_missing_entire_labels(self) -> None:
@@ -75,23 +85,35 @@ def test_call_with_missing_entire_labels(self) -> None:
         expected_mask_for_x = torch.tensor([[True, True], [True, False]])
         expected_lens_for_x = torch.tensor([2, 1])
 
-        self.assertTrue(torch.equal(result.x, expected_x))
-        self.assertTrue(torch.equal(result.y, expected_y))
+        self.assertTrue(
+            torch.equal(result.x, expected_x),
+            "The feature tensor 'x' does not match the expected output when labels are missing.",
+        )
+        self.assertTrue(
+            torch.equal(result.y, expected_y),
+            "The label tensor 'y' does not match the expected output when labels are missing.",
+        )
         self.assertTrue(
             torch.equal(
                 result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
-            )
+            ),
+            "The mask tensor does not match the expected output when labels are missing.",
         )
         self.assertTrue(
             torch.equal(
                 result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
-            )
+            ),
+            "The lens tensor does not match the expected output when labels are missing.",
         )
         self.assertEqual(
-            result.additional_fields["loss_kwargs"]["non_null_labels"], [0, 2]
+            result.additional_fields["loss_kwargs"]["non_null_labels"],
+            [0, 2],
+            "The non-null labels list does not match the expected output.",
         )
         self.assertEqual(
-            result.additional_fields["idents"], ("sample1", "sample2", "sample3")
+            result.additional_fields["idents"],
+            ("sample1", "sample2", "sample3"),
+            "The identifiers do not match the expected output when labels are missing.",
         )
 
     def test_call_with_none_in_labels(self) -> None:
@@ -115,20 +137,30 @@ def test_call_with_none_in_labels(self) -> None:
         )
         expected_lens_for_x = torch.tensor([2, 3, 1])
 
-        self.assertTrue(torch.equal(result.x, expected_x))
-        self.assertTrue(torch.equal(result.y, expected_y))
+        self.assertTrue(
+            torch.equal(result.x, expected_x),
+            "The feature tensor 'x' does not match the expected output when labels contain None.",
+        )
+        self.assertTrue(
+            torch.equal(result.y, expected_y),
+            "The label tensor 'y' does not match the expected output when labels contain None.",
+        )
         self.assertTrue(
             torch.equal(
                 result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
-            )
+            ),
+            "The mask tensor does not match the expected output when labels contain None.",
         )
         self.assertTrue(
             torch.equal(
                 result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
-            )
+            ),
+            "The lens tensor does not match the expected output when labels contain None.",
         )
         self.assertEqual(
-            result.additional_fields["idents"], ("sample1", "sample2", "sample3")
+            result.additional_fields["idents"],
+            ("sample1", "sample2", "sample3"),
+            "The identifiers do not match the expected output when labels contain None.",
         )
 
     def test_call_with_empty_data(self) -> None:
@@ -137,7 +169,9 @@ def test_call_with_empty_data(self) -> None:
         """
         data: List[Dict] = []
 
-        with self.assertRaises(Exception):
+        with self.assertRaises(
+            Exception, msg="Expected an Error when no data is provided"
+        ):
             self.collator(data)
 
     def test_process_label_rows(self) -> None:
@@ -152,7 +186,10 @@ def test_process_label_rows(self) -> None:
             [[True, False, False], [False, True, True], [True, False, False]]
         )
 
-        self.assertTrue(torch.equal(result, expected_output))
+        self.assertTrue(
+            torch.equal(result, expected_output),
+            "The processed label rows tensor does not match the expected output.",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py
index ae69952a..50b9287a 100644
--- a/tests/unit/dataset_classes/testDynamicDataset.py
+++ b/tests/unit/dataset_classes/testDynamicDataset.py
@@ -26,8 +26,8 @@ def setUpClass(
         """
 
         # Mocking properties
-        mock_base_dir_property.return_value = "MockedBaseDirProperty"
-        mock_name_property.return_value = "MockedNameProperty"
+        mock_base_dir_property.return_value = "MockedBaseDirPropertyDynamicDataset"
+        mock_name_property.return_value = "MockedNamePropertyDynamicDataset"
 
         # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
         _DynamicDataset.READER = ProteinDataReader
diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py
index d8aabc67..4c2d21dc 100644
--- a/tests/unit/dataset_classes/testXYBaseDataModule.py
+++ b/tests/unit/dataset_classes/testXYBaseDataModule.py
@@ -12,13 +12,13 @@ class TestXYBaseDataModule(unittest.TestCase):
 
     @classmethod
     @patch.object(XYBaseDataModule, "_name", new_callable=PropertyMock)
-    def setUpClass(cls, mock_name_property) -> None:
+    def setUpClass(cls, mock_name_property: PropertyMock) -> None:
         """
         Set up a base instance of XYBaseDataModule for testing.
         """
 
         # Mock the _name property of XYBaseDataModule
-        mock_name_property.return_value = "MockedXYBaseDataModule"
+        mock_name_property.return_value = "MockedNamePropXYBaseDataModule"
 
         # Assign a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
         XYBaseDataModule.READER = ProteinDataReader
@@ -41,13 +41,21 @@ def test_filter_labels_valid_index(self) -> None:
         filtered_row = self.module._filter_labels(row)
         expected_labels = [3]  # Only the label at index 1 should be kept
 
-        self.assertEqual(filtered_row["labels"], expected_labels)
+        self.assertEqual(
+            filtered_row["labels"],
+            expected_labels,
+            "The filtered labels do not match the expected labels.",
+        )
 
         row = {
             "features": ["feature1", "feature2"],
             "labels": [True, False, True, True],
         }
-        self.assertEqual(self.module._filter_labels(row)["labels"], [False])
+        self.assertEqual(
+            self.module._filter_labels(row)["labels"],
+            [False],
+            "The filtered labels for the boolean case do not match the expected labels.",
+        )
 
     def test_filter_labels_no_filter(self) -> None:
         """
@@ -57,7 +65,9 @@ def test_filter_labels_no_filter(self) -> None:
         self.module.label_filter = None
         row = {"features": ["feature1", "feature2"], "labels": [False, True]}
         # Handle the case where the index is out of bounds
-        with self.assertRaises(TypeError):
+        with self.assertRaises(
+            TypeError, msg="Expected a TypeError when no label filter is provided."
+        ):
             self.module._filter_labels(row)
 
     def test_filter_labels_invalid_index(self) -> None:
@@ -68,7 +78,10 @@ def test_filter_labels_invalid_index(self) -> None:
         self.module.label_filter = 10
         row = {"features": ["feature1", "feature2"], "labels": [False, True]}
         # Handle the case where the index is out of bounds
-        with self.assertRaises(IndexError):
+        with self.assertRaises(
+            IndexError,
+            msg="Expected an IndexError when the label filter index is out of bounds.",
+        ):
             self.module._filter_labels(row)
 
 
diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
index 3d7b5e6f..fde8604f 100644
--- a/tests/unit/readers/testChemDataReader.py
+++ b/tests/unit/readers/testChemDataReader.py
@@ -27,7 +27,14 @@ def setUpClass(cls, mock_file: mock_open) -> None:
         """
         cls.reader = ChemDataReader(token_path="/mock/path")
         # After initializing, cls.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '(']
-        assert cls.reader.cache == ["C", "O", "N", "=", "1", "("]
+        assert cls.reader.cache == [
+            "C",
+            "O",
+            "N",
+            "=",
+            "1",
+            "(",
+        ], "Initial cache does not match expected values."
 
     def test_read_data(self) -> None:
         """
@@ -48,7 +55,11 @@ def test_read_data(self) -> None:
             EMBEDDING_OFFSET + len(self.reader.cache) + 1,  # [Mg-2]
         ]
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The output of _read_data does not match the expected tokenized values.",
+        )
 
     def test_read_data_with_new_token(self) -> None:
         """
@@ -62,12 +73,24 @@ def test_read_data_with_new_token(self) -> None:
         expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token]
 
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The output for new token '[H-]' does not match the expected values.",
+        )
 
         # Verify that '[H-]' was added to the cache
-        self.assertIn("[H-]", self.reader.cache)
+        self.assertIn(
+            "[H-]",
+            self.reader.cache,
+            "The new token '[H-]' was not added to the cache.",
+        )
         # Ensure it's at the correct index
-        self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token)
+        self.assertEqual(
+            self.reader.cache.index("[H-]"),
+            index_for_last_token,
+            "The new token '[H-]' was not added at the correct index in the cache.",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/readers/testDataReader.py b/tests/unit/readers/testDataReader.py
index 8a8af053..745c0ace 100644
--- a/tests/unit/readers/testDataReader.py
+++ b/tests/unit/readers/testDataReader.py
@@ -45,7 +45,11 @@ def test_to_data(self) -> None:
             "extra_key": "extra_value",
         }
 
-        self.assertEqual(self.reader.to_data(row), expected)
+        self.assertEqual(
+            self.reader.to_data(row),
+            expected,
+            "The to_data method did not process the input row as expected.",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py
index 23ac35d5..31a63dd1 100644
--- a/tests/unit/readers/testDeepChemDataReader.py
+++ b/tests/unit/readers/testDeepChemDataReader.py
@@ -27,7 +27,12 @@ def setUpClass(cls, mock_file: mock_open) -> None:
         """
         cls.reader = DeepChemDataReader(token_path="/mock/path")
         # After initializing, cls.reader.cache should now be set to ['C', 'O', 'c', ')']
-        assert cls.reader.cache == ["C", "O", "c", ")"]
+        assert cls.reader.cache == [
+            "C",
+            "O",
+            "c",
+            ")",
+        ], "Cache initialization did not match expected tokens."
 
     def test_read_data(self) -> None:
         """
@@ -58,7 +63,11 @@ def test_read_data(self) -> None:
             EMBEDDING_OFFSET + len(self.reader.cache) + 3,  # [Ni-2] (new token)
         ]
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The _read_data method did not produce the expected tokenized output for the SMILES string.",
+        )
 
     def test_read_data_with_new_token(self) -> None:
         """
@@ -72,12 +81,24 @@ def test_read_data_with_new_token(self) -> None:
         expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token]
 
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The _read_data method did not produce the expected output for a SMILES string with a new token.",
+        )
 
         # Verify that '[H-]' was added to the cache
-        self.assertIn("[H-]", self.reader.cache)
+        self.assertIn(
+            "[H-]",
+            self.reader.cache,
+            "The new token '[H-]' was not added to the cache as expected.",
+        )
         # Ensure it's at the correct index
-        self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token)
+        self.assertEqual(
+            self.reader.cache.index("[H-]"),
+            index_for_last_token,
+            "The new token '[H-]' was not added to the correct index in the cache.",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py
index 6e5f325c..c5bc5e9a 100644
--- a/tests/unit/readers/testProteinDataReader.py
+++ b/tests/unit/readers/testProteinDataReader.py
@@ -25,7 +25,14 @@ def setUpClass(cls, mock_file: mock_open) -> None:
         """
         cls.reader = ProteinDataReader(token_path="/mock/path")
         # After initializing, cls.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N']
-        assert cls.reader.cache == ["M", "K", "T", "F", "R", "N"]
+        assert cls.reader.cache == [
+            "M",
+            "K",
+            "T",
+            "F",
+            "R",
+            "N",
+        ], "Cache initialization did not match expected tokens."
 
     def test_read_data(self) -> None:
         """
@@ -44,7 +51,11 @@ def test_read_data(self) -> None:
             EMBEDDING_OFFSET + 5,  # N
         ]
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The _read_data method did not produce the expected tokenized output.",
+        )
 
     def test_read_data_with_new_token(self) -> None:
         """
@@ -63,12 +74,22 @@ def test_read_data_with_new_token(self) -> None:
         ]
 
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The _read_data method did not correctly handle a new token.",
+        )
 
         # Verify that 'Y' was added to the cache
-        self.assertIn("Y", self.reader.cache)
+        self.assertIn(
+            "Y", self.reader.cache, "The new token 'Y' was not added to the cache."
+        )
         # Ensure it's at the correct index
-        self.assertEqual(self.reader.cache.index("Y"), len(self.reader.cache) - 1)
+        self.assertEqual(
+            self.reader.cache.index("Y"),
+            len(self.reader.cache) - 1,
+            "The new token 'Y' was not added at the correct index in the cache.",
+        )
 
     def test_read_data_with_invalid_token(self) -> None:
         """
@@ -79,7 +100,11 @@ def test_read_data_with_invalid_token(self) -> None:
         with self.assertRaises(KeyError) as context:
             self.reader._read_data(raw_data)
 
-        self.assertIn("Invalid token 'Z' encountered", str(context.exception))
+        self.assertIn(
+            "Invalid token 'Z' encountered",
+            str(context.exception),
+            "The KeyError did not contain the expected message for an invalid token.",
+        )
 
     def test_read_data_with_empty_sequence(self) -> None:
         """
@@ -88,7 +113,11 @@ def test_read_data_with_empty_sequence(self) -> None:
         raw_data = ""
 
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, [])
+        self.assertEqual(
+            result,
+            [],
+            "The _read_data method did not return an empty list for an empty input sequence.",
+        )
 
     def test_read_data_with_repeated_tokens(self) -> None:
         """
@@ -99,7 +128,11 @@ def test_read_data_with_repeated_tokens(self) -> None:
         expected_output: List[int] = [EMBEDDING_OFFSET + 0] * 5  # All tokens are 'M'
 
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The _read_data method did not correctly handle repeated tokens.",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/readers/testSelfiesReader.py b/tests/unit/readers/testSelfiesReader.py
index 019a0f59..411fc63b 100644
--- a/tests/unit/readers/testSelfiesReader.py
+++ b/tests/unit/readers/testSelfiesReader.py
@@ -26,8 +26,12 @@ def setUpClass(cls, mock_file: mock_open) -> None:
             mock_file: Mock object for file operations.
         """
         cls.reader = SelfiesReader(token_path="/mock/path")
-        # After initializing, cls.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[(']
-        assert cls.reader.cache == ["[C]", "[O]", "[=C]"]
+        # After initializing, cls.reader.cache should now be set to ['[C]', '[O]', '[=C]']
+        assert cls.reader.cache == [
+            "[C]",
+            "[O]",
+            "[=C]",
+        ], "Cache initialization did not match expected tokens."
 
     def test_read_data(self) -> None:
         """
@@ -62,7 +66,11 @@ def test_read_data(self) -> None:
         ]
 
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The _read_data method did not produce the expected tokenized output.",
+        )
 
     def test_read_data_with_new_token(self) -> None:
         """
@@ -76,12 +84,24 @@ def test_read_data_with_new_token(self) -> None:
         expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token]
 
         result = self.reader._read_data(raw_data)
-        self.assertEqual(result, expected_output)
+        self.assertEqual(
+            result,
+            expected_output,
+            "The _read_data method did not correctly handle a new token.",
+        )
 
         # Verify that '[H-1]' was added to the cache, "[H-]" translated to "[H-1]" in SELFIES
-        self.assertIn("[H-1]", self.reader.cache)
+        self.assertIn(
+            "[H-1]",
+            self.reader.cache,
+            "The new token '[H-1]' was not added to the cache.",
+        )
         # Ensure it's at the correct index
-        self.assertEqual(self.reader.cache.index("[H-1]"), index_for_last_token)
+        self.assertEqual(
+            self.reader.cache.index("[H-1]"),
+            index_for_last_token,
+            "The new token '[H-1]' was not added at the correct index in the cache.",
+        )
 
     def test_read_data_with_invalid_selfies(self) -> None:
         """
@@ -90,10 +110,17 @@ def test_read_data_with_invalid_selfies(self) -> None:
         raw_data = "[C][O][INVALID][N]"
 
         result = self.reader._read_data(raw_data)
-        self.assertIsNone(result)
+        self.assertIsNone(
+            result,
+            "The _read_data method did not return None for an invalid SELFIES string.",
+        )
 
         # Verify that the error count was incremented
-        self.assertEqual(self.reader.error_count, 1)
+        self.assertEqual(
+            self.reader.error_count,
+            1,
+            "The error count was not incremented for an invalid SELFIES string.",
+        )
 
 
 if __name__ == "__main__":

From c0aaeeaef84efa06b0a68879ddf3e0874c749138 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Wed, 4 Sep 2024 17:34:03 +0200
Subject: [PATCH 024/112] test data class for chebi ontology

---
 tests/unit/mock_data/ontology_mock_data.py | 146 +++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 tests/unit/mock_data/ontology_mock_data.py

diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
new file mode 100644
index 00000000..27fd511f
--- /dev/null
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -0,0 +1,146 @@
+class ChebiMockOntology:
+    """
+    Nodes:
+    - CHEBI:12345 (Compound A)
+    - CHEBI:54321 (Compound B)
+    - CHEBI:67890 (Compound C)
+    - CHEBI:11111 (Compound D)
+    - CHEBI:22222 (Compound E)
+    - CHEBI:99999 (Compound F)
+    - CHEBI:77533 (Compound F, Obsolete node)
+    - CHEBI:77564 (Compound H, Obsolete node)
+    - CHEBI:88888 (Compound I)
+
+    Valid Edges:
+    - CHEBI:54321 -> CHEBI:12345
+    - CHEBI:67890 -> CHEBI:12345
+    - CHEBI:67890 -> CHEBI:88888
+    - CHEBI:11111 -> CHEBI:54321
+    - CHEBI:77564 -> CHEBI:54321 (Ignored due to obsolete status)
+    - CHEBI:22222 -> CHEBI:67890
+    - CHEBI:12345 -> CHEBI:99999
+    - CHEBI:77533 -> CHEBI:99999 (Ignored due to obsolete status)
+    """
+
+    @staticmethod
+    def get_nodes():
+        return {12345, 54321, 67890, 11111, 22222, 99999, 88888}
+
+    @staticmethod
+    def get_number_of_nodes():
+        return len(ChebiMockOntology.get_nodes())
+
+    @staticmethod
+    def get_edges_of_transitive_closure_graph():
+        return {
+            (54321, 12345),
+            (54321, 99999),
+            (67890, 12345),
+            (67890, 99999),
+            (67890, 88888),
+            (11111, 54321),
+            (11111, 12345),
+            (11111, 99999),
+            (22222, 67890),
+            (22222, 12345),
+            (22222, 99999),
+            (22222, 88888),
+            (12345, 99999),
+        }
+
+    @staticmethod
+    def get_number_of_transitive_edges():
+        return len(ChebiMockOntology.get_edges_of_transitive_closure_graph())
+
+    @staticmethod
+    def get_edges():
+        return {
+            (54321, 12345),
+            (67890, 12345),
+            (67890, 88888),
+            (11111, 54321),
+            (22222, 67890),
+            (12345, 99999),
+        }
+
+    @staticmethod
+    def get_number_of_edges():
+        return len(ChebiMockOntology.get_edges())
+
+    @staticmethod
+    def get_obsolete_nodes_ids():
+        return {77533, 77564}
+
+    @staticmethod
+    def get_raw_data():
+        # Create mock terms with a complex hierarchy, names, and SMILES strings
+        return """
+        [Term]
+        id: CHEBI:12345
+        name: Compound A
+        subset: 2_STAR
+        property_value: http://purl.obolibrary.org/obo/chebi/formula "C26H35ClN4O6S" xsd:string
+        property_value: http://purl.obolibrary.org/obo/chebi/charge "0" xsd:string
+        property_value: http://purl.obolibrary.org/obo/chebi/monoisotopicmass "566.19658" xsd:string
+        property_value: http://purl.obolibrary.org/obo/chebi/mass "567.099" xsd:string
+        property_value: http://purl.obolibrary.org/obo/chebi/inchikey "ROXPMFGZZQEKHB-IUKKYPGJSA-N" xsd:string
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1" xsd:string
+        property_value: http://purl.obolibrary.org/obo/chebi/inchi "InChI=1S/C26H35ClN4O6S/c1-16(2)28-26(34)30(5)14-23-17(3)13-31(18(4)15-32)25(33)21-7-6-8-22(24(21)37-23)29-38(35,36)20-11-9-19(27)10-12-20/h6-12,16-18,23,29,32H,13-15H2,1-5H3,(H,28,34)/t17-,18-,23+/m0/s1" xsd:string
+        xref: LINCS:LSM-20139
+        is_a: CHEBI:54321
+        is_a: CHEBI:67890
+
+        [Term]
+        id: CHEBI:54321
+        name: Compound B
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1O" xsd:string
+        is_a: CHEBI:11111
+        is_a: CHEBI:77564
+
+        [Term]
+        id: CHEBI:67890
+        name: Compound C
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1N" xsd:string
+        is_a: CHEBI:22222
+
+        [Term]
+        id: CHEBI:11111
+        name: Compound D
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1F" xsd:string
+
+        [Term]
+        id: CHEBI:22222
+        name: Compound E
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1Cl" xsd:string
+
+        [Term]
+        id: CHEBI:99999
+        name: Compound F
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1Br" xsd:string
+        is_a: CHEBI:12345
+
+        [Term]
+        id: CHEBI:77533
+        name: Compound G
+        is_a: CHEBI:99999
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=C1Br" xsd:string
+        is_obsolete: true
+
+        [Term]
+        id: CHEBI:77564
+        name: Compound H
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "CC=C1Br" xsd:string
+        is_obsolete: true
+
+        [Typedef]
+        id: has_major_microspecies_at_pH_7_3
+        name: has major microspecies at pH 7.3
+        is_cyclic: true
+        is_transitive: false
+
+        [Term]
+        id: CHEBI:88888
+        name: Compound I
+        property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1[Mg+]" xsd:string
+        is_a: CHEBI:67890
+        """

From 764216e91e032693b90b9044eccc2fb411fcfad5 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Wed, 4 Sep 2024 17:38:13 +0200
Subject: [PATCH 025/112] test for term callback + mock data changes

---
 .../dataset_classes/testChebiTermCallback.py  | 67 +++++++++++++
 tests/unit/mock_data/__init__.py              |  0
 tests/unit/mock_data/ontology_mock_data.py    | 98 +++++++++++++++----
 3 files changed, 144 insertions(+), 21 deletions(-)
 create mode 100644 tests/unit/dataset_classes/testChebiTermCallback.py
 create mode 100644 tests/unit/mock_data/__init__.py

diff --git a/tests/unit/dataset_classes/testChebiTermCallback.py b/tests/unit/dataset_classes/testChebiTermCallback.py
new file mode 100644
index 00000000..7b22d1a2
--- /dev/null
+++ b/tests/unit/dataset_classes/testChebiTermCallback.py
@@ -0,0 +1,67 @@
+import unittest
+from typing import Any, Dict
+
+import fastobo
+from fastobo.term import TermFrame
+
+from chebai.preprocessing.datasets.chebi import term_callback
+from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology
+
+
+class TestChebiTermCallback(unittest.TestCase):
+    """
+    Unit tests for the `term_callback` function used in processing ChEBI ontology terms.
+    """
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        """
+        Set up the test class by loading ChEBI term data and storing it in a dictionary
+        where keys are the term IDs and values are TermFrame instances.
+        """
+        cls.callback_input_data: Dict[int, TermFrame] = {
+            int(term_doc.id.local): term_doc
+            for term_doc in fastobo.loads(ChebiMockOntology.get_raw_data())
+            if term_doc and ":" in str(term_doc.id)
+        }
+
+    def test_process_valid_terms(self) -> None:
+        """
+        Test that `term_callback` correctly processes valid ChEBI terms.
+        """
+
+        expected_result: Dict[str, Any] = {
+            "id": 12345,
+            "parents": [54321, 67890],
+            "has_part": set(),
+            "name": "Compound A",
+            "smiles": "C1=CC=CC=C1",
+        }
+
+        actual_dict: Dict[str, Any] = term_callback(
+            self.callback_input_data.get(expected_result["id"])
+        )
+        self.assertEqual(
+            expected_result,
+            actual_dict,
+            msg="term_callback should correctly extract information from valid ChEBI terms.",
+        )
+
+    def test_skip_obsolete_terms(self) -> None:
+        """
+        Test that `term_callback` correctly skips obsolete ChEBI terms.
+        """
+
+        term_callback_output = [
+            term_callback(self.callback_input_data.get(ident))
+            for ident in ChebiMockOntology.get_obsolete_nodes_ids()
+        ]
+        self.assertEqual(
+            term_callback_output,
+            [],
+            msg="The term_callback function should skip obsolete terms and return an empty list.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/mock_data/__init__.py b/tests/unit/mock_data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index 27fd511f..11d5c9ce 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -1,5 +1,12 @@
+from typing import Set, Tuple
+
+
 class ChebiMockOntology:
     """
+    A mock ontology representing a simplified ChEBI (Chemical Entities of Biological Interest) structure.
+    This class is used for testing purposes and includes nodes and edges representing chemical compounds
+    and their relationships in a graph structure.
+
     Nodes:
     - CHEBI:12345 (Compound A)
     - CHEBI:54321 (Compound B)
@@ -7,7 +14,7 @@ class ChebiMockOntology:
     - CHEBI:11111 (Compound D)
     - CHEBI:22222 (Compound E)
     - CHEBI:99999 (Compound F)
-    - CHEBI:77533 (Compound F, Obsolete node)
+    - CHEBI:77533 (Compound G, Obsolete node)
     - CHEBI:77564 (Compound H, Obsolete node)
     - CHEBI:88888 (Compound I)
 
@@ -16,64 +23,113 @@ class ChebiMockOntology:
     - CHEBI:67890 -> CHEBI:12345
     - CHEBI:67890 -> CHEBI:88888
     - CHEBI:11111 -> CHEBI:54321
-    - CHEBI:77564 -> CHEBI:54321 (Ignored due to obsolete status)
     - CHEBI:22222 -> CHEBI:67890
     - CHEBI:12345 -> CHEBI:99999
-    - CHEBI:77533 -> CHEBI:99999 (Ignored due to obsolete status)
+
+    The class also includes methods to retrieve nodes, edges, and transitive closure of the graph.
     """
 
     @staticmethod
-    def get_nodes():
+    def get_nodes() -> Set[int]:
+        """
+        Get the set of valid node IDs in the mock ontology.
+
+        Returns:
+        - Set[int]: A set of integers representing the valid ChEBI node IDs.
+        """
         return {12345, 54321, 67890, 11111, 22222, 99999, 88888}
 
     @staticmethod
-    def get_number_of_nodes():
+    def get_number_of_nodes() -> int:
+        """
+        Get the number of valid nodes in the mock ontology.
+
+        Returns:
+        - int: The number of valid nodes.
+        """
         return len(ChebiMockOntology.get_nodes())
 
     @staticmethod
-    def get_edges_of_transitive_closure_graph():
+    def get_edges() -> Set[Tuple[int, int]]:
+        """
+        Get the set of valid edges in the mock ontology.
+
+        Returns:
+        - Set[Tuple[int, int]]: A set of tuples representing the directed edges
+          between ChEBI nodes.
+        """
         return {
             (54321, 12345),
-            (54321, 99999),
             (67890, 12345),
-            (67890, 99999),
             (67890, 88888),
             (11111, 54321),
-            (11111, 12345),
-            (11111, 99999),
             (22222, 67890),
-            (22222, 12345),
-            (22222, 99999),
-            (22222, 88888),
             (12345, 99999),
         }
 
     @staticmethod
-    def get_number_of_transitive_edges():
-        return len(ChebiMockOntology.get_edges_of_transitive_closure_graph())
+    def get_number_of_edges() -> int:
+        """
+        Get the number of valid edges in the mock ontology.
+
+        Returns:
+        - int: The number of valid edges.
+        """
+        return len(ChebiMockOntology.get_edges())
 
     @staticmethod
-    def get_edges():
+    def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]:
+        """
+        Get the set of edges derived from the transitive closure of the mock ontology graph.
+
+        Returns:
+        - Set[Tuple[int, int]]: A set of tuples representing the directed edges
+          in the transitive closure of the ChEBI graph.
+        """
         return {
             (54321, 12345),
+            (54321, 99999),
             (67890, 12345),
+            (67890, 99999),
             (67890, 88888),
             (11111, 54321),
+            (11111, 12345),
+            (11111, 99999),
             (22222, 67890),
+            (22222, 12345),
+            (22222, 99999),
+            (22222, 88888),
             (12345, 99999),
         }
 
     @staticmethod
-    def get_number_of_edges():
-        return len(ChebiMockOntology.get_edges())
+    def get_number_of_transitive_edges() -> int:
+        """
+        Get the number of edges in the transitive closure of the mock ontology graph.
+
+        Returns:
+        - int: The number of edges in the transitive closure graph.
+        """
+        return len(ChebiMockOntology.get_edges_of_transitive_closure_graph())
 
     @staticmethod
-    def get_obsolete_nodes_ids():
+    def get_obsolete_nodes_ids() -> Set[int]:
+        """
+        Get the set of obsolete node IDs in the mock ontology.
+
+        Returns:
+        - Set[int]: A set of integers representing the obsolete ChEBI node IDs.
+        """
         return {77533, 77564}
 
     @staticmethod
-    def get_raw_data():
-        # Create mock terms with a complex hierarchy, names, and SMILES strings
+    def get_raw_data() -> str:
+        """
+        Get the raw data representing the mock ontology in OBO format.
+
+        Returns:
+        - str: A string containing the raw OBO data for the mock ChEBI terms.
+        """
         return """
         [Term]
         id: CHEBI:12345

From 1dd8428bbfc46ebf5aa445cc542851cfd8df4f5a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 5 Sep 2024 20:10:35 +0200
Subject: [PATCH 026/112] test for chebidataextractor + changes in mock data

---
 .../dataset_classes/testChebiDataExtractor.py | 214 ++++++++++++++++++
 tests/unit/mock_data/ontology_mock_data.py    |  80 ++++++-
 2 files changed, 291 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/dataset_classes/testChebiDataExtractor.py

diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py
new file mode 100644
index 00000000..cb52e68f
--- /dev/null
+++ b/tests/unit/dataset_classes/testChebiDataExtractor.py
@@ -0,0 +1,214 @@
+import unittest
+from unittest.mock import PropertyMock, mock_open, patch
+
+import networkx as nx
+import pandas as pd
+
+from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
+from chebai.preprocessing.reader import ChemDataReader
+from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology
+
+
+class TestChEBIDataExtractor(unittest.TestCase):
+
+    @classmethod
+    @patch.multiple(_ChEBIDataExtractor, __abstractmethods__=frozenset())
+    @patch.object(_ChEBIDataExtractor, "base_dir", new_callable=PropertyMock)
+    @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock)
+    def setUpClass(
+        cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock
+    ) -> None:
+        """
+        Set up a base instance of _DynamicDataset for testing with mocked properties.
+        """
+
+        # Mocking properties
+        mock_base_dir_property.return_value = "MockedBaseDirPropertyChebiDataExtractor"
+        mock_name_property.return_value = "MockedNamePropertyChebiDataExtractor"
+
+        # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
+        _ChEBIDataExtractor.READER = ChemDataReader
+
+        # Creating an instance of the dataset
+        cls.extractor: _ChEBIDataExtractor = _ChEBIDataExtractor(
+            chebi_version=231, chebi_version_train=200
+        )
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=ChebiMockOntology.get_raw_data(),
+    )
+    def test_extract_class_hierarchy(self, mock_open):
+        # Mock the output of fastobo.loads
+        graph = self.extractor._extract_class_hierarchy("fake_path")
+
+        # Validate the graph structure
+        self.assertIsInstance(
+            graph, nx.DiGraph, "The result should be a directed graph."
+        )
+
+        # Check nodes
+        actual_nodes = set(graph.nodes)
+        self.assertEqual(
+            set(ChebiMockOntology.get_nodes()),
+            actual_nodes,
+            "The graph nodes do not match the expected nodes.",
+        )
+
+        # Check edges
+        actual_edges = set(graph.edges)
+        self.assertEqual(
+            ChebiMockOntology.get_edges_of_transitive_closure_graph(),
+            actual_edges,
+            "The graph edges do not match the expected edges.",
+        )
+
+        # Check number of nodes and edges
+        self.assertEqual(
+            ChebiMockOntology.get_number_of_nodes(),
+            len(actual_nodes),
+            "The number of nodes should match the actual number of nodes in the graph.",
+        )
+
+        self.assertEqual(
+            ChebiMockOntology.get_number_of_transitive_edges(),
+            len(actual_edges),
+            "The number of transitive edges should match the actual number of transitive edges in the graph.",
+        )
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=ChebiMockOntology.get_raw_data(),
+    )
+    @patch.object(
+        _ChEBIDataExtractor,
+        "select_classes",
+        return_value=ChebiMockOntology.get_nodes(),
+    )
+    def test_graph_to_raw_dataset(self, mock_open, mock_select_classes):
+        graph = self.extractor._extract_class_hierarchy("fake_path")
+        data_df = self.extractor._graph_to_raw_dataset(graph)
+
+        pd.testing.assert_frame_equal(
+            data_df,
+            ChebiMockOntology.get_data_in_dataframe(),
+            obj="The DataFrame should match the expected structure",
+        )
+
+    @patch(
+        "builtins.open", new_callable=mock_open, read_data=b"Mocktestdata"
+    )  # Mocking open as a binary file
+    @patch("pandas.read_pickle")
+    def test_load_dict(self, mock_open, mock_read_pickle):
+
+        # Mock the DataFrame returned by read_pickle
+        mock_df = pd.DataFrame(
+            {
+                "id": [12345, 67890, 11111, 54321],  # Corrected ID
+                "name": ["A", "B", "C", "D"],
+                "SMILES": ["C1CCCCC1", "O=C=O", "C1CC=CC1", "C[Mg+]"],
+                12345: [True, False, False, True],
+                67890: [False, True, True, False],
+                11111: [True, False, True, False],
+            }
+        )
+        mock_read_pickle.return_value = mock_df  # Mock the return value of read_pickle
+
+        # Call the actual function (with open correctly mocked)
+        generator = self.extractor._load_dict("data/tests")
+        result = list(generator)  # Collect all output from the generator
+
+        # Expected output for comparison
+        expected_result = [
+            {"features": "C1CCCCC1", "labels": [True, False, True], "ident": 12345},
+            {"features": "O=C=O", "labels": [False, True, False], "ident": 67890},
+            {"features": "C1CC=CC1", "labels": [False, True, True], "ident": 11111},
+            {
+                "features": "C[Mg+]",
+                "labels": [True, False, False],
+                "ident": 54321,
+            },  # Corrected ID
+        ]
+
+        # Assert if the result matches the expected output
+        self.assertEqual(
+            result,
+            expected_result,
+            "The loaded dictionary should match the expected structure.",
+        )
+
+    @patch("builtins.open", new_callable=mock_open)
+    @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock)
+    @patch.object(_ChEBIDataExtractor, "processed_dir_main", new_callable=PropertyMock)
+    @patch.object(
+        _ChEBIDataExtractor, "_chebi_version_train_obj", new_callable=PropertyMock
+    )
+    def test_setup_pruned_test_set(
+        self,
+        mock_chebi_version_train_obj,
+        mock_processed_dir_main,
+        mock_name_property,
+        mock_open_file,
+    ):
+        # Mock the content for the two open calls (original classes and new classes)
+        mock_orig_classes = "12345\n67890\n88888\n54321\n77777\n"
+        mock_new_classes = "12345\n67890\n99999\n77777\n"
+
+        # Use side_effect to simulate the two different file reads
+        mock_open_file.side_effect = [
+            mock_open(
+                read_data=mock_orig_classes
+            ).return_value,  # First open() for orig_classes
+            mock_open(
+                read_data=mock_new_classes
+            ).return_value,  # Second open() for new_classes
+        ]
+
+        # Mock the attributes used in the method
+        mock_processed_dir_main.return_value = "/mock/path/to/current"
+        mock_chebi_version_train_obj.return_value.processed_dir_main = (
+            "/mock/path/to/train"
+        )
+
+        # Mock DataFrame to simulate the test dataset
+        mock_df = pd.DataFrame(
+            {
+                "labels": [
+                    [
+                        True,
+                        False,
+                        True,
+                        False,
+                        True,
+                    ],  # First test instance labels (match orig_classes)
+                    [False, True, False, True, False],
+                ]  # Second test instance labels
+            }
+        )
+
+        # Call the method under test
+        pruned_df = self.extractor._setup_pruned_test_set(mock_df)
+
+        # Expected DataFrame labels after pruning (only "12345", "67890", "77777",and "99999" remain)
+        expected_labels = [[True, False, False, True], [False, True, False, False]]
+
+        # Check if the pruned DataFrame still has the same number of rows
+        self.assertEqual(
+            len(pruned_df),
+            len(mock_df),
+            "The pruned DataFrame should have the same number of rows.",
+        )
+
+        # Check that the labels are correctly pruned
+        for i in range(len(pruned_df)):
+            self.assertEqual(
+                pruned_df.iloc[i]["labels"],
+                expected_labels[i],
+                f"Row {i}'s labels should be pruned correctly.",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index 11d5c9ce..61b4462a 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -1,4 +1,7 @@
-from typing import Set, Tuple
+from collections import OrderedDict
+from typing import List, Set, Tuple
+
+import pandas as pd
 
 
 class ChebiMockOntology:
@@ -30,14 +33,14 @@ class ChebiMockOntology:
     """
 
     @staticmethod
-    def get_nodes() -> Set[int]:
+    def get_nodes() -> List[int]:
         """
         Get the set of valid node IDs in the mock ontology.
 
         Returns:
         - Set[int]: A set of integers representing the valid ChEBI node IDs.
         """
-        return {12345, 54321, 67890, 11111, 22222, 99999, 88888}
+        return [11111, 12345, 22222, 54321, 67890, 88888, 99999]
 
     @staticmethod
     def get_number_of_nodes() -> int:
@@ -200,3 +203,74 @@ def get_raw_data() -> str:
         property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1[Mg+]" xsd:string
         is_a: CHEBI:67890
         """
+
+    @staticmethod
+    def get_data_in_dataframe():
+        data = OrderedDict(
+            id=[
+                12345,
+                54321,
+                67890,
+                11111,
+                22222,
+                99999,
+                88888,
+            ],
+            name=[
+                "Compound A",
+                "Compound B",
+                "Compound C",
+                "Compound D",
+                "Compound E",
+                "Compound F",
+                "Compound I",
+            ],
+            SMILES=[
+                "C1=CC=CC=C1",
+                "C1=CC=CC=C1O",
+                "C1=CC=CC=C1N",
+                "C1=CC=CC=C1F",
+                "C1=CC=CC=C1Cl",
+                "C1=CC=CC=C1Br",
+                "C1=CC=CC=C1[Mg+]",
+            ],
+            # Relationships {
+            #  12345: [11111, 54321, 22222, 67890],
+            #  67890: [22222],
+            #  99999: [67890, 11111, 54321, 22222, 12345],
+            #  54321: [11111],
+            #  88888: [22222, 67890]
+            #  11111: []
+            #  22222: []
+            # }
+            **{
+                # -row- [11111, 12345, 22222, 54321, 67890, 88888, 99999]
+                11111: [False, False, False, False, False, False, False],
+                12345: [True, True, True, True, True, False, False],
+                22222: [False, False, False, False, False, False, False],
+                54321: [True, False, False, True, False, False, False],
+                67890: [False, False, True, False, True, False, False],
+                88888: [False, False, True, False, True, True, False],
+                99999: [True, True, True, True, True, False, True],
+            }
+        )
+
+        data_df = pd.DataFrame(data)
+
+        # ------------- Code Approach -------
+        # ancestors_of_nodes = {}
+        # for parent, child in ChebiMockOntology.get_edges_of_transitive_closure_graph():
+        #     if child not in ancestors_of_nodes:
+        #         ancestors_of_nodes[child] = set()
+        #     if parent not in ancestors_of_nodes:
+        #         ancestors_of_nodes[parent] = set()
+        #     ancestors_of_nodes[child].add(parent)
+        #     ancestors_of_nodes[child].add(child)
+        #
+        # # For each node in the ontology, create a column to check if it's an ancestor of any other node or itself
+        # for node in ChebiMockOntology.get_nodes():
+        #     data_df[node] = data_df['id'].apply(
+        #         lambda x: (x == node) or (node in ancestors_of_nodes[x])
+        #     )
+
+        return data_df

From f3519b566410ef1d20f9020258bceabe57199f74 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 5 Sep 2024 21:13:59 +0200
Subject: [PATCH 027/112] mock reader for all + test_setup_pruned_test_set
 changes

---
 .../dataset_classes/testChebiDataExtractor.py | 78 +++++++++++--------
 .../dataset_classes/testDynamicDataset.py     | 25 +++---
 .../dataset_classes/testXYBaseDataModule.py   |  8 +-
 3 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py
index cb52e68f..0559e090 100644
--- a/tests/unit/dataset_classes/testChebiDataExtractor.py
+++ b/tests/unit/dataset_classes/testChebiDataExtractor.py
@@ -1,11 +1,10 @@
 import unittest
-from unittest.mock import PropertyMock, mock_open, patch
+from unittest.mock import MagicMock, PropertyMock, mock_open, patch
 
 import networkx as nx
 import pandas as pd
 
 from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
-from chebai.preprocessing.reader import ChemDataReader
 from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology
 
 
@@ -16,30 +15,39 @@ class TestChEBIDataExtractor(unittest.TestCase):
     @patch.object(_ChEBIDataExtractor, "base_dir", new_callable=PropertyMock)
     @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock)
     def setUpClass(
-        cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock
+        cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock
     ) -> None:
         """
-        Set up a base instance of _DynamicDataset for testing with mocked properties.
+        Set up a base instance of _ChEBIDataExtractor for testing with mocked properties.
         """
-
         # Mocking properties
         mock_base_dir_property.return_value = "MockedBaseDirPropertyChebiDataExtractor"
         mock_name_property.return_value = "MockedNamePropertyChebiDataExtractor"
 
-        # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
-        _ChEBIDataExtractor.READER = ChemDataReader
+        # Mock Data Reader
+        ReaderMock = MagicMock()
+        ReaderMock.name.return_value = "MockedReader"
+        _ChEBIDataExtractor.READER = ReaderMock
 
-        # Creating an instance of the dataset
+        # Create an instance of the dataset
         cls.extractor: _ChEBIDataExtractor = _ChEBIDataExtractor(
             chebi_version=231, chebi_version_train=200
         )
 
+        # Mock instance for _chebi_version_train_obj
+        mock_train_obj = MagicMock()
+        mock_train_obj.processed_dir_main = "/mock/path/to/train"
+        cls.extractor._chebi_version_train_obj = mock_train_obj
+
     @patch(
         "builtins.open",
         new_callable=mock_open,
         read_data=ChebiMockOntology.get_raw_data(),
     )
-    def test_extract_class_hierarchy(self, mock_open):
+    def test_extract_class_hierarchy(self, mock_open: mock_open) -> None:
+        """
+        Test the extraction of class hierarchy and validate the structure of the resulting graph.
+        """
         # Mock the output of fastobo.loads
         graph = self.extractor._extract_class_hierarchy("fake_path")
 
@@ -87,22 +95,31 @@ def test_extract_class_hierarchy(self, mock_open):
         "select_classes",
         return_value=ChebiMockOntology.get_nodes(),
     )
-    def test_graph_to_raw_dataset(self, mock_open, mock_select_classes):
+    def test_graph_to_raw_dataset(
+        self, mock_select_classes: PropertyMock, mock_open: mock_open
+    ) -> None:
+        """
+        Test conversion of a graph to a raw dataset and compare it with the expected DataFrame.
+        """
         graph = self.extractor._extract_class_hierarchy("fake_path")
         data_df = self.extractor._graph_to_raw_dataset(graph)
 
         pd.testing.assert_frame_equal(
             data_df,
             ChebiMockOntology.get_data_in_dataframe(),
-            obj="The DataFrame should match the expected structure",
+            obj="The DataFrame should match the expected structure.",
         )
 
     @patch(
         "builtins.open", new_callable=mock_open, read_data=b"Mocktestdata"
     )  # Mocking open as a binary file
     @patch("pandas.read_pickle")
-    def test_load_dict(self, mock_open, mock_read_pickle):
-
+    def test_load_dict(
+        self, mock_read_pickle: PropertyMock, mock_open: mock_open
+    ) -> None:
+        """
+        Test loading data from a pickled file and verify the generator output.
+        """
         # Mock the DataFrame returned by read_pickle
         mock_df = pd.DataFrame(
             {
@@ -114,22 +131,21 @@ def test_load_dict(self, mock_open, mock_read_pickle):
                 11111: [True, False, True, False],
             }
         )
-        mock_read_pickle.return_value = mock_df  # Mock the return value of read_pickle
+        mock_read_pickle.return_value = mock_df
 
-        # Call the actual function (with open correctly mocked)
         generator = self.extractor._load_dict("data/tests")
-        result = list(generator)  # Collect all output from the generator
+        result = list(generator)
+
+        # Convert NumPy arrays to lists for comparison
+        for item in result:
+            item["labels"] = list(item["labels"])
 
         # Expected output for comparison
         expected_result = [
             {"features": "C1CCCCC1", "labels": [True, False, True], "ident": 12345},
             {"features": "O=C=O", "labels": [False, True, False], "ident": 67890},
             {"features": "C1CC=CC1", "labels": [False, True, True], "ident": 11111},
-            {
-                "features": "C[Mg+]",
-                "labels": [True, False, False],
-                "ident": 54321,
-            },  # Corrected ID
+            {"features": "C[Mg+]", "labels": [True, False, False], "ident": 54321},
         ]
 
         # Assert if the result matches the expected output
@@ -140,18 +156,15 @@ def test_load_dict(self, mock_open, mock_read_pickle):
         )
 
     @patch("builtins.open", new_callable=mock_open)
-    @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock)
     @patch.object(_ChEBIDataExtractor, "processed_dir_main", new_callable=PropertyMock)
-    @patch.object(
-        _ChEBIDataExtractor, "_chebi_version_train_obj", new_callable=PropertyMock
-    )
     def test_setup_pruned_test_set(
         self,
-        mock_chebi_version_train_obj,
-        mock_processed_dir_main,
-        mock_name_property,
-        mock_open_file,
-    ):
+        mock_processed_dir_main: PropertyMock,
+        mock_open_file: mock_open,
+    ) -> None:
+        """
+        Test the pruning of the test set to match classes in the training set.
+        """
         # Mock the content for the two open calls (original classes and new classes)
         mock_orig_classes = "12345\n67890\n88888\n54321\n77777\n"
         mock_new_classes = "12345\n67890\n99999\n77777\n"
@@ -168,9 +181,6 @@ def test_setup_pruned_test_set(
 
         # Mock the attributes used in the method
         mock_processed_dir_main.return_value = "/mock/path/to/current"
-        mock_chebi_version_train_obj.return_value.processed_dir_main = (
-            "/mock/path/to/train"
-        )
 
         # Mock DataFrame to simulate the test dataset
         mock_df = pd.DataFrame(
@@ -191,7 +201,7 @@ def test_setup_pruned_test_set(
         # Call the method under test
         pruned_df = self.extractor._setup_pruned_test_set(mock_df)
 
-        # Expected DataFrame labels after pruning (only "12345", "67890", "77777",and "99999" remain)
+        # Expected DataFrame labels after pruning (only "12345", "67890", "77777", and "99999" remain)
         expected_labels = [[True, False, False, True], [False, True, False, False]]
 
         # Check if the pruned DataFrame still has the same number of rows
diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py
index 50b9287a..1ff6c26d 100644
--- a/tests/unit/dataset_classes/testDynamicDataset.py
+++ b/tests/unit/dataset_classes/testDynamicDataset.py
@@ -1,11 +1,10 @@
 import unittest
 from typing import Tuple
-from unittest.mock import PropertyMock, patch
+from unittest.mock import MagicMock, PropertyMock, patch
 
 import pandas as pd
 
 from chebai.preprocessing.datasets.base import _DynamicDataset
-from chebai.preprocessing.reader import ProteinDataReader
 
 
 class TestDynamicDataset(unittest.TestCase):
@@ -29,8 +28,10 @@ def setUpClass(
         mock_base_dir_property.return_value = "MockedBaseDirPropertyDynamicDataset"
         mock_name_property.return_value = "MockedNamePropertyDynamicDataset"
 
-        # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
-        _DynamicDataset.READER = ProteinDataReader
+        # Mock Data Reader
+        ReaderMock = MagicMock()
+        ReaderMock.name.return_value = "MockedReader"
+        _DynamicDataset.READER = ReaderMock
 
         # Creating an instance of the dataset
         cls.dataset: _DynamicDataset = _DynamicDataset()
@@ -72,7 +73,7 @@ def setUpClass(
             [True, False],
             [True, True],
         ]
-        cls.df = pd.DataFrame(
+        cls.data_df = pd.DataFrame(
             {"ident": [f"id{i + 1}" for i in range(len(X))], "features": X, "labels": y}
         )
 
@@ -82,7 +83,7 @@ def test_get_test_split_valid(self) -> None:
         """
         self.dataset.train_split = 0.5
         # Test size will be 0.25 * 16 = 4
-        train_df, test_df = self.dataset.get_test_split(self.df, seed=0)
+        train_df, test_df = self.dataset.get_test_split(self.data_df, seed=0)
 
         # Assert the correct number of rows in train and test sets
         self.assertEqual(len(train_df), 12, "Train set should contain 12 samples.")
@@ -127,8 +128,8 @@ def test_get_test_split_seed_consistency(self) -> None:
         """
         Test that splitting the dataset with the same seed produces consistent results.
         """
-        train_df1, test_df1 = self.dataset.get_test_split(self.df, seed=42)
-        train_df2, test_df2 = self.dataset.get_test_split(self.df, seed=42)
+        train_df1, test_df1 = self.dataset.get_test_split(self.data_df, seed=42)
+        train_df2, test_df2 = self.dataset.get_test_split(self.data_df, seed=42)
 
         pd.testing.assert_frame_equal(
             train_df1,
@@ -145,7 +146,7 @@ def test_get_train_val_splits_given_test(self) -> None:
         """
         self.dataset.use_inner_cross_validation = False
         self.dataset.train_split = 0.5
-        df_train_main, test_df = self.dataset.get_test_split(self.df, seed=0)
+        df_train_main, test_df = self.dataset.get_test_split(self.data_df, seed=0)
         train_df, val_df = self.dataset.get_train_val_splits_given_test(
             df_train_main, test_df, seed=42
         )
@@ -192,12 +193,12 @@ def test_get_train_val_splits_given_test_consistency(self) -> None:
         """
         Test that splitting the dataset into train and validation sets with the same seed produces consistent results.
         """
-        test_df = self.df.iloc[12:]  # Assume rows 12 onward are for testing
+        test_df = self.data_df.iloc[12:]  # Assume rows 12 onward are for testing
         train_df1, val_df1 = self.dataset.get_train_val_splits_given_test(
-            self.df, test_df, seed=42
+            self.data_df, test_df, seed=42
         )
         train_df2, val_df2 = self.dataset.get_train_val_splits_given_test(
-            self.df, test_df, seed=42
+            self.data_df, test_df, seed=42
         )
 
         pd.testing.assert_frame_equal(
diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py
index 4c2d21dc..8e3575ab 100644
--- a/tests/unit/dataset_classes/testXYBaseDataModule.py
+++ b/tests/unit/dataset_classes/testXYBaseDataModule.py
@@ -1,8 +1,7 @@
 import unittest
-from unittest.mock import PropertyMock, patch
+from unittest.mock import MagicMock, PropertyMock, patch
 
 from chebai.preprocessing.datasets.base import XYBaseDataModule
-from chebai.preprocessing.reader import ProteinDataReader
 
 
 class TestXYBaseDataModule(unittest.TestCase):
@@ -21,7 +20,10 @@ def setUpClass(cls, mock_name_property: PropertyMock) -> None:
         mock_name_property.return_value = "MockedNamePropXYBaseDataModule"
 
         # Assign a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader)
-        XYBaseDataModule.READER = ProteinDataReader
+        # Mock Data Reader
+        ReaderMock = MagicMock()
+        ReaderMock.name.return_value = "MockedReader"
+        XYBaseDataModule.READER = ReaderMock
 
         # Initialize the module with a label_filter
         cls.module = XYBaseDataModule(

From fc0fd47389ea60a7573b4de7645c1a133816245d Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 5 Sep 2024 21:43:10 +0200
Subject: [PATCH 028/112] fix for misalignment between x an y in RaggedCollator

- https://github.com/ChEB-AI/python-chebai/pull/48#issuecomment-2324393829
---
 tests/unit/collators/testRaggedCollator.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py
index d31776a6..d9ab2b1d 100644
--- a/tests/unit/collators/testRaggedCollator.py
+++ b/tests/unit/collators/testRaggedCollator.py
@@ -78,12 +78,15 @@ def test_call_with_missing_entire_labels(self) -> None:
 
         result: XYData = self.collator(data)
 
-        expected_x = torch.tensor([[1, 2], [6, 0]])
+        # https://github.com/ChEB-AI/python-chebai/pull/48#issuecomment-2324393829
+        expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
         expected_y = torch.tensor(
             [[True, False], [True, False]]
         )  # True -> 1, False -> 0
-        expected_mask_for_x = torch.tensor([[True, True], [True, False]])
-        expected_lens_for_x = torch.tensor([2, 1])
+        expected_mask_for_x = torch.tensor(
+            [[True, True, False], [True, True, True], [True, False, False]]
+        )
+        expected_lens_for_x = torch.tensor([2, 3, 1])
 
         self.assertTrue(
             torch.equal(result.x, expected_x),
@@ -110,6 +113,11 @@ def test_call_with_missing_entire_labels(self) -> None:
             [0, 2],
             "The non-null labels list does not match the expected output.",
         )
+        self.assertEqual(
+            len(result.additional_fields["loss_kwargs"]["non_null_labels"]),
+            result.y.shape[1],
+            "The length of non null labels list must match with target label variable size",
+        )
         self.assertEqual(
             result.additional_fields["idents"],
             ("sample1", "sample2", "sample3"),

From f7f163142c86480c08d31d9b686baba2eabcc81a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 6 Sep 2024 12:24:58 +0200
Subject: [PATCH 029/112] test for ChebiOverX

---
 tests/unit/dataset_classes/testChEBIOverX.py | 123 +++++++++++++++++++
 tests/unit/mock_data/ontology_mock_data.py   |  34 ++++-
 2 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/dataset_classes/testChEBIOverX.py

diff --git a/tests/unit/dataset_classes/testChEBIOverX.py b/tests/unit/dataset_classes/testChEBIOverX.py
new file mode 100644
index 00000000..78d85dd4
--- /dev/null
+++ b/tests/unit/dataset_classes/testChEBIOverX.py
@@ -0,0 +1,123 @@
+import unittest
+from unittest.mock import PropertyMock, mock_open, patch
+
+from chebai.preprocessing.datasets.chebi import ChEBIOverX
+from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology
+
+
+class TestChEBIOverX(unittest.TestCase):
+    @classmethod
+    @patch.multiple(ChEBIOverX, __abstractmethods__=frozenset())
+    @patch.object(ChEBIOverX, "processed_dir_main", new_callable=PropertyMock)
+    def setUpClass(cls, mock_processed_dir_main: PropertyMock) -> None:
+        """
+        Set up the ChEBIOverX instance with a mock processed directory path and a test graph.
+
+        Args:
+            mock_processed_dir_main (PropertyMock): Mocked property for the processed directory path.
+        """
+        mock_processed_dir_main.return_value = "/mock/processed_dir"
+        cls.chebi_extractor = ChEBIOverX(chebi_version=231)
+        cls.test_graph = ChebiMockOntology.get_transitively_closed_graph()
+
+    @patch("builtins.open", new_callable=mock_open)
+    def test_select_classes(self, mock_open_file: mock_open) -> None:
+        """
+        Test the select_classes method to ensure it correctly selects nodes based on the threshold.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to intercept file operations.
+        """
+        self.chebi_extractor.THRESHOLD = 3
+        selected_classes = self.chebi_extractor.select_classes(self.test_graph)
+
+        # Check if the returned selected classes match the expected list
+        expected_classes = sorted([11111, 22222, 67890])
+        self.assertListEqual(
+            selected_classes,
+            expected_classes,
+            "The selected classes do not match the expected output for the given threshold of 3.",
+        )
+
+        # Expected data as string
+        expected_lines = "\n".join(map(str, expected_classes)) + "\n"
+
+        # Extract the generator passed to writelines
+        written_generator = mock_open_file().writelines.call_args[0][0]
+        written_lines = "".join(written_generator)
+
+        # Ensure the data matches
+        self.assertEqual(
+            written_lines,
+            expected_lines,
+            "The written lines do not match the expected lines for the given threshold of 3.",
+        )
+
+    @patch("builtins.open", new_callable=mock_open)
+    def test_no_classes_meet_threshold(self, mock_open_file: mock_open) -> None:
+        """
+        Test the select_classes method when no nodes meet the successor threshold.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to intercept file operations.
+        """
+        self.chebi_extractor.THRESHOLD = 5
+        selected_classes = self.chebi_extractor.select_classes(self.test_graph)
+
+        # Expected empty result
+        self.assertEqual(
+            selected_classes,
+            [],
+            "The selected classes list should be empty when no nodes meet the threshold of 5.",
+        )
+
+        # Expected data as string
+        expected_lines = ""
+
+        # Extract the generator passed to writelines
+        written_generator = mock_open_file().writelines.call_args[0][0]
+        written_lines = "".join(written_generator)
+
+        # Ensure the data matches
+        self.assertEqual(
+            written_lines,
+            expected_lines,
+            "The written lines do not match the expected lines when no nodes meet the threshold of 5.",
+        )
+
+    @patch("builtins.open", new_callable=mock_open)
+    def test_all_nodes_meet_threshold(self, mock_open_file: mock_open) -> None:
+        """
+        Test the select_classes method when all nodes meet the successor threshold.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to intercept file operations.
+        """
+        self.chebi_extractor.THRESHOLD = 0
+        selected_classes = self.chebi_extractor.select_classes(self.test_graph)
+
+        expected_classes = sorted(ChebiMockOntology.get_nodes())
+        # Check if the returned selected classes match the expected list
+        self.assertListEqual(
+            selected_classes,
+            expected_classes,
+            "The selected classes do not match the expected output when all nodes meet the threshold of 0.",
+        )
+
+        # Expected data as string
+        expected_lines = "\n".join(map(str, expected_classes)) + "\n"
+
+        # Extract the generator passed to writelines
+        written_generator = mock_open_file().writelines.call_args[0][0]
+        written_lines = "".join(written_generator)
+
+        # Ensure the data matches
+        self.assertEqual(
+            written_lines,
+            expected_lines,
+            "The written lines do not match the expected lines when all nodes meet the threshold of 0.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index 61b4462a..e6c14a93 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -1,6 +1,7 @@
 from collections import OrderedDict
-from typing import List, Set, Tuple
+from typing import Dict, List, Set, Tuple
 
+import networkx as nx
 import pandas as pd
 
 
@@ -30,6 +31,18 @@ class ChebiMockOntology:
     - CHEBI:12345 -> CHEBI:99999
 
     The class also includes methods to retrieve nodes, edges, and transitive closure of the graph.
+
+    Visual Representation Graph with Valid Nodes and Edges:
+
+                                       22222
+                                        /
+                       11111         67890
+                         \\         /  \
+                        54321     /    88888
+                           \\   /
+                           12345
+                             \
+                            99999
     """
 
     @staticmethod
@@ -205,7 +218,7 @@ def get_raw_data() -> str:
         """
 
     @staticmethod
-    def get_data_in_dataframe():
+    def get_data_in_dataframe() -> pd.DataFrame:
         data = OrderedDict(
             id=[
                 12345,
@@ -274,3 +287,20 @@ def get_data_in_dataframe():
         #     )
 
         return data_df
+
+    @staticmethod
+    def get_transitively_closed_graph() -> nx.DiGraph:
+        """
+        Create a directed graph, compute its transitive closure, and return it.
+
+        Returns:
+            g (nx.DiGraph): A transitively closed directed graph.
+        """
+        g = nx.DiGraph()
+
+        for node in ChebiMockOntology.get_nodes():
+            g.add_node(node, **{"smiles": "test_smiles_placeholder"})
+
+        g.add_edges_from(ChebiMockOntology.get_edges_of_transitive_closure_graph())
+
+        return g

From bf45bb5360eceadf7f8fb7c651a42d8208de20ec Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 6 Sep 2024 13:52:12 +0200
Subject: [PATCH 030/112] test for ChebiXOverPartial

---
 .../dataset_classes/testChebiOverXPartial.py  | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 tests/unit/dataset_classes/testChebiOverXPartial.py

diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py
new file mode 100644
index 00000000..c2515d75
--- /dev/null
+++ b/tests/unit/dataset_classes/testChebiOverXPartial.py
@@ -0,0 +1,108 @@
+import unittest
+from unittest.mock import mock_open, patch
+
+import networkx as nx
+
+from chebai.preprocessing.datasets.chebi import ChEBIOverXPartial
+from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology
+
+
+class TestChEBIOverX(unittest.TestCase):
+
+    @classmethod
+    @patch.multiple(ChEBIOverXPartial, __abstractmethods__=frozenset())
+    def setUpClass(cls) -> None:
+        """
+        Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph.
+        """
+        cls.chebi_extractor = ChEBIOverXPartial(top_class_id=11111, chebi_version=231)
+        cls.test_graph = ChebiMockOntology.get_transitively_closed_graph()
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=ChebiMockOntology.get_raw_data(),
+    )
+    def test_extract_class_hierarchy(self, mock_open: mock_open) -> None:
+        """
+        Test the extraction of class hierarchy and validate the structure of the resulting graph.
+        """
+        # Mock the output of fastobo.loads
+        self.chebi_extractor.top_class_id = 11111
+        graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path")
+
+        # Validate the graph structure
+        self.assertIsInstance(
+            graph, nx.DiGraph, "The result should be a directed graph."
+        )
+
+        # Check nodes
+        expected_nodes = {11111, 54321, 12345, 99999}
+        expected_edges = {
+            (54321, 12345),
+            (54321, 99999),
+            (11111, 54321),
+            (11111, 12345),
+            (11111, 99999),
+            (12345, 99999),
+        }
+        self.assertEqual(
+            set(graph.nodes),
+            expected_nodes,
+            f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.",
+        )
+
+        # Check edges
+        self.assertEqual(
+            expected_edges,
+            set(graph.edges),
+            "The graph edges do not match the expected edges.",
+        )
+
+        # Check number of nodes and edges
+        self.assertEqual(
+            len(graph.nodes),
+            len(expected_nodes),
+            "The number of nodes should match the actual number of nodes in the graph.",
+        )
+
+        self.assertEqual(
+            len(expected_edges),
+            len(graph.edges),
+            "The number of transitive edges should match the actual number of transitive edges in the graph.",
+        )
+
+        self.chebi_extractor.top_class_id = 22222
+        graph = self.chebi_extractor.extract_class_hierarchy("fake_path")
+
+        # Check nodes with top class as 22222
+        self.assertEqual(
+            set(graph.nodes),
+            {67890, 88888, 12345, 99999, 22222},
+            f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.",
+        )
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=ChebiMockOntology.get_raw_data(),
+    )
+    def test_extract_class_hierarchy_with_bottom_cls(
+        self, mock_open: mock_open
+    ) -> None:
+        """
+        Test the extraction of class hierarchy and validate the structure of the resulting graph.
+        """
+        self.chebi_extractor.top_class_id = 88888
+        graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path")
+
+        # Check nodes with top class as 88888
+        self.assertEqual(
+            set(graph.nodes),
+            {self.chebi_extractor.top_class_id},
+            f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 17bf5843df4ade5dde7264ee926cb7123cb97289 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 9 Sep 2024 11:26:58 +0200
Subject: [PATCH 031/112] Mock data for GOUniProt

---
 tests/unit/mock_data/ontology_mock_data.py | 459 ++++++++++++++++++++-
 1 file changed, 457 insertions(+), 2 deletions(-)

diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index e6c14a93..dbce56d2 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -1,3 +1,4 @@
+from abc import ABC, abstractmethod
 from collections import OrderedDict
 from typing import Dict, List, Set, Tuple
 
@@ -5,7 +6,115 @@
 import pandas as pd
 
 
-class ChebiMockOntology:
+class MockOntologyGraphData(ABC):
+    """
+    Abstract base class for mocking ontology graph data.
+
+    This class provides a set of static methods that must be implemented by subclasses
+    to return various elements of an ontology graph such as nodes, edges, and dataframes.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def get_nodes() -> List[int]:
+        """
+        Get a list of node IDs in the ontology graph.
+
+        Returns:
+            List[int]: A list of node IDs.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_number_of_nodes() -> int:
+        """
+        Get the number of nodes in the ontology graph.
+
+        Returns:
+            int: The total number of nodes.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_edges() -> Set[Tuple[int, int]]:
+        """
+        Get the set of edges in the ontology graph.
+
+        Returns:
+            Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_number_of_edges() -> int:
+        """
+        Get the number of edges in the ontology graph.
+
+        Returns:
+            int: The total number of edges.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]:
+        """
+        Get the set of edges in the transitive closure of the ontology graph.
+
+        Returns:
+            Set[Tuple[int, int]]: A set of tuples representing the transitive closure edges.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_number_of_transitive_edges() -> int:
+        """
+        Get the number of edges in the transitive closure of the ontology graph.
+
+        Returns:
+            int: The total number of transitive edges.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_obsolete_nodes_ids() -> Set[int]:
+        """
+        Get the set of obsolete node IDs in the ontology graph.
+
+        Returns:
+            Set[int]: A set of obsolete node IDs.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_transitively_closed_graph() -> nx.DiGraph:
+        """
+        Get the transitive closure of the ontology graph.
+
+        Returns:
+            nx.DiGraph: A directed graph representing the transitive closure of the ontology graph.
+        """
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_data_in_dataframe() -> pd.DataFrame:
+        """
+        Get the ontology data as a Pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: A DataFrame containing ontology data.
+        """
+        pass
+
+
+class ChebiMockOntology(MockOntologyGraphData):
     """
     A mock ontology representing a simplified ChEBI (Chemical Entities of Biological Interest) structure.
     This class is used for testing purposes and includes nodes and edges representing chemical compounds
@@ -265,7 +374,7 @@ def get_data_in_dataframe() -> pd.DataFrame:
                 67890: [False, False, True, False, True, False, False],
                 88888: [False, False, True, False, True, True, False],
                 99999: [True, True, True, True, True, False, True],
-            }
+            },
         )
 
         data_df = pd.DataFrame(data)
@@ -304,3 +413,349 @@ def get_transitively_closed_graph() -> nx.DiGraph:
         g.add_edges_from(ChebiMockOntology.get_edges_of_transitive_closure_graph())
 
         return g
+
+
+class GOUniProtMockData(MockOntologyGraphData):
+    """
+    A mock ontology representing a simplified version of the Gene Ontology (GO) structure with nodes and edges
+    representing GO terms and their relationships in a directed acyclic graph (DAG).
+
+    Nodes:
+        - GO_1
+        - GO_2
+        - GO_3
+        - GO_4
+        - GO_5
+        - GO_6
+
+    Edges (Parent-Child Relationships):
+        - GO_1 -> GO_2
+        - GO_1 -> GO_3
+        - GO_2 -> GO_4
+        - GO_2 -> GO_5
+        - GO_3 -> GO_4
+        - GO_4 -> GO_6
+
+    This mock ontology structure is useful for testing methods related to GO hierarchy, graph extraction, and transitive
+    closure operations.
+
+    The class also includes methods to retrieve nodes, edges, and transitive closure of the graph.
+
+    Visual Representation Graph with Valid Nodes and Edges:
+
+                                GO_1
+                               /    \
+                             GO_2   GO_3
+                            /  \    /
+                         GO_5   GO_4
+                                   \
+                                   GO_6
+
+    Valid Swiss Proteins with mapping to valid GO ids
+    Swiss_Prot_1 -> GO_2, GO_3, GO_5
+    Swiss_Prot_2 -> GO_2, GO_5
+    """
+
+    @staticmethod
+    def get_nodes() -> List[int]:
+        """
+        Get a sorted list of node IDs.
+
+        Returns:
+            List[int]: A sorted list of node IDs in the ontology graph.
+        """
+        return sorted([1, 2, 3, 4, 5, 6])
+
+    @staticmethod
+    def get_number_of_nodes() -> int:
+        """
+        Get the total number of nodes in the ontology graph.
+
+        Returns:
+            int: The number of nodes.
+        """
+        return len(GOUniProtMockData.get_nodes())
+
+    @staticmethod
+    def get_edges() -> Set[Tuple[int, int]]:
+        """
+        Get the set of edges in the ontology graph.
+
+        Returns:
+            Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes.
+        """
+        return {(1, 2), (1, 3), (2, 4), (2, 5), (3, 4), (4, 6)}
+
+    @staticmethod
+    def get_number_of_edges() -> int:
+        """
+        Get the total number of edges in the ontology graph.
+
+        Returns:
+            int: The number of edges.
+        """
+        return len(GOUniProtMockData.get_edges())
+
+    @staticmethod
+    def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]:
+        """
+        Get the set of edges in the transitive closure of the ontology graph.
+
+        Returns:
+            Set[Tuple[int, int]]: A set of tuples representing edges in the transitive closure graph.
+        """
+        return {
+            (1, 2),
+            (1, 3),
+            (1, 4),
+            (1, 5),
+            (1, 6),
+            (2, 4),
+            (2, 5),
+            (2, 6),
+            (3, 4),
+            (3, 6),
+            (4, 6),
+        }
+
+    @staticmethod
+    def get_number_of_transitive_edges() -> int:
+        """
+        Get the total number of edges in the transitive closure graph.
+
+        Returns:
+            int: The number of transitive edges.
+        """
+        return len(GOUniProtMockData.get_edges_of_transitive_closure_graph())
+
+    @staticmethod
+    def get_obsolete_nodes_ids() -> Set[int]:
+        """
+        Get the set of obsolete node IDs in the ontology graph.
+
+        Returns:
+            Set[int]: A set of node IDs representing obsolete nodes.
+        """
+        return {7, 8}
+
+    @staticmethod
+    def get_GO_raw_data() -> str:
+        """
+        Get raw data in string format for GO ontology.
+
+        This data simulates a basic GO ontology in a format typically used for testing.
+
+        Returns:
+            str: The raw GO data in string format.
+        """
+        return """
+        [Term]
+        id: GO:0000001
+        name: GO_1
+        namespace: molecular_function
+        def: "OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function." [GOC:jl, PMID:12150913]
+        comment: This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular function.
+        synonym: "ribosomal chaperone activity" EXACT []
+        xref: MetaCyc:BETAGALACTOSID-RXN
+        xref: Reactome:R-HSA-189062 "lactose + H2O => D-glucose + D-galactose"
+        xref: Reactome:R-HSA-5658001 "Defective LCT does not hydrolyze Lac"
+        xref: RHEA:10076
+
+        [Term]
+        id: GO:0000002
+        name: GO_2
+        namespace: biological_process
+        is_a: GO:0000001 ! hydrolase activity, hydrolyzing O-glycosyl compounds
+
+        [Term]
+        id: GO:0000003
+        name: GO_3
+        namespace: cellular_component
+        is_a: GO:0000001 ! regulation of DNA recombination
+
+        [Term]
+        id: GO:0000004
+        name: GO_4
+        namespace: biological_process
+        is_a: GO:0000003 ! regulation of DNA recombination
+        is_a: GO:0000002 ! hydrolase activity, hydrolyzing O-glycosyl compounds
+
+        [Term]
+        id: GO:0000005
+        name: GO_5
+        namespace: molecular_function
+        is_a: GO:0000002 ! regulation of DNA recombination
+
+        [Term]
+        id: GO:0000006
+        name: GO_6
+        namespace: cellular_component
+        is_a: GO:0000004 ! glucoside transport
+
+        [Term]
+        id: GO:0000007
+        name: GO_7
+        namespace: biological_process
+        is_a: GO:0000003 ! glucoside transport
+        is_obsolete: true
+
+        [Term]
+        id: GO:0000008
+        name: GO_8
+        namespace: molecular_function
+        is_a: GO:0000001 ! glucoside transport
+        is_obsolete: true
+
+        [Typedef]
+        id: term_tracker_item
+        name: term tracker item
+        namespace: external
+        xref: IAO:0000233
+        is_metadata_tag: true
+        is_class_level: true
+        """
+
+    @staticmethod
+    def protein_sequences() -> Dict[str, str]:
+        """
+        Get the protein sequences for Swiss-Prot proteins.
+
+        Returns:
+            Dict[str, str]: A dictionary where keys are Swiss-Prot IDs and values are their respective sequences.
+        """
+        return {
+            "Swiss_Prot_1": "MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK".replace(
+                " ", ""
+            ),
+            "Swiss_Prot_2": "EKGLIVGHFS GIKYKGEKAQ ASEVDVNKMC CWVSKFKDAM RRYQGIQTCK".replace(
+                " ", ""
+            ),
+        }
+
+    @staticmethod
+    def get_UniProt_raw_data() -> str:
+        """
+        Get raw data in string format for UniProt proteins.
+
+        This mock data contains six Swiss-Prot proteins with different properties:
+        - Swiss_Prot_1 and Swiss_Prot_2 are valid proteins.
+        - Swiss_Prot_3 has a sequence length greater than 1002.
+        - Swiss_Prot_4 contains "X", a non-valid amino acid in its sequence.
+        - Swiss_Prot_5 has no GO IDs mapped to it.
+        - Swiss_Prot_6 has GO IDs mapped, but no evidence codes.
+
+        Returns:
+            str: The raw UniProt data in string format.
+        """
+        protein_sq_1 = GOUniProtMockData.protein_sequences()["Swiss_Prot_1"]
+        protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"]
+        raw_str = (
+            f"ID   Swiss_Prot_1              Reviewed;         {len(protein_sq_1)} AA. \n"
+            + "AC   Q6GZX4;\n"
+            + "DR   GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n"
+            + "DR   GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n"
+            + "DR   GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n"
+            + "DR   GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n"
+            + f"SQ   SEQUENCE   {len(protein_sq_1)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            + f"     {protein_sq_1}\n"
+            + "//\n"
+            + f"ID   Swiss_Prot_2              Reviewed;         {len(protein_sq_2)} AA.\n"
+            + "AC   DCGZX4;\n"
+            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            + "DR   GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n"
+            + "DR   GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n"
+            + "DR   GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n"
+            + f"SQ   SEQUENCE   {len(protein_sq_2)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            + f"     {protein_sq_2}\n"
+            + "//\n"
+            + "ID   Swiss_Prot_3              Reviewed;         1165 AA.\n"
+            + "AC   Q6GZX4;\n"
+            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            + "DR   GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n"
+            + "DR   GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n"
+            + "DR   GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
+            + "SQ   SEQUENCE   1165 AA;  129118 MW;  FE2984658CED53A8 CRC64;\n"
+            + "     MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n"
+            + "     GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n"
+            + "     DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n"
+            + "     DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n"
+            + "     KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n"
+            + "     EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n"
+            + "     SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n"
+            + "     KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n"
+            + "     KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n"
+            + "     WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n"
+            + "     NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n"
+            + "     NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n"
+            + "     SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n"
+            + "     GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n"
+            + "     YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n"
+            + "     AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n"
+            + "     AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n"
+            + "     NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n"
+            + "     ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n"
+            + "     DVRVSGVMIY RASEGKPMQY VSLLM\n"
+            + "//\n"
+            + "ID   Swiss_Prot_4              Reviewed;         60 AA.\n"
+            + "AC   Q6GZX4;\n"
+            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            + "DR   GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n"
+            + "DR   GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n"
+            + "DR   GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
+            + "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            + "     XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            + "//\n"
+            + "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
+            + "AC   Q6GZX4;\n"
+            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            + "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            + "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            + "//\n"
+            + "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
+            + "AC   Q6GZX4;\n"
+            + "DR   GO; GO:0000005; P:regulation of viral transcription;\n"
+            + "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            + "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            + "//"
+        )
+
+        return raw_str
+
+    @staticmethod
+    def get_data_in_dataframe() -> pd.DataFrame:
+        """
+        Get a mock DataFrame representing UniProt data.
+
+        The DataFrame contains Swiss-Prot protein data, including identifiers, accessions, GO terms, sequences,
+        and binary label columns representing whether each protein is associated with certain GO classes.
+
+        Returns:
+            pd.DataFrame: A DataFrame containing mock UniProt data with columns for 'swiss_id', 'accession', 'go_ids', 'sequence',
+                          and binary labels for GO classes.
+        """
+        expected_data = OrderedDict(
+            swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"],
+            accession=["Q6GZX4", "DCGZX4"],
+            go_ids=[[2, 3, 5], [2, 5]],
+            sequence=list(GOUniProtMockData.protein_sequences().values()),
+            **{
+                #   SP_1,  SP_2
+                1: [False, False],
+                2: [True, True],
+                3: [True, False],
+                4: [False, False],
+                5: [True, True],
+                6: [False, False],
+            },
+        )
+        return pd.DataFrame(expected_data)
+
+    @staticmethod
+    def get_transitively_closed_graph() -> nx.DiGraph:
+        """
+        Get the transitive closure of the ontology graph.
+
+        Returns:
+            nx.DiGraph: A directed graph representing the transitive closure of the ontology graph.
+        """
+        pass

From c6c5a59990b6933d785898d6001595a94a5396be Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 9 Sep 2024 11:27:26 +0200
Subject: [PATCH 032/112] test for GOUniProtDataExtractor

---
 .../testGOUniProDataExtractor.py              | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 tests/unit/dataset_classes/testGOUniProDataExtractor.py

diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
new file mode 100644
index 00000000..7394405d
--- /dev/null
+++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
@@ -0,0 +1,217 @@
+import unittest
+from unittest.mock import MagicMock, PropertyMock, mock_open, patch
+
+import fastobo
+import networkx as nx
+import pandas as pd
+
+from chebai.preprocessing.datasets.go_uniprot import _GOUniProtDataExtractor
+from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData
+
+
+class TestGOUniProtDataExtractor(unittest.TestCase):
+    """
+    Unit tests for the _GOUniProtDataExtractor class.
+    """
+
+    @classmethod
+    @patch.multiple(_GOUniProtDataExtractor, __abstractmethods__=frozenset())
+    @patch.object(_GOUniProtDataExtractor, "base_dir", new_callable=PropertyMock)
+    @patch.object(_GOUniProtDataExtractor, "_name", new_callable=PropertyMock)
+    def setUpClass(
+        cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock
+    ) -> None:
+        """
+        Class setup for mocking abstract properties of _GOUniProtDataExtractor.
+        """
+        mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor"
+        mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor"
+        ReaderMock = MagicMock()
+        ReaderMock.name.return_value = "MockedReader"
+        _GOUniProtDataExtractor.READER = ReaderMock
+
+        cls.extractor = _GOUniProtDataExtractor()
+
+    def test_term_callback(self) -> None:
+        """
+        Test the term_callback method for correct parsing and filtering of GO terms.
+        """
+        self.extractor.go_branch = "all"
+        term_mapping = {}
+        for term in fastobo.loads(GOUniProtMockData.get_GO_raw_data()):
+            if isinstance(term, fastobo.typedef.TypedefFrame):
+                continue
+            term_mapping[self.extractor._parse_go_id(term.id)] = term
+
+        # Test individual term callback
+        term_dict = self.extractor.term_callback(term_mapping[4])
+        expected_dict = {"go_id": 4, "parents": [3, 2], "name": "GO_4"}
+        self.assertEqual(
+            term_dict,
+            expected_dict,
+            "The term_callback did not return the expected dictionary.",
+        )
+
+        # Test filtering valid terms
+        valid_terms_docs = set()
+        for term_id, term_doc in term_mapping.items():
+            if self.extractor.term_callback(term_doc):
+                valid_terms_docs.add(term_id)
+
+        self.assertEqual(
+            valid_terms_docs,
+            set(GOUniProtMockData.get_nodes()),
+            "The valid terms do not match expected nodes.",
+        )
+
+        # Test that obsolete terms are filtered out
+        self.assertFalse(
+            any(
+                self.extractor.term_callback(term_mapping[obs_id])
+                for obs_id in GOUniProtMockData.get_obsolete_nodes_ids()
+            ),
+            "Obsolete terms should not be present.",
+        )
+
+        # Test filtering by GO branch (e.g., BP)
+        self.extractor.go_branch = "BP"
+        BP_terms = {
+            term_id
+            for term_id, term in term_mapping.items()
+            if self.extractor.term_callback(term)
+        }
+        self.assertEqual(
+            BP_terms, {2, 4}, "The BP terms do not match the expected set."
+        )
+
+    @patch(
+        "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data())
+    )
+    def test_extract_class_hierarchy(self, mock_load) -> None:
+        """
+        Test the extraction of the class hierarchy from the ontology.
+        """
+        graph = self.extractor._extract_class_hierarchy("fake_path")
+
+        # Validate the graph structure
+        self.assertIsInstance(
+            graph, nx.DiGraph, "The result should be a directed graph."
+        )
+
+        # Check nodes
+        actual_nodes = set(graph.nodes)
+        self.assertEqual(
+            set(GOUniProtMockData.get_nodes()),
+            actual_nodes,
+            "The graph nodes do not match the expected nodes.",
+        )
+
+        # Check edges
+        actual_edges = set(graph.edges)
+        self.assertEqual(
+            GOUniProtMockData.get_edges_of_transitive_closure_graph(),
+            actual_edges,
+            "The graph edges do not match the expected edges.",
+        )
+
+        # Check number of nodes and edges
+        self.assertEqual(
+            GOUniProtMockData.get_number_of_nodes(),
+            len(actual_nodes),
+            "The number of nodes should match the actual number of nodes in the graph.",
+        )
+
+        self.assertEqual(
+            GOUniProtMockData.get_number_of_transitive_edges(),
+            len(actual_edges),
+            "The number of transitive edges should match the actual number of transitive edges in the graph.",
+        )
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=GOUniProtMockData.get_UniProt_raw_data(),
+    )
+    def test_get_swiss_to_go_mapping(self, mock_open) -> None:
+        """
+        Test the extraction of SwissProt to GO term mapping.
+        """
+        mapping_df = self.extractor._get_swiss_to_go_mapping()
+        expected_df = GOUniProtMockData.get_data_in_dataframe().iloc[:, :4]
+
+        pd.testing.assert_frame_equal(
+            mapping_df,
+            expected_df,
+            obj="The SwissProt to GO mapping DataFrame does not match the expected DataFrame.",
+        )
+
+    @patch(
+        "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data())
+    )
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=GOUniProtMockData.get_UniProt_raw_data(),
+    )
+    @patch.object(
+        _GOUniProtDataExtractor,
+        "select_classes",
+        return_value=GOUniProtMockData.get_nodes(),
+    )
+    def test_graph_to_raw_dataset(
+        self, mock_select_classes, mock_open, mock_load
+    ) -> None:
+        """
+        Test the conversion of the class hierarchy graph to a raw dataset.
+        """
+        graph = self.extractor._extract_class_hierarchy("fake_path")
+        actual_df = self.extractor._graph_to_raw_dataset(graph)
+        expected_df = GOUniProtMockData.get_data_in_dataframe()
+
+        pd.testing.assert_frame_equal(
+            actual_df,
+            expected_df,
+            obj="The raw dataset DataFrame does not match the expected DataFrame.",
+        )
+
+    @patch("builtins.open", new_callable=mock_open, read_data=b"Mocktestdata")
+    @patch("pandas.read_pickle")
+    def test_load_dict(
+        self, mock_read_pickle: PropertyMock, mock_open: mock_open
+    ) -> None:
+        """
+        Test the loading of the dictionary from a DataFrame.
+        """
+        mock_df = GOUniProtMockData.get_data_in_dataframe()
+        mock_read_pickle.return_value = mock_df
+
+        generator = self.extractor._load_dict("data/tests")
+        result = list(generator)
+
+        # Convert NumPy arrays to lists for comparison
+        for item in result:
+            item["labels"] = list(item["labels"])
+
+        # Expected output for comparison
+        expected_result = [
+            {
+                "features": mock_df["sequence"][0],
+                "labels": mock_df.iloc[0, 4:].to_list(),
+                "ident": mock_df["swiss_id"][0],
+            },
+            {
+                "features": mock_df["sequence"][1],
+                "labels": mock_df.iloc[1, 4:].to_list(),
+                "ident": mock_df["swiss_id"][1],
+            },
+        ]
+
+        self.assertEqual(
+            result,
+            expected_result,
+            "The loaded dictionary does not match the expected structure.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 427bc60a1e6d6d33a7fbfd7a7707224f3922a894 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 9 Sep 2024 12:29:32 +0200
Subject: [PATCH 033/112] update test to new method name
 _extract_class_hierarchy

---
 tests/unit/dataset_classes/testChebiOverXPartial.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py
index c2515d75..a8c53408 100644
--- a/tests/unit/dataset_classes/testChebiOverXPartial.py
+++ b/tests/unit/dataset_classes/testChebiOverXPartial.py
@@ -29,7 +29,7 @@ def test_extract_class_hierarchy(self, mock_open: mock_open) -> None:
         """
         # Mock the output of fastobo.loads
         self.chebi_extractor.top_class_id = 11111
-        graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path")
+        graph: nx.DiGraph = self.chebi_extractor._extract_class_hierarchy("fake_path")
 
         # Validate the graph structure
         self.assertIsInstance(
@@ -73,7 +73,7 @@ def test_extract_class_hierarchy(self, mock_open: mock_open) -> None:
         )
 
         self.chebi_extractor.top_class_id = 22222
-        graph = self.chebi_extractor.extract_class_hierarchy("fake_path")
+        graph = self.chebi_extractor._extract_class_hierarchy("fake_path")
 
         # Check nodes with top class as 22222
         self.assertEqual(
@@ -94,7 +94,7 @@ def test_extract_class_hierarchy_with_bottom_cls(
         Test the extraction of class hierarchy and validate the structure of the resulting graph.
         """
         self.chebi_extractor.top_class_id = 88888
-        graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path")
+        graph: nx.DiGraph = self.chebi_extractor._extract_class_hierarchy("fake_path")
 
         # Check nodes with top class as 88888
         self.assertEqual(

From c01ecde837227eb4c4e99afb95063aa58d7cb9cb Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 9 Sep 2024 13:12:22 +0200
Subject: [PATCH 034/112] test for GOUniProtOverX

---
 .../dataset_classes/testGoUniProtOverX.py     | 139 ++++++++++++++++++
 tests/unit/mock_data/ontology_mock_data.py    |   5 +-
 2 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/dataset_classes/testGoUniProtOverX.py

diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py
new file mode 100644
index 00000000..282091b5
--- /dev/null
+++ b/tests/unit/dataset_classes/testGoUniProtOverX.py
@@ -0,0 +1,139 @@
+import unittest
+from typing import List
+from unittest.mock import mock_open, patch
+
+import networkx as nx
+import pandas as pd
+
+from chebai.preprocessing.datasets.go_uniprot import _GOUniProtOverX
+from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData
+
+
+class TestGOUniProtOverX(unittest.TestCase):
+    @classmethod
+    @patch.multiple(_GOUniProtOverX, __abstractmethods__=frozenset())
+    def setUpClass(cls) -> None:
+        """
+        Set up the class for tests by initializing the extractor, graph, and input DataFrame.
+        """
+        cls.extractor = _GOUniProtOverX()
+        cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph()
+        cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[
+            :, :4
+        ]
+
+    @patch("builtins.open", new_callable=mock_open)
+    def test_select_classes(self, mock_open_file: mock_open) -> None:
+        """
+        Test the `select_classes` method to ensure it selects classes based on the threshold.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to intercept file operations.
+        """
+        # Set threshold for testing
+        self.extractor.THRESHOLD = 2
+        selected_classes: List[int] = self.extractor.select_classes(
+            self.test_graph, data_df=self.input_df
+        )
+
+        # Expected result: GO terms 1, 2, and 5 should be selected based on the threshold
+        expected_selected_classes: List[int] = sorted([1, 2, 5])
+
+        # Check if the selected classes are as expected
+        self.assertEqual(
+            selected_classes,
+            expected_selected_classes,
+            msg="The selected classes do not match the expected output for threshold 2.",
+        )
+
+        # Expected data as string
+        expected_lines: str = "\n".join(map(str, expected_selected_classes)) + "\n"
+
+        # Extract the generator passed to writelines
+        written_generator = mock_open_file().writelines.call_args[0][0]
+        written_lines: str = "".join(written_generator)
+
+        # Ensure the data matches
+        self.assertEqual(
+            written_lines,
+            expected_lines,
+            msg="The written lines do not match the expected lines for the given threshold of 2.",
+        )
+
+    @patch("builtins.open", new_callable=mock_open)
+    def test_no_classes_meet_threshold(self, mock_open_file: mock_open) -> None:
+        """
+        Test the `select_classes` method when no nodes meet the successor threshold.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to intercept file operations.
+        """
+        self.extractor.THRESHOLD = 5
+        selected_classes: List[int] = self.extractor.select_classes(
+            self.test_graph, data_df=self.input_df
+        )
+
+        # Expected result: No classes should meet the threshold of 5
+        expected_selected_classes: List[int] = []
+
+        # Check if the selected classes are as expected
+        self.assertEqual(
+            selected_classes,
+            expected_selected_classes,
+            msg="The selected classes list should be empty when no nodes meet the threshold of 5.",
+        )
+
+        # Expected data as string
+        expected_lines: str = ""
+
+        # Extract the generator passed to writelines
+        written_generator = mock_open_file().writelines.call_args[0][0]
+        written_lines: str = "".join(written_generator)
+
+        # Ensure the data matches
+        self.assertEqual(
+            written_lines,
+            expected_lines,
+            msg="The written lines do not match the expected lines when no nodes meet the threshold of 5.",
+        )
+
+    @patch("builtins.open", new_callable=mock_open)
+    def test_all_nodes_meet_threshold(self, mock_open_file: mock_open) -> None:
+        """
+        Test the `select_classes` method when all nodes meet the successor threshold.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to intercept file operations.
+        """
+        self.extractor.THRESHOLD = 0
+        selected_classes: List[int] = self.extractor.select_classes(
+            self.test_graph, data_df=self.input_df
+        )
+
+        # Expected result: All nodes except those not referenced by any protein (4 and 6) should be selected
+        expected_classes: List[int] = sorted([1, 2, 3, 5])
+
+        # Check if the returned selected classes match the expected list
+        self.assertListEqual(
+            selected_classes,
+            expected_classes,
+            msg="The selected classes do not match the expected output when all nodes meet the threshold of 0.",
+        )
+
+        # Expected data as string
+        expected_lines: str = "\n".join(map(str, expected_classes)) + "\n"
+
+        # Extract the generator passed to writelines
+        written_generator = mock_open_file().writelines.call_args[0][0]
+        written_lines: str = "".join(written_generator)
+
+        # Ensure the data matches
+        self.assertEqual(
+            written_lines,
+            expected_lines,
+            msg="The written lines do not match the expected lines when all nodes meet the threshold of 0.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index dbce56d2..d516a7a0 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -758,4 +758,7 @@ def get_transitively_closed_graph() -> nx.DiGraph:
         Returns:
             nx.DiGraph: A directed graph representing the transitive closure of the ontology graph.
         """
-        pass
+        g = nx.DiGraph()
+        g.add_nodes_from(node for node in ChebiMockOntology.get_nodes())
+        g.add_edges_from(GOUniProtMockData.get_edges_of_transitive_closure_graph())
+        return g

From dfd084e6c49ef10d1f4c22388fe2c01217c8cde6 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 10 Sep 2024 15:21:24 +0200
Subject: [PATCH 035/112] test for _load_data_from_file for Tox21MolNet

---
 .../testGOUniProDataExtractor.py              |   2 +-
 tests/unit/dataset_classes/testTox21MolNet.py | 115 ++++++++++
 tests/unit/mock_data/tox_mock_data.py         | 201 ++++++++++++++++++
 3 files changed, 317 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/dataset_classes/testTox21MolNet.py
 create mode 100644 tests/unit/mock_data/tox_mock_data.py

diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
index 7394405d..1b60aa97 100644
--- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py
+++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
@@ -27,7 +27,7 @@ def setUpClass(
         mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor"
         mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor"
         ReaderMock = MagicMock()
-        ReaderMock.name.return_value = "MockedReader"
+        ReaderMock.name.return_value = "MockedReaderGOUniProtDataExtractor"
         _GOUniProtDataExtractor.READER = ReaderMock
 
         cls.extractor = _GOUniProtDataExtractor()
diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py
new file mode 100644
index 00000000..3639f5d1
--- /dev/null
+++ b/tests/unit/dataset_classes/testTox21MolNet.py
@@ -0,0 +1,115 @@
+import os
+import unittest
+from typing import Dict, List
+from unittest.mock import MagicMock, mock_open, patch
+
+import torch
+from sklearn.model_selection import GroupShuffleSplit
+
+from chebai.preprocessing.datasets.tox21 import Tox21MolNet
+from tests.unit.mock_data.tox_mock_data import Tox21MockData
+
+
+class TestTox21MolNet(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        """Initialize a Tox21MolNet instance for testing."""
+        ReaderMock = MagicMock()
+        ReaderMock.name.return_value = "MockedReaderTox21MolNet"
+        Tox21MolNet.READER = ReaderMock
+        cls.data_module = Tox21MolNet()
+        # cls.data_module.raw_dir = "/mock/raw_dir"
+        # cls.data_module.processed_dir = "/mock/processed_dir"
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=Tox21MockData.get_raw_data(),
+    )
+    def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
+        """
+        Test the `_load_data_from_file` method for correct CSV parsing.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to simulate file reading.
+        """
+        expected_data = Tox21MockData.get_processed_data()
+        actual_data = self.data_module._load_data_from_file("fake/file/path.csv")
+
+        self.assertEqual(
+            list(actual_data),
+            expected_data,
+            "The loaded data does not match the expected output.",
+        )
+
+    @patch.object(
+        Tox21MolNet,
+        "_load_data_from_file",
+        return_value=Tox21MockData.get_processed_data(),
+    )
+    @patch("torch.save")
+    def test_setup_processed_simple_split(
+        self, mock_load_data: MagicMock, mock_torch_save: MagicMock
+    ) -> None:
+        """
+        Test the `setup_processed` method for basic data splitting and saving.
+
+        Args:
+            mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data.
+            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
+        """
+        self.data_module.setup_processed()
+
+        # # Check that torch.save was called for train, test, and validation splits
+        # self.assertEqual(
+        #     mock_torch_save.call_count,
+        #     3,
+        #     "torch.save should have been called exactly three times for train, test, and validation splits."
+        # )
+
+    # @patch("os.path.isfile", return_value=False)
+    # @patch.object(Tox21MolNet,
+    #               "_load_data_from_file",
+    #               return_value= Tox21MockData.get_processed_grouped_data())
+    # @patch("torch.save")
+    # @patch("torch.load")
+    # @patch("chebai.preprocessing.datasets.tox21.GroupShuffleSplit")
+    # def test_setup_processed_group_split(
+    #         self,
+    #         mock_group_split: MagicMock,
+    #         mock_torch_load: MagicMock,
+    #         mock_save: MagicMock,
+    #         mock_load_data: MagicMock,
+    #         mock_isfile: MagicMock
+    # ) -> None:
+    #     """
+    #     Test the `setup_processed` method for group-based data splitting and saving.
+    #
+    #     Args:
+    #         mock_save (MagicMock): Mocked `torch.save` function to avoid file writes.
+    #         mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data.
+    #         mock_isfile (MagicMock): Mocked `os.path.isfile` function to simulate file presence.
+    #         mock_group_split (MagicMock): Mocked `GroupShuffleSplit` to control data splitting behavior.
+    #     """
+    #     mock_group_split.return_value = GroupShuffleSplit(n_splits=1, train_size=0.7)
+    #     self.data_module.setup_processed()
+    #
+    #     # Load the test split
+    #     test_split_path = os.path.join(self.data_module.processed_dir, "test.pt")
+    #     test_split = torch.load(test_split_path)
+    #
+    #     # Check if torch.save was called with correct arguments
+    #     mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/test.pt")
+    #     mock_save.assert_any_call([mock_data[0]], "/mock/processed_dir/train.pt")
+    #     mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/validation.pt")
+    #     # Check that torch.save was called for train, test, and validation splits
+    #     self.assertEqual(
+    #         mock_torch_save.call_count,
+    #         3,
+    #         "torch.save should have been called exactly three times for train, test, and validation splits."
+    #     )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py
new file mode 100644
index 00000000..912d172c
--- /dev/null
+++ b/tests/unit/mock_data/tox_mock_data.py
@@ -0,0 +1,201 @@
+class Tox21MockData:
+    """
+    A utility class providing mock data for testing the Tox21MolNet dataset.
+
+    This class includes static methods that return mock data in various formats, simulating
+    the raw and processed data of the Tox21MolNet dataset. The mock data is used for unit tests
+    to verify the functionality of methods within the Tox21MolNet class without relying on actual
+    data files.
+    """
+
+    @staticmethod
+    def get_raw_data() -> str:
+        """
+        Returns a raw CSV string that simulates the raw data of the Tox21MolNet dataset.
+        """
+        return (
+            "NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,"
+            + "mol_id,smiles\n"
+            + "0,0,1,0,1,1,0,1,0,,1,0,TOX958,Nc1ccc([N+](=O)[O-])cc1N\n"
+            + ",,,,,,,,,1,,,TOX31681,Nc1cc(C(F)(F)F)ccc1S\n"
+            + "0,0,0,0,0,0,0,,0,0,0,0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C\n"
+            + "0,0,0,0,0,0,0,0,0,0,0,0,TOX6619,O=S(=O)(Cl)c1ccccc1\n"
+            + "0,0,0,,0,0,,,0,,1,,TOX27679,CCCCCc1ccco1\n"
+            + "0,,1,,,,0,,1,1,1,1,TOX2801,Oc1c(Cl)cc(Cl)c2cccnc12\n"
+            + "0,0,0,0,,0,,,0,0,,1,TOX2808,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21\n"
+            + "0,,0,1,,,,1,0,,1,,TOX29085,CCCCCCCCCCCCCCn1cc[n+](C)c1\n"
+        )
+
+    @staticmethod
+    def get_processed_data() -> list:
+        """
+        Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset.
+        Each dictionary contains 'ident', 'features', and 'labels'.
+        """
+        return [
+            {
+                "ident": "TOX958",
+                "features": "Nc1ccc([N+](=O)[O-])cc1N",
+                "labels": [
+                    False,
+                    False,
+                    True,
+                    False,
+                    True,
+                    True,
+                    False,
+                    True,
+                    False,
+                    None,
+                    True,
+                    False,
+                ],
+            },
+            {
+                "ident": "TOX31681",
+                "features": "Nc1cc(C(F)(F)F)ccc1S",
+                "labels": [
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    True,
+                    None,
+                    None,
+                ],
+            },
+            {
+                "ident": "TOX5110",
+                "features": "CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C",
+                "labels": [
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    None,
+                    False,
+                    False,
+                    False,
+                    False,
+                ],
+            },
+            {
+                "ident": "TOX6619",
+                "features": "O=S(=O)(Cl)c1ccccc1",
+                "labels": [
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                ],
+            },
+            {
+                "ident": "TOX27679",
+                "features": "CCCCCc1ccco1",
+                "labels": [
+                    False,
+                    False,
+                    False,
+                    None,
+                    False,
+                    False,
+                    None,
+                    None,
+                    False,
+                    None,
+                    True,
+                    None,
+                ],
+            },
+            {
+                "ident": "TOX2801",
+                "features": "Oc1c(Cl)cc(Cl)c2cccnc12",
+                "labels": [
+                    False,
+                    None,
+                    True,
+                    None,
+                    None,
+                    None,
+                    False,
+                    None,
+                    True,
+                    True,
+                    True,
+                    True,
+                ],
+            },
+            {
+                "ident": "TOX2808",
+                "features": "CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21",
+                "labels": [
+                    False,
+                    False,
+                    False,
+                    False,
+                    None,
+                    False,
+                    None,
+                    None,
+                    False,
+                    False,
+                    None,
+                    True,
+                ],
+            },
+            {
+                "ident": "TOX29085",
+                "features": "CCCCCCCCCCCCCCn1cc[n+](C)c1",
+                "labels": [
+                    False,
+                    None,
+                    False,
+                    True,
+                    None,
+                    None,
+                    None,
+                    True,
+                    False,
+                    None,
+                    True,
+                    None,
+                ],
+            },
+        ]
+
+    @staticmethod
+    def get_processed_grouped_data():
+        """
+        Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset.
+        Each dictionary contains 'ident', 'features', and 'labels'.
+        """
+        processed_data = Tox21MockData.get_processed_data()
+        groups = ["A", "A", "B", "B", "C", "C", "C", "C"]
+
+        assert len(processed_data) == len(
+            groups
+        ), "The number of processed data entries does not match the number of groups."
+
+        # Combine processed data with their corresponding groups
+        grouped_data = [
+            {**data, "group": group, "original": True}
+            for data, group in zip(processed_data, groups)
+        ]
+
+        return grouped_data

From 77956d473b88f71cc0fa7b262da9b595849fa92e Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 16 Sep 2024 13:06:47 +0200
Subject: [PATCH 036/112] _load_data_from_file test case Tox21Challenge

---
 .../dataset_classes/testTox21Challenge.py     |  43 ++++
 tests/unit/dataset_classes/testTox21MolNet.py |  10 +-
 tests/unit/mock_data/ontology_mock_data.py    | 132 +++++------
 tests/unit/mock_data/tox_mock_data.py         | 214 +++++++++++++++++-
 4 files changed, 317 insertions(+), 82 deletions(-)
 create mode 100644 tests/unit/dataset_classes/testTox21Challenge.py

diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py
new file mode 100644
index 00000000..4b23c487
--- /dev/null
+++ b/tests/unit/dataset_classes/testTox21Challenge.py
@@ -0,0 +1,43 @@
+import os
+import unittest
+from unittest.mock import MagicMock, mock_open, patch
+
+from rdkit import Chem
+
+from chebai.preprocessing.datasets.tox21 import Tox21Challenge
+from chebai.preprocessing.reader import ChemDataReader
+from tests.unit.mock_data.tox_mock_data import Tox21ChallengeMockData
+
+
+class TestTox21Challenge(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        """
+        Set up the Tox21Challenge instance and mock data for testing.
+        """
+        Tox21Challenge.READER = ChemDataReader
+        cls.tox21 = Tox21Challenge()
+
+    @patch("rdkit.Chem.SDMolSupplier")
+    def test_load_data_from_file(self, mock_sdmol_supplier) -> None:
+        """
+        Test the _load_data_from_file method to ensure it correctly loads data from an SDF file.
+        """
+        # Use ForwardSDMolSupplier to read the mock data from the binary string
+        mock_file = mock_open(read_data=Tox21ChallengeMockData.get_raw_train_data())
+        with patch("builtins.open", mock_file):
+            with open(
+                r"G:\github-aditya0by0\chebai_data\tox21_challenge\tox21_10k_data_all.sdf\tox21_10k_data_all.sdf",
+                "rb",
+            ) as f:
+                suppl = Chem.ForwardSDMolSupplier(f)
+
+        mock_sdmol_supplier.return_value = suppl
+
+        actual_data = self.tox21._load_data_from_file("fake/path")
+        self.assertEqual(Tox21ChallengeMockData.data_in_dict_format(), actual_data)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py
index 3639f5d1..0a2d67b1 100644
--- a/tests/unit/dataset_classes/testTox21MolNet.py
+++ b/tests/unit/dataset_classes/testTox21MolNet.py
@@ -7,7 +7,7 @@
 from sklearn.model_selection import GroupShuffleSplit
 
 from chebai.preprocessing.datasets.tox21 import Tox21MolNet
-from tests.unit.mock_data.tox_mock_data import Tox21MockData
+from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData
 
 
 class TestTox21MolNet(unittest.TestCase):
@@ -25,7 +25,7 @@ def setUpClass(cls) -> None:
     @patch(
         "builtins.open",
         new_callable=mock_open,
-        read_data=Tox21MockData.get_raw_data(),
+        read_data=Tox21MolNetMockData.get_raw_data(),
     )
     def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
         """
@@ -34,7 +34,7 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
         Args:
             mock_open_file (mock_open): Mocked open function to simulate file reading.
         """
-        expected_data = Tox21MockData.get_processed_data()
+        expected_data = Tox21MolNetMockData.get_processed_data()
         actual_data = self.data_module._load_data_from_file("fake/file/path.csv")
 
         self.assertEqual(
@@ -46,7 +46,7 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
     @patch.object(
         Tox21MolNet,
         "_load_data_from_file",
-        return_value=Tox21MockData.get_processed_data(),
+        return_value=Tox21MolNetMockData.get_processed_data(),
     )
     @patch("torch.save")
     def test_setup_processed_simple_split(
@@ -71,7 +71,7 @@ def test_setup_processed_simple_split(
     # @patch("os.path.isfile", return_value=False)
     # @patch.object(Tox21MolNet,
     #               "_load_data_from_file",
-    #               return_value= Tox21MockData.get_processed_grouped_data())
+    #               return_value= Tox21MolNetMockData.get_processed_grouped_data())
     # @patch("torch.save")
     # @patch("torch.load")
     # @patch("chebai.preprocessing.datasets.tox21.GroupShuffleSplit")
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index d516a7a0..478a2bbb 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -651,72 +651,72 @@ def get_UniProt_raw_data() -> str:
         protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"]
         raw_str = (
             f"ID   Swiss_Prot_1              Reviewed;         {len(protein_sq_1)} AA. \n"
-            + "AC   Q6GZX4;\n"
-            + "DR   GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n"
-            + "DR   GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n"
-            + "DR   GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n"
-            + "DR   GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n"
-            + f"SQ   SEQUENCE   {len(protein_sq_1)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
-            + f"     {protein_sq_1}\n"
-            + "//\n"
-            + f"ID   Swiss_Prot_2              Reviewed;         {len(protein_sq_2)} AA.\n"
-            + "AC   DCGZX4;\n"
-            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
-            + "DR   GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n"
-            + "DR   GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n"
-            + "DR   GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n"
-            + f"SQ   SEQUENCE   {len(protein_sq_2)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
-            + f"     {protein_sq_2}\n"
-            + "//\n"
-            + "ID   Swiss_Prot_3              Reviewed;         1165 AA.\n"
-            + "AC   Q6GZX4;\n"
-            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
-            + "DR   GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n"
-            + "DR   GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n"
-            + "DR   GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
-            + "SQ   SEQUENCE   1165 AA;  129118 MW;  FE2984658CED53A8 CRC64;\n"
-            + "     MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n"
-            + "     GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n"
-            + "     DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n"
-            + "     DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n"
-            + "     KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n"
-            + "     EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n"
-            + "     SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n"
-            + "     KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n"
-            + "     KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n"
-            + "     WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n"
-            + "     NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n"
-            + "     NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n"
-            + "     SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n"
-            + "     GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n"
-            + "     YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n"
-            + "     AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n"
-            + "     AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n"
-            + "     NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n"
-            + "     ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n"
-            + "     DVRVSGVMIY RASEGKPMQY VSLLM\n"
-            + "//\n"
-            + "ID   Swiss_Prot_4              Reviewed;         60 AA.\n"
-            + "AC   Q6GZX4;\n"
-            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
-            + "DR   GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n"
-            + "DR   GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n"
-            + "DR   GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
-            + "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
-            + "     XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
-            + "//\n"
-            + "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
-            + "AC   Q6GZX4;\n"
-            + "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
-            + "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
-            + "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
-            + "//\n"
-            + "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
-            + "AC   Q6GZX4;\n"
-            + "DR   GO; GO:0000005; P:regulation of viral transcription;\n"
-            + "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
-            + "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
-            + "//"
+            "AC   Q6GZX4;\n"
+            "DR   GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n"
+            "DR   GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n"
+            "DR   GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n"
+            "DR   GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n"
+            f"SQ   SEQUENCE   {len(protein_sq_1)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            f"     {protein_sq_1}\n"
+            "//\n"
+            f"ID   Swiss_Prot_2              Reviewed;         {len(protein_sq_2)} AA.\n"
+            "AC   DCGZX4;\n"
+            "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            "DR   GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n"
+            "DR   GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n"
+            "DR   GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n"
+            f"SQ   SEQUENCE   {len(protein_sq_2)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            f"     {protein_sq_2}\n"
+            "//\n"
+            "ID   Swiss_Prot_3              Reviewed;         1165 AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            "DR   GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n"
+            "DR   GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n"
+            "DR   GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
+            "SQ   SEQUENCE   1165 AA;  129118 MW;  FE2984658CED53A8 CRC64;\n"
+            "     MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n"
+            "     GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n"
+            "     DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n"
+            "     DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n"
+            "     KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n"
+            "     EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n"
+            "     SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n"
+            "     KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n"
+            "     KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n"
+            "     WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n"
+            "     NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n"
+            "     NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n"
+            "     SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n"
+            "     GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n"
+            "     YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n"
+            "     AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n"
+            "     AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n"
+            "     NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n"
+            "     ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n"
+            "     DVRVSGVMIY RASEGKPMQY VSLLM\n"
+            "//\n"
+            "ID   Swiss_Prot_4              Reviewed;         60 AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            "DR   GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n"
+            "DR   GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n"
+            "DR   GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
+            "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            "     XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            "//\n"
+            "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
+            "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            "//\n"
+            "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   GO; GO:0000005; P:regulation of viral transcription;\n"
+            "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            "//"
         )
 
         return raw_str
diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py
index 912d172c..96b31a91 100644
--- a/tests/unit/mock_data/tox_mock_data.py
+++ b/tests/unit/mock_data/tox_mock_data.py
@@ -1,4 +1,4 @@
-class Tox21MockData:
+class Tox21MolNetMockData:
     """
     A utility class providing mock data for testing the Tox21MolNet dataset.
 
@@ -15,15 +15,15 @@ def get_raw_data() -> str:
         """
         return (
             "NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,"
-            + "mol_id,smiles\n"
-            + "0,0,1,0,1,1,0,1,0,,1,0,TOX958,Nc1ccc([N+](=O)[O-])cc1N\n"
-            + ",,,,,,,,,1,,,TOX31681,Nc1cc(C(F)(F)F)ccc1S\n"
-            + "0,0,0,0,0,0,0,,0,0,0,0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C\n"
-            + "0,0,0,0,0,0,0,0,0,0,0,0,TOX6619,O=S(=O)(Cl)c1ccccc1\n"
-            + "0,0,0,,0,0,,,0,,1,,TOX27679,CCCCCc1ccco1\n"
-            + "0,,1,,,,0,,1,1,1,1,TOX2801,Oc1c(Cl)cc(Cl)c2cccnc12\n"
-            + "0,0,0,0,,0,,,0,0,,1,TOX2808,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21\n"
-            + "0,,0,1,,,,1,0,,1,,TOX29085,CCCCCCCCCCCCCCn1cc[n+](C)c1\n"
+            "mol_id,smiles\n"
+            "0,0,1,0,1,1,0,1,0,,1,0,TOX958,Nc1ccc([N+](=O)[O-])cc1N\n"
+            ",,,,,,,,,1,,,TOX31681,Nc1cc(C(F)(F)F)ccc1S\n"
+            "0,0,0,0,0,0,0,,0,0,0,0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C\n"
+            "0,0,0,0,0,0,0,0,0,0,0,0,TOX6619,O=S(=O)(Cl)c1ccccc1\n"
+            "0,0,0,,0,0,,,0,,1,,TOX27679,CCCCCc1ccco1\n"
+            "0,,1,,,,0,,1,1,1,1,TOX2801,Oc1c(Cl)cc(Cl)c2cccnc12\n"
+            "0,0,0,0,,0,,,0,0,,1,TOX2808,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21\n"
+            "0,,0,1,,,,1,0,,1,,TOX29085,CCCCCCCCCCCCCCn1cc[n+](C)c1\n"
         )
 
     @staticmethod
@@ -185,7 +185,7 @@ def get_processed_grouped_data():
         Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset.
         Each dictionary contains 'ident', 'features', and 'labels'.
         """
-        processed_data = Tox21MockData.get_processed_data()
+        processed_data = Tox21MolNetMockData.get_processed_data()
         groups = ["A", "A", "B", "B", "C", "C", "C", "C"]
 
         assert len(processed_data) == len(
@@ -199,3 +199,195 @@ def get_processed_grouped_data():
         ]
 
         return grouped_data
+
+
+class Tox21ChallengeMockData:
+
+    MOL_BINARY_STR = (
+        b"cyclobutane\n"
+        b"     RDKit          2D\n\n"
+        b"  4  4  0  0  0  0  0  0  0  0999 V2000\n"
+        b"    1.0607   -0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n"
+        b"   -0.0000   -1.0607    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n"
+        b"   -1.0607    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n"
+        b"    0.0000    1.0607    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n"
+        b"  1  2  1  0\n"
+        b"  2  3  1  0\n"
+        b"  3  4  1  0\n"
+        b"  4  1  1  0\n"
+        b"M  END\n\n"
+    )
+
+    SMILES_OF_MOL = "C1CCC1"
+    # Feature encoding of SMILES as per chebai/preprocessing/bin/smiles_token/tokens.txt
+    FEATURE_OF_SMILES = [19, 42, 19, 19, 19, 42]
+
+    @staticmethod
+    def get_raw_train_data():
+        raw_str = (
+            Tox21ChallengeMockData.MOL_BINARY_STR + b">  <DSSTox_CID>\n"
+            b"25848\n\n"
+            b">  <SR-HSE>\n"
+            b"0\n\n"
+            b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b">  <DSSTox_CID>\n"
+            b"2384\n\n"
+            b">  <NR-Aromatase>\n"
+            b"1\n\n"
+            b">  <NR-AR>\n"
+            b"0\n\n"
+            b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b">  <DSSTox_CID>\n"
+            b"27102\n\n"
+            b">  <NR-AR>\n"
+            b"0\n\n"
+            b">  <NR-AhR>\n"
+            b"0\n\n"
+            b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b">  <DSSTox_CID>\n"
+            b"26792\n\n"
+            b">  <NR-AR>\n"
+            b"1\n\n"
+            b">  <NR-AR-LBD>\n"
+            b"1\n\n"
+            b">  <NR-AhR>\n"
+            b"1\n\n"
+            b">  <NR-Aromatase>\n"
+            b"1\n\n"
+            b">  <NR-ER>\n"
+            b"1\n\n"
+            b">  <NR-ER-LBD>\n"
+            b"1\n\n"
+            b">  <NR-PPAR-gamma>\n"
+            b"1\n\n"
+            b">  <SR-ARE>\n"
+            b"1\n\n"
+            b">  <SR-ATAD5>\n"
+            b"1\n\n"
+            b">  <SR-HSE>\n"
+            b"1\n\n"
+            b">  <SR-MMP>\n"
+            b"1\n\n"
+            b">  <SR-p53>\n"
+            b"1\n\n"
+            b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b">  <DSSTox_CID>\n"
+            b"26401\n\n"
+            b">  <SR-ARE>\n"
+            b"1\n\n"
+            b">  <SR-HSE>\n"
+            b"1\n\n"
+            b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b">  <DSSTox_CID>\n"
+            b"25973\n\n"
+            b"$$$$\n"
+        )
+        return raw_str
+
+    @staticmethod
+    def data_in_dict_format():
+        data_list = [
+            {
+                "labels": [
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    0,
+                    None,
+                    None,
+                ],
+                "ident": "25848",
+            },
+            {
+                "labels": [
+                    0,
+                    None,
+                    None,
+                    1,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                ],
+                "ident": "2384",
+            },
+            {
+                "labels": [
+                    0,
+                    None,
+                    0,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                ],
+                "ident": "27102",
+            },
+            {
+                "labels": [
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                ],
+                "ident": "26792",
+            },
+            {
+                "labels": [
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    1,
+                    None,
+                    1,
+                    None,
+                    None,
+                ],
+                "ident": "26401",
+            },
+            {
+                "labels": [
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                ],
+                "ident": "25973",
+            },
+        ]
+
+        for dict_ in data_list:
+            dict_["features"] = Tox21ChallengeMockData.FEATURE_OF_SMILES
+            dict_["group"] = None
+
+        return data_list

From a3670b0ca2a73ebb417bb4d45dea8e87d61937ac Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 17 Sep 2024 12:23:24 +0200
Subject: [PATCH 037/112] test for Tox21Chal

---
 .../dataset_classes/testTox21Challenge.py     |  95 +++++++++++++-
 tests/unit/mock_data/tox_mock_data.py         | 122 +++++++++++++++++-
 2 files changed, 206 insertions(+), 11 deletions(-)

diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py
index 4b23c487..9986c82f 100644
--- a/tests/unit/dataset_classes/testTox21Challenge.py
+++ b/tests/unit/dataset_classes/testTox21Challenge.py
@@ -1,28 +1,37 @@
-import os
 import unittest
-from unittest.mock import MagicMock, mock_open, patch
+from unittest.mock import mock_open, patch
 
 from rdkit import Chem
 
 from chebai.preprocessing.datasets.tox21 import Tox21Challenge
 from chebai.preprocessing.reader import ChemDataReader
-from tests.unit.mock_data.tox_mock_data import Tox21ChallengeMockData
+from tests.unit.mock_data.tox_mock_data import (
+    Tox21ChallengeMockData,
+    Tox21MolNetMockData,
+)
 
 
 class TestTox21Challenge(unittest.TestCase):
+    """
+    Unit tests for the Tox21Challenge class.
+    """
 
     @classmethod
-    def setUpClass(cls):
+    def setUpClass(cls) -> None:
         """
         Set up the Tox21Challenge instance and mock data for testing.
+        This is run once for the test class.
         """
         Tox21Challenge.READER = ChemDataReader
         cls.tox21 = Tox21Challenge()
 
     @patch("rdkit.Chem.SDMolSupplier")
-    def test_load_data_from_file(self, mock_sdmol_supplier) -> None:
+    def test_load_data_from_file(self, mock_sdmol_supplier: patch) -> None:
         """
-        Test the _load_data_from_file method to ensure it correctly loads data from an SDF file.
+        Test the `_load_data_from_file` method to ensure it correctly loads data from an SDF file.
+
+        Args:
+            mock_sdmol_supplier (patch): A mock of the RDKit SDMolSupplier.
         """
         # Use ForwardSDMolSupplier to read the mock data from the binary string
         mock_file = mock_open(read_data=Tox21ChallengeMockData.get_raw_train_data())
@@ -36,7 +45,79 @@ def test_load_data_from_file(self, mock_sdmol_supplier) -> None:
         mock_sdmol_supplier.return_value = suppl
 
         actual_data = self.tox21._load_data_from_file("fake/path")
-        self.assertEqual(Tox21ChallengeMockData.data_in_dict_format(), actual_data)
+        expected_data = Tox21ChallengeMockData.data_in_dict_format()
+
+        self.assertEqual(
+            actual_data,
+            expected_data,
+            "The loaded data from file does not match the expected data.",
+        )
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=Tox21MolNetMockData.get_raw_data(),
+    )
+    def test_load_dict(self, mock_open_file: mock_open) -> None:
+        """
+        Test the `_load_dict` method to ensure correct CSV parsing.
+
+        Args:
+            mock_open_file (mock_open): Mocked open function to simulate file reading.
+        """
+        expected_data = Tox21MolNetMockData.get_processed_data()
+        actual_data = self.tox21._load_dict("fake/file/path.csv")
+
+        self.assertEqual(
+            list(actual_data),
+            expected_data,
+            "The loaded data from CSV does not match the expected processed data.",
+        )
+
+    @patch.object(Tox21Challenge, "_load_data_from_file", return_value="test")
+    @patch("builtins.open", new_callable=mock_open)
+    @patch("torch.save")
+    @patch("os.path.join")
+    def test_setup_processed(
+        self,
+        mock_join: patch,
+        mock_torch_save: patch,
+        mock_open_file: mock_open,
+        mock_load_file: patch,
+    ) -> None:
+        """
+        Test the `setup_processed` method to ensure it processes and saves data correctly.
+
+        Args:
+            mock_join (patch): Mock of os.path.join to simulate file path joining.
+            mock_torch_save (patch): Mock of torch.save to simulate saving processed data.
+            mock_open_file (mock_open): Mocked open function to simulate file reading.
+            mock_load_file (patch): Mocked data loading method.
+        """
+        # Simulated raw and processed directories
+        path_str = "fake/test/path"
+        mock_join.return_value = path_str
+
+        # Mock the file content for test.smiles and score.txt
+        mock_open_file.side_effect = [
+            mock_open(
+                read_data=Tox21ChallengeMockData.get_raw_smiles_data()
+            ).return_value,
+            mock_open(
+                read_data=Tox21ChallengeMockData.get_raw_score_txt_data()
+            ).return_value,
+        ]
+
+        # Call setup_processed to simulate the data processing workflow
+        self.tox21.setup_processed()
+
+        # Assert that torch.save was called with the correct processed data
+        expected_test_data = Tox21ChallengeMockData.get_setup_processed_output_data()
+        mock_torch_save.assert_called_with(expected_test_data, path_str)
+
+        self.assertTrue(
+            mock_torch_save.called, "The processed data was not saved as expected."
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py
index 96b31a91..32745c38 100644
--- a/tests/unit/mock_data/tox_mock_data.py
+++ b/tests/unit/mock_data/tox_mock_data.py
@@ -1,3 +1,6 @@
+from typing import Dict, List
+
+
 class Tox21MolNetMockData:
     """
     A utility class providing mock data for testing the Tox21MolNet dataset.
@@ -27,7 +30,7 @@ def get_raw_data() -> str:
         )
 
     @staticmethod
-    def get_processed_data() -> list:
+    def get_processed_data() -> List[Dict]:
         """
         Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset.
         Each dictionary contains 'ident', 'features', and 'labels'.
@@ -180,7 +183,7 @@ def get_processed_data() -> list:
         ]
 
     @staticmethod
-    def get_processed_grouped_data():
+    def get_processed_grouped_data() -> List[Dict]:
         """
         Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset.
         Each dictionary contains 'ident', 'features', and 'labels'.
@@ -223,7 +226,7 @@ class Tox21ChallengeMockData:
     FEATURE_OF_SMILES = [19, 42, 19, 19, 19, 42]
 
     @staticmethod
-    def get_raw_train_data():
+    def get_raw_train_data() -> bytes:
         raw_str = (
             Tox21ChallengeMockData.MOL_BINARY_STR + b">  <DSSTox_CID>\n"
             b"25848\n\n"
@@ -280,7 +283,7 @@ def get_raw_train_data():
         return raw_str
 
     @staticmethod
-    def data_in_dict_format():
+    def data_in_dict_format() -> List[Dict]:
         data_list = [
             {
                 "labels": [
@@ -391,3 +394,114 @@ def data_in_dict_format():
             dict_["group"] = None
 
         return data_list
+
+    @staticmethod
+    def get_raw_smiles_data() -> str:
+        """
+        Returns mock SMILES data in a tab-delimited format (mocks test.smiles file).
+
+        The data represents molecules and their associated sample IDs.
+
+        Returns:
+            str: A string containing SMILES representations and corresponding sample IDs.
+        """
+        return (
+            "#SMILES\tSample ID\n"
+            f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00260869-01\n"
+            f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261776-01\n"
+            f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261380-01\n"
+            f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261842-01\n"
+            f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261662-01\n"
+            f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261190-01\n"
+        )
+
+    @staticmethod
+    def get_raw_score_txt_data() -> str:
+        """
+        Returns mock score data in a tab-delimited format (mocks test_results.txt file).
+
+        The data represents toxicity test results for different molecular samples, including several toxicity endpoints.
+
+        Returns:
+            str: A string containing toxicity scores for each molecular sample and corresponding toxicity endpoints.
+        """
+        return (
+            "Sample ID\tNR-AhR\tNR-AR\tNR-AR-LBD\tNR-Aromatase\tNR-ER\tNR-ER-LBD\tNR-PPAR-gamma\t"
+            "SR-ARE\tSR-ATAD5\tSR-HSE\tSR-MMP\tSR-p53\n"
+            "NCGC00260869-01\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n"
+            "NCGC00261776-01\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\n"
+            "NCGC00261380-01\tx\tx\tx\tx\tx\tx\tx\tx\tx\tx\tx\tx\n"
+            "NCGC00261842-01\t0\t0\t0\tx\t0\t0\t0\t0\t0\t0\tx\t1\n"
+            "NCGC00261662-01\t1\t0\t0\tx\t1\t1\t1\tx\t1\t1\tx\t1\n"
+            "NCGC00261190-01\tx\t0\t0\tx\t1\t0\t0\t1\t0\t0\t1\t1\n"
+        )
+
+    @staticmethod
+    def get_setup_processed_output_data() -> List[Dict]:
+        """
+        Returns mock processed data used for testing the `setup_processed` method.
+
+        The data contains molecule identifiers and their corresponding toxicity labels for multiple endpoints.
+        Each dictionary in the list represents a molecule with its associated labels, features, and group information.
+
+        Returns:
+            List[Dict]: A list of dictionaries where each dictionary contains:
+                        - "features": The SMILES features of the molecule.
+                        - "labels": A list of toxicity endpoint labels (0, 1, or None).
+                        - "ident": The sample identifier.
+                        - "group": None (default value for the group key).
+        """
+
+        # "NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5",
+        # "SR-HSE", "SR-MMP", "SR-p53",
+        data_list = [
+            {
+                "labels": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                "ident": "NCGC00260869-01",
+            },
+            {
+                "labels": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                "ident": "NCGC00261776-01",
+            },
+            {
+                "labels": [
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                ],
+                "ident": "NCGC00261380-01",
+            },
+            {
+                "labels": [0, 0, 0, None, 0, 0, 0, 0, 0, 0, None, 1],
+                "ident": "NCGC00261842-01",
+            },
+            {
+                "labels": [0, 0, 1, None, 1, 1, 1, None, 1, 1, None, 1],
+                "ident": "NCGC00261662-01",
+            },
+            {
+                "labels": [0, 0, None, None, 1, 0, 0, 1, 0, 0, 1, 1],
+                "ident": "NCGC00261190-01",
+            },
+        ]
+
+        complete_list = []
+        for dict_ in data_list:
+            complete_list.append(
+                {
+                    "features": Tox21ChallengeMockData.FEATURE_OF_SMILES,
+                    **dict_,
+                    "group": None,
+                }
+            )
+
+        return complete_list

From ac3ac19deed760fb422a60f8f8b2e84bc45540cb Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 17 Sep 2024 13:12:35 +0200
Subject: [PATCH 038/112] patch `os.makedirs` in tests to avoid creating
 directories

---
 tests/unit/dataset_classes/testChEBIOverX.py  |  4 +-
 .../dataset_classes/testChebiDataExtractor.py |  6 +-
 .../dataset_classes/testChebiOverXPartial.py  |  3 +-
 .../dataset_classes/testDynamicDataset.py     |  6 +-
 .../testGOUniProDataExtractor.py              |  6 +-
 .../dataset_classes/testGoUniProtOverX.py     |  3 +-
 .../dataset_classes/testTox21Challenge.py     |  3 +-
 tests/unit/dataset_classes/testTox21MolNet.py | 55 +------------------
 .../dataset_classes/testXYBaseDataModule.py   |  3 +-
 9 files changed, 29 insertions(+), 60 deletions(-)

diff --git a/tests/unit/dataset_classes/testChEBIOverX.py b/tests/unit/dataset_classes/testChEBIOverX.py
index 78d85dd4..270b868c 100644
--- a/tests/unit/dataset_classes/testChEBIOverX.py
+++ b/tests/unit/dataset_classes/testChEBIOverX.py
@@ -9,11 +9,13 @@ class TestChEBIOverX(unittest.TestCase):
     @classmethod
     @patch.multiple(ChEBIOverX, __abstractmethods__=frozenset())
     @patch.object(ChEBIOverX, "processed_dir_main", new_callable=PropertyMock)
-    def setUpClass(cls, mock_processed_dir_main: PropertyMock) -> None:
+    @patch("os.makedirs", return_value=None)
+    def setUpClass(cls, mock_makedirs, mock_processed_dir_main: PropertyMock) -> None:
         """
         Set up the ChEBIOverX instance with a mock processed directory path and a test graph.
 
         Args:
+            mock_makedirs: This patches os.makedirs to do nothing
             mock_processed_dir_main (PropertyMock): Mocked property for the processed directory path.
         """
         mock_processed_dir_main.return_value = "/mock/processed_dir"
diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py
index 0559e090..8da900da 100644
--- a/tests/unit/dataset_classes/testChebiDataExtractor.py
+++ b/tests/unit/dataset_classes/testChebiDataExtractor.py
@@ -14,8 +14,12 @@ class TestChEBIDataExtractor(unittest.TestCase):
     @patch.multiple(_ChEBIDataExtractor, __abstractmethods__=frozenset())
     @patch.object(_ChEBIDataExtractor, "base_dir", new_callable=PropertyMock)
     @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock)
+    @patch("os.makedirs", return_value=None)
     def setUpClass(
-        cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock
+        cls,
+        mock_makedirs,
+        mock_name_property: PropertyMock,
+        mock_base_dir_property: PropertyMock,
     ) -> None:
         """
         Set up a base instance of _ChEBIDataExtractor for testing with mocked properties.
diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py
index a8c53408..7720d301 100644
--- a/tests/unit/dataset_classes/testChebiOverXPartial.py
+++ b/tests/unit/dataset_classes/testChebiOverXPartial.py
@@ -11,7 +11,8 @@ class TestChEBIOverX(unittest.TestCase):
 
     @classmethod
     @patch.multiple(ChEBIOverXPartial, __abstractmethods__=frozenset())
-    def setUpClass(cls) -> None:
+    @patch("os.makedirs", return_value=None)
+    def setUpClass(cls, mock_makedirs) -> None:
         """
         Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph.
         """
diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py
index 1ff6c26d..e42c3e7e 100644
--- a/tests/unit/dataset_classes/testDynamicDataset.py
+++ b/tests/unit/dataset_classes/testDynamicDataset.py
@@ -17,8 +17,12 @@ class TestDynamicDataset(unittest.TestCase):
     @patch.multiple(_DynamicDataset, __abstractmethods__=frozenset())
     @patch.object(_DynamicDataset, "base_dir", new_callable=PropertyMock)
     @patch.object(_DynamicDataset, "_name", new_callable=PropertyMock)
+    @patch("os.makedirs", return_value=None)
     def setUpClass(
-        cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock
+        cls,
+        mock_makedirs,
+        mock_base_dir_property: PropertyMock,
+        mock_name_property: PropertyMock,
     ) -> None:
         """
         Set up a base instance of _DynamicDataset for testing with mocked properties.
diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
index 1b60aa97..976334f0 100644
--- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py
+++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
@@ -18,8 +18,12 @@ class TestGOUniProtDataExtractor(unittest.TestCase):
     @patch.multiple(_GOUniProtDataExtractor, __abstractmethods__=frozenset())
     @patch.object(_GOUniProtDataExtractor, "base_dir", new_callable=PropertyMock)
     @patch.object(_GOUniProtDataExtractor, "_name", new_callable=PropertyMock)
+    @patch("os.makedirs", return_value=None)
     def setUpClass(
-        cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock
+        cls,
+        mock_makedirs,
+        mock_name_property: PropertyMock,
+        mock_base_dir_property: PropertyMock,
     ) -> None:
         """
         Class setup for mocking abstract properties of _GOUniProtDataExtractor.
diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py
index 282091b5..d4157770 100644
--- a/tests/unit/dataset_classes/testGoUniProtOverX.py
+++ b/tests/unit/dataset_classes/testGoUniProtOverX.py
@@ -12,7 +12,8 @@
 class TestGOUniProtOverX(unittest.TestCase):
     @classmethod
     @patch.multiple(_GOUniProtOverX, __abstractmethods__=frozenset())
-    def setUpClass(cls) -> None:
+    @patch("os.makedirs", return_value=None)
+    def setUpClass(cls, mock_makedirs) -> None:
         """
         Set up the class for tests by initializing the extractor, graph, and input DataFrame.
         """
diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py
index 9986c82f..b94c8ca4 100644
--- a/tests/unit/dataset_classes/testTox21Challenge.py
+++ b/tests/unit/dataset_classes/testTox21Challenge.py
@@ -17,7 +17,8 @@ class TestTox21Challenge(unittest.TestCase):
     """
 
     @classmethod
-    def setUpClass(cls) -> None:
+    @patch("os.makedirs", return_value=None)
+    def setUpClass(cls, mock_makedirs) -> None:
         """
         Set up the Tox21Challenge instance and mock data for testing.
         This is run once for the test class.
diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py
index 0a2d67b1..c995e701 100644
--- a/tests/unit/dataset_classes/testTox21MolNet.py
+++ b/tests/unit/dataset_classes/testTox21MolNet.py
@@ -13,14 +13,13 @@
 class TestTox21MolNet(unittest.TestCase):
 
     @classmethod
-    def setUpClass(cls) -> None:
+    @patch("os.makedirs", return_value=None)
+    def setUpClass(cls, mock_makedirs) -> None:
         """Initialize a Tox21MolNet instance for testing."""
         ReaderMock = MagicMock()
         ReaderMock.name.return_value = "MockedReaderTox21MolNet"
         Tox21MolNet.READER = ReaderMock
         cls.data_module = Tox21MolNet()
-        # cls.data_module.raw_dir = "/mock/raw_dir"
-        # cls.data_module.processed_dir = "/mock/processed_dir"
 
     @patch(
         "builtins.open",
@@ -59,57 +58,9 @@ def test_setup_processed_simple_split(
             mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data.
             mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
         """
+        # Facing technical error here
         self.data_module.setup_processed()
 
-        # # Check that torch.save was called for train, test, and validation splits
-        # self.assertEqual(
-        #     mock_torch_save.call_count,
-        #     3,
-        #     "torch.save should have been called exactly three times for train, test, and validation splits."
-        # )
-
-    # @patch("os.path.isfile", return_value=False)
-    # @patch.object(Tox21MolNet,
-    #               "_load_data_from_file",
-    #               return_value= Tox21MolNetMockData.get_processed_grouped_data())
-    # @patch("torch.save")
-    # @patch("torch.load")
-    # @patch("chebai.preprocessing.datasets.tox21.GroupShuffleSplit")
-    # def test_setup_processed_group_split(
-    #         self,
-    #         mock_group_split: MagicMock,
-    #         mock_torch_load: MagicMock,
-    #         mock_save: MagicMock,
-    #         mock_load_data: MagicMock,
-    #         mock_isfile: MagicMock
-    # ) -> None:
-    #     """
-    #     Test the `setup_processed` method for group-based data splitting and saving.
-    #
-    #     Args:
-    #         mock_save (MagicMock): Mocked `torch.save` function to avoid file writes.
-    #         mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data.
-    #         mock_isfile (MagicMock): Mocked `os.path.isfile` function to simulate file presence.
-    #         mock_group_split (MagicMock): Mocked `GroupShuffleSplit` to control data splitting behavior.
-    #     """
-    #     mock_group_split.return_value = GroupShuffleSplit(n_splits=1, train_size=0.7)
-    #     self.data_module.setup_processed()
-    #
-    #     # Load the test split
-    #     test_split_path = os.path.join(self.data_module.processed_dir, "test.pt")
-    #     test_split = torch.load(test_split_path)
-    #
-    #     # Check if torch.save was called with correct arguments
-    #     mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/test.pt")
-    #     mock_save.assert_any_call([mock_data[0]], "/mock/processed_dir/train.pt")
-    #     mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/validation.pt")
-    #     # Check that torch.save was called for train, test, and validation splits
-    #     self.assertEqual(
-    #         mock_torch_save.call_count,
-    #         3,
-    #         "torch.save should have been called exactly three times for train, test, and validation splits."
-    #     )
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py
index 8e3575ab..64dfbe40 100644
--- a/tests/unit/dataset_classes/testXYBaseDataModule.py
+++ b/tests/unit/dataset_classes/testXYBaseDataModule.py
@@ -11,7 +11,8 @@ class TestXYBaseDataModule(unittest.TestCase):
 
     @classmethod
     @patch.object(XYBaseDataModule, "_name", new_callable=PropertyMock)
-    def setUpClass(cls, mock_name_property: PropertyMock) -> None:
+    @patch("os.makedirs", return_value=None)
+    def setUpClass(cls, mock_makedirs, mock_name_property: PropertyMock) -> None:
         """
         Set up a base instance of XYBaseDataModule for testing.
         """

From 44a1dfda8f92627b3bab97f62ab9101452a2754e Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 22 Sep 2024 12:39:42 +0200
Subject: [PATCH 039/112] add test case for invalid token/input to read_data

---
 tests/unit/readers/testChemDataReader.py     | 10 ++++++++++
 tests/unit/readers/testDeepChemDataReader.py | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
index fde8604f..0c1c4d6f 100644
--- a/tests/unit/readers/testChemDataReader.py
+++ b/tests/unit/readers/testChemDataReader.py
@@ -92,6 +92,16 @@ def test_read_data_with_new_token(self) -> None:
             "The new token '[H-]' was not added at the correct index in the cache.",
         )
 
+    def test_read_data_with_invalid_input(self) -> None:
+        """
+        Test the _read_data method with an invalid input.
+        The invalid token should raise an error or be handled appropriately.
+        """
+        raw_data = "%INVALID%"
+
+        with self.assertRaises(ValueError):
+            self.reader._read_data(raw_data)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py
index 31a63dd1..dc29c9a6 100644
--- a/tests/unit/readers/testDeepChemDataReader.py
+++ b/tests/unit/readers/testDeepChemDataReader.py
@@ -100,6 +100,16 @@ def test_read_data_with_new_token(self) -> None:
             "The new token '[H-]' was not added to the correct index in the cache.",
         )
 
+    def test_read_data_with_invalid_input(self) -> None:
+        """
+        Test the _read_data method with an invalid input string.
+        The invalid token should raise an error or be handled appropriately.
+        """
+        raw_data = "CBr))(OCI"
+
+        with self.assertRaises(Exception):
+            self.reader._read_data(raw_data)
+
 
 if __name__ == "__main__":
     unittest.main()

From 03bf4cd4a1c3f2de4e93cedfb2d1b7096ac4454c Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 24 Sep 2024 12:58:34 +0200
Subject: [PATCH 040/112] refactor _extract_class_hierarchy

---
 chebai/preprocessing/datasets/chebi.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 1c0cb2f9..1f305d4b 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -758,27 +758,18 @@ def _extract_class_hierarchy(self, chebi_path: str) -> nx.DiGraph:
         """
         Extracts a subset of ChEBI based on subclasses of the top class ID.
 
+        This method calls the superclass method to extract the full class hierarchy,
+        then extracts the subgraph containing only the descendants of the top class ID, including itself.
+
         Args:
             chebi_path (str): The file path to the ChEBI ontology file.
 
         Returns:
-            nx.DiGraph: The extracted class hierarchy as a directed graph.
+            nx.DiGraph: The extracted class hierarchy as a directed graph, limited to the
+            descendants of the top class ID.
         """
-        with open(chebi_path, encoding="utf-8") as chebi:
-            chebi = "\n".join(l for l in chebi if not l.startswith("xref:"))
-        elements = [
-            term_callback(clause)
-            for clause in fastobo.loads(chebi)
-            if clause and ":" in str(clause.id)
-        ]
-        g = nx.DiGraph()
-        for n in elements:
-            g.add_node(n["id"], **n)
-        g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]])
-
-        g = nx.transitive_closure_dag(g)
-        g = g.subgraph(list(nx.descendants(g, self.top_class_id)) + [self.top_class_id])
-        print("Compute transitive closure")
+        g = super()._extract_class_hierarchy(chebi_path)
+        g = g.subgraph(list(g.successors(self.top_class_id)) + [self.top_class_id])
         return g
 
 

From 96d2097a9853e8e00a6067bd19ba3d35c49af317 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 24 Sep 2024 13:50:27 +0200
Subject: [PATCH 041/112] add top_class_id to kwargs

- add top_class_id to kwargs, which is needed in _ChEBIDataExtractor class to create another chebi class object related to train version

self._chebi_version_train_obj = self.__class__(single_class=self.single_class, **_init_kwargs,)
---
 chebai/preprocessing/datasets/chebi.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 1f305d4b..7316a39b 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -736,6 +736,9 @@ def __init__(self, top_class_id: int, **kwargs):
             top_class_id (int): The ID of the top class from which to extract subclasses.
             **kwargs: Additional keyword arguments passed to the superclass initializer.
         """
+        if "top_class_id" not in kwargs:
+            kwargs["top_class_id"] = top_class_id
+
         self.top_class_id: int = top_class_id
         super().__init__(**kwargs)
 

From 6b9024b088e244c13bf74f3be797fcb2154077d9 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Tue, 24 Sep 2024 18:10:06 +0200
Subject: [PATCH 042/112] minor changes to texts

---
 tutorials/data_exploration.ipynb | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tutorials/data_exploration.ipynb b/tutorials/data_exploration.ipynb
index 8cd834b1..fce3a9f7 100644
--- a/tutorials/data_exploration.ipynb
+++ b/tutorials/data_exploration.ipynb
@@ -19,14 +19,16 @@
    "source": [
     "# 1. Instantiation of a Data Class\n",
     "\n",
-    "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n",
+    "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data.\n",
     "### Inheritance Hierarchy\n",
     "\n",
     "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n",
     "\n",
-    "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n",
+    "- **`XYBaseDataModule`**: This is the base class for all data modules in `chebai`, providing foundational properties and methods for handling and processing datasets, including loading a stored dataset and creating a `DataLoader`.\n",
+    "\n",
+    "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for some datasets (e.g. the ChEBI and Gene Ontology datasets). The defining feature is the dynamically created data split into training, validation and test sets. It inherits from `XYBaseDataModule`.\n",
+    "\n",
     "\n",
-    "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n",
     "\n",
     "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n",
     "\n",
@@ -34,7 +36,7 @@
     "### Explanation\n",
     "A ChEBI data class can be configured with the following main parameters:\n",
     "\n",
-    "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n",
+    "- **chebi_version (int)**: Specifies the version of the ChEBI dataset to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n",
     "\n",
     "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n",
     "\n",
@@ -50,7 +52,7 @@
     "\n",
     "### Additional Input Parameters\n",
     "\n",
-    "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n",
+    "The `XYBaseDa ChEBI data class, which `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n",
     "\n",
     "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n",
     "\n",
@@ -225,11 +227,11 @@
     "### Data Files\n",
     "\n",
     "1. **`Raw Data Files`**: (e.g., `.obo` file)\n",
-    "   - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n",
+    "   - **Description**: Contains the raw ChEBI ontology data in OBO format, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n",
     "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
     "\n",
     "2. **`data.pkl`**\n",
-    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
+    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a Pandas dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
     "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
     "\n",
     "3. **`data.pt`**\n",
@@ -261,7 +263,7 @@
     "\n",
     "3. **Processed Data Stage 2**:\n",
     "   - **File**: `data.pt`\n",
-    "   - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n",
+    "   - **Description**: This final stage includes the tokenized data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n",
     "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
     "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n",
     "\n",

From b62c93119e34098f02919f9c4bf840def518b4b8 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 24 Sep 2024 21:48:36 +0200
Subject: [PATCH 043/112] update term_callback to handle obsolete terms

---
 chebai/preprocessing/datasets/chebi.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 1c0cb2f9..616a3408 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -13,7 +13,7 @@
 import pickle
 from abc import ABC
 from collections import OrderedDict
-from typing import Any, Dict, Generator, List, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 
 import fastobo
 import networkx as nx
@@ -244,11 +244,16 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
         with open(data_path, encoding="utf-8") as chebi:
             chebi = "\n".join(l for l in chebi if not l.startswith("xref:"))
 
-        elements = [
-            term_callback(clause)
-            for clause in fastobo.loads(chebi)
-            if clause and ":" in str(clause.id)
-        ]
+        elements = []
+        for term_doc in fastobo.loads(chebi):
+            if (
+                term_doc
+                and isinstance(term_doc.id, fastobo.id.PrefixedIdent)
+                and term_doc.id.prefix == "CHEBI"
+            ):
+                term_dict = term_callback(term_doc)
+                if term_dict:
+                    elements.append(term_dict)
 
         g = nx.DiGraph()
         for n in elements:
@@ -818,7 +823,7 @@ def chebi_to_int(s: str) -> int:
     return int(s[s.index(":") + 1 :])
 
 
-def term_callback(doc) -> dict:
+def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]:
     """
     Extracts information from a ChEBI term document.
     This function takes a ChEBI term document as input and extracts relevant information such as the term ID, parents,
@@ -858,6 +863,12 @@ def term_callback(doc) -> dict:
             parents.append(chebi_to_int(str(clause.term)))
         elif isinstance(clause, fastobo.term.NameClause):
             name = str(clause.name)
+
+        if isinstance(clause, fastobo.term.IsObsoleteClause):
+            if clause.obsolete:
+                # if the term document contains clause as obsolete as true, skips this document.
+                return False
+
     return {
         "id": chebi_to_int(str(doc.id)),
         "parents": parents,

From aab0fea1df5801b047e0f1ba9e3d2bce9f928f91 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Wed, 25 Sep 2024 13:57:54 +0200
Subject: [PATCH 044/112] test case for `Tox21MolNet.setup_processed` simple
 split

---
 tests/unit/dataset_classes/testTox21MolNet.py | 43 +++++++++++++++----
 tests/unit/mock_data/tox_mock_data.py         |  5 ++-
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py
index c995e701..042a6ae4 100644
--- a/tests/unit/dataset_classes/testTox21MolNet.py
+++ b/tests/unit/dataset_classes/testTox21MolNet.py
@@ -42,25 +42,52 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
             "The loaded data does not match the expected output.",
         )
 
-    @patch.object(
-        Tox21MolNet,
-        "_load_data_from_file",
-        return_value=Tox21MolNetMockData.get_processed_data(),
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=Tox21MolNetMockData.get_raw_data(),
     )
     @patch("torch.save")
     def test_setup_processed_simple_split(
-        self, mock_load_data: MagicMock, mock_torch_save: MagicMock
+        self,
+        mock_torch_save,
+        mock_open_file: mock_open,
     ) -> None:
         """
         Test the `setup_processed` method for basic data splitting and saving.
 
         Args:
-            mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data.
-            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
+            mock_torch_save : Mocked `torch.save` function to avoid actual file writes.
+            mock_open_file (mock_open): Mocked `open` builtin-method to provide custom data.
         """
-        # Facing technical error here
         self.data_module.setup_processed()
 
+        # Verify if torch.save was called for each split
+        self.assertEqual(mock_torch_save.call_count, 3)
+        call_args_list = mock_torch_save.call_args_list
+        self.assertIn("test", call_args_list[0][0][1])
+        self.assertIn("train", call_args_list[1][0][1])
+        self.assertIn("validation", call_args_list[2][0][1])
+
+        # Check for non-overlap between train, test, and validation
+        test_split = [d["ident"] for d in call_args_list[0][0][0]]
+        train_split = [d["ident"] for d in call_args_list[1][0][0]]
+        validation_split = [d["ident"] for d in call_args_list[2][0][0]]
+
+        # Assert no overlap between splits
+        self.assertTrue(
+            set(train_split).isdisjoint(test_split),
+            "There is an overlap between the train and test splits.",
+        )
+        self.assertTrue(
+            set(train_split).isdisjoint(validation_split),
+            "There is an overlap between the train and validation splits.",
+        )
+        self.assertTrue(
+            set(test_split).isdisjoint(validation_split),
+            "There is an overlap between the test and validation splits.",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py
index 32745c38..b5f85bda 100644
--- a/tests/unit/mock_data/tox_mock_data.py
+++ b/tests/unit/mock_data/tox_mock_data.py
@@ -35,7 +35,7 @@ def get_processed_data() -> List[Dict]:
         Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset.
         Each dictionary contains 'ident', 'features', and 'labels'.
         """
-        return [
+        data_list = [
             {
                 "ident": "TOX958",
                 "features": "Nc1ccc([N+](=O)[O-])cc1N",
@@ -182,6 +182,9 @@ def get_processed_data() -> List[Dict]:
             },
         ]
 
+        data_with_group = [{**data, "group": None} for data in data_list]
+        return data_with_group
+
     @staticmethod
     def get_processed_grouped_data() -> List[Dict]:
         """

From fc8182e0cc80187fcdf6ce8d9b0e783030378c5e Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Wed, 25 Sep 2024 19:11:35 +0200
Subject: [PATCH 045/112] test case for `Tox21MolNet.setup_processed` group
 split

---
 tests/unit/dataset_classes/testTox21MolNet.py | 117 ++++++++++++++----
 1 file changed, 93 insertions(+), 24 deletions(-)

diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py
index 042a6ae4..5d5f3497 100644
--- a/tests/unit/dataset_classes/testTox21MolNet.py
+++ b/tests/unit/dataset_classes/testTox21MolNet.py
@@ -1,21 +1,21 @@
-import os
 import unittest
-from typing import Dict, List
+from typing import List
 from unittest.mock import MagicMock, mock_open, patch
 
-import torch
-from sklearn.model_selection import GroupShuffleSplit
-
 from chebai.preprocessing.datasets.tox21 import Tox21MolNet
 from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData
 
 
 class TestTox21MolNet(unittest.TestCase):
-
     @classmethod
     @patch("os.makedirs", return_value=None)
-    def setUpClass(cls, mock_makedirs) -> None:
-        """Initialize a Tox21MolNet instance for testing."""
+    def setUpClass(cls, mock_makedirs: MagicMock) -> None:
+        """
+        Initialize a Tox21MolNet instance for testing.
+
+        Args:
+            mock_makedirs (MagicMock): Mocked `os.makedirs` function.
+        """
         ReaderMock = MagicMock()
         ReaderMock.name.return_value = "MockedReaderTox21MolNet"
         Tox21MolNet.READER = ReaderMock
@@ -39,7 +39,7 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
         self.assertEqual(
             list(actual_data),
             expected_data,
-            "The loaded data does not match the expected output.",
+            "The loaded data does not match the expected output from the file.",
         )
 
     @patch(
@@ -50,42 +50,111 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
     @patch("torch.save")
     def test_setup_processed_simple_split(
         self,
-        mock_torch_save,
+        mock_torch_save: MagicMock,
         mock_open_file: mock_open,
     ) -> None:
         """
         Test the `setup_processed` method for basic data splitting and saving.
 
         Args:
-            mock_torch_save : Mocked `torch.save` function to avoid actual file writes.
-            mock_open_file (mock_open): Mocked `open` builtin-method to provide custom data.
+            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
+            mock_open_file (mock_open): Mocked `open` function to simulate file reading.
+        """
+        self.data_module.setup_processed()
+
+        # Verify if torch.save was called for each split (train, test, validation)
+        self.assertEqual(
+            mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times."
+        )
+        call_args_list = mock_torch_save.call_args_list
+        self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.")
+        self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.")
+        self.assertIn(
+            "validation", call_args_list[2][0][1], "Missing 'validation' split."
+        )
+
+        # Check for non-overlap between train, test, and validation splits
+        test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]]
+        train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]]
+        validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]]
+
+        self.assertTrue(
+            set(train_split).isdisjoint(test_split),
+            "Overlap detected between the train and test splits.",
+        )
+        self.assertTrue(
+            set(train_split).isdisjoint(validation_split),
+            "Overlap detected between the train and validation splits.",
+        )
+        self.assertTrue(
+            set(test_split).isdisjoint(validation_split),
+            "Overlap detected between the test and validation splits.",
+        )
+
+    @patch.object(
+        Tox21MolNet,
+        "_load_data_from_file",
+        return_value=Tox21MolNetMockData.get_processed_grouped_data(),
+    )
+    @patch("torch.save")
+    def test_setup_processed_with_group_split(
+        self, mock_torch_save: MagicMock, mock_load_file: MagicMock
+    ) -> None:
+        """
+        Test the `setup_processed` method for group-based splitting and saving.
+
+        Args:
+            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
+            mock_load_file (MagicMock): Mocked `_load_data_from_file` to provide custom data.
         """
+        self.data_module.train_split = 0.5
         self.data_module.setup_processed()
 
         # Verify if torch.save was called for each split
-        self.assertEqual(mock_torch_save.call_count, 3)
+        self.assertEqual(
+            mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times."
+        )
         call_args_list = mock_torch_save.call_args_list
-        self.assertIn("test", call_args_list[0][0][1])
-        self.assertIn("train", call_args_list[1][0][1])
-        self.assertIn("validation", call_args_list[2][0][1])
+        self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.")
+        self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.")
+        self.assertIn(
+            "validation", call_args_list[2][0][1], "Missing 'validation' split."
+        )
 
-        # Check for non-overlap between train, test, and validation
-        test_split = [d["ident"] for d in call_args_list[0][0][0]]
-        train_split = [d["ident"] for d in call_args_list[1][0][0]]
-        validation_split = [d["ident"] for d in call_args_list[2][0][0]]
+        # Check for non-overlap between train, test, and validation splits (based on 'ident')
+        test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]]
+        train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]]
+        validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]]
 
-        # Assert no overlap between splits
         self.assertTrue(
             set(train_split).isdisjoint(test_split),
-            "There is an overlap between the train and test splits.",
+            "Overlap detected between the train and test splits (based on 'ident').",
         )
         self.assertTrue(
             set(train_split).isdisjoint(validation_split),
-            "There is an overlap between the train and validation splits.",
+            "Overlap detected between the train and validation splits (based on 'ident').",
         )
         self.assertTrue(
             set(test_split).isdisjoint(validation_split),
-            "There is an overlap between the test and validation splits.",
+            "Overlap detected between the test and validation splits (based on 'ident').",
+        )
+
+        # Check for non-overlap between train, test, and validation splits (based on 'group')
+        test_split_grp: List[str] = [d["group"] for d in call_args_list[0][0][0]]
+        train_split_grp: List[str] = [d["group"] for d in call_args_list[1][0][0]]
+        validation_split_grp: List[str] = [d["group"] for d in call_args_list[2][0][0]]
+
+        self.assertTrue(
+            set(train_split_grp).isdisjoint(test_split_grp),
+            "Overlap detected between the train and test splits (based on 'group').",
+        )
+        self.assertTrue(
+            set(train_split_grp).isdisjoint(validation_split_grp),
+            "Overlap detected between the train and validation splits (based on 'group').",
+        )
+        self.assertTrue(
+            set(test_split_grp).isdisjoint(validation_split_grp),
+            "Overlap detected between the test and validation splits (based on 'group').",
         )
 
 

From 1d3ecbe327b63324c52347ccc806a25c51471d40 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 26 Sep 2024 00:17:14 +0200
Subject: [PATCH 046/112] update chebi test as per modified term_callback

---
 .../dataset_classes/testChebiTermCallback.py  | 10 +++++---
 tests/unit/mock_data/ontology_mock_data.py    | 25 ++++++-------------
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/tests/unit/dataset_classes/testChebiTermCallback.py b/tests/unit/dataset_classes/testChebiTermCallback.py
index 7b22d1a2..8680760e 100644
--- a/tests/unit/dataset_classes/testChebiTermCallback.py
+++ b/tests/unit/dataset_classes/testChebiTermCallback.py
@@ -51,11 +51,13 @@ def test_skip_obsolete_terms(self) -> None:
         """
         Test that `term_callback` correctly skips obsolete ChEBI terms.
         """
+        term_callback_output = []
+        for ident in ChebiMockOntology.get_obsolete_nodes_ids():
+            raw_term = self.callback_input_data.get(ident)
+            term_dict = term_callback(raw_term)
+            if term_dict:
+                term_callback_output.append(term_dict)
 
-        term_callback_output = [
-            term_callback(self.callback_input_data.get(ident))
-            for ident in ChebiMockOntology.get_obsolete_nodes_ids()
-        ]
         self.assertEqual(
             term_callback_output,
             [],
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index 478a2bbb..40d9674e 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -356,24 +356,15 @@ def get_data_in_dataframe() -> pd.DataFrame:
                 "C1=CC=CC=C1Br",
                 "C1=CC=CC=C1[Mg+]",
             ],
-            # Relationships {
-            #  12345: [11111, 54321, 22222, 67890],
-            #  67890: [22222],
-            #  99999: [67890, 11111, 54321, 22222, 12345],
-            #  54321: [11111],
-            #  88888: [22222, 67890]
-            #  11111: []
-            #  22222: []
-            # }
             **{
-                # -row- [11111, 12345, 22222, 54321, 67890, 88888, 99999]
-                11111: [False, False, False, False, False, False, False],
-                12345: [True, True, True, True, True, False, False],
-                22222: [False, False, False, False, False, False, False],
-                54321: [True, False, False, True, False, False, False],
-                67890: [False, False, True, False, True, False, False],
-                88888: [False, False, True, False, True, True, False],
-                99999: [True, True, True, True, True, False, True],
+                # -row- [12345, 54321, 67890, 11111, 22222, 99999, 88888]
+                11111: [True, True, False, True, False, True, False],
+                12345: [True, False, False, False, False, True, False],
+                22222: [True, False, True, False, True, True, True],
+                54321: [True, True, False, False, False, True, False],
+                67890: [True, False, True, False, False, True, True],
+                88888: [False, False, False, False, False, False, True],
+                99999: [False, False, False, False, False, True, False],
             },
         )
 

From 630add7a78a24277cfddf2b97fe10a450ad9f069 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 26 Sep 2024 00:19:29 +0200
Subject: [PATCH 047/112] update input to add_edges_from

 - Modified input to add_edges_from to only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes
---
 chebai/preprocessing/datasets/chebi.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 616a3408..c5aac3ae 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -258,7 +258,16 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
         g = nx.DiGraph()
         for n in elements:
             g.add_node(n["id"], **n)
-        g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]])
+
+        # Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes
+        g.add_edges_from(
+            [
+                (p, q["id"])
+                for q in elements
+                for p in q["parents"]
+                if g.has_node(p) and g.has_node(q["id"])
+            ]
+        )
 
         print("Compute transitive closure")
         return nx.transitive_closure_dag(g)

From 35a621cee6cfd3c732d6e851ba2bc320defa760d Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 26 Sep 2024 00:30:49 +0200
Subject: [PATCH 048/112] group key not needed for Tox21Chal._load_dict

 - group key needed in Tox21MolNet but not needed for Tox21Chal._load_dict
---
 tests/unit/dataset_classes/testTox21Challenge.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py
index b94c8ca4..fedde8e5 100644
--- a/tests/unit/dataset_classes/testTox21Challenge.py
+++ b/tests/unit/dataset_classes/testTox21Challenge.py
@@ -67,6 +67,9 @@ def test_load_dict(self, mock_open_file: mock_open) -> None:
             mock_open_file (mock_open): Mocked open function to simulate file reading.
         """
         expected_data = Tox21MolNetMockData.get_processed_data()
+        for item in expected_data:
+            item.pop("group", None)
+
         actual_data = self.tox21._load_dict("fake/file/path.csv")
 
         self.assertEqual(

From 19b194aead6923f6b1f866447f9b5cbfd5cad1ec Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 30 Sep 2024 11:23:06 +0200
Subject: [PATCH 049/112] fix - if only one class surpass given selection
 threshold

- https://github.com/ChEB-AI/python-chebai/pull/54#issuecomment-2371843170
---
 chebai/preprocessing/datasets/base.py | 29 +++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
index 02877ad3..a2997699 100644
--- a/chebai/preprocessing/datasets/base.py
+++ b/chebai/preprocessing/datasets/base.py
@@ -14,6 +14,7 @@
 )
 from lightning.pytorch.core.datamodule import LightningDataModule
 from lightning_utilities.core.rank_zero import rank_zero_info
+from sklearn.model_selection import StratifiedShuffleSplit
 from torch.utils.data import DataLoader
 
 from chebai.preprocessing import reader as dr
@@ -929,11 +930,17 @@ def get_test_split(
         labels_list = df["labels"].tolist()
 
         test_size = 1 - self.train_split - (1 - self.train_split) ** 2
-        msss = MultilabelStratifiedShuffleSplit(
-            n_splits=1, test_size=test_size, random_state=seed
-        )
 
-        train_indices, test_indices = next(msss.split(labels_list, labels_list))
+        if len(labels_list[0]) > 1:
+            splitter = MultilabelStratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
+        else:
+            splitter = StratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
+
+        train_indices, test_indices = next(splitter.split(labels_list, labels_list))
 
         df_train = df.iloc[train_indices]
         df_test = df.iloc[test_indices]
@@ -985,12 +992,18 @@ def get_train_val_splits_given_test(
 
         # scale val set size by 1/self.train_split to compensate for (hypothetical) test set size (1-self.train_split)
         test_size = ((1 - self.train_split) ** 2) / self.train_split
-        msss = MultilabelStratifiedShuffleSplit(
-            n_splits=1, test_size=test_size, random_state=seed
-        )
+
+        if len(labels_list_trainval[0]) > 1:
+            splitter = MultilabelStratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
+        else:
+            splitter = StratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
 
         train_indices, validation_indices = next(
-            msss.split(labels_list_trainval, labels_list_trainval)
+            splitter.split(labels_list_trainval, labels_list_trainval)
         )
 
         df_validation = df_trainval.iloc[validation_indices]

From 4fc31dab7716a54f05666f9bd0d5fe51d066e647 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 30 Sep 2024 16:13:40 +0200
Subject: [PATCH 050/112] chebi notebook : suggested changes

- https://github.com/ChEB-AI/python-chebai/pull/46#pullrequestreview-2325741708
---
 tutorials/data_exploration_chebi.ipynb | 836 +++++++++++++++++++++++++
 1 file changed, 836 insertions(+)
 create mode 100644 tutorials/data_exploration_chebi.ipynb

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
new file mode 100644
index 00000000..17c3ae33
--- /dev/null
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -0,0 +1,836 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "\n",
+    "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n",
+    "\n",
+    "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n",
+    "\n",
+    "---\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d",
+   "metadata": {},
+   "source": [
+    "# 1. Instantiation of a Data Class\n",
+    "\n",
+    "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n",
+    "### Inheritance Hierarchy\n",
+    "\n",
+    "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n",
+    "\n",
+    "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n",
+    "\n",
+    "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n",
+    "\n",
+    "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n",
+    "\n",
+    "\n",
+    "### Explanation\n",
+    "A ChEBI data class can be configured with the following main parameters:\n",
+    "\n",
+    "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n",
+    "\n",
+    "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n",
+    "\n",
+    "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n",
+    "\n",
+    "### Additional Input Parameters\n",
+    "\n",
+    "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a",
+   "metadata": {},
+   "source": [
+    "# Available ChEBI Data Classes\n",
+    "\n",
+    "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py):\n",
+    "\n",
+    "## `ChEBIOver100`\n",
+    "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
+    "\n",
+    "## `ChEBIOver50`\n",
+    "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
+    "\n",
+    "## `ChEBIOver100DeepSMILES`\n",
+    "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n",
+    "\n",
+    "## `ChEBIOver100SELFIES`\n",
+    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n",
+    "\n",
+    "## `ChEBIOver50SELFIES`\n",
+    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n",
+    "\n",
+    "## `ChEBIOver50Partial`\n",
+    "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chebai.preprocessing.datasets.chebi import ChEBIOver50"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a71b7301-6195-4155-a439-f5eb3183d0f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chebi_class = ChEBIOver50(chebi_version=231)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8456b545-88c5-401d-baa5-47e8ae710f04",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1655d489-25fe-46de-9feb-eeca5d36936f",
+   "metadata": {},
+   "source": [
+    "# 2. Preparation / Setup Methods\n",
+    "\n",
+    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
+    "### Automatic Execution: \n",
+    "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n",
+    "\n",
+    "\n",
+    "### Why is Preparation Needed?\n",
+    "\n",
+    "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n",
+    "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n",
+    "\n",
+    "### Main Methods for Data Preprocessing\n",
+    "\n",
+    "The data preprocessing in a data class involves two main methods:\n",
+    "\n",
+    "1. **`prepare_data` Method**:\n",
+    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n",
+    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n",
+    "\n",
+    "2. **`setup` Method**:\n",
+    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n",
+    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n",
+    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
+    "\n",
+    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "f2df4bd1-cf34-4414-bce4-54379ffac006",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n",
+      "Cross-validation enabled: False\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n",
+      "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n",
+      "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n"
+     ]
+    }
+   ],
+   "source": [
+    "chebi_class.prepare_data()\n",
+    "chebi_class.setup()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e",
+   "metadata": {},
+   "source": [
+    "# 3. Different Data Files Created and their Structure\n",
+    "\n",
+    "\n",
+    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n",
+    "\n",
+    "### Data Files\n",
+    "\n",
+    "1. **`Raw Data Files`**: (e.g., `.obo` file)\n",
+    "   - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
+    "\n",
+    "2. **`data.pkl`**\n",
+    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
+    "\n",
+    "3. **`data.pt`**\n",
+    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
+    "\n",
+    "4. **`classes.txt`**\n",
+    "   - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n",
+    "\n",
+    "5. **`splits.csv`**\n",
+    "   - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n",
+    "\n",
+    "### File Structure and Preprocessing Stages\n",
+    "\n",
+    "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n",
+    "\n",
+    "1. **Raw Data Stage**:\n",
+    "   - **File**: `chebi.obo`\n",
+    "   - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
+    "\n",
+    "2. **Processed Data Stage 1**:\n",
+    "   - **File**: `data.pkl`\n",
+    "   - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
+    "   - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n",
+    "\n",
+    "3. **Processed Data Stage 2**:\n",
+    "   - **File**: `data.pt`\n",
+    "   - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n",
+    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
+    "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n",
+    "\n",
+    "### Data Splits\n",
+    "\n",
+    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n",
+    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n",
+    "\n",
+    "### Summary of File Paths\n",
+    "\n",
+    "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n",
+    "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n",
+    "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n",
+    "\n",
+    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74adb549-9e02-472d-a535-78a584853b52",
+   "metadata": {},
+   "source": [
+    "# 4. Information Stored in the Files\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43329709-5134-4ce5-88e7-edd2176bf84d",
+   "metadata": {},
+   "source": [
+    "## chebi.obo\n",
+    "\n",
+    "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n",
+    "\n",
+    "### Structure of `chebi.obo`\n",
+    "\n",
+    "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n",
+    "\n",
+    "#### Example of a Term Document\n",
+    "\n",
+    "```plaintext\n",
+    "[Term]\n",
+    "id: CHEBI:24867\n",
+    "name: monoatomic ion\n",
+    "subset: 3_STAR\n",
+    "synonym: \"monoatomic ions\" RELATED [ChEBI]\n",
+    "is_a: CHEBI:24870\n",
+    "is_a: CHEBI:33238\n",
+    "is_a: CHEBI:3323Relevant 8\n",
+    "```\n",
+    "\n",
+    "### Breakdown of Attributes\n",
+    "\n",
+    "Each term document in the `chebi.obo` file consists of the following key attributes:\n",
+    "\n",
+    "- **`[Term]`**: \n",
+    "  - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n",
+    "\n",
+    "- **`id: CHEBI:24867`**: \n",
+    "  - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n",
+    "  - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n",
+    "\n",
+    "- **`name: monoatomic ion`**: \n",
+    "  - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n",
+    "  - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n",
+    "\n",
+    "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n",
+    "  - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n",
+    "  - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "322bc926-69ff-4b93-9e95-5e8b85869c38",
+   "metadata": {},
+   "source": [
+    "## `data.pkl` File\n",
+    "\n",
+    "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n",
+    "\n",
+    "\n",
+    "\n",
+    "### Structure of `data.pkl`\n",
+    "`data.pkl` as following structure: \n",
+    "- **Column 0**: Contains the ID of each ChEBI data instance.\n",
+    "- **Column 1**: Contains the name of each ChEBI data instance.\n",
+    "- **Column 2**: Contains the SMILES representation of the chemical.\n",
+    "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n",
+    "\n",
+    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "fd490270-59b8-4c1c-8b09-204defddf592",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "d7d16247-092c-4e8d-96c2-ab23931cf766",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of the data (rows x columns):  (129184, 1335)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>SMILES</th>\n",
+       "      <th>1722</th>\n",
+       "      <th>2468</th>\n",
+       "      <th>2571</th>\n",
+       "      <th>2580</th>\n",
+       "      <th>2634</th>\n",
+       "      <th>3098</th>\n",
+       "      <th>3992</th>\n",
+       "      <th>...</th>\n",
+       "      <th>143017</th>\n",
+       "      <th>143212</th>\n",
+       "      <th>143813</th>\n",
+       "      <th>146180</th>\n",
+       "      <th>147334</th>\n",
+       "      <th>156473</th>\n",
+       "      <th>166828</th>\n",
+       "      <th>166904</th>\n",
+       "      <th>167497</th>\n",
+       "      <th>167559</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>33429</td>\n",
+       "      <td>monoatomic monoanion</td>\n",
+       "      <td>[*-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>30151</td>\n",
+       "      <td>aluminide(1-)</td>\n",
+       "      <td>[Al-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>16042</td>\n",
+       "      <td>halide anion</td>\n",
+       "      <td>[*-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>17051</td>\n",
+       "      <td>fluoride</td>\n",
+       "      <td>[F-]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>28741</td>\n",
+       "      <td>sodium fluoride</td>\n",
+       "      <td>[F-].[Na+]</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 1335 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      id                  name      SMILES   1722   2468   2571   2580   2634  \\\n",
+       "0  33429  monoatomic monoanion        [*-]  False  False  False  False  False   \n",
+       "1  30151         aluminide(1-)       [Al-]  False  False  False  False  False   \n",
+       "2  16042          halide anion        [*-]  False  False  False  False  False   \n",
+       "3  17051              fluoride        [F-]  False  False  False  False  False   \n",
+       "4  28741       sodium fluoride  [F-].[Na+]  False  False  False  False  False   \n",
+       "\n",
+       "    3098   3992  ...  143017  143212  143813  146180  147334  156473  166828  \\\n",
+       "0  False  False  ...   False   False   False   False   False   False   False   \n",
+       "1  False  False  ...   False   False   False   False   False   False   False   \n",
+       "2  False  False  ...   False   False   False   False   False   False   False   \n",
+       "3  False  False  ...   False   False   False   False   False   False   False   \n",
+       "4  False  False  ...   False   False   False   False   False   False   False   \n",
+       "\n",
+       "   166904  167497  167559  \n",
+       "0   False   False   False  \n",
+       "1   False   False   False  \n",
+       "2   False   False   False  \n",
+       "3   False   False   False  \n",
+       "4   False   False   False  \n",
+       "\n",
+       "[5 rows x 1335 columns]"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n",
+    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
+    "pkl_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07",
+   "metadata": {},
+   "source": [
+    "## `data.pt` File\n",
+    "\n",
+    "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n",
+    "\n",
+    "### Structure of `data.pt`\n",
+    "\n",
+    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
+    "\n",
+    "- **`features`**: \n",
+    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
+    "\n",
+    "- **`labels`**: \n",
+    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
+    "\n",
+    "- **`ident`**: \n",
+    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Type of loaded data: <class 'list'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n",
+    "print(\"Type of loaded data:\", type(data_pt))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n",
+      "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n",
+      "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n",
+      "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n",
+      "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(5):\n",
+    "    print(data_pt[i])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "861da1c3-0401-49f0-a22f-109814ed95d5",
+   "metadata": {},
+   "source": [
+    "## `classes.txt` File\n",
+    "\n",
+    "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n",
+    "\n",
+    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1722\n",
+      "2468\n",
+      "2571\n",
+      "2580\n",
+      "2634\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n",
+    "    for i in range(5):\n",
+    "        line = file.readline()\n",
+    "        print(line.strip())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b058714f-e434-4367-89b9-74c129ac727f",
+   "metadata": {},
+   "source": [
+    "## `splits.csv` File\n",
+    "\n",
+    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>split</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>33429</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>30151</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>17051</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>32129</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>30340</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      id  split\n",
+       "0  33429  train\n",
+       "1  30151  train\n",
+       "2  17051  train\n",
+       "3  32129  train\n",
+       "4  30340  train"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n",
+    "csv_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d",
+   "metadata": {},
+   "source": [
+    "# 5. Example Molecule: Different Encodings\n",
+    "\n",
+    "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n",
+    "\n",
+    "### Explanation:\n",
+    "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
+    "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
+    "\n",
+    "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n",
+    "\n",
+    "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n",
+    "   - **Benzene SMILES**: `c1ccccc1`\n",
+    "   - **Explanation**: \n",
+    "     - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n",
+    "\n",
+    "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n",
+    "   - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n",
+    "   - **Explanation**: \n",
+    "     - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n",
+    "     - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n",
+    "\n",
+    "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93e328cf-09f9-4694-b175-28320590937d",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (env_chebai)",
+   "language": "python",
+   "name": "env_chebai"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 587c0264b6a9c79a7d2b6be490c03486acc197f8 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 30 Sep 2024 16:14:31 +0200
Subject: [PATCH 051/112] go_notebook: data exploration

---
 tutorials/data_exploration_go.ipynb | 551 ++++++++++++++++++++++++++++
 1 file changed, 551 insertions(+)
 create mode 100644 tutorials/data_exploration_go.ipynb

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
new file mode 100644
index 00000000..391192a1
--- /dev/null
+++ b/tutorials/data_exploration_go.ipynb
@@ -0,0 +1,551 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Introduction\n",
+    "\n",
+    "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on Gene Ontology (GO) and Swiss UniProt Protein data. This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n",
+    "\n",
+    "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n",
+    "\n",
+    "---"
+   ],
+   "id": "da687d32ba48b188"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Information for Protein Dataset\n",
+    "\n",
+    "# 1. Instantiation of a Data Class\n",
+    "\n",
+    "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n",
+    "### Inheritance Hierarchy\n",
+    "\n",
+    "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n",
+    "\n",
+    "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n",
+    "\n",
+    "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n",
+    "\n",
+    "In summary, GO_UniProt data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n",
+    "\n",
+    "\n",
+    "### Configuration Parameters\n",
+    "\n",
+    "Data classes related to proteins can be configured using the following main parameters:\n",
+    "\n",
+    "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n",
+    "\n",
+    "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n",
+    "\n",
+    "### Additional Input Parameters\n",
+    "\n",
+    "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n",
+    "\n",
+    "### Available GOUniProt Data Classes\n",
+    "\n",
+    "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n",
+    "\n",
+    "#### `GOUniProtOver250`\n",
+    "\n",
+    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n",
+    "\n",
+    "#### `GOUniProtOver50`\n",
+    "\n",
+    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n",
+    "\n",
+    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n"
+   ],
+   "id": "64585012b0d7f66f"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "### Instantiation Example",
+   "id": "605bbca601037df2"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250",
+   "id": "440f203ceaf7e4b7",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T14:08:21.236447Z",
+     "start_time": "2024-09-30T14:08:21.130242Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "go_class = GOUniProtOver250()",
+   "id": "a648346d81d0dc5e",
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## GOUniProt Data File Structure\n",
+    "\n",
+    "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n",
+    "   - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n",
+    "   - **File Paths**:\n",
+    "     - `data/GO_UniProt/raw/${filename}.obo`\n",
+    "     - `data/GO_UniProt/raw/${filename}.dat`\n",
+    "\n",
+    "2. **`data.pkl`**\n",
+    "   - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n",
+    "\n",
+    "3. **`data.pt`**\n",
+    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n",
+    "\n",
+    "4. **`classes.txt`**\n",
+    "   - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n",
+    "\n",
+    "5. **`splits.csv`**\n",
+    "   - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
+    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n",
+    "\n",
+    "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n"
+   ],
+   "id": "ee174b61b36c71aa"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# 2. Preparation / Setup Methods\n",
+    "\n",
+    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
+    "### Automatic Execution: \n",
+    "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n",
+    "\n",
+    "\n",
+    "### Why is Preparation Needed?\n",
+    "\n",
+    "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n",
+    "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n",
+    "\n",
+    "### Main Methods for Data Preprocessing\n",
+    "\n",
+    "The data preprocessing in a data class involves two main methods:\n",
+    "\n",
+    "1. **`prepare_data` Method**:\n",
+    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n",
+    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n",
+    "\n",
+    "2. **`setup` Method**:\n",
+    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n",
+    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n",
+    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
+    "\n",
+    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
+   ],
+   "id": "2328e824c4dafb2d"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "go_class.prepare_data()\n",
+    "go_class.setup()"
+   ],
+   "id": "9f77351090560bc4"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## data.pkl",
+   "id": "735844f0b2474ad6"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T14:08:33.990378Z",
+     "start_time": "2024-09-30T14:08:33.959459Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "import pandas as pd",
+   "id": "b4da7e73e251e1d1",
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T14:10:12.796911Z",
+     "start_time": "2024-09-30T14:10:06.052276Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n",
+    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
+    "pkl_df.head()"
+   ],
+   "id": "b66fbb9b720d053c",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of the data (rows x columns):  (27459, 1050)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "       swiss_id             accession  \\\n",
+       "8   14331_ARATH  P42643,Q945M2,Q9M0S7   \n",
+       "9   14331_CAEEL         P41932,Q21537   \n",
+       "10  14331_MAIZE                P49106   \n",
+       "13  14332_MAIZE                Q01526   \n",
+       "14  14333_ARATH  P42644,F4KBI7,Q945L2   \n",
+       "\n",
+       "                                               go_ids  \\\n",
+       "8                                             [19222]   \n",
+       "9   [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...   \n",
+       "10                         [3677, 5634, 10468, 44877]   \n",
+       "13                         [3677, 5634, 10468, 44877]   \n",
+       "14  [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...   \n",
+       "\n",
+       "                                             sequence     41     75    122  \\\n",
+       "8   MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...  False  False  False   \n",
+       "9   MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...  False  False  False   \n",
+       "10  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
+       "13  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
+       "14  MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...  False  False  False   \n",
+       "\n",
+       "      165    209    226  ...  2000145  2000146  2000147  2000241  2000243  \\\n",
+       "8   False  False  False  ...    False    False    False    False    False   \n",
+       "9   False  False  False  ...    False    False    False    False    False   \n",
+       "10  False  False  False  ...    False    False    False    False    False   \n",
+       "13  False  False  False  ...    False    False    False    False    False   \n",
+       "14  False  False  False  ...    False    False    False    False    False   \n",
+       "\n",
+       "    2000377  2001020  2001141  2001233  2001234  \n",
+       "8     False    False    False    False    False  \n",
+       "9     False    False    False    False    False  \n",
+       "10    False    False    False    False    False  \n",
+       "13    False    False    False    False    False  \n",
+       "14    False    False    False    False    False  \n",
+       "\n",
+       "[5 rows x 1050 columns]"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>swiss_id</th>\n",
+       "      <th>accession</th>\n",
+       "      <th>go_ids</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>41</th>\n",
+       "      <th>75</th>\n",
+       "      <th>122</th>\n",
+       "      <th>165</th>\n",
+       "      <th>209</th>\n",
+       "      <th>226</th>\n",
+       "      <th>...</th>\n",
+       "      <th>2000145</th>\n",
+       "      <th>2000146</th>\n",
+       "      <th>2000147</th>\n",
+       "      <th>2000241</th>\n",
+       "      <th>2000243</th>\n",
+       "      <th>2000377</th>\n",
+       "      <th>2001020</th>\n",
+       "      <th>2001141</th>\n",
+       "      <th>2001233</th>\n",
+       "      <th>2001234</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>14331_ARATH</td>\n",
+       "      <td>P42643,Q945M2,Q9M0S7</td>\n",
+       "      <td>[19222]</td>\n",
+       "      <td>MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>14331_CAEEL</td>\n",
+       "      <td>P41932,Q21537</td>\n",
+       "      <td>[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...</td>\n",
+       "      <td>MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>14331_MAIZE</td>\n",
+       "      <td>P49106</td>\n",
+       "      <td>[3677, 5634, 10468, 44877]</td>\n",
+       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>14332_MAIZE</td>\n",
+       "      <td>Q01526</td>\n",
+       "      <td>[3677, 5634, 10468, 44877]</td>\n",
+       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>14333_ARATH</td>\n",
+       "      <td>P42644,F4KBI7,Q945L2</td>\n",
+       "      <td>[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...</td>\n",
+       "      <td>MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 1050 columns</p>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 7
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## data.pt",
+   "id": "2c9f23883c66b48d"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T14:10:35.034002Z",
+     "start_time": "2024-09-30T14:10:35.018342Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "import torch",
+   "id": "85b097601fb242d6",
+   "outputs": [],
+   "execution_count": 8
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T14:11:36.443693Z",
+     "start_time": "2024-09-30T14:11:34.199285Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n",
+    "print(\"Type of loaded data:\", type(data_pt))\n",
+    "for i in range(1):\n",
+    "    print(data_pt[i])"
+   ],
+   "id": "289a54a71dec20fb",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Type of loaded data: <class 'list'>\n",
+      "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n"
+     ]
+    }
+   ],
+   "execution_count": 11
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## Protein Representation Using Amino Acid Sequence Notation\n",
+    "\n",
+    "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n",
+    "\n",
+    "### Example Protein Sequence\n",
+    "\n",
+    "Protein: **Lysozyme C** from **Gallus gallus** (Chicken).  \n",
+    "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n",
+    "\n",
+    "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n",
+    "- **Sequence Length**: 147\n",
+    "\n",
+    "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n",
+    "\n",
+    "### The 20 Amino Acids and Their One-Letter Notations\n",
+    "\n",
+    "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n",
+    "\n",
+    "| One-Letter Notation | Amino Acid Name      | Description                                             |\n",
+    "|---------------------|----------------------|---------------------------------------------------------|\n",
+    "| **A**               | Alanine              | Non-polar, aliphatic amino acid.                        |\n",
+    "| **C**               | Cysteine             | Polar, contains a thiol group, forms disulfide bonds.   |\n",
+    "| **D**               | Aspartic Acid        | Acidic, negatively charged at physiological pH.         |\n",
+    "| **E**               | Glutamic Acid        | Acidic, negatively charged at physiological pH.         |\n",
+    "| **F**               | Phenylalanine        | Aromatic, non-polar.                                    |\n",
+    "| **G**               | Glycine              | Smallest amino acid, non-polar.                         |\n",
+    "| **H**               | Histidine            | Polar, positively charged, can participate in enzyme active sites. |\n",
+    "| **I**               | Isoleucine           | Non-polar, aliphatic.                                   |\n",
+    "| **K**               | Lysine               | Basic, positively charged at physiological pH.          |\n",
+    "| **L**               | Leucine              | Non-polar, aliphatic.                                   |\n",
+    "| **M**               | Methionine           | Non-polar, contains sulfur, start codon in mRNA translation. |\n",
+    "| **N**               | Asparagine           | Polar, uncharged.                                       |\n",
+    "| **P**               | Proline              | Non-polar, introduces kinks in protein chains.          |\n",
+    "| **Q**               | Glutamine            | Polar, uncharged.                                       |\n",
+    "| **R**               | Arginine             | Basic, positively charged, involved in binding phosphate groups. |\n",
+    "| **S**               | Serine               | Polar, can be phosphorylated.                           |\n",
+    "| **T**               | Threonine            | Polar, can be phosphorylated.                           |\n",
+    "| **V**               | Valine               | Non-polar, aliphatic.                                   |\n",
+    "| **W**               | Tryptophan           | Aromatic, non-polar, largest amino acid.                |\n",
+    "| **Y**               | Tyrosine             | Aromatic, polar, can be phosphorylated.                 |\n",
+    "\n",
+    "### Understanding Protein Sequences\n",
+    "\n",
+    "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n",
+    "\n",
+    "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n",
+    "\n",
+    "\n",
+    "_Note_:  Refer for amino acid sequence:  https://en.wikipedia.org/wiki/Protein_primary_structure"
+   ],
+   "id": "481b8c0271ec9636"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 71e9888d54276413f4d145c031ea56cd60d0f228 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 30 Sep 2024 20:29:32 +0200
Subject: [PATCH 052/112] Delete data_exploration.ipynb

---
 tutorials/data_exploration.ipynb | 1294 ------------------------------
 1 file changed, 1294 deletions(-)
 delete mode 100644 tutorials/data_exploration.ipynb

diff --git a/tutorials/data_exploration.ipynb b/tutorials/data_exploration.ipynb
deleted file mode 100644
index fce3a9f7..00000000
--- a/tutorials/data_exploration.ipynb
+++ /dev/null
@@ -1,1294 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b",
-   "metadata": {},
-   "source": [
-    "# Introduction\n",
-    "\n",
-    "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n",
-    "\n",
-    "---\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d",
-   "metadata": {},
-   "source": [
-    "# 1. Instantiation of a Data Class\n",
-    "\n",
-    "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data.\n",
-    "### Inheritance Hierarchy\n",
-    "\n",
-    "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n",
-    "\n",
-    "- **`XYBaseDataModule`**: This is the base class for all data modules in `chebai`, providing foundational properties and methods for handling and processing datasets, including loading a stored dataset and creating a `DataLoader`.\n",
-    "\n",
-    "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for some datasets (e.g. the ChEBI and Gene Ontology datasets). The defining feature is the dynamically created data split into training, validation and test sets. It inherits from `XYBaseDataModule`.\n",
-    "\n",
-    "\n",
-    "\n",
-    "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n",
-    "\n",
-    "\n",
-    "### Explanation\n",
-    "A ChEBI data class can be configured with the following main parameters:\n",
-    "\n",
-    "- **chebi_version (int)**: Specifies the version of the ChEBI dataset to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n",
-    "\n",
-    "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n",
-    "\n",
-    "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\n",
-    "\n",
-    "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\n",
-    "\n",
-    "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n",
-    "\n",
-    "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\n",
-    "\n",
-    "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\n",
-    "\n",
-    "### Additional Input Parameters\n",
-    "\n",
-    "The `XYBaseDa ChEBI data class, which `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n",
-    "\n",
-    "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n",
-    "\n",
-    "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\n",
-    "\n",
-    "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\n",
-    "\n",
-    "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\n",
-    "\n",
-    "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\n",
-    "\n",
-    "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\n",
-    "\n",
-    "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\n",
-    "\n",
-    "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\n",
-    "\n",
-    "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\n",
-    "\n",
-    "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\n",
-    "\n",
-    "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\n",
-    "\n",
-    "- **kwargs**: Additional keyword arguments.\n",
-    "\n",
-    "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a",
-   "metadata": {},
-   "source": [
-    "# Available ChEBI Data Classes\n",
-    "\n",
-    "## `ChEBIOver100`\n",
-    "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
-    "\n",
-    "## `ChEBIOver50`\n",
-    "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
-    "\n",
-    "## `ChEBIOver100DeepSMILES`\n",
-    "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n",
-    "\n",
-    "## `ChEBIOver100SELFIES`\n",
-    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n",
-    "\n",
-    "## `ChEBIOver50SELFIES`\n",
-    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n",
-    "\n",
-    "## `ChEBIOver50Partial`\n",
-    "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from chebai.preprocessing.datasets.chebi import ChEBIOver50"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "a71b7301-6195-4155-a439-f5eb3183d0f3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chebi_class = ChEBIOver50(chebi_version=231)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8456b545-88c5-401d-baa5-47e8ae710f04",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1655d489-25fe-46de-9feb-eeca5d36936f",
-   "metadata": {},
-   "source": [
-    "# 2. Preparation / Setup Methods\n",
-    "\n",
-    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
-    "\n",
-    "### Why is Preparation Needed?\n",
-    "\n",
-    "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n",
-    "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n",
-    "\n",
-    "### Main Methods for Data Preprocessing\n",
-    "\n",
-    "The data preprocessing in a data class involves two main methods:\n",
-    "\n",
-    "1. **`prepare_data` Method**:\n",
-    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n",
-    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n",
-    "\n",
-    "2. **`setup` Method**:\n",
-    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n",
-    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n",
-    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
-    "\n",
-    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "f2df4bd1-cf34-4414-bce4-54379ffac006",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n",
-      "Cross-validation enabled: False\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n",
-      "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n",
-      "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n"
-     ]
-    }
-   ],
-   "source": [
-    "chebi_class.prepare_data()\n",
-    "chebi_class.setup()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e",
-   "metadata": {},
-   "source": [
-    "# 3. Different Data Files Created and their Structure\n",
-    "\n",
-    "\n",
-    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n",
-    "\n",
-    "### Data Files\n",
-    "\n",
-    "1. **`Raw Data Files`**: (e.g., `.obo` file)\n",
-    "   - **Description**: Contains the raw ChEBI ontology data in OBO format, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
-    "\n",
-    "2. **`data.pkl`**\n",
-    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a Pandas dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
-    "\n",
-    "3. **`data.pt`**\n",
-    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
-    "\n",
-    "4. **`classes.txt`**\n",
-    "   - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n",
-    "\n",
-    "5. **`splits.csv`**\n",
-    "   - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n",
-    "\n",
-    "### File Structure and Preprocessing Stages\n",
-    "\n",
-    "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n",
-    "\n",
-    "1. **Raw Data Stage**:\n",
-    "   - **File**: `chebi.obo`\n",
-    "   - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
-    "\n",
-    "2. **Processed Data Stage 1**:\n",
-    "   - **File**: `data.pkl`\n",
-    "   - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
-    "   - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n",
-    "\n",
-    "3. **Processed Data Stage 2**:\n",
-    "   - **File**: `data.pt`\n",
-    "   - **Description**: This final stage includes the tokenized data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
-    "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n",
-    "\n",
-    "### Data Splits\n",
-    "\n",
-    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n",
-    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n",
-    "\n",
-    "### Summary of File Paths\n",
-    "\n",
-    "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n",
-    "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n",
-    "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n",
-    "\n",
-    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "74adb549-9e02-472d-a535-78a584853b52",
-   "metadata": {},
-   "source": [
-    "# 4. Information Stored in the Files\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "43329709-5134-4ce5-88e7-edd2176bf84d",
-   "metadata": {},
-   "source": [
-    "## chebi.obo\n",
-    "\n",
-    "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n",
-    "\n",
-    "### Structure of `chebi.obo`\n",
-    "\n",
-    "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n",
-    "\n",
-    "#### Example of a Term Document\n",
-    "\n",
-    "```plaintext\n",
-    "[Term]\n",
-    "id: CHEBI:24867\n",
-    "name: monoatomic ion\n",
-    "subset: 3_STAR\n",
-    "synonym: \"monoatomic ions\" RELATED [ChEBI]\n",
-    "is_a: CHEBI:24870\n",
-    "is_a: CHEBI:33238\n",
-    "is_a: CHEBI:3323Relevant 8\n",
-    "```\n",
-    "\n",
-    "### Breakdown of Attributes\n",
-    "\n",
-    "Each term document in the `chebi.obo` file consists of the following key attributes:\n",
-    "\n",
-    "- **`[Term]`**: \n",
-    "  - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n",
-    "\n",
-    "- **`id: CHEBI:24867`**: \n",
-    "  - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n",
-    "  - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n",
-    "\n",
-    "- **`name: monoatomic ion`**: \n",
-    "  - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n",
-    "  - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n",
-    "\n",
-    "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n",
-    "  - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n",
-    "  - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "322bc926-69ff-4b93-9e95-5e8b85869c38",
-   "metadata": {},
-   "source": [
-    "## `data.pkl` File\n",
-    "\n",
-    "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n",
-    "\n",
-    "\n",
-    "\n",
-    "### Structure of `data.pkl`\n",
-    "`data.pkl` as following structure: \n",
-    "- **Column 0**: Contains the ID of each ChEBI data instance.\n",
-    "- **Column 1**: Contains the name of each ChEBI data instance.\n",
-    "- **Column 2**: Contains the SMILES representation of the chemical.\n",
-    "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n",
-    "\n",
-    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "fd490270-59b8-4c1c-8b09-204defddf592",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "d7d16247-092c-4e8d-96c2-ab23931cf766",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Size of the data (rows x columns):  (129184, 1335)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>SMILES</th>\n",
-       "      <th>1722</th>\n",
-       "      <th>2468</th>\n",
-       "      <th>2571</th>\n",
-       "      <th>2580</th>\n",
-       "      <th>2634</th>\n",
-       "      <th>3098</th>\n",
-       "      <th>3992</th>\n",
-       "      <th>...</th>\n",
-       "      <th>143017</th>\n",
-       "      <th>143212</th>\n",
-       "      <th>143813</th>\n",
-       "      <th>146180</th>\n",
-       "      <th>147334</th>\n",
-       "      <th>156473</th>\n",
-       "      <th>166828</th>\n",
-       "      <th>166904</th>\n",
-       "      <th>167497</th>\n",
-       "      <th>167559</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>33429</td>\n",
-       "      <td>monoatomic monoanion</td>\n",
-       "      <td>[*-]</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>30151</td>\n",
-       "      <td>aluminide(1-)</td>\n",
-       "      <td>[Al-]</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>16042</td>\n",
-       "      <td>halide anion</td>\n",
-       "      <td>[*-]</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>17051</td>\n",
-       "      <td>fluoride</td>\n",
-       "      <td>[F-]</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>28741</td>\n",
-       "      <td>sodium fluoride</td>\n",
-       "      <td>[F-].[Na+]</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 1335 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      id                  name      SMILES   1722   2468   2571   2580   2634  \\\n",
-       "0  33429  monoatomic monoanion        [*-]  False  False  False  False  False   \n",
-       "1  30151         aluminide(1-)       [Al-]  False  False  False  False  False   \n",
-       "2  16042          halide anion        [*-]  False  False  False  False  False   \n",
-       "3  17051              fluoride        [F-]  False  False  False  False  False   \n",
-       "4  28741       sodium fluoride  [F-].[Na+]  False  False  False  False  False   \n",
-       "\n",
-       "    3098   3992  ...  143017  143212  143813  146180  147334  156473  166828  \\\n",
-       "0  False  False  ...   False   False   False   False   False   False   False   \n",
-       "1  False  False  ...   False   False   False   False   False   False   False   \n",
-       "2  False  False  ...   False   False   False   False   False   False   False   \n",
-       "3  False  False  ...   False   False   False   False   False   False   False   \n",
-       "4  False  False  ...   False   False   False   False   False   False   False   \n",
-       "\n",
-       "   166904  167497  167559  \n",
-       "0   False   False   False  \n",
-       "1   False   False   False  \n",
-       "2   False   False   False  \n",
-       "3   False   False   False  \n",
-       "4   False   False   False  \n",
-       "\n",
-       "[5 rows x 1335 columns]"
-      ]
-     },
-     "execution_count": 53,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n",
-    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
-    "pkl_df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07",
-   "metadata": {},
-   "source": [
-    "## `data.pt` File\n",
-    "\n",
-    "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n",
-    "\n",
-    "### Structure of `data.pt`\n",
-    "\n",
-    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
-    "\n",
-    "- **`features`**: \n",
-    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
-    "\n",
-    "- **`labels`**: \n",
-    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
-    "\n",
-    "- **`ident`**: \n",
-    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Type of loaded data: <class 'list'>\n"
-     ]
-    }
-   ],
-   "source": [
-    "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n",
-    "print(\"Type of loaded data:\", type(data_pt))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n",
-      "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n",
-      "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n",
-      "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n",
-      "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i in range(5):\n",
-    "    print(data_pt[i])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "861da1c3-0401-49f0-a22f-109814ed95d5",
-   "metadata": {},
-   "source": [
-    "## `classes.txt` File\n",
-    "\n",
-    "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n",
-    "\n",
-    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1722\n",
-      "2468\n",
-      "2571\n",
-      "2580\n",
-      "2634\n"
-     ]
-    }
-   ],
-   "source": [
-    "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n",
-    "    for i in range(5):\n",
-    "        line = file.readline()\n",
-    "        print(line.strip())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b058714f-e434-4367-89b9-74c129ac727f",
-   "metadata": {},
-   "source": [
-    "## `splits.csv` File\n",
-    "\n",
-    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 98,
-   "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>split</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>33429</td>\n",
-       "      <td>train</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>30151</td>\n",
-       "      <td>train</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>17051</td>\n",
-       "      <td>train</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>32129</td>\n",
-       "      <td>train</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>30340</td>\n",
-       "      <td>train</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      id  split\n",
-       "0  33429  train\n",
-       "1  30151  train\n",
-       "2  17051  train\n",
-       "3  32129  train\n",
-       "4  30340  train"
-      ]
-     },
-     "execution_count": 98,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n",
-    "csv_df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d",
-   "metadata": {},
-   "source": [
-    "# 5. Example Molecule: Different Encodings\n",
-    "\n",
-    "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n",
-    "\n",
-    "### Explanation:\n",
-    "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
-    "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
-    "\n",
-    "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n",
-    "\n",
-    "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n",
-    "   - **Benzene SMILES**: `c1ccccc1`\n",
-    "   - **Explanation**: \n",
-    "     - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n",
-    "\n",
-    "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n",
-    "   - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n",
-    "   - **Explanation**: \n",
-    "     - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n",
-    "     - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n",
-    "\n",
-    "### 3. **InChI (IUPAC International Chemical Identifier)**\n",
-    "   - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\n",
-    "   - **Explanation**: \n",
-    "     - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\n",
-    "\n",
-    "### 4. **InChIKey**\n",
-    "   - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\n",
-    "   - **Explanation**: \n",
-    "     - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\n",
-    "\n",
-    "### 5. **Canonical SMILES**\n",
-    "   - **Benzene Canonical SMILES**: `c1ccccc1`\n",
-    "   - **Explanation**:\n",
-    "     - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\n",
-    "\n",
-    "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\n",
-    "   - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\n",
-    "   - **Explanation**: \n",
-    "     - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\n",
-    "\n",
-    "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "93e328cf-09f9-4694-b175-28320590937d",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde",
-   "metadata": {},
-   "source": [
-    "# Information for Protein Dataset\n",
-    "\n",
-    "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\n",
-    "\n",
-    "### Configuration Parameters\n",
-    "\n",
-    "Data classes related to proteins can be configured using the following main parameters:\n",
-    "\n",
-    "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n",
-    "\n",
-    "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\n",
-    "\n",
-    "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n",
-    "\n",
-    "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\n",
-    "\n",
-    "### Available GOUniProt Data Classes\n",
-    "\n",
-    "#### `GOUniProtOver250`\n",
-    "\n",
-    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n",
-    "\n",
-    "#### `GOUniProtOver50`\n",
-    "\n",
-    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n",
-    "\n",
-    "### Instantiation Example\n",
-    "\n",
-    "```python\n",
-    "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\n",
-    "go_class = GOUniProtOver250()\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2ffca830-bc0b-421c-8054-0860c95c10f2",
-   "metadata": {},
-   "source": [
-    "## GOUniProt Data File Structure\n",
-    "\n",
-    "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n",
-    "   - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n",
-    "   - **File Paths**:\n",
-    "     - `data/GO_UniProt/raw/${filename}.obo`\n",
-    "     - `data/GO_UniProt/raw/${filename}.dat`\n",
-    "\n",
-    "2. **`data.pkl`**\n",
-    "   - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n",
-    "\n",
-    "3. **`data.pt`**\n",
-    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n",
-    "\n",
-    "4. **`classes.txt`**\n",
-    "   - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n",
-    "\n",
-    "5. **`splits.csv`**\n",
-    "   - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n",
-    "\n",
-    "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "61bc261e-2328-4968-aca6-14c48bb24348",
-   "metadata": {},
-   "source": [
-    "## data.pkl"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "id": "31df4ee7-4c03-4ea2-9798-5e5082a74c2b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Size of the data (rows x columns):  (27459, 1050)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>swiss_id</th>\n",
-       "      <th>accession</th>\n",
-       "      <th>go_ids</th>\n",
-       "      <th>sequence</th>\n",
-       "      <th>41</th>\n",
-       "      <th>75</th>\n",
-       "      <th>122</th>\n",
-       "      <th>165</th>\n",
-       "      <th>209</th>\n",
-       "      <th>226</th>\n",
-       "      <th>...</th>\n",
-       "      <th>2000145</th>\n",
-       "      <th>2000146</th>\n",
-       "      <th>2000147</th>\n",
-       "      <th>2000241</th>\n",
-       "      <th>2000243</th>\n",
-       "      <th>2000377</th>\n",
-       "      <th>2001020</th>\n",
-       "      <th>2001141</th>\n",
-       "      <th>2001233</th>\n",
-       "      <th>2001234</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>14331_ARATH</td>\n",
-       "      <td>P42643,Q945M2,Q9M0S7</td>\n",
-       "      <td>[19222]</td>\n",
-       "      <td>MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>14331_CAEEL</td>\n",
-       "      <td>P41932,Q21537</td>\n",
-       "      <td>[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...</td>\n",
-       "      <td>MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>14331_MAIZE</td>\n",
-       "      <td>P49106</td>\n",
-       "      <td>[3677, 5634, 10468, 44877]</td>\n",
-       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>14332_MAIZE</td>\n",
-       "      <td>Q01526</td>\n",
-       "      <td>[3677, 5634, 10468, 44877]</td>\n",
-       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>14333_ARATH</td>\n",
-       "      <td>P42644,F4KBI7,Q945L2</td>\n",
-       "      <td>[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...</td>\n",
-       "      <td>MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 1050 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       swiss_id             accession  \\\n",
-       "8   14331_ARATH  P42643,Q945M2,Q9M0S7   \n",
-       "9   14331_CAEEL         P41932,Q21537   \n",
-       "10  14331_MAIZE                P49106   \n",
-       "13  14332_MAIZE                Q01526   \n",
-       "14  14333_ARATH  P42644,F4KBI7,Q945L2   \n",
-       "\n",
-       "                                               go_ids  \\\n",
-       "8                                             [19222]   \n",
-       "9   [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...   \n",
-       "10                         [3677, 5634, 10468, 44877]   \n",
-       "13                         [3677, 5634, 10468, 44877]   \n",
-       "14  [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...   \n",
-       "\n",
-       "                                             sequence     41     75    122  \\\n",
-       "8   MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...  False  False  False   \n",
-       "9   MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...  False  False  False   \n",
-       "10  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
-       "13  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
-       "14  MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...  False  False  False   \n",
-       "\n",
-       "      165    209    226  ...  2000145  2000146  2000147  2000241  2000243  \\\n",
-       "8   False  False  False  ...    False    False    False    False    False   \n",
-       "9   False  False  False  ...    False    False    False    False    False   \n",
-       "10  False  False  False  ...    False    False    False    False    False   \n",
-       "13  False  False  False  ...    False    False    False    False    False   \n",
-       "14  False  False  False  ...    False    False    False    False    False   \n",
-       "\n",
-       "    2000377  2001020  2001141  2001233  2001234  \n",
-       "8     False    False    False    False    False  \n",
-       "9     False    False    False    False    False  \n",
-       "10    False    False    False    False    False  \n",
-       "13    False    False    False    False    False  \n",
-       "14    False    False    False    False    False  \n",
-       "\n",
-       "[5 rows x 1050 columns]"
-      ]
-     },
-     "execution_count": 123,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n",
-    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
-    "pkl_df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "be0078fd-bcf1-4d4c-b8c6-c84e3aeac99c",
-   "metadata": {},
-   "source": [
-    "## data.pt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 127,
-   "id": "a70f9c35-daca-4728-a9ea-b1212866f421",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Type of loaded data: <class 'list'>\n",
-      "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n"
-     ]
-    }
-   ],
-   "source": [
-    "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n",
-    "print(\"Type of loaded data:\", type(data_pt))\n",
-    "for i in range(1):\n",
-    "    print(data_pt[i])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "380049c1-2963-4223-b698-a7b59b9fe595",
-   "metadata": {},
-   "source": [
-    "## Protein Representation Using Amino Acid Sequence Notation\n",
-    "\n",
-    "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n",
-    "\n",
-    "### Example Protein Sequence\n",
-    "\n",
-    "Protein: **Lysozyme C** from **Gallus gallus** (Chicken).  \n",
-    "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n",
-    "\n",
-    "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n",
-    "- **Sequence Length**: 147\n",
-    "\n",
-    "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n",
-    "\n",
-    "### The 20 Amino Acids and Their One-Letter Notations\n",
-    "\n",
-    "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n",
-    "\n",
-    "| One-Letter Notation | Amino Acid Name      | Description                                             |\n",
-    "|---------------------|----------------------|---------------------------------------------------------|\n",
-    "| **A**               | Alanine              | Non-polar, aliphatic amino acid.                        |\n",
-    "| **C**               | Cysteine             | Polar, contains a thiol group, forms disulfide bonds.   |\n",
-    "| **D**               | Aspartic Acid        | Acidic, negatively charged at physiological pH.         |\n",
-    "| **E**               | Glutamic Acid        | Acidic, negatively charged at physiological pH.         |\n",
-    "| **F**               | Phenylalanine        | Aromatic, non-polar.                                    |\n",
-    "| **G**               | Glycine              | Smallest amino acid, non-polar.                         |\n",
-    "| **H**               | Histidine            | Polar, positively charged, can participate in enzyme active sites. |\n",
-    "| **I**               | Isoleucine           | Non-polar, aliphatic.                                   |\n",
-    "| **K**               | Lysine               | Basic, positively charged at physiological pH.          |\n",
-    "| **L**               | Leucine              | Non-polar, aliphatic.                                   |\n",
-    "| **M**               | Methionine           | Non-polar, contains sulfur, start codon in mRNA translation. |\n",
-    "| **N**               | Asparagine           | Polar, uncharged.                                       |\n",
-    "| **P**               | Proline              | Non-polar, introduces kinks in protein chains.          |\n",
-    "| **Q**               | Glutamine            | Polar, uncharged.                                       |\n",
-    "| **R**               | Arginine             | Basic, positively charged, involved in binding phosphate groups. |\n",
-    "| **S**               | Serine               | Polar, can be phosphorylated.                           |\n",
-    "| **T**               | Threonine            | Polar, can be phosphorylated.                           |\n",
-    "| **V**               | Valine               | Non-polar, aliphatic.                                   |\n",
-    "| **W**               | Tryptophan           | Aromatic, non-polar, largest amino acid.                |\n",
-    "| **Y**               | Tyrosine             | Aromatic, polar, can be phosphorylated.                 |\n",
-    "\n",
-    "### Understanding Protein Sequences\n",
-    "\n",
-    "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n",
-    "\n",
-    "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n",
-    "\n",
-    "\n",
-    "_Note_:  Refer for amino acid sequence:  https://en.wikipedia.org/wiki/Protein_primary_structure"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "702359d6-5338-4391-b196-2328ba5676a1",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python (env_chebai)",
-   "language": "python",
-   "name": "env_chebai"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From c6b8d5071b16e99c9b379304ddb22829af9840cf Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 30 Sep 2024 23:35:07 +0200
Subject: [PATCH 053/112] add info on evidence codes + uniprot.data file +
 changes

---
 tutorials/data_exploration_go.ipynb | 436 +++++++++++++++++++++++++---
 1 file changed, 402 insertions(+), 34 deletions(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index 391192a1..2c789ae6 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -18,8 +18,6 @@
    "metadata": {},
    "cell_type": "markdown",
    "source": [
-    "# Information for Protein Dataset\n",
-    "\n",
     "# 1. Instantiation of a Data Class\n",
     "\n",
     "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n",
@@ -71,31 +69,80 @@
    "id": "605bbca601037df2"
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T21:25:03.920610Z",
+     "start_time": "2024-09-30T21:25:03.622407Z"
+    }
+   },
    "cell_type": "code",
    "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250",
    "id": "440f203ceaf7e4b7",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 12
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-09-30T14:08:21.236447Z",
-     "start_time": "2024-09-30T14:08:21.130242Z"
+     "end_time": "2024-09-30T21:25:08.863132Z",
+     "start_time": "2024-09-30T21:25:08.387739Z"
     }
    },
    "cell_type": "code",
    "source": "go_class = GOUniProtOver250()",
    "id": "a648346d81d0dc5e",
    "outputs": [],
-   "execution_count": 2
+   "execution_count": 13
   },
   {
    "metadata": {},
    "cell_type": "markdown",
    "source": [
-    "## GOUniProt Data File Structure\n",
+    "# 2. Preparation / Setup Methods\n",
+    "\n",
+    "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
+    "### Automatic Execution: \n",
+    "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n",
+    "\n",
+    "\n",
+    "### Why is Preparation Needed?\n",
+    "\n",
+    "- **Data Availability**: The preparation step ensures that the required GOUniProt data files are downloaded or loaded, which are essential for analysis.\n",
+    "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n",
+    "\n",
+    "### Main Methods for Data Preprocessing\n",
+    "\n",
+    "The data preprocessing in a data class involves two main methods:\n",
+    "\n",
+    "1. **`prepare_data` Method**:\n",
+    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n",
+    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n",
+    "\n",
+    "2. **`setup` Method**:\n",
+    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n",
+    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n",
+    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
+    "\n",
+    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
+   ],
+   "id": "2328e824c4dafb2d"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "go_class.prepare_data()\n",
+    "go_class.setup()"
+   ],
+   "id": "9f77351090560bc4",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# 3. GOUniProt Data File Structure\n",
     "\n",
     "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n",
     "   - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n",
@@ -123,54 +170,225 @@
    ],
    "id": "ee174b61b36c71aa"
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# 4. Information Stored in the Files",
+   "id": "3f92b58e460c08fd"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",
    "source": [
-    "# 2. Preparation / Setup Methods\n",
+    "## go-basic.obo\n",
     "\n",
-    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
-    "### Automatic Execution: \n",
-    "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n",
+    "The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n",
     "\n",
+    "### Structure of `go-basic.obo`\n",
     "\n",
-    "### Why is Preparation Needed?\n",
+    "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n",
     "\n",
-    "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n",
-    "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n",
+    "#### Example of a Term Document\n",
     "\n",
-    "### Main Methods for Data Preprocessing\n",
+    "```plaintext\n",
+    "[Term]\n",
+    "id: GO:0000032\n",
+    "name: cell wall mannoprotein biosynthetic process\n",
+    "namespace: biological_process\n",
+    "def: \"The chemical reactions and pathways resulting in the formation of cell wall mannoproteins, any cell wall protein that contains covalently bound mannose residues.\" [GOC:ai]\n",
+    "synonym: \"cell wall mannoprotein anabolism\" EXACT []\n",
+    "is_a: GO:0006057 ! mannoprotein biosynthetic process\n",
+    "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n",
+    "```\n",
     "\n",
-    "The data preprocessing in a data class involves two main methods:\n",
+    "### Breakdown of Attributes\n",
     "\n",
-    "1. **`prepare_data` Method**:\n",
-    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n",
-    "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n",
+    "Each term document in the `go-basic.obo` file consists of the following key attributes:\n",
     "\n",
-    "2. **`setup` Method**:\n",
-    "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n",
-    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n",
-    "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
+    "- **`[Term]`**: \n",
+    "  - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct biological process, molecular function, or cellular component.\n",
     "\n",
-    "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
+    "- **`id: GO:0000032`**: \n",
+    "  - **Description**: A unique identifier for the biological term within the GO ontology.\n",
+    "  - **Example**: `GO:0000032` refers to the term \"cell wall mannoprotein biosynthetic process.\"\n",
+    "\n",
+    "- **`name: cell wall mannoprotein biosynthetic process`**: \n",
+    "  - **Description**: The name of the biological process, molecular function, or cellular component being described.\n",
+    "  - **Example**: The name \"cell wall mannoprotein biosynthetic process\" is a descriptive label for the GO term with the identifier `GO:0000032`.\n",
+    "\n",
+    "- **`namespace: biological_process`**: \n",
+    "  - **Description**: Specifies which ontology the term belongs to. The main namespaces are `biological_process`, `molecular_function`, and `cellular_component`.\n",
+    "\n",
+    "- **`is_a: GO:0006057`**: \n",
+    "  - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n",
+    "  - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n"
    ],
-   "id": "2328e824c4dafb2d"
+   "id": "cca75d881cb8bade"
   },
   {
    "metadata": {},
-   "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
+   "cell_type": "markdown",
    "source": [
-    "go_class.prepare_data()\n",
-    "go_class.setup()"
+    "## uniprot_sprot.dat\n",
+    "\n",
+    "The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n",
+    "\n",
+    "\n",
+    "## Structure of `uniprot_sprot.dat`\n",
+    "\n",
+    "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n",
+    "\n",
+    "### Example of a Protein Entry\n",
+    "\n",
+    "```plaintext\n",
+    "ID   002L_FRG3G              Reviewed;         320 AA.\n",
+    "AC   Q6GZX3;\n",
+    "DT   28-JUN-2011, integrated into UniProtKB/Swiss-Prot.\n",
+    "DT   19-JUL-2004, sequence version 1.\n",
+    "DT   08-NOV-2023, entry version 46.\n",
+    "DE   RecName: Full=Uncharacterized protein 002L;\n",
+    "GN   ORFNames=FV3-002L;\n",
+    "OS   Frog virus 3 (isolate Goorha) (FV-3).\n",
+    "OC   Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes;\n",
+    "OX   NCBI_TaxID=654924;\n",
+    "OH   NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens).\n",
+    "RN   [1]\n",
+    "RP   NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n",
+    "RX   PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;\n",
+    "RA   Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.;\n",
+    "RT   \"Comparative genomic analyses of frog virus 3, type species of the genus\n",
+    "RT   Ranavirus (family Iridoviridae).\";\n",
+    "RL   Virology 323:70-84(2004).\n",
+    "CC   -!- SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass membrane\n",
+    "CC       protein {ECO:0000305}.\n",
+    "DR   EMBL; AY548484; AAT09661.1; -; Genomic_DNA.\n",
+    "DR   RefSeq; YP_031580.1; NC_005946.1.\n",
+    "DR   GeneID; 2947774; -.\n",
+    "DR   KEGG; vg:2947774; -.\n",
+    "DR   Proteomes; UP000008770; Segment.\n",
+    "DR   GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n",
+    "DR   GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.\n",
+    "PE   4: Predicted;\n",
+    "KW   Host membrane; Membrane; Reference proteome; Transmembrane;\n",
+    "KW   Transmembrane helix.\n",
+    "FT   CHAIN           1..320\n",
+    "FT                   /note=\"Uncharacterized protein 002L\"\n",
+    "FT                   /id=\"PRO_0000410509\"\n",
+    "SQ   SEQUENCE   320 AA;  34642 MW;  9E110808B6E328E0 CRC64;\n",
+    "     MSIIGATRLQ NDKSDTYSAG PCYAGGCSAF TPRGTCGKDW DLGEQTCASG FCTSQPLCAR\n",
+    "     IKKTQVCGLR YSSKGKDPLV SAEWDSRGAP YVRCTYDADL IDTQAQVDQF VSMFGESPSL\n",
+    "     AERYCMRGVK NTAGELVSRV SSDADPAGGW CRKWYSAHRG PDQDAALGSF CIKNPGAADC\n",
+    "     KCINRASDPV YQKVKTLHAY PDQCWYVPCA ADVGELKMGT QRDTPTNCPT QVCQIVFNML\n",
+    "     DDGSVTMDDV KNTINCDFSK YVPPPPPPKP TPPTPPTPPT PPTPPTPPTP PTPRPVHNRK\n",
+    "     VMFFVAGAVL VAILISTVRW\n",
+    "//\n",
+    "```\n",
+    "\n",
+    "### Breakdown of Attributes\n",
+    "\n",
+    "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n",
+    "\n",
+    "- **`ID`**: \n",
+    "  - **Description**: Contains the unique identifier for the protein and its status (e.g., `Reviewed` indicates the sequence has been manually curated).\n",
+    "  - **Example**: `002L_FRG3G` is the identifier for the protein from Frog virus 3.\n",
+    "\n",
+    "- **`AC`**: \n",
+    "  - **Description**: Accession number, a unique identifier for the protein sequence.\n",
+    "  - **Example**: `Q6GZX3` is the accession number for this entry.\n",
+    "\n",
+    "- **`DR`**: \n",
+    "  - **Description**: Cross-references to other databases like EMBL, RefSeq, KEGG, and GeneID.\n",
+    "  - **Example**: This entry is cross-referenced with the EMBL database, RefSeq, GO, etc.\n",
+    "\n",
+    "- **`GO`**: \n",
+    "  - **Description**: Gene Ontology annotations that describe the cellular component, biological process, or molecular function associated with the protein.\n",
+    "  - **Example**: The protein is associated with the GO terms `GO:0033644` (host cell membrane) and `GO:0016020` (membrane).\n",
+    "\n",
+    "- **`SQ`**: \n",
+    "  - **Description**: The amino acid sequence of the protein.\n",
+    "  - **Example**: The sequence consists of 320 amino acids.\n",
+    "\n",
+    "The `uniprot_sprot.dat` file is an extensively curated resource, containing comprehensive protein data used for various bioinformatics applications.\n",
+    "\n",
+    "__Note__: For more detailed information refer [here](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt\n",
+    "). \n",
+    "\n",
+    "Consider the below line from above example: \n",
+    "```plaintext\n",
+    "DR   GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n",
+    "```\n",
+    "\n",
+    "The line contains a **Gene Ontology (GO) annotation** describing the protein's subcellular location. Here's a detailed breakdown:\n",
+    "\n",
+    "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n",
+    "\n",
+    "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n",
+    "\n",
+    "### More on GO Evidence Codes\n",
+    "\n",
+    "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n",
+    "\n",
+    "| **Evidence Code** | **Description** |\n",
+    "|-------------------|-----------------|\n",
+    "| **EXP**           | Inferred from Experiment |\n",
+    "| **IDA**           | Inferred from Direct Assay |\n",
+    "| **IPI**           | Inferred from Physical Interaction |\n",
+    "| **IMP**           | Inferred from Mutant Phenotype |\n",
+    "| **IGI**           | Inferred from Genetic Interaction |\n",
+    "| **IEP**           | Inferred from Expression Pattern |\n",
+    "| **TAS**           | Traceable Author Statement |\n",
+    "| **IC**            | Inferred by Curator |\n",
+    "| **IEA**           | Inferred from Electronic Annotation (Computational) |\n",
+    "| **ISS**           | Inferred from Sequence or Structural Similarity |\n",
+    "| **ISA**           | Inferred from Sequence Alignment |\n",
+    "| **ISM**           | Inferred from Sequence Model |\n",
+    "| **ISO**           | Inferred from Sequence Orthology |\n",
+    "| **ISA**           | Inferred from Sequence Alignment |\n",
+    "| **RCA**           | Inferred from Reviewed Computational Analysis |\n",
+    "| **NAS**           | Non-traceable Author Statement |\n",
+    "| **ND**            | No Biological Data Available (placeholder) |\n",
+    "| **NR**            | Not Recorded |\n",
+    "\n",
+    "\n",
+    "### Grouping of Codes:\n",
+    "\n",
+    "- **Experimental Evidence Codes**: \n",
+    "  - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n",
+    "  \n",
+    "- **Author/Curator Inferred Codes**:\n",
+    "  - **TAS**, **IC**, **NAS**\n",
+    "\n",
+    "- **Computational Evidence Codes**:\n",
+    "  - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n",
+    "\n",
+    "- **Others**:\n",
+    "  - **ND** (No Data), **NR** (Not Recorded)\n",
+    "\n",
+    "\n",
+    "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation."
    ],
-   "id": "9f77351090560bc4"
+   "id": "87c841de7d80beef"
   },
   {
    "metadata": {},
    "cell_type": "markdown",
-   "source": "## data.pkl",
+   "source": [
+    "## data.pkl\n",
+    "\n",
+    "The `data.pkl` file, generated during the preprocessing stage, contains the processed GO data in a dataframe format. Below is an example of how this data is structured:\n",
+    "\n",
+    "\n",
+    "\n",
+    "### Structure of `data.pkl`\n",
+    "`data.pkl` as following structure: \n",
+    "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n",
+    "- **Column 1**: Contains the accession of each Protein data instance.\n",
+    "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n",
+    "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n",
+    "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n",
+    "\n",
+    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
+   ],
    "id": "735844f0b2474ad6"
   },
   {
@@ -427,7 +645,20 @@
   {
    "metadata": {},
    "cell_type": "markdown",
-   "source": "## data.pt",
+   "source": [
+    "## data.pt\n",
+    "\n",
+    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
+    "\n",
+    "- **`features`**: \n",
+    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
+    "\n",
+    "- **`labels`**: \n",
+    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
+    "\n",
+    "- **`ident`**: \n",
+    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
+   ],
    "id": "2c9f23883c66b48d"
   },
   {
@@ -470,6 +701,143 @@
    ],
    "execution_count": 11
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## `classes.txt` File\n",
+    "\n",
+    "The `classes.txt` file lists selected Swiss Proteins classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset.\n",
+    "\n",
+    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n"
+   ],
+   "id": "f69012b3540fd1b6"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T21:30:34.344202Z",
+     "start_time": "2024-09-30T21:30:34.328318Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "with open(r\"data/GO_UniProt/GO250_BP/processed/classes.txt\", \"r\") as file:\n",
+    "    for i in range(5):\n",
+    "        line = file.readline()\n",
+    "        print(line.strip())"
+   ],
+   "id": "19200f7ff9a6ebba",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "41\n",
+      "75\n",
+      "122\n",
+      "165\n",
+      "209\n"
+     ]
+    }
+   ],
+   "execution_count": 15
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## `splits.csv` File\n",
+    "\n",
+    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run."
+   ],
+   "id": "6661dc11247e9753"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T21:30:41.586616Z",
+     "start_time": "2024-09-30T21:30:39.318598Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "csv_df = pd.read_csv(r\"data/GO_UniProt/GO250_BP/processed/splits.csv\")\n",
+    "csv_df.head()"
+   ],
+   "id": "88c3ea8f01ba9fac",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "            id  split\n",
+       "0  14331_ARATH  train\n",
+       "1  14331_CAEEL  train\n",
+       "2  14331_MAIZE  train\n",
+       "3  14332_MAIZE  train\n",
+       "4  14333_ARATH  train"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>split</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>14331_ARATH</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>14331_CAEEL</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>14331_MAIZE</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>14332_MAIZE</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>14333_ARATH</td>\n",
+       "      <td>train</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 16
+  },
   {
    "metadata": {},
    "cell_type": "markdown",

From 4c55b04890861c063370345d3f7f0cc169ec88c5 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Mon, 30 Sep 2024 23:51:36 +0200
Subject: [PATCH 054/112] minor formatting changes

---
 tutorials/data_exploration_chebi.ipynb |   1 -
 tutorials/data_exploration_go.ipynb    | 129 ++++++++++++++++---------
 2 files changed, 86 insertions(+), 44 deletions(-)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index 17c3ae33..6ddd3238 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -291,7 +291,6 @@
     "synonym: \"monoatomic ions\" RELATED [ChEBI]\n",
     "is_a: CHEBI:24870\n",
     "is_a: CHEBI:33238\n",
-    "is_a: CHEBI:3323Relevant 8\n",
     "```\n",
     "\n",
     "### Breakdown of Attributes\n",
diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index 2c789ae6..8dc4cb44 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -94,6 +94,12 @@
    "outputs": [],
    "execution_count": 13
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "---",
+   "id": "651ab5c39833bd2c"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",
@@ -138,6 +144,12 @@
    "outputs": [],
    "execution_count": null
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "---",
+   "id": "db5b58f2d96823fc"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",
@@ -170,6 +182,12 @@
    ],
    "id": "ee174b61b36c71aa"
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "---",
+   "id": "a927ad484c930960"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",
@@ -323,49 +341,7 @@
     "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n",
     "\n",
     "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n",
-    "\n",
-    "### More on GO Evidence Codes\n",
-    "\n",
-    "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n",
-    "\n",
-    "| **Evidence Code** | **Description** |\n",
-    "|-------------------|-----------------|\n",
-    "| **EXP**           | Inferred from Experiment |\n",
-    "| **IDA**           | Inferred from Direct Assay |\n",
-    "| **IPI**           | Inferred from Physical Interaction |\n",
-    "| **IMP**           | Inferred from Mutant Phenotype |\n",
-    "| **IGI**           | Inferred from Genetic Interaction |\n",
-    "| **IEP**           | Inferred from Expression Pattern |\n",
-    "| **TAS**           | Traceable Author Statement |\n",
-    "| **IC**            | Inferred by Curator |\n",
-    "| **IEA**           | Inferred from Electronic Annotation (Computational) |\n",
-    "| **ISS**           | Inferred from Sequence or Structural Similarity |\n",
-    "| **ISA**           | Inferred from Sequence Alignment |\n",
-    "| **ISM**           | Inferred from Sequence Model |\n",
-    "| **ISO**           | Inferred from Sequence Orthology |\n",
-    "| **ISA**           | Inferred from Sequence Alignment |\n",
-    "| **RCA**           | Inferred from Reviewed Computational Analysis |\n",
-    "| **NAS**           | Non-traceable Author Statement |\n",
-    "| **ND**            | No Biological Data Available (placeholder) |\n",
-    "| **NR**            | Not Recorded |\n",
-    "\n",
-    "\n",
-    "### Grouping of Codes:\n",
-    "\n",
-    "- **Experimental Evidence Codes**: \n",
-    "  - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n",
-    "  \n",
-    "- **Author/Curator Inferred Codes**:\n",
-    "  - **TAS**, **IC**, **NAS**\n",
-    "\n",
-    "- **Computational Evidence Codes**:\n",
-    "  - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n",
-    "\n",
-    "- **Others**:\n",
-    "  - **ND** (No Data), **NR** (Not Recorded)\n",
-    "\n",
-    "\n",
-    "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation."
+    "\n"
    ],
    "id": "87c841de7d80beef"
   },
@@ -838,6 +814,12 @@
    ],
    "execution_count": 16
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "---",
+   "id": "e6b1f184a5091b83"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",
@@ -893,6 +875,67 @@
     "_Note_:  Refer for amino acid sequence:  https://en.wikipedia.org/wiki/Protein_primary_structure"
    ],
    "id": "481b8c0271ec9636"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "---",
+   "id": "db6d7f2cc446e6f9"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## More on GO Evidence Codes\n",
+    "\n",
+    "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n",
+    "\n",
+    "| **Evidence Code** | **Description** |\n",
+    "|-------------------|-----------------|\n",
+    "| **EXP**           | Inferred from Experiment |\n",
+    "| **IDA**           | Inferred from Direct Assay |\n",
+    "| **IPI**           | Inferred from Physical Interaction |\n",
+    "| **IMP**           | Inferred from Mutant Phenotype |\n",
+    "| **IGI**           | Inferred from Genetic Interaction |\n",
+    "| **IEP**           | Inferred from Expression Pattern |\n",
+    "| **TAS**           | Traceable Author Statement |\n",
+    "| **IC**            | Inferred by Curator |\n",
+    "| **IEA**           | Inferred from Electronic Annotation (Computational) |\n",
+    "| **ISS**           | Inferred from Sequence or Structural Similarity |\n",
+    "| **ISA**           | Inferred from Sequence Alignment |\n",
+    "| **ISM**           | Inferred from Sequence Model |\n",
+    "| **ISO**           | Inferred from Sequence Orthology |\n",
+    "| **ISA**           | Inferred from Sequence Alignment |\n",
+    "| **RCA**           | Inferred from Reviewed Computational Analysis |\n",
+    "| **NAS**           | Non-traceable Author Statement |\n",
+    "| **ND**            | No Biological Data Available (placeholder) |\n",
+    "| **NR**            | Not Recorded |\n",
+    "\n",
+    "\n",
+    "### Grouping of Codes:\n",
+    "\n",
+    "- **Experimental Evidence Codes**: \n",
+    "  - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n",
+    "  \n",
+    "- **Author/Curator Inferred Codes**:\n",
+    "  - **TAS**, **IC**, **NAS**\n",
+    "\n",
+    "- **Computational Evidence Codes**:\n",
+    "  - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n",
+    "\n",
+    "- **Others**:\n",
+    "  - **ND** (No Data), **NR** (Not Recorded)\n",
+    "\n",
+    "\n",
+    "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation."
+   ],
+   "id": "7f42b928364e5cd1"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "---",
+   "id": "1c11d6f520b02434"
   }
  ],
  "metadata": {

From 1a32757addfd29185d504dd6d56d4ae869b3e1dc Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 1 Oct 2024 11:03:26 +0200
Subject: [PATCH 055/112] Separate tokens.txt files for each n-gram

---
 chebai/preprocessing/datasets/go_uniprot.py | 10 ----------
 chebai/preprocessing/reader.py              |  6 ++++--
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index 574ecdbd..c59b3d4a 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -563,16 +563,6 @@ def base_dir(self) -> str:
         """
         return os.path.join("data", f"GO_UniProt")
 
-    @property
-    def identifier(self) -> tuple:
-        """Identifier for the dataset."""
-        # overriding identifier instead of reader.name to keep same tokens.txt file, but different processed_dir folder
-        if not isinstance(self.reader, dr.ProteinDataReader):
-            raise ValueError("Need Protein DataReader for identifier")
-        if self.reader.n_gram is not None:
-            return (f"{self.reader.name()}_{self.reader.n_gram}_gram",)
-        return (self.reader.name(),)
-
     @property
     def raw_file_names_dict(self) -> dict:
         """
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
index 46cd558a..e220e1e4 100644
--- a/chebai/preprocessing/reader.py
+++ b/chebai/preprocessing/reader.py
@@ -372,14 +372,16 @@ class ProteinDataReader(DataReader):
         "V",
     ]
 
-    @classmethod
-    def name(cls) -> str:
+    def name(self) -> str:
         """
         Returns the name of the data reader. This method identifies the specific type of data reader.
 
         Returns:
             str: The name of the data reader, which is "protein_token".
         """
+        if self.n_gram is not None:
+            return f"protein_token_{self.n_gram}_gram"
+
         return "protein_token"
 
     def __init__(self, *args, n_gram: Optional[int] = None, **kwargs):

From 33a5e64a1a904b00eec1df3f1bce93f499e4fa2c Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Tue, 1 Oct 2024 14:43:21 +0200
Subject: [PATCH 056/112] move commands to the top, restructure section 2

---
 tutorials/data_exploration_chebi.ipynb | 162 +++++++++++--------------
 1 file changed, 69 insertions(+), 93 deletions(-)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index 6ddd3238..6a7e25ed 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -1,30 +1,58 @@
 {
  "cells": [
   {
-   "cell_type": "markdown",
-   "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b",
    "metadata": {},
+   "cell_type": "markdown",
    "source": [
     "# Introduction\n",
     "\n",
-    "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n",
+    "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the ChEBI dataset. It covers\n",
+    "- how to instantiate a data class and generate data\n",
+    "- how the data is processed and stored\n",
+    "- and how to work with different molecule encodings.\n",
     "\n",
-    "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n",
+    "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n",
     "\n",
     "---\n"
-   ]
+   ],
+   "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b"
   },
   {
-   "cell_type": "markdown",
-   "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d",
    "metadata": {},
+   "cell_type": "markdown",
    "source": [
     "# 1. Instantiation of a Data Class\n",
     "\n",
-    "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n",
+    "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data."
+   ],
+   "id": "4550d01fc7af5ae4"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": 18,
+   "source": "from chebai.preprocessing.datasets.chebi import ChEBIOver50",
+   "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a71b7301-6195-4155-a439-f5eb3183d0f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chebi_class = ChEBIOver50(chebi_version=231)"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "\n",
     "### Inheritance Hierarchy\n",
     "\n",
-    "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n",
+    "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L598), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L23). Specifically:\n",
     "\n",
     "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n",
     "\n",
@@ -33,8 +61,8 @@
     "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n",
     "\n",
     "\n",
-    "### Explanation\n",
-    "A ChEBI data class can be configured with the following main parameters:\n",
+    "### Input parameters\n",
+    "A ChEBI data class can be configured with a range of parameters, including:\n",
     "\n",
     "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n",
     "\n",
@@ -45,87 +73,64 @@
     "### Additional Input Parameters\n",
     "\n",
     "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n"
-   ]
+   ],
+   "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d"
   },
   {
-   "cell_type": "markdown",
-   "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a",
    "metadata": {},
+   "cell_type": "markdown",
    "source": [
     "# Available ChEBI Data Classes\n",
     "\n",
     "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py):\n",
     "\n",
-    "## `ChEBIOver100`\n",
-    "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
-    "\n",
-    "## `ChEBIOver50`\n",
-    "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n",
+    "There is a range of available dataset classes for ChEBI. Usually, you want to use `ChEBIOver100` or `ChEBIOver50`. The number indicates the threshold for selecting label classes: ChEBI classes which have at least 100 / 50 SMILES-annotated subclasses will be used as labels.\n",
     "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverX`.\n",
+    "Both inherit from `ChEBIOverX`. If you need a different threshold, you can create your own subclass. By default, `ChEBIOverX` uses the SMILES encoding (see Section 5). The other implemented encodings are SELFIES and DeepSMILES, used by the classes `ChEBIOverXSELFIES` and `ChEBIOverXDeepSMILES`, respectively. \n",
+    "They also have subclasses for different thresholds (`ChEBIOver50SELFIES`, `ChEBIOver100SELFIES`, `ChEBIOver100DeepSMILES`).\n",
     "\n",
-    "## `ChEBIOver100DeepSMILES`\n",
-    "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n",
-    "\n",
-    "## `ChEBIOver100SELFIES`\n",
-    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n",
-    "\n",
-    "## `ChEBIOver50SELFIES`\n",
-    "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n",
-    "\n",
-    "## `ChEBIOver50Partial`\n",
-    "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n",
-    "\n",
-    "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n"
-   ]
+    "Finally, `ChEBIOver50Partial` selects extracts a part of ChEBI based on a given top class, with a threshold of 50 for selecting labels.\n",
+    "This class inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n"
+   ],
+   "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a"
   },
   {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22",
+   "cell_type": "markdown",
+   "id": "8456b545-88c5-401d-baa5-47e8ae710f04",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from chebai.preprocessing.datasets.chebi import ChEBIOver50"
+    "---"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "a71b7301-6195-4155-a439-f5eb3183d0f3",
    "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
    "source": [
-    "chebi_class = ChEBIOver50(chebi_version=231)"
-   ]
+    "# 2. Preparation / Setup Methods\n",
+    "\n",
+    "Now we have a ChEBI data class with all the relevant parameters. Next, we need to generate the actual dataset."
+   ],
+   "id": "ed973fb59df11849"
   },
   {
-   "cell_type": "markdown",
-   "id": "8456b545-88c5-401d-baa5-47e8ae710f04",
    "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
    "source": [
-    "---"
-   ]
+    "chebi_class.prepare_data()\n",
+    "chebi_class.setup()"
+   ],
+   "id": "d0a58e2bd9c0e6d9"
   },
   {
    "cell_type": "markdown",
    "id": "1655d489-25fe-46de-9feb-eeca5d36936f",
    "metadata": {},
    "source": [
-    "# 2. Preparation / Setup Methods\n",
     "\n",
-    "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
     "### Automatic Execution: \n",
-    "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n",
-    "\n",
+    "These methods are executed automatically when using the training command `chebai fit`. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n",
     "\n",
     "### Why is Preparation Needed?\n",
     "\n",
@@ -137,46 +142,17 @@
     "The data preprocessing in a data class involves two main methods:\n",
     "\n",
     "1. **`prepare_data` Method**:\n",
-    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n",
+    "   - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels. This step is independent of input encodings and all chemicals are stored as SMILES strings.\n",
     "   - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n",
     "\n",
     "2. **`setup` Method**:\n",
     "   - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n",
-    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n",
+    "   - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), tokenizing the input according to the specified encoding. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the tokenization.\n",
     "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
     "\n",
     "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "f2df4bd1-cf34-4414-bce4-54379ffac006",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n",
-      "Cross-validation enabled: False\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n",
-      "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n",
-      "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n"
-     ]
-    }
-   ],
-   "source": [
-    "chebi_class.prepare_data()\n",
-    "chebi_class.setup()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf",
@@ -202,7 +178,7 @@
     "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
     "\n",
     "2. **`data.pkl`**\n",
-    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
+    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values.\n",
     "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
     "\n",
     "3. **`data.pt`**\n",

From 016134f815c810f989566b94759514588cd09e02 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 1 Oct 2024 20:33:02 +0200
Subject: [PATCH 057/112] Obsolete terms being the parent of valid terms

---
 tests/unit/mock_data/ontology_mock_data.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index 40d9674e..0c713334 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -532,12 +532,21 @@ def get_obsolete_nodes_ids() -> Set[int]:
     @staticmethod
     def get_GO_raw_data() -> str:
         """
-        Get raw data in string format for GO ontology.
+        Get raw data in string format for a basic Gene Ontology (GO) structure.
 
-        This data simulates a basic GO ontology in a format typically used for testing.
+        This data simulates a basic GO ontology format typically used for testing purposes.
+        The data will include valid and obsolete GO terms with various relationships between them.
+
+        Scenarios covered:
+            - Obsolete terms being the parent of valid terms.
+            - Valid terms being the parent of obsolete terms.
+            - Both direct and indirect hierarchical relationships between terms.
+
+        The data is designed to help test the proper handling of obsolete and valid GO terms,
+        ensuring that the ontology parser can correctly manage both cases.
 
         Returns:
-            str: The raw GO data in string format.
+            str: The raw GO data in string format, structured as test input.
         """
         return """
         [Term]
@@ -557,6 +566,7 @@ def get_GO_raw_data() -> str:
         name: GO_2
         namespace: biological_process
         is_a: GO:0000001 ! hydrolase activity, hydrolyzing O-glycosyl compounds
+        is_a: GO:0000008 ! hydrolase activity, hydrolyzing O-glycosyl compounds
 
         [Term]
         id: GO:0000003
@@ -594,7 +604,6 @@ def get_GO_raw_data() -> str:
         id: GO:0000008
         name: GO_8
         namespace: molecular_function
-        is_a: GO:0000001 ! glucoside transport
         is_obsolete: true
 
         [Typedef]

From 582b528cb950344892d4959acc66b126c950ab6c Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 1 Oct 2024 20:40:17 +0200
Subject: [PATCH 058/112] remove `g.has_node(q["id"])`

- https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142
---
 chebai/preprocessing/datasets/chebi.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 7d53e831..727f9f64 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -260,13 +260,9 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
             g.add_node(n["id"], **n)
 
         # Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes
+        # https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142
         g.add_edges_from(
-            [
-                (p, q["id"])
-                for q in elements
-                for p in q["parents"]
-                if g.has_node(p) and g.has_node(q["id"])
-            ]
+            [(p, q["id"]) for q in elements for p in q["parents"] if g.has_node(p)]
         )
 
         print("Compute transitive closure")

From 4b39bbbcee268099ea393ee692c1a8d10b70a630 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 1 Oct 2024 20:54:16 +0200
Subject: [PATCH 059/112] for ngram, truncate sequence to adhere to max no of
 AA

---
 chebai/preprocessing/datasets/go_uniprot.py | 22 ++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index c59b3d4a..fd55d45d 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -80,6 +80,12 @@ def __init__(self, **kwargs):
             self.max_sequence_length >= 1
         ), "Max sequence length should be greater than or equal to 1."
 
+        if self.reader.n_gram is not None:
+            assert self.max_sequence_length >= self.reader.n_gram, (
+                f"max_sequence_length ({self.max_sequence_length}) must be greater than "
+                f"or equal to n_gram ({self.reader.n_gram})."
+            )
+
     @classmethod
     def _get_go_branch(cls, **kwargs) -> str:
         """
@@ -536,7 +542,8 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
 
         This method overrides the dataloader method from the superclass. After fetching the dataset from the
         superclass, it truncates the 'features' of each data instance to a maximum length specified by
-        `self.max_sequence_length`.
+        `self.max_sequence_length`. The truncation is adjusted based on the value of `n_gram` to ensure that
+        the correct number of amino acids is preserved in the truncated sequences.
 
         Args:
             kind (str): The kind of data to load (e.g., 'train', 'val', 'test').
@@ -547,9 +554,18 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
         """
         dataloader = super().dataloader(kind, **kwargs)
 
-        # Truncate the 'features' to max_sequence_length for each instance
+        if self.reader.n_gram is None:
+            # Truncate the 'features' to max_sequence_length for each instance
+            truncate_index = self.max_sequence_length
+        else:
+            # If n_gram is given, adjust truncation to ensure maximum sequence length refers to the maximum number of
+            # amino acids in sequence rather than number of n-grams. Eg, Sequence "ABCDEFGHIJ" can form 8 trigrams,
+            # if max length is 5, then only first 3 trigrams should be considered as they are formed by first 5 letters.
+            truncate_index = self.max_sequence_length - (self.reader.n_gram - 1)
+
         for instance in dataloader.dataset:
-            instance["features"] = instance["features"][: self.max_sequence_length]
+            instance["features"] = instance["features"][:truncate_index]
+
         return dataloader
 
     # ------------------------------ Phase: Raw Properties -----------------------------------

From d7e80970bd0db90017141101e8e62a0f6876388a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 1 Oct 2024 20:54:49 +0200
Subject: [PATCH 060/112] 3-gram token.txt

---
 .../bin/protein_token_3_gram/tokens.txt       | 8000 +++++++++++++++++
 1 file changed, 8000 insertions(+)
 create mode 100644 chebai/preprocessing/bin/protein_token_3_gram/tokens.txt

diff --git a/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt b/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt
new file mode 100644
index 00000000..69dca126
--- /dev/null
+++ b/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt
@@ -0,0 +1,8000 @@
+MAT
+ATP
+TPG
+PGA
+GAS
+ASS
+SSA
+SAR
+ARD
+RDE
+DEF
+EFV
+FVY
+VYM
+YMA
+MAK
+AKL
+KLA
+LAE
+AEQ
+EQA
+QAE
+AER
+ERY
+RYE
+YEE
+EEM
+EMV
+MVE
+VEF
+EFM
+FME
+MEK
+EKV
+KVA
+VAK
+AKA
+KAV
+AVD
+VDK
+DKD
+KDE
+DEL
+ELT
+LTV
+TVE
+VEE
+EER
+ERN
+RNL
+NLL
+LLS
+LSV
+SVA
+VAY
+AYK
+YKN
+KNV
+NVI
+VIG
+IGA
+GAR
+ARR
+RRA
+RAS
+ASW
+SWR
+WRI
+RII
+IIS
+ISS
+SSI
+SIE
+IEQ
+EQK
+QKE
+KEE
+EES
+ESR
+SRG
+RGN
+GND
+NDD
+DDH
+DHV
+HVS
+VSL
+SLI
+LIR
+IRD
+RDY
+DYR
+YRS
+RSK
+SKI
+KIE
+IET
+ETE
+TEL
+ELS
+LSD
+SDI
+DIC
+ICD
+CDG
+DGI
+GIL
+ILK
+LKL
+KLL
+LLD
+LDT
+DTI
+TIL
+ILV
+LVP
+VPA
+PAA
+AAA
+AAS
+ASG
+SGD
+GDS
+DSK
+SKV
+KVF
+VFY
+FYL
+YLK
+LKM
+KMK
+MKG
+KGD
+GDY
+DYH
+YHR
+HRY
+RYL
+YLA
+AEF
+EFK
+FKS
+KSG
+SGQ
+GQE
+QER
+ERK
+RKD
+KDA
+DAA
+AAE
+AEH
+EHT
+HTL
+TLT
+LTA
+TAY
+YKA
+KAA
+AAQ
+AQD
+QDI
+DIA
+IAN
+ANS
+NSE
+SEL
+ELA
+LAP
+APT
+PTH
+THP
+HPI
+PIR
+IRL
+RLG
+LGL
+GLA
+LAL
+ALN
+LNF
+NFS
+FSV
+SVF
+FYY
+YYE
+YEI
+EIL
+ILN
+LNS
+NSP
+SPD
+PDR
+DRA
+RAC
+ACN
+CNL
+NLA
+LAK
+AKQ
+KQA
+QAF
+AFD
+FDE
+DEA
+EAI
+AIA
+IAE
+AEL
+ELD
+DTL
+TLG
+LGE
+GEE
+ESY
+SYK
+YKD
+KDS
+DST
+STL
+TLI
+LIM
+IMQ
+MQL
+QLL
+LLR
+LRD
+RDN
+DNL
+NLT
+LTL
+TLW
+LWT
+WTS
+TSD
+SDM
+DMQ
+MQD
+QDD
+DDV
+DVA
+VAD
+ADD
+DDI
+DIK
+IKE
+KEA
+EAA
+AAP
+APA
+AAK
+AKP
+KPA
+PAD
+ADE
+DEQ
+EQQ
+QQS
+MSD
+SDT
+DTV
+EEL
+ELV
+LVQ
+VQR
+QRA
+RAK
+RYD
+YDD
+DDM
+DMA
+MAA
+AAM
+AMK
+MKK
+KKV
+KVT
+VTE
+TEQ
+EQG
+QGQ
+QEL
+LSN
+SNE
+NEE
+NVV
+VVG
+VGA
+RRS
+RSS
+SSW
+WRV
+RVI
+VIS
+QKT
+KTE
+TEG
+EGS
+GSE
+SEK
+EKK
+KKQ
+KQQ
+QQL
+QLA
+AKE
+KEY
+EYR
+YRV
+RVK
+VKV
+KVE
+VEQ
+EQE
+ELN
+LND
+NDI
+ICQ
+CQD
+QDV
+DVL
+VLK
+LDE
+EFL
+FLI
+LIV
+IVK
+VKA
+KAG
+AGA
+GAA
+AES
+ESK
+DYY
+YYR
+YRY
+AEV
+EVA
+VAS
+ASE
+SED
+EDR
+RAA
+AAV
+AVV
+VVE
+VEK
+EKS
+KSQ
+SQK
+QKA
+KAY
+AYQ
+YQE
+QEA
+EAL
+ALD
+LDI
+IAK
+AKD
+KDK
+DKM
+KMQ
+MQP
+QPT
+LNT
+NTP
+TPE
+PEH
+EHA
+HAC
+ACQ
+CQL
+FDD
+DDA
+DAI
+TLN
+LNE
+NED
+EDS
+DSY
+SDV
+DVG
+GAE
+AED
+EDQ
+DQE
+QEQ
+QEG
+EGN
+GNQ
+NQE
+EAG
+AGN
+MAS
+ASA
+SAE
+LSR
+SRE
+REE
+EEN
+ENV
+NVY
+AKT
+KTV
+TVD
+VDS
+DSE
+SEE
+EEG
+EGR
+GRG
+GNE
+DRV
+RVT
+VTL
+LIK
+IKD
+KDY
+YRG
+RGK
+GKI
+LTK
+TKI
+KIC
+LLE
+LET
+ETH
+THL
+HLV
+VPS
+PSS
+SST
+STA
+TAP
+APE
+PES
+FKT
+KTG
+TGA
+AEN
+ENT
+NTM
+TMV
+MVA
+IAL
+ALA
+ACS
+CSL
+SLA
+AIS
+ISE
+TLS
+LSE
+DIS
+EDP
+DPA
+PAE
+AEE
+EEI
+EIR
+IRE
+REA
+EAP
+APK
+PKR
+KRD
+RDS
+DSS
+SSE
+SEG
+EGQ
+LES
+ESH
+SHL
+LLH
+LHD
+HDN
+PKH
+KHD
+HDL
+DLS
+MST
+STR
+TRE
+VDV
+DVE
+SVE
+SKG
+KGN
+EDH
+HVA
+VAI
+AII
+IIK
+IES
+ESE
+LSK
+LNV
+NVL
+VLE
+LEA
+EAH
+AHL
+HLI
+LIP
+IPS
+PSA
+SAS
+ASP
+SPA
+FKA
+RKE
+EST
+TLV
+LVA
+YKS
+KSA
+ASD
+IAT
+ATA
+TAE
+DMT
+MTD
+TDE
+AGD
+GDE
+DEI
+EIK
+EAS
+ASK
+SKP
+KPD
+PDG
+DGA
+MAE
+RED
+EDC
+DCV
+CVF
+VFL
+FLS
+SKL
+EQS
+QSE
+SER
+YDE
+DEM
+MVQ
+VQY
+QYM
+YMK
+MKQ
+KQV
+QVA
+VAA
+AAL
+NTE
+IGS
+GSR
+SRR
+IIT
+ITS
+TSL
+SLE
+LEQ
+KEQ
+QAK
+AKG
+NDK
+DKH
+KHV
+HVE
+VEI
+EII
+IKG
+KGY
+GYR
+YRA
+AKI
+IED
+EDE
+AKY
+KYC
+YCD
+CDD
+LKV
+KVI
+VIK
+KEN
+ENL
+LLP
+LPN
+PNA
+NAS
+AST
+STS
+TSE
+SES
+FYK
+YKK
+KKM
+KME
+MEG
+EGD
+RYY
+YYA
+YAE
+EFT
+FTV
+VDE
+DEK
+EKR
+KRQ
+RQE
+QEV
+ADK
+DKS
+KSL
+LAA
+AAY
+AYT
+YTE
+TEA
+EAT
+ATE
+TEI
+EIS
+ISN
+SNA
+NAD
+ADL
+DLA
+EIM
+IMN
+MND
+NDA
+DAD
+DKA
+KAC
+DDS
+DSI
+SIA
+KLD
+DEV
+EVP
+VPE
+ESS
+SSY
+DTA
+TAD
+DEE
+AAT
+ATL
+LGR
+GRD
+RDQ
+DQY
+QYV
+YVY
+VQF
+QFM
+MEQ
+EQL
+QLV
+LVT
+VTG
+GAT
+TPA
+GSL
+SLR
+LRA
+AAW
+AWR
+RIV
+IVS
+VSS
+SRK
+RKN
+KND
+NDE
+DEH
+EHV
+SLV
+LVK
+VKD
+VES
+LSS
+SSV
+SVC
+VCS
+CSG
+SGI
+LDS
+DSH
+SAG
+RYM
+DER
+RKT
+KTA
+TAA
+EDT
+DTM
+TML
+MLA
+LAY
+IAA
+AAD
+ADM
+MAP
+NSS
+SSD
+SDK
+CNM
+NMA
+AFE
+FEE
+EEA
+MQE
+EQM
+QMD
+MDE
+ATT
+TTL
+SRD
+LVS
+VSG
+SGA
+PAG
+AGE
+GEL
+KNE
+EEH
+VET
+SIC
+ICS
+ILR
+LRL
+RLL
+SAT
+TAS
+TMI
+MIA
+IAY
+VAV
+AVA
+EKA
+CSM
+SMA
+MTM
+TMD
+MDK
+KSE
+VQK
+KAK
+MKA
+AVT
+QGH
+GHE
+HEL
+TER
+RNE
+NEK
+QQM
+QMG
+MGK
+GKE
+YRE
+REK
+EKI
+IEA
+EAE
+ELQ
+LQD
+ICN
+CND
+NDV
+LEL
+ELL
+LDK
+DKY
+KYL
+YLI
+IPN
+NAT
+ATQ
+TQP
+QPE
+DYF
+YFR
+FRY
+YLS
+SEV
+GDN
+DNK
+NKQ
+KQT
+QTT
+TTV
+TVS
+VSN
+SNS
+NSQ
+SQQ
+QQA
+QAY
+EAF
+FEI
+ISK
+SKK
+KKE
+KEM
+EMQ
+SPE
+PEK
+TAF
+SEN
+ENQ
+NQG
+QGD
+DEG
+GDA
+DAG
+GEG
+EGE
+GEN
+LIL
+LNA
+TQA
+SGE
+ENK
+CSD
+ATH
+THA
+HAE
+MTE
+ERE
+REN
+ENN
+NNV
+VYK
+VEA
+EAM
+ASM
+SMD
+MDV
+VEL
+TSI
+NKG
+KGA
+EEK
+EKL
+KLE
+LEM
+EMI
+MIK
+IKT
+KTY
+TYR
+RGQ
+GQV
+QVE
+EKE
+KEL
+ELR
+RDI
+DIL
+LEK
+EKH
+KHL
+IPC
+PCA
+CAT
+ATS
+TSG
+GES
+YYK
+YKM
+EFA
+FAT
+ATG
+TGS
+GSD
+SDR
+DRK
+ENS
+NSL
+LIA
+IAM
+AMN
+NDL
+DLP
+LPP
+PPT
+ACR
+CRL
+RLA
+AAF
+MQA
+EEV
+EVD
+VDP
+DPN
+NAG
+GDG
+DGE
+GEP
+EPK
+PKE
+EQI
+QIQ
+IQD
+VED
+DQD
+DVS
+MDD
+DDR
+DRE
+EDL
+DLV
+LVY
+VYQ
+YQA
+ESM
+SMK
+VAG
+AGM
+GMD
+KGG
+GGE
+GED
+EDK
+DKL
+KLK
+KMI
+MIR
+REY
+YRQ
+RQM
+QMV
+ELK
+KLI
+LIC
+ICC
+CCD
+CDI
+ILD
+LDV
+VLD
+IPA
+AAN
+ANT
+NTG
+TGE
+TGN
+NDR
+AMT
+ELP
+MQG
+EEQ
+EQN
+QNK
+NKE
+ALQ
+DEN
+MGD
+GDR
+REQ
+LLQ
+LQR
+RAR
+ARL
+SAM
+NEP
+EPL
+PLS
+DRN
+KTM
+TMA
+MAD
+ADG
+DGN
+KKL
+KVK
+AYR
+IEK
+ELE
+ETV
+TVC
+VCN
+VLS
+LSL
+SLL
+DKF
+KFL
+IKN
+KNC
+NCN
+NDF
+DFQ
+FQY
+QYE
+YES
+GEK
+KKN
+KNS
+NSV
+SVV
+SEA
+YKE
+SKE
+QMQ
+EIQ
+IQN
+QNA
+NAP
+PEQ
+QAC
+ACL
+CLL
+LLA
+SDQ
+DQQ
+QQD
+QDE
+VLA
+ALL
+KEH
+EHM
+HMQ
+MVD
+VDR
+KAR
+MKN
+NVT
+KTS
+TSA
+SAD
+KKI
+IEM
+MVR
+VRA
+RAY
+EAV
+AVC
+VCQ
+LDN
+DNY
+NYL
+NCS
+CSE
+SET
+ETQ
+TQY
+VAT
+KRA
+RAT
+ATV
+TVV
+AYS
+YSE
+AHE
+HEI
+LNY
+NYS
+YSV
+ACH
+CHL
+HLA
+DDD
+DDG
+DGG
+GNN
+MER
+ERA
+ASL
+LIQ
+IQK
+YED
+EDM
+AFM
+FMK
+MKS
+SAV
+AVE
+EKG
+KGE
+LSC
+SCE
+CEE
+VGG
+GGQ
+GQR
+RVL
+QKS
+KSN
+KGP
+GPE
+PEV
+EVK
+VKE
+LRG
+RGV
+GVC
+VCD
+CDT
+TVL
+VLG
+GLL
+GAG
+DAE
+SRV
+RVF
+TGD
+GDD
+DDK
+DKK
+KKR
+KRI
+IID
+IDS
+DSA
+ARS
+RSA
+SAY
+AMD
+MDI
+EMP
+MPP
+PTN
+TNP
+NPI
+VFH
+FHY
+HYE
+EIA
+PEE
+ISL
+KTT
+TTF
+TFD
+AMA
+DLH
+LHT
+WTA
+ADS
+EGG
+GEA
+EEP
+EPQ
+PQS
+EKT
+ELI
+ATC
+TCM
+CMK
+QGA
+GGR
+GRR
+SAW
+KTD
+TDT
+DTS
+KLQ
+LQL
+QLI
+LRS
+RSI
+ICT
+CTT
+ANA
+ATN
+NPE
+VAC
+ACG
+CGD
+RKQ
+QTI
+TID
+IDN
+DNS
+SQG
+GAY
+FDI
+LNN
+NNP
+PEL
+LAC
+ACT
+CTL
+TLA
+SDS
+EEC
+ECD
+CDA
+AEG
+EGA
+TIE
+IEN
+STV
+DKE
+MAQ
+AQA
+QAM
+KSV
+SVT
+TET
+ETG
+TGV
+GVE
+ARK
+LAR
+ARE
+RER
+ERV
+RVE
+LRE
+REI
+EIC
+ICY
+CYE
+YEV
+EVL
+IPK
+PKA
+KAS
+ASN
+SNP
+DAR
+ARN
+RNT
+NTV
+VVD
+VDD
+DSQ
+SQT
+QTA
+YQD
+QDA
+DAF
+KGK
+GKM
+PDK
+DTQ
+TQG
+AEP
+PQE
+GGD
+DKN
+NEL
+AAC
+ACM
+RVV
+VVS
+AEK
+QMA
+MAR
+EKF
+ASQ
+SQA
+AAG
+KKG
+KGI
+GIV
+IVD
+VDQ
+DQS
+QSQ
+AEA
+SQP
+MPA
+PAS
+ASR
+DSV
+SVY
+VYL
+VEN
+ENM
+NMK
+SSG
+EAK
+NES
+ESQ
+SQV
+VAL
+ALI
+ICE
+CED
+EDI
+ILS
+SVL
+SDH
+DHL
+LIT
+SAQ
+AQT
+QTG
+FAI
+KRK
+EAY
+DAV
+DLE
+ETL
+WTD
+TDL
+TEE
+QQQ
+QSS
+SSQ
+QAP
+AQP
+PTE
+EGK
+GKA
+KAD
+ADQ
+MTR
+VAE
+NEN
+ENH
+NHV
+HVK
+VKK
+KIK
+EYK
+YKC
+KCK
+CKV
+LTD
+TDI
+ILE
+LEV
+GNP
+NPR
+PRK
+SSL
+IAV
+DVH
+VHN
+HNM
+NME
+EKN
+KNQ
+NQD
+QDG
+DGD
+DDQ
+DQN
+QNE
+EPG
+PGM
+AFT
+FTR
+EDY
+DYV
+YVF
+VFM
+FMA
+AQL
+QLN
+ENA
+NAE
+ETM
+TMR
+MRK
+RKI
+KIS
+ISG
+SGM
+GME
+KER
+IGP
+GPR
+PRR
+KEK
+KGR
+GRQ
+RQK
+QKP
+KPN
+NAK
+AKR
+RIE
+QIR
+IRV
+RVY
+VYR
+QKI
+LQE
+EQF
+QFV
+FVP
+VPR
+PRS
+RST
+STN
+TNA
+ADA
+DAK
+AKV
+AEY
+EYS
+YSS
+KIA
+IAG
+AGS
+GSA
+SAL
+NAY
+AYN
+YNS
+NSA
+SAF
+ISQ
+QLP
+ILA
+LAS
+ACE
+CEL
+RKA
+KAF
+FDA
+AAI
+AIT
+ITD
+DLD
+KLT
+LTE
+NLN
+LNL
+NLW
+LWV
+WVT
+VTD
+TDS
+DDN
+DNA
+NEA
+ALS
+VLN
+DNF
+NFL
+NCG
+CGE
+GET
+TQH
+QHE
+HES
+KSY
+SYS
+DDE
+MVS
+VSQ
+QVV
+VVA
+EKP
+KPQ
+PQL
+KKA
+AGC
+GCN
+CNS
+NSH
+SHG
+HGQ
+GQD
+QDS
+SYF
+YFL
+FLG
+LGW
+GWQ
+WQE
+QEY
+EYE
+YEK
+KNP
+NPF
+PFD
+FDP
+DPV
+PVS
+NPS
+PSG
+GII
+IIQ
+IQM
+MGL
+NQL
+QLS
+LSF
+SFD
+FDL
+DLL
+LEE
+EEW
+EWL
+WLE
+NPH
+PHA
+HAL
+ALG
+GLR
+LRR
+RRE
+REG
+GGG
+GGA
+ASV
+VFR
+FRE
+REL
+ALF
+LFQ
+FQD
+QDY
+YHG
+HGL
+GLP
+LPA
+PAF
+AFK
+FKN
+KNA
+NAL
+ARF
+RFM
+FMS
+MSE
+SEQ
+EQR
+QRG
+RGY
+GYK
+YKV
+KVV
+VVF
+VFD
+DPS
+PSN
+SNI
+NIV
+IVL
+VLT
+TAG
+SAN
+ANE
+ALM
+LMF
+MFC
+FCL
+CLA
+LAD
+ADH
+DHG
+HGD
+AFL
+IPT
+PTP
+TPY
+PYY
+YYP
+YPG
+PGF
+GFD
+FDR
+DRD
+RDL
+DLK
+LKW
+KWR
+WRT
+RTG
+AEI
+EIV
+IVP
+VPV
+PVH
+VHC
+HCA
+CAS
+ANG
+NGF
+GFR
+FRV
+VTR
+TRP
+RPA
+PAL
+LDD
+DAY
+YRR
+RAQ
+AQK
+QKR
+KRR
+RRL
+RLR
+LRV
+VKG
+KGV
+GVL
+VLI
+ITN
+NPL
+PLG
+LGT
+GTA
+SPR
+PRA
+RAD
+ETI
+TIV
+VDF
+DFV
+FVA
+GIH
+IHL
+LIS
+ISD
+SDE
+EIY
+IYA
+YAG
+AGT
+AFA
+FAE
+EPP
+PPA
+AGF
+GFV
+FVS
+VSA
+ALE
+EVV
+AGR
+RDG
+GAD
+ADV
+VSD
+RVH
+VHV
+HVV
+VVY
+VYS
+YSL
+SLS
+SKD
+KDL
+DLG
+LPG
+RVG
+GAI
+AIY
+IYS
+YSA
+NAA
+SAA
+ATK
+TKM
+KMS
+MSS
+SSF
+SFG
+FGL
+GLV
+QTQ
+QYL
+YLL
+LLG
+LGD
+RDF
+DFT
+TRS
+RSY
+SYV
+YVA
+NKR
+RRI
+RIK
+ERH
+RHD
+HDQ
+DQL
+LVD
+VDG
+DGL
+EIG
+IGI
+GIG
+IGC
+GCL
+CLP
+LPS
+AGL
+GLF
+LFC
+FCW
+CWV
+WVD
+VDM
+DMS
+MSH
+HLM
+LMR
+MRS
+RSR
+SRS
+RSF
+SFA
+FAG
+GEM
+EME
+MEL
+ELW
+LWK
+WKK
+VFE
+FEV
+EVG
+VGL
+GLN
+LNI
+NIS
+ISP
+SPG
+PGS
+GSS
+SSC
+SCH
+CHC
+HCR
+CRE
+REP
+PGW
+GWF
+WFR
+RVC
+VCF
+CFA
+FAN
+ANM
+NMS
+MSA
+SAK
+KTL
+TLD
+VAM
+AMQ
+MQR
+QRL
+SFV
+FVD
+TGG
+ALR
+AVP
+PVR
+VRS
+RSV
+SVS
+VSC
+SCP
+CPL
+PLA
+LAI
+AIK
+IKW
+KWA
+WAL
+RLT
+LTP
+TPS
+PSI
+IAD
+ADR
+KAE
+MAY
+YQG
+QGI
+GID
+IDL
+LST
+STK
+TKA
+HGE
+YFD
+FDG
+DGW
+GWK
+WKA
+AYD
+YDT
+DTN
+DLR
+LRH
+RHN
+HNR
+NRG
+RGG
+GGV
+GVI
+VIQ
+SLD
+LDL
+DLI
+LIE
+IEE
+EWS
+WSK
+SKN
+KNH
+NHP
+HPE
+PEA
+ASI
+CTP
+PEG
+EGV
+GVS
+SQF
+QFK
+FKR
+RIA
+ANF
+NFQ
+LPE
+PEF
+EFR
+FRK
+KAM
+AQF
+FMG
+MGQ
+QVR
+VRG
+GGK
+KAT
+ATF
+DPD
+VVM
+VMS
+MSG
+SGG
+GAQ
+AQE
+QET
+LAF
+AFC
+LAN
+ANP
+NPG
+PGE
+FLV
+VPT
+YPA
+RDC
+DCC
+CCW
+CWR
+WRS
+RSG
+GIK
+IKL
+LPI
+PIE
+IEC
+ECH
+CHS
+HSF
+SFN
+FND
+DFR
+FRL
+TKE
+ALV
+YDG
+RRQ
+RQG
+GIS
+ISV
+SVK
+ILI
+GTI
+TIT
+TDR
+RDT
+LAM
+AML
+LAT
+TFA
+TEH
+EHR
+HRV
+VHL
+LVC
+CDE
+GSV
+VFA
+PEY
+EYV
+YVS
+VSI
+EVI
+VIE
+IER
+ERD
+RDV
+DVP
+VPW
+PWC
+WCN
+CNR
+NRD
+LIH
+IHV
+KDF
+DFG
+VGI
+IIY
+YSY
+SYN
+YND
+AAR
+RRM
+RMS
+QYF
+FLA
+ARM
+RML
+MLS
+EEF
+EFI
+FIG
+IGR
+GRF
+RFL
+FLQ
+QES
+SKC
+KCR
+RLV
+VAR
+ARH
+RHE
+HER
+ERF
+RFT
+FTS
+SGL
+REV
+CLR
+GNA
+LFS
+FSW
+SWM
+WMD
+MDL
+MLR
+LWR
+VIV
+IVH
+VHQ
+HQV
+QVK
+VKL
+KLN
+NVS
+VSP
+PGT
+GTS
+TSF
+SFH
+FHC
+VCH
+CHA
+HAN
+NMD
+DET
+TME
+MEV
+GRI
+RIH
+IHD
+HDF
+FVR
+VRQ
+RQH
+QHQ
+HQQ
+QQR
+QRR
+RRV
+ERW
+RWA
+WAA
+ANR
+NRQ
+RQL
+QLR
+RLS
+SLP
+LPH
+PHH
+HHH
+HHL
+HLS
+LSP
+PAH
+SSP
+SPL
+SPQ
+QSP
+SPM
+PMV
+KQL
+TKV
+VTS
+TSN
+SNG
+NGH
+GHG
+GWE
+WEE
+EEY
+NPY
+PYD
+NPN
+PNG
+NGM
+GMI
+MIQ
+QLC
+LCF
+CFD
+ESW
+SWL
+WLT
+TKN
+NPD
+PDA
+SLK
+LKR
+KRN
+RNG
+NGQ
+GQS
+QSI
+SIF
+IFR
+HGM
+GMP
+MPE
+FKK
+MEE
+IRG
+GNR
+NRV
+VTF
+DPK
+PKK
+KIV
+GST
+NET
+TLM
+PGD
+FLL
+LPT
+VPI
+PIH
+IHC
+HCS
+CSS
+SSS
+SSN
+GFQ
+FQI
+QIT
+ITE
+TES
+ESA
+LQQ
+YQQ
+QAQ
+QKL
+VLV
+VTN
+TAL
+ALT
+LTR
+TRR
+LLV
+DFI
+FIT
+TSK
+KNI
+NIH
+YSG
+SGT
+GTM
+TMF
+MFG
+FGF
+GFE
+FEQ
+QFI
+FIS
+SVM
+VMD
+LKD
+LED
+DTE
+TEV
+EVS
+VSK
+SKR
+KRV
+YSN
+SND
+MIV
+LSA
+KKF
+KFT
+TSQ
+SQY
+YLE
+NQK
+KRL
+RLK
+LKS
+KSR
+SRQ
+RQR
+GLE
+AGI
+GIT
+ITC
+TCL
+RSN
+DMR
+MRH
+RHL
+HLL
+TNT
+NTF
+TFE
+FEA
+DLW
+IVY
+VYN
+YNV
+NVK
+HCT
+CTE
+TEP
+ALK
+LKT
+KTF
+TFV
+FVE
+STD
+TDC
+DCG
+CGR
+GRM
+RMI
+MIS
+ISR
+SSH
+SHE
+ERL
+LRK
+RKK
+KKT
+SNW
+NWV
+WVF
+RVS
+VSW
+SWT
+RVP
+VPD
+PDE
+VAF
+TEK
+KQD
+QDL
+DLN
+IAS
+DGH
+AYE
+ENP
+PFH
+FHP
+PID
+IDR
+DRP
+RPD
+DGV
+LCG
+GDL
+DLM
+RKW
+KWV
+WVL
+LKH
+KHP
+CTS
+GVN
+VNQ
+NQF
+QFS
+FSD
+IAI
+AIF
+IFQ
+FRQ
+RQA
+QAV
+AKF
+KFM
+KTR
+TRN
+RNN
+NNK
+NKV
+VKF
+KFD
+DRI
+IVM
+GAH
+HET
+TVA
+DGF
+GFL
+LRW
+RWR
+VNL
+NLV
+PVT
+VTC
+TCH
+HSS
+GFK
+FKI
+KIT
+ITV
+YEN
+NAR
+RKS
+NIP
+IPV
+PVK
+KGL
+GTT
+LDR
+REC
+ECL
+CLK
+LVN
+VNF
+NFT
+FTN
+TND
+DKG
+YAA
+TFG
+FGQ
+SEF
+EIE
+DCN
+IHI
+HIV
+KDM
+DMG
+PGL
+VVQ
+VQI
+QIA
+IAR
+RKM
+QHL
+AKM
+KML
+FIR
+RES
+KLR
+RHA
+EIT
+ITT
+TTG
+TGL
+GLD
+LDG
+GLG
+LGI
+IGW
+GWL
+WLK
+LKA
+LFL
+FLW
+LWM
+LRN
+LLK
+TAT
+FDS
+PGG
+GGS
+GSF
+HCH
+CHE
+HEP
+MDH
+DHK
+HKT
+MET
+ETA
+LER
+ERI
+RIR
+VFT
+SQL
+QLE
+EEE
+EET
+ETK
+TKP
+KPM
+PMA
+TTM
+TMM
+MMA
+AKK
+KKK
+KKC
+KCW
+CWQ
+WQS
+QSN
+SNL
+NLR
+SFS
+DTR
+RRF
+RFD
+GFF
+FFS
+FSP
+SPH
+PHS
+HSP
+SPV
+PVP
+VPP
+PPS
+PSP
+PLV
+LVR
+RKV
+NAH
+AHG
+NGI
+ETW
+TWL
+WLA
+AKN
+GLK
+LKK
+KKD
+KDG
+DGQ
+IFK
+FKE
+KAL
+PSK
+MLT
+GTV
+TVF
+VFG
+VSV
+KNL
+NLE
+LEN
+VHI
+MVV
+TST
+STY
+TYL
+YLD
+LKI
+KIR
+IRQ
+QKK
+KLV
+VYD
+YDV
+DVK
+MKR
+LKE
+YVE
+DSR
+SKS
+KSS
+SHD
+HDR
+IKS
+RKR
+KRT
+RTV
+MHG
+HGS
+GSG
+SGH
+GHS
+HSL
+SLT
+LTG
+GAP
+APH
+PHQ
+HQI
+QIP
+IPP
+PPP
+PPR
+PRT
+RTQ
+GQQ
+TAN
+ANQ
+DKI
+KID
+IDP
+DPF
+FHN
+HNK
+KRG
+RGT
+TSR
+LRI
+RIN
+INN
+NNS
+SSR
+SRY
+RYN
+NVD
+VQL
+KDT
+NEQ
+EQP
+QPA
+LVI
+VQC
+QCQ
+CQH
+QHV
+HVF
+FDF
+DFY
+FYD
+YDP
+PVA
+VAQ
+QLK
+LKC
+CKE
+KEI
+IKR
+LID
+IDH
+DHI
+HIT
+TKG
+AIV
+IVE
+TIY
+IYP
+PAV
+AVI
+IKM
+KMV
+NIF
+VLP
+PSE
+ENC
+NCE
+CEF
+EFD
+DPE
+EED
+DEP
+EPT
+PTL
+TLE
+SWP
+WPH
+PHL
+HLQ
+VYE
+YEL
+ELF
+FLR
+LRF
+FLE
+ESP
+PDF
+FQA
+QAS
+SIG
+IGK
+GKK
+KKY
+KYI
+YID
+IDQ
+DQR
+QRF
+RFV
+FVL
+DLF
+LFD
+DPR
+PRE
+DFL
+FLK
+VLH
+LHR
+HRI
+RIY
+IYG
+YGK
+GKF
+RAF
+AFI
+IRK
+RKH
+KHI
+HIN
+NNM
+NMF
+MFL
+YET
+ETD
+DSF
+FNG
+NGV
+GVG
+VGE
+LEI
+ILG
+LGS
+GSI
+SII
+IIN
+ING
+GFA
+FAL
+ALP
+LPL
+PLK
+LKQ
+KQE
+QEH
+EHK
+HKV
+KVL
+VLL
+PLH
+LHK
+HKP
+KPK
+PKC
+KCL
+CLS
+SLY
+LYH
+YHA
+HAQ
+AYC
+YCV
+CVV
+FIE
+EKD
+TPQ
+PQV
+QVF
+LKF
+KFW
+FWP
+WPR
+RTC
+TCS
+SSK
+KEV
+EVM
+VMF
+GEV
+EVE
+DII
+IIE
+IEP
+EPE
+KII
+DPL
+PLF
+LFR
+AKC
+KCV
+CVS
+PHF
+HFQ
+FQV
+RAL
+ALY
+LYF
+YFW
+FWN
+WNN
+NNE
+NEY
+EYI
+YIL
+TSS
+LVM
+VMP
+MPI
+PIM
+IMF
+MFP
+FPA
+LYR
+YRI
+RIS
+EHW
+HWN
+WNQ
+NQT
+IVA
+TFM
+MEM
+EMN
+MNG
+NGK
+GKL
+KLF
+LTS
+TYK
+YKG
+GER
+EKQ
+KQR
+QRE
+KDR
+RDA
+AFW
+FWK
+MEA
+LNP
+NPP
+EVT
+VTP
+PSL
+SLF
+LFP
+FPE
+TDY
+DYL
+DGP
+GPN
+PNM
+NMT
+MTP
+TPL
+PLP
+LPV
+AGG
+GDK
+KSP
+SPS
+PSV
+VVK
+KKS
+STG
+ETT
+TTT
+TTP
+PAK
+TKL
+KLP
+STP
+TPT
+PTS
+TSP
+GLS
+PPD
+DKV
+KVD
+GFS
+FSR
+RSL
+ARP
+RPR
+RSH
+SHS
+QFR
+RYQ
+YQS
+SNQ
+NQQ
+QQE
+PLL
+KDV
+ELH
+LHE
+RKL
+LAQ
+AQC
+QCG
+CGV
+GVM
+MFD
+FLD
+LDC
+CVA
+LKG
+VKR
+LVE
+VEC
+ECV
+CVG
+VGS
+TRG
+EPV
+PVY
+VYP
+YPD
+PDI
+IIR
+IRM
+SVN
+VNI
+FRT
+RTL
+TLP
+EPN
+PNL
+LEP
+EPS
+PSW
+YEF
+EFF
+FFL
+FQP
+QPS
+KRY
+RYV
+YVD
+DQK
+QKF
+KFV
+VLM
+LML
+MLL
+EYL
+KTI
+ILH
+VYG
+AYI
+YIR
+KQC
+QCN
+CNH
+NHI
+HIF
+IFL
+RFI
+FIY
+IYE
+LEH
+EHF
+HFN
+GVA
+HKQ
+KQF
+QFL
+VRV
+IPL
+LHS
+HSV
+VKS
+FHA
+DAT
+HVI
+VIR
+RGL
+LKY
+KYW
+YWP
+WPK
+PKT
+KTC
+TCT
+CTQ
+TQK
+DVI
+PSQ
+FVK
+VKI
+KIQ
+IQE
+QEP
+LFK
+FKQ
+ARC
+RCV
+EDN
+DNC
+NCH
+CHT
+HTV
+AVF
+FGT
+GTL
+TLY
+LYQ
+YQV
+QVS
+LIY
+IYN
+ASY
+YKL
+QQK
+KAQ
+ERQ
+WRG
+RLQ
+LQG
+QGT
+GTQ
+GAK
+APV
+PRP
+RPT
+MPY
+PYK
+KEP
+PPK
+PKV
+KCT
+CTA
+TAK
+KPS
+SGK
+GKD
+EAQ
+QPQ
+PQP
+PQA
+AQS
+QPP
+SNK
+KRP
+RPS
+NST
+TPP
+PTQ
+TQL
+IKY
+KYS
+GGP
+GPQ
+PQI
+QIV
+ERR
+RQS
+SRF
+RFN
+FNL
+NLS
+KNR
+NRE
+LQK
+DSP
+SPT
+TQE
+LFI
+FIQ
+LRQ
+RQC
+QCC
+CCV
+CVL
+VLF
+SDP
+SDL
+KFK
+RAG
+NEM
+VEY
+YIT
+ITH
+THS
+HSR
+DVV
+VVT
+YPE
+VTM
+MFS
+NLF
+NPT
+PTG
+AWP
+QPN
+PNI
+NIA
+IRR
+RQI
+QIN
+INH
+IFY
+FYR
+YRF
+EHH
+HHN
+HNG
+GIA
+HKM
+KMF
+VYH
+YHP
+HPQ
+KES
+PVI
+IVG
+KTH
+SPK
+FLN
+EFS
+FSK
+KVM
+VME
+MEP
+LYY
+YYW
+YWN
+YIM
+IMS
+MSL
+SDN
+ARV
+YRN
+RNS
+NSK
+KSH
+SHW
+WNK
+NKT
+TIH
+IHG
+GLI
+YNA
+LFM
+MNQ
+DDC
+DCT
+TQQ
+QQY
+QYK
+KQK
+QKG
+RFR
+FRM
+RMK
+MKE
+EMW
+MWQ
+WQK
+RLN
+NPQ
+PQY
+QYP
+YPM
+PMF
+MFR
+FRA
+RAP
+APP
+PPL
+PPV
+YSM
+SME
+ETP
+PTA
+DIQ
+IQL
+AVQ
+VQM
+QML
+MLK
+KDI
+IKK
+RRK
+LPQ
+PQD
+DVY
+VYT
+YTI
+TIK
+IKA
+AHK
+HKR
+RAE
+FLT
+SQE
+MMR
+MRG
+RGF
+RLI
+STT
+TTS
+KKP
+HGT
+TTH
+GSK
+KST
+TTE
+GKQ
+KQS
+QSG
+SGS
+SVP
+QGK
+GKH
+KHH
+HHS
+SKT
+KTK
+TKT
+VSR
+TKK
+RKG
+KGQ
+QSK
+SKQ
+QQP
+SQS
+QKQ
+KQG
+QGS
+AIM
+MNP
+TPV
+PVL
+TVT
+VTK
+TKD
+KDD
+DHA
+HAH
+AHP
+HPT
+TLL
+LGA
+GAV
+AVS
+SPI
+PIS
+TAV
+ENG
+NGN
+GNS
+NSN
+SNN
+NNN
+NMN
+MNI
+NIN
+INT
+NTS
+SNT
+NTQ
+TQD
+DAN
+ANH
+NHA
+HAS
+SID
+IDI
+DIP
+IPR
+SFE
+FER
+RLP
+PTK
+PDT
+DTD
+KTP
+PQR
+QRH
+RHS
+RFE
+FEP
+PSR
+RYT
+YTP
+PLT
+PNF
+NFN
+FNE
+NEV
+RIP
+FIA
+DQC
+CNT
+DFN
+NDP
+PSF
+IQG
+KRS
+IEF
+TNR
+NRF
+FTY
+TYT
+YTN
+TNE
+EMY
+MYA
+YAH
+AHV
+VVN
+VNM
+MFK
+KIN
+INL
+FRP
+RPI
+PIP
+PVN
+VNP
+NPV
+PVG
+VGD
+GDI
+DIY
+IYD
+DED
+VNE
+LAW
+PHM
+AVY
+FNH
+NHQ
+KQY
+QYI
+QDF
+FIL
+DIR
+DCL
+TLH
+SFI
+RSM
+SMN
+MNN
+NNI
+LQF
+KFN
+VRI
+RIL
+KVR
+VRC
+RCL
+YCI
+CIV
+IVQ
+KDP
+LLT
+VMG
+LRY
+RYW
+PKI
+INS
+NEI
+DIF
+IFE
+PLE
+LEF
+FIK
+IKV
+VEV
+VPL
+LFV
+FVQ
+KCI
+CIS
+LSY
+SYW
+EYF
+NLC
+LCI
+CIE
+VIL
+ILP
+PII
+IIF
+IFP
+LYE
+NGE
+SIS
+DPY
+PYM
+YML
+MLV
+QAI
+AIN
+NSG
+GSW
+SWN
+WNR
+NRA
+RAI
+AIH
+IHA
+HAM
+MAF
+KIF
+ETN
+VLY
+CNA
+LYL
+KET
+QRK
+KVQ
+ENW
+NWS
+YVK
+VKN
+NND
+KDQ
+QYT
+NSF
+FNT
+NTA
+NNT
+NTL
+ENE
+END
+NDC
+DCD
+CDS
+SEI
+IKQ
+KQI
+QIF
+IFG
+FGK
+LPR
+RKP
+SHN
+HND
+NDS
+DSN
+VNS
+NSY
+SYY
+YYI
+YIP
+PNS
+NGA
+GAN
+NGT
+TVI
+VIA
+IAP
+APS
+SNR
+NRT
+RTN
+TNQ
+NQV
+QVN
+VNG
+GVY
+YEA
+SFR
+FRD
+KLS
+LSM
+SMC
+MCC
+RQT
+QTL
+VDY
+DYI
+YIA
+VST
+SDA
+QEI
+RTF
+TFP
+FPS
+NHE
+KIL
+DVD
+EPA
+PAW
+LQV
+LLL
+PMT
+TDA
+RYI
+DHS
+FMV
+MVH
+VHR
+HRP
+RPF
+PFI
+KAI
+FIF
+FET
+KHN
+HKL
+IRA
+RPK
+KCA
+AYH
+YHQ
+SYC
+DFK
+FKL
+ADT
+WPV
+TNS
+QAA
+EFQ
+FQR
+QRC
+RCM
+CMV
+MVP
+CLN
+SHF
+LWN
+NDH
+HIR
+IRN
+NLI
+ITQ
+TQN
+QNH
+NHK
+VIM
+IMP
+PIV
+IVF
+VFP
+PAM
+AME
+NTR
+RGH
+GHW
+NQA
+VQS
+QSL
+NVR
+VRK
+VMA
+AET
+TDQ
+DQI
+QIL
+ILF
+DEC
+KFQ
+FQE
+QED
+EAN
+KRE
+ATW
+TWK
+WKL
+AVL
+PRF
+RFS
+FSS
+TGK
+GKT
+LTC
+TCN
+CNK
+NKA
+SRM
+RMV
+VDA
+NGP
+GPF
+PFQ
+QPV
+PVV
+VVL
+LHI
+QEK
+KWK
+WKE
+SEM
+THN
+NRN
+RNV
+VIT
+EPI
+PIY
+VVH
+VHM
+HMF
+MFA
+FAV
+AVN
+VLQ
+HKI
+MAL
+KIM
+IME
+THW
+QQF
+EAW
+AWV
+WVK
+KAN
+YTV
+TVY
+YSQ
+STM
+TMS
+MSI
+SIP
+TDG
+GPL
+LFE
+FED
+EDV
+DVQ
+TVK
+AHQ
+HQA
+QKD
+RPL
+QDP
+DPH
+PHT
+HTK
+AHC
+CRA
+SQD
+DGR
+MSV
+ATD
+TDD
+DAL
+LYP
+YPI
+PIA
+IDE
+DVT
+TLR
+NSI
+SIR
+STI
+TIA
+LGV
+VER
+ERT
+RTR
+IQF
+LVL
+QLG
+LGN
+GNF
+FTP
+LVG
+GPD
+PDH
+HVH
+HCL
+VVR
+VRD
+RDK
+ESL
+KHS
+HFV
+VPM
+PML
+GDW
+DWF
+WFT
+SRT
+RTS
+SAC
+CGL
+YPR
+PRV
+PAI
+KSM
+SMF
+TLC
+LCR
+CRD
+RDD
+DDT
+DTP
+TPM
+VRR
+KLG
+GEF
+FAK
+FEK
+IEG
+EGL
+GLH
+LHV
+HVD
+EQD
+SVR
+VRL
+SAI
+IAF
+AFG
+ANK
+NKK
+PIL
+IEL
+KSW
+RVR
+VRY
+YMV
+IEI
+QNV
+DMD
+MDT
+DTT
+NMY
+MYT
+TNL
+EVR
+RCA
+CAA
+TQR
+QEF
+NLP
+PED
+DKR
+RQN
+QNI
+NII
+IIC
+LLN
+NVA
+LAG
+AGV
+IMG
+APL
+PLI
+LIG
+EQT
+QTV
+VSE
+IYM
+YMQ
+NDQ
+DQT
+QTP
+KVN
+EDG
+DGK
+GKW
+FMP
+MPL
+LGQ
+FFD
+PLC
+LCL
+LNW
+NWL
+TDH
+VFS
+FSI
+IMK
+LTQ
+KFG
+FGG
+GQW
+QWA
+WAS
+TNI
+VPK
+PKM
+MQK
+TNY
+YLQ
+QRM
+RMT
+MTC
+CLF
+MTQ
+EDD
+VPN
+PNV
+VRF
+FNA
+AKS
+RIG
+GKN
+PST
+VKP
+KPL
+LGK
+DSD
+SDF
+DFD
+FDV
+DVR
+RYF
+YFS
+FSE
+SLG
+SVD
+DSL
+LKN
+SIK
+RSE
+IPF
+PFL
+FAM
+AMY
+MYL
+LRT
+EHS
+HSA
+EIH
+VVP
+TLQ
+VCY
+CYP
+VTQ
+RAN
+NFR
+KLC
+LCQ
+NKL
+TEY
+KSD
+NFV
+LAV
+EAC
+ACV
+IAQ
+VEH
+EHL
+QCA
+VDL
+DLQ
+AVG
+VGP
+PEI
+ITR
+TRV
+RVD
+AFQ
+DFC
+FCA
+CAN
+ANL
+NLD
+QVQ
+QII
+IIL
+SIL
+LPY
+PYV
+YVR
+PNP
+PHV
+SVI
+MLG
+YQT
+ECP
+CPE
+CVN
+VND
+GIQ
+IQQ
+LSQ
+SKW
+IEY
+EYM
+YMP
+AGQ
+GQL
+FDQ
+GLC
+LCM
+CMG
+MGW
+WLN
+HVY
+VYA
+YAI
+AIR
+LNM
+QFG
+FGA
+APW
+PWA
+WAE
+IIP
+IPM
+PMI
+MIL
+MSR
+SRN
+RNK
+NKN
+KNY
+YLH
+HRM
+EVC
+VCG
+CGT
+GTD
+DIT
+TTK
+PTV
+ADP
+VAN
+ANV
+FNV
+SPF
+VID
+IDA
+DAQ
+AQV
+KPT
+NTD
+TDV
+VKH
+KHF
+HFA
+FAA
+LPF
+GTF
+TFT
+FTT
+YVH
+ISH
+HEH
+PSD
+AHF
+AVK
+RQY
+FRN
+LCS
+SDD
+DNV
+FSN
+MPT
+FTE
+ITK
+FQN
+QNL
+NLM
+LMK
+MKD
+KDC
+DCE
+CEA
+ASH
+SHK
+KEF
+EFC
+FCE
+CEN
+ADC
+DCR
+MSQ
+SQI
+LPC
+PCI
+CIK
+NQH
+KDN
+DNT
+NTI
+IEH
+GIR
+EDA
+AKW
+SLC
+CMA
+MAW
+AWL
+WLV
+VDH
+NLK
+KEW
+EWA
+WAH
+AHA
+HAT
+ATI
+TII
+AMS
+GDP
+PNY
+MTT
+TLF
+FCI
+CIN
+INV
+CGQ
+TKH
+KHM
+HML
+MLP
+VLR
+LRM
+RMA
+MAG
+SLQ
+KIG
+GPI
+LQS
+KPI
+QDQ
+VKY
+KYF
+YFA
+FAQ
+TTA
+YPL
+LLM
+LMD
+HDD
+LGP
+PER
+EVF
+VPY
+PYI
+YIG
+IGG
+QYA
+YAT
+ILL
+VRE
+SLN
+QLF
+ADW
+WFS
+KVS
+IVR
+NIL
+MVK
+RAV
+VGK
+NLG
+EDW
+DWD
+WDY
+YIS
+FQK
+IND
+NDN
+DNQ
+VDC
+CLI
+ISI
+KFF
+FFN
+DES
+SHT
+HTQ
+IGD
+DRF
+VQP
+QPF
+LCE
+DNE
+NEG
+GDV
+SGF
+LNK
+NKI
+VQN
+TVR
+NKD
+DQV
+QVI
+VIN
+NNF
+FLP
+NML
+EFP
+FPD
+PDV
+IIA
+GIE
+DVN
+VNW
+NWR
+VRM
+MAI
+IPI
+LGM
+GMQ
+MQF
+QFF
+DLC
+LSW
+WLW
+LWD
+WDT
+YSI
+VNN
+NNL
+EIF
+FGS
+SDW
+DWC
+WCR
+SRL
+ENF
+FTI
+LTT
+GVP
+NIR
+IRF
+SYA
+YAV
+KYD
+YDA
+KNT
+LQT
+AEC
+ECQ
+CQE
+MVM
+SQN
+QNQ
+NQP
+AND
+FDM
+EGP
+ETF
+PVD
+INW
+NWK
+WKF
+FNQ
+GNI
+NID
+VHT
+HTE
+EAD
+ISC
+SCV
+CVE
+FSH
+HDG
+GEY
+GRV
+VVI
+VIF
+QRD
+GKY
+KYV
+GVR
+EYN
+YST
+STF
+TFQ
+FQS
+QSH
+FDY
+EID
+INQ
+NQI
+IRW
+RWL
+NFI
+DKT
+KLW
+WKI
+DAW
+AWN
+WNL
+NRI
+FRG
+RGR
+GRL
+LQI
+SIV
+PME
+YGN
+AHT
+HTY
+TYH
+YHV
+HVN
+NSD
+TFL
+DDL
+RVN
+ESF
+FNI
+VDI
+IKP
+PAN
+ITA
+EFH
+TQC
+CNW
+NWF
+WFV
+KGS
+RLC
+LCD
+CDM
+MRD
+RDR
+ALC
+AYA
+YAK
+DPQ
+QSR
+SFF
+KFS
+NGR
+GRY
+TRD
+YLT
+KVW
+VWD
+WDL
+MES
+PVE
+ETY
+TYP
+YPV
+HNY
+YLR
+RTK
+LCA
+CAL
+IFD
+FDK
+KFE
+FEC
+CDW
+DWS
+WSG
+HIL
+ILT
+GSY
+SYH
+YHN
+HNL
+FRS
+YAR
+ARG
+NNQ
+KTW
+TWE
+WEA
+EAR
+RPQ
+EPH
+HSQ
+FVV
+QLQ
+QFD
+HTA
+TAW
+AWH
+WHP
+HPK
+PKD
+DNI
+TNN
+NLY
+LYI
+YIF
+IFS
+MGR
+GRW
+RWG
+WGR
+PDP
+PQM
+MQT
+FMR
+MRQ
+SIT
+IGN
+GNM
+MLN
+TAI
+INI
+SWC
+WCF
+CFS
+FSQ
+QIK
+GAL
+ADI
+EFN
+NHD
+RDP
+SKA
+RRG
+RGE
+INK
+WLQ
+QKN
+VHF
+HFL
+WKV
+KSF
+GGY
+GYN
+YNT
+NTK
+NGL
+PQN
+VTA
+VKQ
+RRT
+YHI
+LWH
+WHL
+HLE
+NQS
+QSY
+YNI
+TNM
+TEC
+ECN
+CNV
+NVF
+VFV
+KGT
+TIR
+CDR
+DRH
+HSK
+QFE
+PEN
+NRS
+SGR
+YMI
+LSI
+LHM
+HME
+VHE
+HEY
+DCI
+CIF
+ECC
+CWN
+WNG
+SIM
+IMT
+MTG
+YNN
+NFF
+FFR
+LKP
+KPR
+KVC
+VCT
+CTG
+GKR
+CLD
+LDF
+FNK
+ENI
+QDK
+DID
+IDT
+TRK
+SFL
+RDH
+HSY
+IST
+NHT
+HTG
+QVH
+HRR
+WLP
+PQQ
+QQN
+AYF
+RPE
+EGY
+YNL
+PAT
+LRP
+RPM
+PMD
+LMV
+TPR
+SDY
+DYE
+TYM
+YMS
+WNF
+NFE
+QSF
+HPH
+HHC
+HCN
+MRA
+RHT
+TKF
+FFE
+HSG
+MEN
+ENR
+NRP
+RPV
+TYQ
+VHD
+HDY
+CVW
+VWN
+NGS
+RMF
+TKR
+AIL
+VCV
+DFS
+HPS
+MRF
+RFC
+FCV
+AWF
+WFF
+FFP
+FPN
+NTT
+TTR
+VFW
+FWD
+WDA
+AFS
+SNF
+FTG
+TGC
+GCH
+CHH
+HHG
+GQN
+GLY
+YFQ
+RFG
+FGY
+GYI
+IPE
+PET
+TFS
+FSG
+SGN
+FTD
+DDF
+ELY
+QTN
+TNF
+LDA
+LTI
+TIQ
+IQH
+QHI
+IVI
+VIP
+PRC
+RCG
+CGN
+SLM
+LMH
+HGG
+EVN
+RTH
+HLH
+LHA
+HAV
+YTL
+FPG
+EPR
+PRW
+RWP
+PRN
+RNR
+NRR
+RRD
+DLT
+LTY
+TYA
+YAF
+PKN
+SRA
+FGR
+RWS
+WSD
+FTL
+FST
+ITI
+TIG
+IGF
+GFY
+FYT
+YTG
+GDH
+EPF
+LAH
+HAF
+SPP
+KFH
+FHL
+HLD
+WVV
+ESV
+AVH
+IGH
+GHL
+LGH
+ESI
+IMY
+MYP
+YPT
+PTI
+LTN
+VEG
+EGI
+IQY
+YLY
+LYG
+YGA
+KHQ
+HQR
+DTG
+GGF
+FSA
+RID
+IDG
+DGS
+TVG
+VLW
+LWF
+WFL
+MGS
+PLR
+KPG
+TSW
+WNS
+VRT
+TQV
+EYG
+YGC
+GCF
+CFE
+KGH
+LNG
+GNK
+NKP
+KPE
+EYD
+GFT
+EGM
+GMG
+MGV
+VGR
+RIT
+LMW
+MWP
+WPE
+CET
+SYG
+KRM
+KMM
+MMV
+MVF
+FES
+FGM
+HFD
+SFC
+CES
+LHF
+HFM
+MRY
+QPG
+PGK
+GRS
+RSP
+SLH
+HKD
+KSI
+IVN
+NQN
+QND
+EFE
+GEW
+EWI
+WIL
+ADN
+DNH
+GDC
+DCF
+CFM
+AWS
+WSN
+RLH
+QAR
+FSF
+SFP
+FPK
+EHP
+HPL
+LLF
+LFN
+FNP
+PFE
+YCF
+CFT
+FTK
+KEG
+CDL
+PAQ
+PFR
+FRI
+QGP
+ERP
+RQQ
+QQC
+QCS
+CSQ
+SQR
+QRI
+RIQ
+QGE
+NQC
+QCR
+CRS
+RSQ
+SQM
+QSC
+SCC
+CCQ
+LQN
+NVE
+EQC
+CQC
+MPG
+GWS
+WSC
+SCL
+CLV
+FVG
+VGQ
+VQE
+QTK
+MLE
+LEG
+AQY
+CQG
+VIH
+IHT
+IDV
+VSH
+SHV
+HVL
+PRQ
+IYC
+YCS
+CST
+AGP
+HEE
+HHE
+STW
+TWS
+AYP
+YPY
+PYS
+YSK
+KNG
+NGG
+GGT
+HTC
+TCA
+PMY
+MYI
+YIY
+YGE
+ERS
+VMI
+KNK
+VYV
+YVG
+VGN
+GNV
+VAW
+AWA
+AHI
+NVQ
+VQG
+GQF
+QFY
+TPH
+HQS
+SYD
+LNC
+NCT
+EWG
+WGL
+RLD
+SWS
+WSL
+LLY
+LYW
+YWL
+VSF
+PFY
+FYN
+YNY
+NYR
+YRP
+RPP
+PPF
+PFN
+FNC
+SKF
+FTF
+FSY
+AQR
+LGY
+GYV
+YVP
+SWE
+SEW
+WIG
+IGT
+EQH
+QHR
+HRE
+RET
+DTK
+TKS
+GGL
+AFR
+QNR
+TAC
+ACI
+CII
+DVF
+FGV
+GVT
+VTH
+THR
+MNV
+NVN
+VNV
+CVQ
+VQA
+PVF
+VFI
+IYT
+YTS
+IEV
+QNG
+NTW
+TWP
+WPT
+PYP
+NGW
+GWN
+NGD
+GDT
+LYT
+YTC
+PTY
+TYI
+SIN
+INE
+NNG
+SVG
+TVN
+KAP
+YDN
+NYI
+EFG
+SRW
+LMY
+MYW
+YWI
+SYQ
+YQP
+FNR
+NRH
+YKP
+PLY
+LYS
+YSW
+VEW
+EWV
+WVG
+RHK
+HKE
+TLK
+KSK
+KTQ
+YRT
+KHK
+VTV
+RGD
+DIV
+QGM
+GMS
+VII
+IIH
+DAC
+TFH
+FHT
+MVN
+VNR
+KNN
+KRH
+SIQ
+NYT
+WGF
+GFC
+MVT
+VTI
+TIS
+ISY
+GYE
+YEP
+QVP
+YLV
+GGC
+GCG
+CGF
+GEH
+EHI
+LEW
+EWE
+WEP
+PRL
+LHL
+TGP
+GPV
+PVQ
+VQV
+QVT
+AIQ
+QAH
+HEV
+GSH
+IHK
+VQT
+TGT
+GTR
+TRL
+SSM
+GHP
+HPF
+PYE
+IHR
+HRH
+RHP
+HPY
+YPC
+PCS
+CSK
+GRK
+RLF
+AIP
+EHG
+HGR
+AWM
+WMH
+MHI
+LMG
+MGG
+QVY
+VYF
+YFC
+FCY
+CYD
+YDK
+SPY
+SYE
+EDF
+FNM
+MEF
+SPC
+PCG
+GTH
+PYW
+WLL
+LQW
+QWL
+PYT
+TNK
+RHF
+HFG
+ART
+RTI
+IHW
+HWV
+WVQ
+RMG
+DAS
+ELG
+VTT
+DRG
+WVR
+DVC
+VCA
+TIF
+IFH
+ELM
+DEY
+QRS
+NVG
+GTE
+TEN
+HAG
+GVQ
+YTD
+DLY
+AQN
+GVD
+DGM
+GML
+CAI
+IRP
+GIW
+IWG
+WGN
+GNG
+GDQ
+QTM
+GHV
+HGF
+GFI
+AAH
+DGT
+APG
+PGQ
+GQA
+YFI
+FIN
+PIN
+INM
+MFE
+FEF
+FAR
+QRW
+KMR
+MRI
+SGP
+GPA
+AVR
+VRW
+RWV
+WVM
+VMT
+TGW
+WQR
+HFR
+FRF
+GFP
+PAP
+RLY
+NYF
+LFT
+TTQ
+QAL
+YYV
+QMK
+ARA
+MMK
+QLH
+RMR
+GRT
+RTP
+RLE
+AHN
+HNI
+LQA
+CLQ
+PLM
+LMA
+SFK
+LDP
+PDS
+SMG
+EMS
+MSC
+SCA
+ARI
+FEM
+EMT
+MTL
+LQP
+QPL
+HKK
+DWN
+WNT
+QAT
+QGL
+LGG
+GSP
+HSH
+HTT
+MAN
+YHF
+FVT
+KED
+YAN
+ANY
+IQA
+QAD
+ADY
+NHG
+PSM
+SMT
+MTA
+THF
+HFP
+FPR
+YGV
+GRE
+CVM
+VMM
+MML
+GMK
+FCS
+SYL
+PEP
+LMT
+MTF
+LYD
+DDW
+DWM
+WMR
+CSR
+PPE
+YLM
+MKF
+VNK
+NKM
+KMT
+LLW
+LWP
+WPP
+DQA
+QLD
+IQV
+VGV
+GVV
+IQS
+QSA
+DIN
+INF
+QDT
+DRL
+RTE
+PAR
+PTM
+TMP
+PPQ
+PPG
+GTP
+TVP
+PGP
+NPA
+QVD
+SGV
+QPR
+HNV
+NVH
+VHK
+TAM
+PLN
+LNR
+NRL
+HTH
+THM
+HMA
+QCK
+CKD
+HFS
+YFT
+FTH
+HRK
+NHS
+APF
+PFS
+QEE
+MTS
+ALH
+HDV
+QEN
+FNN
+GIF
+APQ
+QQV
+MTV
+LPK
+PKP
+PTD
+VGT
+PCP
+CPA
+SNM
+NMP
+DQG
+TED
+GGH
+HPP
+PRG
+EMH
+MHW
+HWP
+PMK
+AIG
+LTM
+AGY
+GYL
+KWP
+WPL
+FVI
+KRC
+CVY
+VYY
+YYF
+YFK
+PQG
+GAF
+FSL
+LSG
+SGY
+YNR
+RVM
+VMR
+FPF
+PFK
+HIS
+KKH
+KHR
+HRT
+RTW
+TWF
+WMA
+GHF
+HFH
+FHE
+HEK
+PLD
+SFY
+FYG
+TDN
+YEH
+EHD
+EPD
+PGR
+MHP
+PAY
+YPP
+DMP
+MPR
+RAH
+AHS
+SFT
+GPG
+KHG
+LPD
+LCP
+CPR
+EPC
+DPP
+KPP
+PPC
+PCF
+CFR
+EPW
+PWT
+WTP
+PGH
+HGA
+GAC
+IMA
+RNC
+NCD
+CDK
+RGP
+GPP
+SEP
+PKF
+AMP
+VAP
+APR
+RQP
+KVP
+FVN
+VNT
+ESC
+CEV
+LYC
+CIR
+GKV
+LVV
+VVW
+WDE
+ETS
+VRN
+RNY
+RIF
+KFY
+GSM
+SMV
+EHY
+HYH
+YHT
+THV
+PSH
+SHQ
+PYG
+YGY
+GYT
+IQI
+QIE
+EIN
+TFR
+GNC
+NCI
+RPY
+AQI
+CQK
+HAA
+MSN
+HEW
+EWQ
+WQF
+FDN
+NAW
+AWQ
+QEM
+EML
+LNH
+QKV
+MDA
+DCH
+EHQ
+FRR
+NKS
+SRP
+PYF
+YFE
+QVC
+TYS
+DIH
+HRQ
+GDF
+DFP
+FPT
+PGV
+FQL
+EKC
+KCD
+CDY
+DYP
+YPS
+GSQ
+QMS
+ACD
+DYD
+VRP
+DVW
+VWE
+WEH
+EHE
+LDH
+LMM
+QQT
+STE
+QRP
+RHC
+HCD
+CDV
+TSC
+HHQ
+HQL
+NHL
+TPI
+PIK
+VSM
+SMR
+MRE
+DRS
+RRR
+PRI
+LNQ
+QST
+INR
+ARQ
+KFR
+KPY
+YWE
+RVA
+RQF
+QRV
+LVH
+ARY
+AMG
+FEL
+KYY
+YVQ
+KMA
+IHE
+MGP
+RGC
+TSV
+DSC
+SCS
+CSN
+TQS
+QSV
+GPT
+MPD
+PDQ
+DQF
+QFP
+RPG
+GMM
+MMF
+FPV
+SEC
+ECS
+PEC
+ECE
+ERG
+ANN
+NNR
+NRM
+LQC
+QIG
+ISA
+REH
+HKA
+LQM
+GKS
+TRM
+GCD
+GVK
+YHS
+HSN
+WDD
+YGD
+HAD
+IGE
+IFN
+FNS
+QLW
+WMV
+VDN
+FQT
+QTE
+YWS
+WSE
+LGF
+LHG
+HGY
+FEH
+HFK
+FKD
+DQM
+QFT
+FTA
+NDT
+QTR
+VFN
+AFP
+KFA
+AYL
+YRW
+RWH
+WHS
+SYI
+TPD
+FHS
+QCL
+CLW
+WRW
+RWW
+WWK
+WGC
+GCP
+LTF
+TFI
+IRH
+RHR
+EFY
+IDM
+DMV
+VKT
+DMY
+MYD
+DTF
+KRW
+RWD
+WDP
+MVL
+EMA
+QGR
+AEW
+WIA
+TGY
+PTF
+FEN
+GHR
+QPI
+PFP
+FPH
+HHI
+ILQ
+IDF
+NDY
+DYA
+YAC
+CSI
+TRC
+RCY
+CYK
+ASC
+SCT
+SCY
+CYM
+STQ
+MIE
+NWE
+WEF
+PDN
+DNN
+NNA
+API
+KHA
+AFN
+LHH
+HHF
+HFY
+YRD
+DGY
+GYS
+LDY
+QFA
+SVQ
+VQQ
+CVK
+AQW
+QWI
+SCI
+DNP
+DMI
+YMR
+LIN
+CLG
+GSC
+SCN
+DFA
+CGY
+GYA
+IVC
+CFW
+HSD
+GQK
+III
+GGI
+RGA
+YER
+GLQ
+GPH
+PHG
+HGW
+GWR
+WRM
+SWG
+LDQ
+IVV
+YLP
+FQQ
+QQH
+QHY
+HYG
+YGG
+HRS
+RSD
+KLH
+LHN
+DIE
+IHS
+DAP
+AEM
+EMK
+IGY
+HFI
+QRY
+RTA
+DWG
+YNH
+NHC
+CDP
+QDR
+WRN
+NNW
+NWW
+WWQ
+WQM
+HAP
+PLQ
+LQY
+AVM
+MAM
+MED
+LFA
+GNL
+LDW
+DWE
+RRP
+RCS
+SRI
+IQT
+RFW
+FWG
+WGE
+WHV
+EGT
+TAR
+WFI
+YAD
+DWL
+LWG
+WGY
+GYD
+HIA
+MPQ
+EWR
+WRY
+RYA
+YAL
+NWQ
+WQP
+PPY
+YDW
+WSW
+WML
+IPD
+CNP
+PGC
+GCV
+CVD
+QGV
+QLY
+YIC
+ICF
+CFP
+LPM
+MTI
+TIP
+IPG
+MKT
+QTF
+PGI
+RWT
+RGW
+WQA
+PDD
+DDY
+RFP
+GMT
+RRY
+RWK
+WKP
+KPW
+PWR
+HIW
+IWY
+WYT
+EGW
+QPD
+RIC
+ICV
+LFF
+FFA
+FAP
+RNA
+NPW
+PWN
+AGK
+LYM
+FQH
+QHF
+NAV
+VEM
+MYQ
+YQR
+QRN
+RNF
+TMH
+MHS
+RFH
+KHY
+HYS
+YSF
+TRW
+RWE
+FYS
+GPM
+PMR
+MRT
+TGH
+NWI
+WIV
+IRT
+TGR
+TTD
+DSG
+SDG
+QYY
+FWI
+WII
+FLY
+YDL
+ACW
+CWA
+WAP
+LFG
+IWI
+WIP
+NYD
+YDQ
+GYM
+CVR
+RGM
+GMA
+AYV
+SKM
+GIP
+IPY
+PYR
+RAM
+KYA
+YPH
+PHI
+HIE
+RTM
+MDP
+MRP
+PGN
+HSM
+SML
+GIM
+IML
+YPW
+DRR
+MWC
+VQD
+QRQ
+QQI
+INA
+RNQ
+EMR
+YLN
+PTR
+NPC
+QYG
+DAH
+AHR
+HRA
+QAW
+GRA
+AHH
+HGC
+GCS
+SRH
+GVH
+VHG
+AWI
+ASF
+QNP
+NPM
+PMG
+LMP
+VYW
+YWK
+WKG
+RRW
+KIW
+IWR
+WRA
+EYA
+GGN
+DRY
+YYG
+FYA
+YAM
+AMR
+MRL
+RLW
+WPG
+GEI
+GTK
+FAF
+MVG
+GKP
+MFY
+FYM
+YMT
+TGQ
+VVV
+GMV
+HQG
+PHY
+GVW
+VWI
+PNN
+RKY
+HAI
+IIG
+DTY
+PEM
+LCW
+WVP
+VPG
+PGY
+YSD
+VEP
+KPF
+PDL
+PMN
+MNM
+NMV
+VMQ
+MQQ
+HPR
+KVG
+TWG
+WGK
+VGM
+IGL
+LYV
+GIY
+IYV
+RHG
+HGV
+EHN
+HNE
+QMR
+MRV
+KYQ
+PIT
+TEW
+EWT
+WTV
+LME
+AWW
+WWG
+WGP
+PWF
+WFA
+IIV
+KRF
+FMN
+MNE
+SMP
+HHM
+HMY
+MYG
+GQY
+YGQ
+GQG
+WLI
+LIF
+QYR
+IFA
+KWL
+ESG
+DFH
+FHR
+HRG
+YDR
+DPT
+IKH
+HGP
+RTD
+LYA
+PVM
+MGH
+GHT
+TVQ
+RTY
+HGI
+KHT
+HTP
+KMC
+MCW
+GRP
+AYG
+MKV
+TMW
+MWA
+WAK
+HEA
+CGG
+LVF
+RYR
+WLD
+NAF
+VGH
+SAP
+QAG
+QDW
+DWT
+YTA
+AQG
+GLT
+TTI
+SIW
+IWL
+RQD
+NIE
+PDY
+RMD
+INP
+DIG
+GRC
+CTK
+DRM
+MIG
+QNF
+NFA
+PRY
+MHA
+FEG
+AIW
+IWS
+WSM
+GPS
+ATR
+RRN
+VPQ
+TSH
+CSP
+DNG
+SFM
+FMI
+MIF
+DCP
+CPP
+AQH
+QHC
+CRK
+RCR
+AFF
+FFC
+FCP
+PPN
+AIE
+AID
+GNT
+FYP
+AMV
+SYR
+QDM
+MIC
+CYN
+YNQ
+PTT
+GQC
+QCY
+DHR
+GCA
+CAC
+ACP
+CPN
+CCS
+KCN
+YKT
+TCP
+LCY
+MFM
+GCI
+CID
+CPK
+YVC
+VCC
+CCN
+DRC
+RCN
+VCL
+KCY
+CYV
+TQT
+QTC
+CEK
+EKY
+VSY
+YFH
+FHD
+YEC
+ECT
+CHR
+GPY
+PYN
+NVC
+LCN
+MGE
+THT
+HTI
+HTS
+HLN
+KFI
+ITY
+EIP
+NAN
+LII
+DFF
+FCN
+TSM
+TYF
+LLC
+LCT
+CTF
+FLH
+HHP
+LHQ
+HQT
+FPL
+PMS
+LFY
+YRK
+KTN
+TNV
+YKH
+NMR
+YGP
+LSH
+PHD
+HDT
+HEC
+FLC
+CFG
+AQQ
+SGC
+GCR
+CRF
+LWL
+EMD
+EGF
+VGF
+TWV
+PQK
+HDA
+THC
+HCG
+CGW
+WSS
+GWP
+MPM
+IYI
+HLP
+RPC
+PCL
+NNH
+HIY
+YTY
+TIM
+IMI
+FVF
+MGA
+YLG
+ACF
+CFV
+VIC
+ICI
+EGC
+CIH
+IHF
+HDI
+QSD
+PKG
+VML
+LTH
+THK
+HKG
+YMH
+HSE
+LMC
+MCV
+LFH
+FHI
+QFC
+KYK
+PFV
+PPI
+TVM
+IKF
+KFP
+QGY
+GYG
+YGM
+AMC
+MCL
+MKI
+QIM
+TRT
+IDK
+WLH
+DND
+FIV
+KGF
+HPN
+AFV
+YKR
+VFF
+FFV
+PKS
+TKQ
+PNH
+QIY
+NSR
+IQP
+QPK
+IVT
+CHV
+CLH
+QAN
+NEH
+YIH
+DVM
+MLC
+LCV
+IQR
+RYK
+WLY
+IDD
+TFY
+FID
+SPN
+VVC
+TTN
+EMM
+TGM
+MSK
+SHR
+HRN
+GEQ
+FIC
+CTV
+MFH
+YGL
+YGS
+MHE
+HEM
+MMS
+SMH
+MHT
+VLC
+KYP
+TGI
+RYG
+YGT
+GQI
+GPK
+PKQ
+GYF
+WLR
+CYI
+IFV
+GYQ
+FPM
+YVV
+APY
+IKI
+MDS
+HAR
+PNT
+HVT
+VNH
+HPD
+NIK
+ESD
+FHV
+TFW
+WPD
+PDM
+DMK
+KYN
+TWY
+IHQ
+SHP
+EYP
+YPK
+PKL
+IRS
+RSC
+CSA
+LMS
+PHK
+KPV
+VCI
+FGW
+WFH
+FDT
+KYG
+INC
+NCA
+CAV
+FCK
+CKK
+FKV
+DYS
+TRI
+RKF
+FLM
+MEC
+ECR
+CRN
+PRD
+PPM
+HLR
+GHQ
+HQP
+DYC
+YCT
+PCH
+MIT
+DPI
+PIQ
+QMP
+EVY
+RGS
+SNV
+NVP
+PSC
+TPF
+RKC
+CVP
+QFQ
+MDR
+KCP
+CPH
+PHR
+YTK
+YDS
+EKW
+KWH
+WHA
+KDH
+HRL
+REF
+FGD
+FGE
+RND
+SYT
+PEW
+EWF
+TGF
+CNG
+NEF
+VPC
+SMI
+RWF
+PHE
+QNS
+GNY
+MLQ
+PFM
+GDM
+TMK
+MSP
+GQP
+GLM
+VFQ
+TRA
+PIG
+FQG
+GMR
+TAQ
+AQM
+NFY
+FYQ
+GFG
+DRT
+KMY
+MYE
+NRY
+VPH
+HVP
+LHP
+HPG
+VHP
+PQH
+SHA
+HMH
+KWF
+WFG
+LEY
+DYK
+APM
+PMH
+NPK
+QTS
+THQ
+MPH
+SHC
+HCV
+SDC
+CVT
+KQP
+QPM
+MNA
+GWV
+LFW
+FWL
+WLG
+QKW
+KWW
+WWH
+WHT
+HKN
+QTD
+QID
+NFG
+TPN
+NSW
+SWF
+VDT
+EFW
+WQN
+NIT
+LLI
+GTN
+ESN
+NRW
+RWC
+WCS
+CSW
+YQL
+QLM
+MLF
+MLW
+DPG
+RHW
+HWD
+WDQ
+NER
+HEG
+FPY
+PYA
+QMN
+MNL
+KLY
+FAD
+TKC
+KCH
+QKH
+YKI
+NDM
+MVI
+SHI
+HIQ
+ECK
+CKY
+KYE
+RQV
+KLM
+MKL
+YVT
+VKM
+DHY
+HYA
+DME
+VFC
+CIT
+PIF
+IFF
+FFF
+KIP
+WFK
+KSC
+SCK
+CKG
+CAY
+CKS
+LQH
+QHP
+PWV
+WVE
+MRM
+MLH
+SHM
+NSM
+QGN
+GYY
+YYD
+KGW
+RYP
+YSP
+PND
+ITP
+IFC
+NAC
+QVL
+NKW
+KWT
+WTL
+TCD
+LCC
+CCT
+HLC
+YWA
+WAI
+TDP
+IDY
+YVN
+LTW
+CTI
+AFY
+FYI
+YGR
+TRH
+RNW
+WRL
+EVH
+TPC
+CAP
+IIM
+MGT
+ILC
+CWL
+PFF
+FFI
+PFC
+CHM
+HMP
+VIY
+YAY
+YFN
+IKC
+CKF
+KFC
+FCR
+CRQ
+WNI
+WRR
+RRC
+RCP
+CPV
+YQI
+FGN
+CVI
+IFI
+ITW
+TWI
+CRI
+ILM
+DTC
+VHH
+HHY
+HYV
+LHC
+HCK
+CKP
+ETC
+IQC
+HNC
+IYQ
+LPW
+PWK
+ITL
+TMY
+CDF
+DFW
+WLS
+TCC
+IMH
+MHL
+TPK
+LVW
+VWV
+FFW
+FWR
+WRQ
+PNK
+VCW
+FII
+PIC
+ICK
+CWF
+FHM
+FNW
+YTM
+AFH
+FHK
+RFK
+FKC
+PNQ
+GAW
+AWD
+YTT
+TWN
+DIW
+IWV
+WVS
+AGH
+GHA
+AMI
+AVW
+TAH
+QIS
+STC
+TCG
+CGA
+ILY
+ITG
+ICW
+ICR
+SCW
+CWI
+WIH
+IHP
+HPA
+FFT
+FTW
+TNC
+EKM
+MLI
+ICM
+CMT
+YIV
+DRW
+EVW
+VWL
+CTC
+NAI
+LMI
+TVW
+VWT
+WTI
+ISM
+SQC
+QCT
+QHD
+HDH
+IYH
+YQK
+FAS
+CKL
+TFC
+TEF
+IRI
+DHP
+SIY
+ADF
+IRC
+MPS
+NWT
+CEG
+KNW
+WSA
+MAV
+DML
+MPV
+WIY
+IYL
+IHH
+VFK
+FIP
+IMV
+SIH
+TMQ
+MQS
+ACK
+FLF
+VMW
+WCP
+CPF
+NIM
+CNE
+FVW
+YIQ
+CQY
+KQH
+QHS
+LMQ
+DCS
+MEI
+PTC
+NRK
+QVG
+CTD
+FVH
+LHW
+HWA
+FVM
+AMW
+WLF
+NQY
+QYN
+TCV
+DFM
+FML
+VTY
+MLD
+MRR
+CNQ
+CNY
+KIY
+IYF
+RNP
+FFK
+GIN
+EQV
+SEH
+CDH
+TVH
+VHW
+IWP
+PHN
+TCE
+HRD
+NDG
+RTT
+VNA
+VGY
+NYQ
+KCS
+YFG
+MFQ
+NWA
+VMV
+DWP
+CPI
+PIW
+CIA
+NYP
+PHP
+ITF
+SNH
+IMM
+MMI
+MII
+KVY
+SYP
+TMG
+SMQ
+CGP
+GPC
+PCD
+ANI
+RLM
+SWV
+TRY
+THI
+LGC
+NCK
+CKQ
+VHS
+VWQ
+FKF
+QNW
+NWP
+WPA
+HNA
+YDY
+YVW
+VWP
+YLC
+PVW
+WIS
+IVW
+VWA
+IGV
+TTC
+TYC
+YCL
+CLT
+YVL
+HGH
+KCC
+CCK
+CKR
+IMW
+IYR
+SNY
+LRC
+NYK
+NIY
+YRH
+HTN
+MQV
+TFN
+FQF
+NCC
+CCC
+APN
+CGK
+SKY
+RCD
+GKG
+HNS
+RDW
+DWR
+WRK
+TTY
+YIW
+WYR
+QFW
+FWT
+QWN
+WNP
+VRH
+RHQ
+EVQ
+QNY
+YNF
+NFP
+QNC
+SLW
+WEL
+FYV
+YFV
+VCM
+PLW
+QLT
+MGN
+GNH
+MCG
+ETR
+LWS
+WSV
+SVW
+VWH
+WHY
+QYW
+YWT
+VYI
+FSM
+NYY
+IAW
+CSH
+HMG
+DFE
+WSI
+IWQ
+WQY
+CIP
+IPQ
+HST
+QWT
+GVF
+PDW
+MAH
+HVG
+IWH
+LWA
+WAC
+CIL
+DTH
+THH
+YHL
+QKY
+KYH
+YHK
+YNW
+WTK
+VHA
+VWY
+WYQ
+WND
+IWA
+APD
+ENY
+TFK
+HDK
+QFN
+RRH
+PNC
+NCR
+HFF
+TIC
+HDE
+VWS
+WSQ
+YLF
+ALW
+WGG
+TRQ
+QYH
+CVH
+SMW
+MWY
+RNH
+QTY
+WQL
+GCT
+APC
+CAE
+FWF
+WFQ
+LCH
+CHF
+WSR
+GGW
+TIN
+DQH
+QHG
+PFT
+ISF
+WDN
+NWN
+KEC
+DKP
+FYF
+YFP
+DSM
+WEI
+MSM
+YII
+MMN
+PMP
+RCC
+CCP
+CPT
+SGW
+GWT
+NCP
+CPG
+GQH
+IWN
+WNC
+NCY
+CYS
+YSR
+HTF
+HHA
+RPW
+PWH
+WHN
+HNQ
+VQW
+ECG
+SMS
+ANW
+RAW
+SFQ
+QIH
+HHR
+REW
+YEQ
+CRP
+FKM
+GQM
+WTR
+ICL
+MVW
+ISW
+IRY
+LDM
+RNI
+QTH
+THG
+YVI
+VPF
+DAM
+CWD
+WDR
+DRQ
+MPF
+PFG
+CCI
+AIC
+CQP
+GCW
+WVI
+QGW
+NIG
+CSV
+AYY
+STH
+MGC
+CFC
+CLC
+DYT
+QVW
+YIN
+GQT
+QWE
+WES
+QCH
+CHP
+VCR
+WAY
+EMF
+MFI
+TWC
+WCV
+FCF
+SRC
+RCH
+HLT
+TFF
+IWF
+WFY
+NHN
+PQT
+QDN
+TNH
+FAW
+RWQ
+LWI
+IAC
+WNV
+RHM
+MEY
+EYT
+NVM
+TWA
+GWG
+CQV
+PSY
+IYK
+DTW
+TWR
+WRE
+CSC
+SCD
+IWK
+WKS
+NYN
+KNF
+PNR
+SQH
+WFP
+SWD
+WDI
+AWG
+WVA
+VMC
+GWH
+WHE
+YCR
+GMF
+MFF
+VTW
+CDC
+GYC
+YCN
+TMN
+PCV
+HCP
+CCL
+TQI
+FCC
+CLE
+PRH
+WST
+MMD
+YGH
+SWA
+PTW
+TWD
+INY
+NYG
+NCL
+KWI
+WIF
+FGH
+GKC
+KCM
+GWA
+WAQ
+FMY
+MYY
+YYQ
+AKH
+HKF
+ECA
+KHE
+ICG
+GMH
+CTR
+DHH
+HNW
+MIH
+HPV
+WMP
+RVQ
+DHC
+GHD
+AWE
+WET
+CHI
+HIG
+NTY
+QNN
+GAM
+HLY
+GWM
+WMI
+TCI
+RYS
+HNT
+NQW
+QWS
+DPW
+YSC
+MPK
+EWK
+YAQ
+AYW
+MLY
+CAR
+KYM
+YME
+EYQ
+MTN
+NHY
+ATM
+FFQ
+RPH
+QMF
+YGI
+IFT
+MMP
+TDF
+CHN
+EWY
+HSI
+NFM
+FMD
+MDF
+FKG
+NKH
+YMD
+YDI
+MCP
+QYQ
+TIW
+VAH
+HEF
+FPI
+YPF
+HGN
+SCR
+NMG
+MIN
+HTD
+MFW
+WNH
+MCI
+WEN
+SVH
+CRV
+KMD
+FQM
+QMI
+HDS
+CHG
+WAV
+PKW
+PQW
+QWP
+TSY
+QWR
+TCR
+SEY
+SHY
+HYQ
+SFW
+FWA
+WAM
+GFN
+VMK
+DYW
+YWY
+WYS
+HWQ
+WQT
+VMH
+MHF
+PKY
+VKW
+WWS
+HMM
+MMT
+MTY
+KWD
+TKW
+KWS
+HPM
+MKP
+KPH
+HNH
+QHA
+MVY
+YYT
+EYY
+WDF
+GYH
+WVY
+YGF
+GFH
+HDP
+NEW
+WYE
+YKY
+YIE
+EWN
+GTG
+YAS
+CSY
+GMN
+RIW
+IMD
+TYE
+DHF
+HIK
+PCC
+CCE
+CEW
+GHY
+DIM
+TDK
+KWN
+QGF
+GFM
+ATY
+TYG
+YGW
+MWR
+NKF
+WTG
+PYL
+YQN
+QNT
+RGI
+GHI
+HID
+REM
+YRC
+QYS
+DYG
+CRY
+GWD
+GMC
+MCA
+CAF
+EMC
+MCK
+DMM
+MID
+MFV
+VGC
+FGP
+KAW
+HLK
+DWV
+KYR
+WTC
+TCF
+DSW
+GTW
+WVH
+TNW
+ICA
+CAK
+HIM
+YCA
+MLM
+FRW
+WGI
+QFH
+FHH
+HHT
+SCF
+CFL
+NHH
+IDW
+DWA
+WAR
+ARW
+WHF
+FWV
+NTN
+KKW
+SAH
+CGS
+VWF
+LWW
+WWR
+WRP
+HRF
+SHH
+HGK
+NYE
+YEG
+FSC
+NAM
+MKM
+KMG
+IGM
+TWT
+WTM
+SWQ
+WQD
+PMM
+FPP
+FIH
+HHD
+RFA
+NQM
+CIG
+IGQ
+YSH
+IIW
+IPW
+PWY
+WYL
+HKY
+LWE
+WEG
+YDF
+FGI
+KGM
+HSC
+GTC
+WPF
+WWL
+HQD
+WGD
+FKP
+NYV
+YMM
+WGH
+HQF
+GGM
+YDM
+QGG
+QDC
+CDN
+FDC
+EQY
+MQM
+GYP
+WLM
+FEW
+FAY
+HLG
+DGC
+HLF
+AHD
+GFW
+FWW
+YTH
+THY
+HYK
+TQF
+FMH
+DMF
+YFF
+VWG
+WGV
+TEM
+RMP
+FNF
+RHY
+HYC
+YCE
+MQI
+FFG
+YCK
+CKH
+YYN
+PPH
+CPS
+GHH
+KQN
+FTC
+GCK
+CRG
+QHH
+QYC
+MGM
+DPM
+MGI
+EPY
+QCM
+CMQ
+NIC
+EYC
+CRT
+YPN
+TRF
+RDM
+WPY
+LYK
+MNC
+NCV
+SCQ
+CQA
+GHN
+HNN
+AMF
+LYN
+NIW
+ECF
+FAC
+ACA
+RHV
+EWM
+WME
+FEY
+DDP
+TMT
+MTW
+IPH
+CNI
+NIQ
+NVW
+VWK
+DHE
+RWN
+FCT
+PCW
+DMW
+YWF
+FHG
+YMG
+DHW
+HWK
+VCK
+SWK
+RFF
+CAG
+MFT
+VMY
+MYN
+PRM
+RMC
+CPM
+THD
+CKI
+FWQ
+WQV
+MDY
+RME
+WGA
+AYM
+YYL
+FFM
+KNM
+YCP
+CPD
+WDG
+HNF
+MGY
+LEC
+CLY
+SWI
+NLQ
+GSN
+PQC
+QCV
+PNW
+IYY
+NMI
+EGH
+HIP
+WQG
+MWS
+HII
+YNG
+RMH
+WKR
+RHI
+YNK
+HQE
+RWM
+DHT
+WEV
+KCE
+IWT
+YRL
+KCG
+WSP
+KGC
+CKA
+HDM
+MPN
+DKC
+DNR
+LCK
+YRM
+RFQ
+FMM
+WMS
+QIW
+HVC
+NWD
+MDG
+FIW
+DPC
+HED
+KWQ
+WQQ
+PVC
+HLW
+TYN
+GHK
+EWD
+WDS
+PHC
+HCI
+CIQ
+IWD
+WDV
+AWC
+IDC
+DCA
+CRR
+WPM
+QYD
+HYR
+HCW
+CWS
+WHI
+WRD
+KMP
+VIW
+AWK
+WKH
+HYP
+FFY
+QCI
+CIY
+FAH
+NWG
+FYE
+TDW
+DWK
+CEP
+HMV
+DKW
+YFY
+VDW
+HKS
+MQH
+FDH
+CCM
+MMM
+DYN
+WKQ
+WCL
+IHN
+NNY
+NYH
+VNY
+HTR
+MNR
+DMH
+MHY
+GTY
+IFM
+QRT
+GCE
+CEI
+YEM
+RMM
+YTR
+YAP
+YMN
+WGQ
+WNM
+GHM
+WQI
+NFD
+WEY
+FKY
+HYD
+HVW
+AMM
+RMY
+QWV
+MMQ
+MSF
+HFE
+WER
+HQM
+VQH
+YPQ
+PQF
+GHC
+HMN
+MNT
+FFH
+MMH
+FIM
+MIY
+IYW
+YWH
+FMC
+MCS
+DWY
+WYA
+MWK
+CMR
+HYN
+GWI
+WIW
+KWC
+PWQ
+RYC
+THE
+YQF
+ERM
+EWP
+SWY
+WYN
+WKY
+WEC
+ECM
+CME
+RVW
+VWC
+WCK
+RFY
+NHM
+KHC
+GWC
+HRW
+RWI
+WIK
+FRC
+HIH
+RCW
+CWP
+CGI
+FVC
+VMN
+KDW
+AMH
+MHQ
+NQR
+TCY
+CYT
+YTQ
+HHV
+AHY
+QSM
+LMN
+FMT
+MTH
+GRN
+NMQ
+NGY
+AWT
+FCG
+NMH
+MHM
+YCQ
+NWC
+CKT
+VCE
+HWH
+NLH
+VWW
+PCR
+RWY
+WYF
+YCM
+QVM
+QHT
+HVR
+RMN
+QPW
+FRH
+HQK
+YKF
+MQN
+KWE
+TYV
+HMR
+ICH
+KYT
+TDM
+CEY
+CVC
+PAC
+NFK
+KCF
+YNC
+QSW
+WEW
+WPW
+YQH
+NFH
+MSY
+YNP
+DQP
+HKH
+MTK
+KAH
+VKC
+YKW
+GWW
+WWP
+MWG
+VYC
+YCG
+HSW
+WNE
+CFI
+CLM
+CHK
+RCQ
+TCQ
+PFA
+NNC
+QGC
+MNY
+NYM
+KQM
+QME
+NCF
+PDC
+WAN
+RPN
+VCP
+WIN
+PPW
+PWL
+CRH
+PWD
+SYM
+FGC
+YIK
+VNC
+YTF
+SNC
+QHM
+MEH
+CQT
+ITM
+EYH
+CQF
+DYM
+SMM
+QMH
+CYA
+MAC
+WVN
+WAT
+FWM
+WMT
+CCG
+CYG
+WAF
+EPM
+MVC
+HWG
+ELC
+RCI
+WQH
+FWH
+QWQ
+AGW
+NWY
+WYC
+CRW
+CQS
+LIW
+CAQ
+QMW
+MWT
+CER
+ERC
+VGW
+IAH
+NAQ
+WIM
+MKC
+FQC
+MWE
+TQM
+YHW
+HWS
+NYA
+WMM
+MMW
+MWN
+WNW
+NWM
+YEY
+PCQ
+HFW
+FNY
+NHR
+NSC
+TNG
+HVM
+HQW
+EYW
+IWE
+HCE
+PYH
+YHD
+YKQ
+SWH
+HAY
+QMY
+KIH
+WFN
+CSF
+RCE
+YCH
+GRH
+YNE
+HQN
+QPH
+HYL
+MHV
+WIT
+SCG
+SPW
+FHF
+CIW
+WAG
+CTW
+YAW
+RHH
+NFW
+MNK
+GEC
+AHM
+CYY
+HEQ
+MWV
+IMR
+FCD
+HQC
+CYF
+MHC
+PMC
+HQY
+WTH
+QKC
+HRC
+HYF
+CYL
+HKC
+WPS
+WDC
+FMQ
+QHK
+CFK
+NEC
+DNM
+CQM
+QMT
+MDN
+DCK
+WDW
+LHY
+TKY
+FPC
+MDM
+QWF
+MDW
+DWW
+WWE
+GLW
+TWM
+MSW
+WEQ
+WKN
+PMQ
+WAW
+WMQ
+DCY
+CYR
+CFH
+HMS
+IWW
+WWI
+PFW
+WVC
+ACY
+MNS
+CGC
+GCM
+TYY
+YYS
+MIM
+MKW
+HMI
+FWE
+MKH
+MEW
+SMY
+MYH
+HYI
+CKN
+NMM
+RIM
+SKH
+YEW
+CQR
+RYH
+HTM
+WKT
+KMN
+FKH
+TCK
+WYI
+HNP
+NGC
+MRN
+FHW
+EIW
+KVH
+WFE
+YCY
+AHW
+TYW
+YWR
+WNA
+EMG
+CFF
+HYT
+FHQ
+NKY
+HHK
+PCE
+FCM
+CMY
+DHM
+QQW
+QWY
+WYM
+MRW
+FPQ
+MME
+MYR
+LWQ
+GWY
+WYD
+HPW
+YWD
+CAH
+EQW
+QWK
+WSH
+NMC
+PNE
+FYH
+QKM
+HWE
+WHD
+RQW
+SWW
+WWA
+MYS
+KQW
+WWT
+CPQ
+WIE
+ACC
+CCH
+WEK
+GMY
+HFT
+WTY
+MMG
+WTN
+YYM
+NTH
+YCC
+CCF
+DYQ
+WEM
+WGT
+NHF
+CMS
+WGS
+MIW
+YQM
+IHM
+QDH
+TWQ
+CAD
+GNW
+NWH
+YYH
+YYY
+YFM
+TPW
+WED
+MCR
+YNM
+WWD
+MYV
+YWM
+SCM
+CMM
+NRC
+RCT
+CTN
+YHM
+QWC
+WCT
+TTW
+TWW
+WWY
+WMG
+YYC
+WID
+YVM
+WIR
+FYC
+FWS
+FYW
+WTW
+RCF
+QQG
+HMD
+HEN
+CKM
+MKY
+HCF
+SQW
+TYD
+GIC
+FQW
+IFW
+YQY
+CCY
+WAD
+WSF
+MYK
+NDW
+MIP
+QWG
+TCW
+CWW
+YLW
+TQW
+IHY
+MQC
+QCD
+WTQ
+MWW
+VWM
+WMK
+GMW
+MQW
+NCQ
+CQI
+MRC
+PWP
+WTF
+HVQ
+HMC
+DWQ
+ILW
+PWS
+YHH
+CPC
+YHE
+HAK
+RNM
+CEH
+CMF
+QHN
+QCE
+MDQ
+DHQ
+YTW
+WLC
+MCF
+WFC
+CFQ
+YCW
+CWE
+MPW
+WYK
+MGF
+FTM
+CWK
+HWF
+PCT
+MHN
+HKW
+WYV
+DCW
+CYQ
+CAW
+HWC
+HWR
+RSW
+PYC
+FKW
+WFW
+FMF
+YMY
+DCM
+YDH
+LWY
+WKD
+WRF
+DKQ
+QEC
+WTE
+CEM
+GCY
+MNH
+CEQ
+HYY
+PYQ
+QIC
+GPW
+PWW
+MCD
+WHR
+NYW
+QWM
+CQQ
+YHC
+FCH
+CHQ
+QCF
+NFC
+PCN
+PWG
+CMI
+CTM
+QCP
+WWN
+TMC
+CYW
+EHC
+CCR
+FTQ
+CNF
+FDW
+DWI
+PWM
+YWG
+KMH
+PWE
+KWG
+WGM
+WHM
+WPQ
+CHY
+VWR
+WRH
+CYC
+AWY
+DHN
+CIC
+CPW
+ICP
+QWD
+CQW
+CTY
+WRC
+WYW
+MWL
+CGH
+HPC
+PCY
+EWH
+QNM
+PCM
+QMM
+WMY
+WPN
+WCE
+HQH
+CNN
+CMW
+PCK
+QWH
+NTC
+HIC
+CMC
+MCQ
+KHW
+KCQ
+MHK
+CWG
+HMT
+WFM
+IWC
+CML
+HWT
+MHR
+DQW
+IQW
+WVW
+WPC
+WHG
+WYH
+IEW
+VHY
+YQW
+WDH
+CHD
+QPY
+WKC
+YDC
+NHW
+WDM
+QPC
+CKW
+KWY
+NCM
+CQN
+MYF
+YMW
+MMC
+KMW
+MWI
+MHD
+ECI
+CMD
+WCI
+CGM
+GCQ
+MCE
+WWF
+WTT
+HDC
+FCQ
+DMN
+PWI
+RMQ
+WGW
+WYP
+MYM
+HCC
+CDQ
+MNW
+CMP
+RCK
+MWD
+FPW
+QTW
+WNY
+MCT
+MHH
+IWM
+CFY
+HYW
+PHW
+HWW
+CFN
+MWF
+HCM
+MWH
+GYW
+HAW
+DWH
+YWV
+NMW
+QEW
+CNC
+WDK
+NKC
+GCC
+MPC
+MCN
+CCA
+KWM
+MCM
+HWL
+WSY
+CKC
+WMF
+CWY
+HCQ
+WCA
+HMK
+DHD
+YHY
+DNW
+WCD
+WPI
+WFD
+WHW
+WHC
+HCY
+WHQ
+IMC
+KPC
+YMC
+CRC
+MCY
+ECY
+MCH
+HWI
+DCQ
+PMW
+LWC
+CRM
+DMC
+MNF
+HWY
+YWW
+YWC
+WYY
+EWC
+FWC
+FWY
+WMN
+WWV
+EWW
+WCM
+CAM
+WKM
+WHH
+YMF
+WCQ
+WIQ
+MFN
+ANC
+ECW
+WCG
+CIM
+WQC
+CMH
+MYC
+CTH
+HHW
+QWW
+WIC
+CPY
+MDC
+NYC
+CMN
+WHK
+MMY
+DEW
+QHW
+WQW
+CEC
+TWH
+HFC
+WKW
+HWM
+MQY
+HDW
+WYG
+CWM
+CYH
+HYM
+QMC
+QCW
+NCW
+YQC
+FMW
+WMC
+WWW
+HMW
+RMW
+CHW
+WCW
+HTW
+CWC
+WCY
+YWQ
+WMW
+CWT
+CWH
+MWM
+WWC
+WCC
+WCH
+WWM

From b479d5aff52adb580346ea70f3736e4ef876ac1a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 5 Oct 2024 12:54:10 +0200
Subject: [PATCH 061/112] remove absolete path for mocked open func

---
 tests/unit/dataset_classes/testTox21Challenge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py
index fedde8e5..9ad2af21 100644
--- a/tests/unit/dataset_classes/testTox21Challenge.py
+++ b/tests/unit/dataset_classes/testTox21Challenge.py
@@ -38,7 +38,7 @@ def test_load_data_from_file(self, mock_sdmol_supplier: patch) -> None:
         mock_file = mock_open(read_data=Tox21ChallengeMockData.get_raw_train_data())
         with patch("builtins.open", mock_file):
             with open(
-                r"G:\github-aditya0by0\chebai_data\tox21_challenge\tox21_10k_data_all.sdf\tox21_10k_data_all.sdf",
+                r"fake/path",
                 "rb",
             ) as f:
                 suppl = Chem.ForwardSDMolSupplier(f)

From adedc093435a8fb53d5bdb8c0210f65204c4d45d Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 5 Oct 2024 16:17:58 +0200
Subject: [PATCH 062/112] test single label split scenario implemented in #54

---
 .../dataset_classes/testChebiOverXPartial.py  | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py
index 7720d301..76584ebf 100644
--- a/tests/unit/dataset_classes/testChebiOverXPartial.py
+++ b/tests/unit/dataset_classes/testChebiOverXPartial.py
@@ -104,6 +104,72 @@ def test_extract_class_hierarchy_with_bottom_cls(
             f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.",
         )
 
+    @patch("pandas.DataFrame.to_csv")
+    @patch("pandas.read_pickle")
+    @patch.object(ChEBIOverXPartial, "_get_data_size", return_value=4.0)
+    @patch("torch.load")
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=ChebiMockOntology.get_raw_data(),
+    )
+    def test_single_label_data_split(
+        self, mock_open, mock_load, mock_get_data_size, mock_read_pickle, mock_to_csv
+    ) -> None:
+        """
+        Test the single-label data splitting functionality of the ChebiExtractor class.
+
+        This test mocks several key methods (file operations, torch loading, and pandas functions)
+        to ensure that the class hierarchy is properly extracted, data is processed into a raw dataset,
+        and the data splitting logic works as intended without actual file I/O.
+
+        It also verifies that there is no overlap between training, validation, and test sets.
+        """
+        self.chebi_extractor.top_class_id = 11111
+        self.chebi_extractor.THRESHOLD = 3
+        self.chebi_extractor.chebi_version_train = None
+
+        graph: nx.DiGraph = self.chebi_extractor._extract_class_hierarchy("fake_path")
+        data_df = self.chebi_extractor._graph_to_raw_dataset(graph)
+
+        mock_read_pickle.return_value = data_df
+        data_pt = self.chebi_extractor._load_data_from_file("fake/path")
+
+        # Verify that the data contains only 1 label
+        self.assertEqual(len(data_pt[0]["labels"]), 1)
+
+        mock_load.return_value = data_pt
+
+        # Retrieve the data splits (train, validation, and test)
+        train_split = self.chebi_extractor.dynamic_split_dfs["train"]
+        validation_split = self.chebi_extractor.dynamic_split_dfs["validation"]
+        test_split = self.chebi_extractor.dynamic_split_dfs["test"]
+
+        train_idents = set(train_split["ident"])
+        val_idents = set(validation_split["ident"])
+        test_idents = set(test_split["ident"])
+
+        # Ensure there is no overlap between train and test sets
+        self.assertEqual(
+            len(train_idents.intersection(test_idents)),
+            0,
+            "Train and test sets should not overlap.",
+        )
+
+        # Ensure there is no overlap between validation and test sets
+        self.assertEqual(
+            len(val_idents.intersection(test_idents)),
+            0,
+            "Validation and test sets should not overlap.",
+        )
+
+        # Ensure there is no overlap between train and validation sets
+        self.assertEqual(
+            len(train_idents.intersection(val_idents)),
+            0,
+            "Train and validation sets should not overlap.",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 65c2d9bd6cdd1241b2b9d2cb0c69bc892f760274 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 5 Oct 2024 17:04:10 +0200
Subject: [PATCH 063/112] test output format for
 Tox21MolNet._load_data_from_file

---
 tests/unit/dataset_classes/testTox21MolNet.py | 37 ++++++++++++++-----
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py
index 5d5f3497..86cbb752 100644
--- a/tests/unit/dataset_classes/testTox21MolNet.py
+++ b/tests/unit/dataset_classes/testTox21MolNet.py
@@ -2,7 +2,10 @@
 from typing import List
 from unittest.mock import MagicMock, mock_open, patch
 
+import torch
+
 from chebai.preprocessing.datasets.tox21 import Tox21MolNet
+from chebai.preprocessing.reader import ChemDataReader
 from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData
 
 
@@ -16,9 +19,7 @@ def setUpClass(cls, mock_makedirs: MagicMock) -> None:
         Args:
             mock_makedirs (MagicMock): Mocked `os.makedirs` function.
         """
-        ReaderMock = MagicMock()
-        ReaderMock.name.return_value = "MockedReaderTox21MolNet"
-        Tox21MolNet.READER = ReaderMock
+        Tox21MolNet.READER = ChemDataReader
         cls.data_module = Tox21MolNet()
 
     @patch(
@@ -28,20 +29,38 @@ def setUpClass(cls, mock_makedirs: MagicMock) -> None:
     )
     def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
         """
-        Test the `_load_data_from_file` method for correct CSV parsing.
+        Test the `_load_data_from_file` method for correct output.
 
         Args:
             mock_open_file (mock_open): Mocked open function to simulate file reading.
         """
-        expected_data = Tox21MolNetMockData.get_processed_data()
         actual_data = self.data_module._load_data_from_file("fake/file/path.csv")
 
-        self.assertEqual(
-            list(actual_data),
-            expected_data,
-            "The loaded data does not match the expected output from the file.",
+        first_instance = next(actual_data)
+
+        # Check for required keys
+        required_keys = ["features", "labels", "ident"]
+        for key in required_keys:
+            self.assertIn(
+                key, first_instance, f"'{key}' key is missing in the output data."
+            )
+
+        self.assertTrue(
+            all(isinstance(feature, int) for feature in first_instance["features"]),
+            "Not all elements in 'features' are integers.",
         )
 
+        # Check that 'features' can be converted to a tensor
+        features = first_instance["features"]
+        try:
+            tensor_features = torch.tensor(features)
+            self.assertTrue(
+                tensor_features.ndim > 0,
+                "'features' should be convertible to a non-empty tensor.",
+            )
+        except Exception as e:
+            self.fail(f"'features' cannot be converted to a tensor: {str(e)}")
+
     @patch(
         "builtins.open",
         new_callable=mock_open,

From a63c010f46cce5780d4f4068a01268ecec292e64 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 5 Oct 2024 17:40:10 +0200
Subject: [PATCH 064/112] DynamicDataset: check split stratification

---
 .../dataset_classes/testDynamicDataset.py     | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py
index e42c3e7e..c8846273 100644
--- a/tests/unit/dataset_classes/testDynamicDataset.py
+++ b/tests/unit/dataset_classes/testDynamicDataset.py
@@ -216,6 +216,142 @@ def test_get_train_val_splits_given_test_consistency(self) -> None:
             obj="Validation sets should be identical for the same seed.",
         )
 
+    def test_get_test_split_stratification(self) -> None:
+        """
+        Test that the split into train and test sets maintains the stratification of labels.
+        """
+        self.dataset.train_split = 0.5
+        train_df, test_df = self.dataset.get_test_split(self.data_df, seed=0)
+
+        number_of_labels = len(self.data_df["labels"][0])
+
+        # Check the label distribution in the original dataset
+        original_pos_count, original_neg_count = (
+            self.get_positive_negative_labels_counts(self.data_df)
+        )
+        total_count = len(self.data_df) * number_of_labels
+
+        # Calculate the expected proportions
+        original_pos_proportion = original_pos_count / total_count
+        original_neg_proportion = original_neg_count / total_count
+
+        # Check the label distribution in the train set
+        train_pos_count, train_neg_count = self.get_positive_negative_labels_counts(
+            train_df
+        )
+        train_total_count = len(train_df) * number_of_labels
+
+        # Calculate the train set proportions
+        train_pos_proportion = train_pos_count / train_total_count
+        train_neg_proportion = train_neg_count / train_total_count
+
+        # Assert that the proportions are similar to the original dataset
+        self.assertAlmostEqual(
+            train_pos_proportion,
+            original_pos_proportion,
+            places=1,
+            msg="Train set labels should maintain original positive label proportion.",
+        )
+        self.assertAlmostEqual(
+            train_neg_proportion,
+            original_neg_proportion,
+            places=1,
+            msg="Train set labels should maintain original negative label proportion.",
+        )
+
+        # Check the label distribution in the test set
+        test_pos_count, test_neg_count = self.get_positive_negative_labels_counts(
+            test_df
+        )
+        test_total_count = len(test_df) * number_of_labels
+
+        # Calculate the test set proportions
+        test_pos_proportion = test_pos_count / test_total_count
+        test_neg_proportion = test_neg_count / test_total_count
+
+        # Assert that the proportions are similar to the original dataset
+        self.assertAlmostEqual(
+            test_pos_proportion,
+            original_pos_proportion,
+            places=1,
+            msg="Test set labels should maintain original positive label proportion.",
+        )
+        self.assertAlmostEqual(
+            test_neg_proportion,
+            original_neg_proportion,
+            places=1,
+            msg="Test set labels should maintain original negative label proportion.",
+        )
+
+    def test_get_train_val_splits_given_test_stratification(self) -> None:
+        """
+        Test that the split into train and validation sets maintains the stratification of labels.
+        """
+        self.dataset.use_inner_cross_validation = False
+        self.dataset.train_split = 0.5
+        df_train_main, test_df = self.dataset.get_test_split(self.data_df, seed=0)
+        train_df, val_df = self.dataset.get_train_val_splits_given_test(
+            df_train_main, test_df, seed=42
+        )
+
+        number_of_labels = len(self.data_df["labels"][0])
+
+        # Check the label distribution in the original dataset
+        original_pos_count, original_neg_count = (
+            self.get_positive_negative_labels_counts(self.data_df)
+        )
+        total_count = len(self.data_df) * number_of_labels
+
+        # Calculate the expected proportions
+        original_pos_proportion = original_pos_count / total_count
+        original_neg_proportion = original_neg_count / total_count
+
+        # Check the label distribution in the train set
+        train_pos_count, train_neg_count = self.get_positive_negative_labels_counts(
+            train_df
+        )
+        train_total_count = len(train_df) * number_of_labels
+
+        # Calculate the train set proportions
+        train_pos_proportion = train_pos_count / train_total_count
+        train_neg_proportion = train_neg_count / train_total_count
+
+        # Assert that the proportions are similar to the original dataset
+        self.assertAlmostEqual(
+            train_pos_proportion,
+            original_pos_proportion,
+            places=1,
+            msg="Train set labels should maintain original positive label proportion.",
+        )
+        self.assertAlmostEqual(
+            train_neg_proportion,
+            original_neg_proportion,
+            places=1,
+            msg="Train set labels should maintain original negative label proportion.",
+        )
+
+        # Check the label distribution in the validation set
+        val_pos_count, val_neg_count = self.get_positive_negative_labels_counts(val_df)
+        val_total_count = len(val_df) * number_of_labels
+
+        # Calculate the validation set proportions
+        val_pos_proportion = val_pos_count / val_total_count
+        val_neg_proportion = val_neg_count / val_total_count
+
+        # Assert that the proportions are similar to the original dataset
+        self.assertAlmostEqual(
+            val_pos_proportion,
+            original_pos_proportion,
+            places=1,
+            msg="Validation set labels should maintain original positive label proportion.",
+        )
+        self.assertAlmostEqual(
+            val_neg_proportion,
+            original_neg_proportion,
+            places=1,
+            msg="Validation set labels should maintain original negative label proportion.",
+        )
+
     @staticmethod
     def get_positive_negative_labels_counts(df: pd.DataFrame) -> Tuple[int, int]:
         """

From 7fc96a939c839a2f080b9ef42f21847a5ea51a1f Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 5 Oct 2024 19:12:42 +0200
Subject: [PATCH 065/112] set weights_only parameter of torch.load to False

- #48
---
 chebai/models/electra.py                           |  8 ++++++--
 chebai/preprocessing/datasets/base.py              | 14 ++++++++++----
 chebai/preprocessing/datasets/chebi.py             |  7 +++++--
 chebai/preprocessing/datasets/go_uniprot.py        |  4 +++-
 chebai/preprocessing/datasets/pubchem.py           |  4 ++--
 .../migration/chebi_data_migration.py              |  2 +-
 chebai/result/analyse_sem.py                       |  4 +++-
 chebai/result/base.py                              |  2 +-
 chebai/result/pretraining.py                       |  2 +-
 chebai/result/utils.py                             |  2 ++
 tests/testCustomBalancedAccuracyMetric.py          |  6 +++++-
 tests/testCustomMacroF1Metric.py                   |  6 +++++-
 tests/testPubChemData.py                           | 12 +++++++++---
 tests/testTox21MolNetData.py                       | 12 +++++++++---
 tutorials/demo_process_results.ipynb               | 10 +++++-----
 tutorials/process_results_old_chebi.ipynb          |  2 +-
 16 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/chebai/models/electra.py b/chebai/models/electra.py
index 3b2807c8..7009406d 100644
--- a/chebai/models/electra.py
+++ b/chebai/models/electra.py
@@ -256,7 +256,9 @@ def __init__(
         # Load pretrained checkpoint if provided
         if pretrained_checkpoint:
             with open(pretrained_checkpoint, "rb") as fin:
-                model_dict = torch.load(fin, map_location=self.device)
+                model_dict = torch.load(
+                    fin, map_location=self.device, weights_only=False
+                )
                 if load_prefix:
                     state_dict = filter_dict(model_dict["state_dict"], load_prefix)
                 else:
@@ -414,7 +416,9 @@ def __init__(self, cone_dimensions=20, **kwargs):
         model_prefix = kwargs.get("load_prefix", None)
         if pretrained_checkpoint:
             with open(pretrained_checkpoint, "rb") as fin:
-                model_dict = torch.load(fin, map_location=self.device)
+                model_dict = torch.load(
+                    fin, map_location=self.device, weights_only=False
+                )
                 if model_prefix:
                     state_dict = {
                         str(k)[len(model_prefix) :]: v
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
index a2997699..f163a9e6 100644
--- a/chebai/preprocessing/datasets/base.py
+++ b/chebai/preprocessing/datasets/base.py
@@ -200,7 +200,9 @@ def load_processed_data(
                     filename = self.processed_file_names_dict[kind]
             except NotImplementedError:
                 filename = f"{kind}.pt"
-        return torch.load(os.path.join(self.processed_dir, filename))
+        return torch.load(
+            os.path.join(self.processed_dir, filename), weights_only=False
+        )
 
     def dataloader(self, kind: str, **kwargs) -> DataLoader:
         """
@@ -519,7 +521,7 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
             DataLoader: DataLoader object for the specified subset.
         """
         subdatasets = [
-            torch.load(os.path.join(s.processed_dir, f"{kind}.pt"))
+            torch.load(os.path.join(s.processed_dir, f"{kind}.pt"), weights_only=False)
             for s in self.subsets
         ]
         dataset = [
@@ -1022,7 +1024,9 @@ def _retrieve_splits_from_csv(self) -> None:
         splits_df = pd.read_csv(self.splits_file_path)
 
         filename = self.processed_file_names_dict["data"]
-        data = torch.load(os.path.join(self.processed_dir, filename))
+        data = torch.load(
+            os.path.join(self.processed_dir, filename), weights_only=False
+        )
         df_data = pd.DataFrame(data)
 
         train_ids = splits_df[splits_df["split"] == "train"]["id"]
@@ -1081,7 +1085,9 @@ def load_processed_data(
 
         # If filename is provided
         try:
-            return torch.load(os.path.join(self.processed_dir, filename))
+            return torch.load(
+                os.path.join(self.processed_dir, filename), weights_only=False
+            )
         except FileNotFoundError:
             raise FileNotFoundError(f"File {filename} doesn't exist")
 
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 727f9f64..9d80929a 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -407,7 +407,9 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """
         try:
             filename = self.processed_file_names_dict["data"]
-            data_chebi_version = torch.load(os.path.join(self.processed_dir, filename))
+            data_chebi_version = torch.load(
+                os.path.join(self.processed_dir, filename), weights_only=False
+            )
         except FileNotFoundError:
             raise FileNotFoundError(
                 f"File data.pt doesn't exists. "
@@ -428,7 +430,8 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
                 data_chebi_train_version = torch.load(
                     os.path.join(
                         self._chebi_version_train_obj.processed_dir, filename_train
-                    )
+                    ),
+                    weights_only=False,
                 )
             except FileNotFoundError:
                 raise FileNotFoundError(
diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index 574ecdbd..dba9940e 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -508,7 +508,9 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """
         try:
             filename = self.processed_file_names_dict["data"]
-            data_go = torch.load(os.path.join(self.processed_dir, filename))
+            data_go = torch.load(
+                os.path.join(self.processed_dir, filename), weights_only=False
+            )
         except FileNotFoundError:
             raise FileNotFoundError(
                 f"File data.pt doesn't exists. "
diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
index 5ba76cc4..c82ea42f 100644
--- a/chebai/preprocessing/datasets/pubchem.py
+++ b/chebai/preprocessing/datasets/pubchem.py
@@ -891,10 +891,10 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
             DataLoader: DataLoader instance.
         """
         labeled_data = torch.load(
-            os.path.join(self.labeled.processed_dir, f"{kind}.pt")
+            os.path.join(self.labeled.processed_dir, f"{kind}.pt"), weights_only=False
         )
         unlabeled_data = torch.load(
-            os.path.join(self.unlabeled.processed_dir, f"{kind}.pt")
+            os.path.join(self.unlabeled.processed_dir, f"{kind}.pt"), weights_only=False
         )
         if self.data_limit is not None:
             labeled_data = labeled_data[: self.data_limit]
diff --git a/chebai/preprocessing/migration/chebi_data_migration.py b/chebai/preprocessing/migration/chebi_data_migration.py
index 5a438b44..a057326a 100644
--- a/chebai/preprocessing/migration/chebi_data_migration.py
+++ b/chebai/preprocessing/migration/chebi_data_migration.py
@@ -168,7 +168,7 @@ def _combine_pt_splits(
         df_list: List[pd.DataFrame] = []
         for split, file_name in old_splits_file_names.items():
             file_path = os.path.join(old_dir, file_name)
-            file_df = pd.DataFrame(torch.load(file_path))
+            file_df = pd.DataFrame(torch.load(file_path, weights_only=False))
             df_list.append(file_df)
 
         return pd.concat(df_list, ignore_index=True)
diff --git a/chebai/result/analyse_sem.py b/chebai/result/analyse_sem.py
index 64ac87a1..6adb1066 100644
--- a/chebai/result/analyse_sem.py
+++ b/chebai/result/analyse_sem.py
@@ -427,7 +427,9 @@ def run_all(
                     os.path.join(buffer_dir_smoothed, "preds000.pt")
                 ):
                     preds = torch.load(
-                        os.path.join(buffer_dir_smoothed, "preds000.pt"), DEVICE
+                        os.path.join(buffer_dir_smoothed, "preds000.pt"),
+                        DEVICE,
+                        weights_only=False,
                     )
                     labels = None
                 else:
diff --git a/chebai/result/base.py b/chebai/result/base.py
index 487be6ac..9d583a00 100644
--- a/chebai/result/base.py
+++ b/chebai/result/base.py
@@ -54,7 +54,7 @@ def _generate_predictions(self, data_path, raw=False, **kwargs):
         else:
             data_tuples = [
                 (x.get("raw_features", x["ident"]), x["ident"], x)
-                for x in torch.load(data_path)
+                for x in torch.load(data_path, weights_only=False)
             ]
 
         for raw_features, ident, row in tqdm.tqdm(data_tuples):
diff --git a/chebai/result/pretraining.py b/chebai/result/pretraining.py
index 33c212c8..8d712f21 100644
--- a/chebai/result/pretraining.py
+++ b/chebai/result/pretraining.py
@@ -34,7 +34,7 @@ def evaluate_model(logs_base_path, model_filename, data_module):
     collate = data_module.reader.COLLATOR()
     test_file = "test.pt"
     data_path = os.path.join(data_module.processed_dir, test_file)
-    data_list = torch.load(data_path)
+    data_list = torch.load(data_path, weights_only=False)
     preds_list = []
     labels_list = []
 
diff --git a/chebai/result/utils.py b/chebai/result/utils.py
index 31063747..d015bd80 100644
--- a/chebai/result/utils.py
+++ b/chebai/result/utils.py
@@ -182,6 +182,7 @@ def load_results_from_buffer(
             torch.load(
                 os.path.join(buffer_dir, filename),
                 map_location=torch.device(device),
+                weights_only=False,
             )
         )
         i += 1
@@ -194,6 +195,7 @@ def load_results_from_buffer(
             torch.load(
                 os.path.join(buffer_dir, filename),
                 map_location=torch.device(device),
+                weights_only=False,
             )
         )
         i += 1
diff --git a/tests/testCustomBalancedAccuracyMetric.py b/tests/testCustomBalancedAccuracyMetric.py
index 30cbe1d5..033227df 100644
--- a/tests/testCustomBalancedAccuracyMetric.py
+++ b/tests/testCustomBalancedAccuracyMetric.py
@@ -49,7 +49,9 @@ def test_metric_against_realistic_data(self) -> None:
 
         # load single file to get the num of labels for metric class instantiation
         labels = torch.load(
-            f"{directory_path}/labels{0:03d}.pt", map_location=torch.device(self.device)
+            f"{directory_path}/labels{0:03d}.pt",
+            map_location=torch.device(self.device),
+            weights_only=False,
         )
         num_labels = labels.shape[1]
         balanced_acc_custom = BalancedAccuracy(num_labels=num_labels)
@@ -58,10 +60,12 @@ def test_metric_against_realistic_data(self) -> None:
             labels = torch.load(
                 f"{directory_path}/labels{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             preds = torch.load(
                 f"{directory_path}/preds{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             balanced_acc_custom.update(preds, labels)
 
diff --git a/tests/testCustomMacroF1Metric.py b/tests/testCustomMacroF1Metric.py
index a7bbbaa2..1c67d54b 100644
--- a/tests/testCustomMacroF1Metric.py
+++ b/tests/testCustomMacroF1Metric.py
@@ -119,7 +119,9 @@ def test_metric_against_realistic_data(self) -> None:
 
         # Load single file to get the number of labels for metric class instantiation
         labels = torch.load(
-            f"{directory_path}/labels{0:03d}.pt", map_location=torch.device(self.device)
+            f"{directory_path}/labels{0:03d}.pt",
+            map_location=torch.device(self.device),
+            weights_only=False,
         )
         num_labels = labels.shape[1]
         macro_f1_custom = MacroF1(num_labels=num_labels)
@@ -130,10 +132,12 @@ def test_metric_against_realistic_data(self) -> None:
             labels = torch.load(
                 f"{directory_path}/labels{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             preds = torch.load(
                 f"{directory_path}/preds{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             macro_f1_standard.update(preds, labels)
             macro_f1_custom.update(preds, labels)
diff --git a/tests/testPubChemData.py b/tests/testPubChemData.py
index dfc43028..71591f6e 100644
--- a/tests/testPubChemData.py
+++ b/tests/testPubChemData.py
@@ -37,9 +37,15 @@ def getDataSplitsOverlaps(cls) -> None:
         processed_path = os.path.join(os.getcwd(), cls.pubChem.processed_dir)
         print(f"Checking Data from - {processed_path}")
 
-        train_set = torch.load(os.path.join(processed_path, "train.pt"))
-        val_set = torch.load(os.path.join(processed_path, "validation.pt"))
-        test_set = torch.load(os.path.join(processed_path, "test.pt"))
+        train_set = torch.load(
+            os.path.join(processed_path, "train.pt"), weights_only=False
+        )
+        val_set = torch.load(
+            os.path.join(processed_path, "validation.pt"), weights_only=False
+        )
+        test_set = torch.load(
+            os.path.join(processed_path, "test.pt"), weights_only=False
+        )
 
         train_smiles, train_smiles_ids = cls.get_features_ids(train_set)
         val_smiles, val_smiles_ids = cls.get_features_ids(val_set)
diff --git a/tests/testTox21MolNetData.py b/tests/testTox21MolNetData.py
index 99424e83..36fcb431 100644
--- a/tests/testTox21MolNetData.py
+++ b/tests/testTox21MolNetData.py
@@ -37,9 +37,15 @@ def getDataSplitsOverlaps(cls) -> None:
         processed_path = os.path.join(os.getcwd(), cls.tox21.processed_dir)
         print(f"Checking Data from - {processed_path}")
 
-        train_set = torch.load(os.path.join(processed_path, "train.pt"))
-        val_set = torch.load(os.path.join(processed_path, "validation.pt"))
-        test_set = torch.load(os.path.join(processed_path, "test.pt"))
+        train_set = torch.load(
+            os.path.join(processed_path, "train.pt"), weights_only=False
+        )
+        val_set = torch.load(
+            os.path.join(processed_path, "validation.pt"), weights_only=False
+        )
+        test_set = torch.load(
+            os.path.join(processed_path, "test.pt"), weights_only=False
+        )
 
         train_smiles, train_smiles_ids = cls.get_features_ids(train_set)
         val_smiles, val_smiles_ids = cls.get_features_ids(val_set)
diff --git a/tutorials/demo_process_results.ipynb b/tutorials/demo_process_results.ipynb
index ee0c1ec9..bf7810cc 100644
--- a/tutorials/demo_process_results.ipynb
+++ b/tutorials/demo_process_results.ipynb
@@ -248,9 +248,9 @@
     "# check if pretraining datasets overlap\n",
     "dm = PubChemDeepSMILES()\n",
     "processed_path = dm.processed_dir\n",
-    "test_set = torch.load(os.path.join(processed_path, \"test.pt\"))\n",
-    "val_set = torch.load(os.path.join(processed_path, \"validation.pt\"))\n",
-    "train_set = torch.load(os.path.join(processed_path, \"train.pt\"))\n",
+    "test_set = torch.load(os.path.join(processed_path, \"test.pt\"), weights_only=False)\n",
+    "val_set = torch.load(os.path.join(processed_path, \"validation.pt\"), weights_only=False)\n",
+    "train_set = torch.load(os.path.join(processed_path, \"train.pt\"), weights_only=False)\n",
     "print(processed_path)\n",
     "test_smiles = [entry[\"features\"] for entry in test_set]\n",
     "val_smiles = [entry[\"features\"] for entry in val_set]\n",
@@ -320,7 +320,7 @@
     "data_module_v200 = ChEBIOver100()\n",
     "data_module_v148 = ChEBIOver100(chebi_version_train=148)\n",
     "data_module_v227 = ChEBIOver100(chebi_version_train=227)\n",
-    "# dataset = torch.load(data_path)\n",
+    "# dataset = torch.load(data_path, weights_only=False)\n",
     "# processors = [CustomResultsProcessor()]\n",
     "# factory = ResultFactory(model, data_module, processors)\n",
     "# factory.execute(data_path)"
@@ -653,7 +653,7 @@
     "    if test_file is None:\n",
     "        test_file = data_module.processed_file_names_dict[\"test\"]\n",
     "    data_path = os.path.join(data_module.processed_dir, test_file)\n",
-    "    data_list = torch.load(data_path)\n",
+    "    data_list = torch.load(data_path, weights_only=False)\n",
     "    preds_list = []\n",
     "    labels_list = []\n",
     "    # if common_classes_mask is not N\n",
diff --git a/tutorials/process_results_old_chebi.ipynb b/tutorials/process_results_old_chebi.ipynb
index c8af0860..e72baf4c 100644
--- a/tutorials/process_results_old_chebi.ipynb
+++ b/tutorials/process_results_old_chebi.ipynb
@@ -167,7 +167,7 @@
     "    if test_file is None:\n",
     "        test_file = data_module.processed_file_names_dict[\"test\"]\n",
     "    data_path = os.path.join(data_module.processed_dir, test_file)\n",
-    "    data_list = torch.load(data_path)\n",
+    "    data_list = torch.load(data_path, weights_only=False)\n",
     "    preds_list = []\n",
     "    labels_list = []\n",
     "\n",

From 242db56e2331a84a569f154a9215961ca210ad78 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 5 Oct 2024 23:48:13 +0200
Subject: [PATCH 066/112] re-order section 3 and 4 as per suggestion

---
 tutorials/data_exploration_chebi.ipynb | 365 +++++++++++++++----------
 1 file changed, 216 insertions(+), 149 deletions(-)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index 6a7e25ed..87818cba 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -1,8 +1,9 @@
 {
  "cells": [
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b",
+   "metadata": {},
    "source": [
     "# Introduction\n",
     "\n",
@@ -14,40 +15,47 @@
     "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n",
     "\n",
     "---\n"
-   ],
-   "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "4550d01fc7af5ae4",
+   "metadata": {},
    "source": [
     "# 1. Instantiation of a Data Class\n",
     "\n",
     "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data."
-   ],
-   "id": "4550d01fc7af5ae4"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "execution_count": 1,
+   "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22",
+   "metadata": {},
    "outputs": [],
-   "execution_count": 18,
-   "source": "from chebai.preprocessing.datasets.chebi import ChEBIOver50",
-   "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22"
+   "source": [
+    "from chebai.preprocessing.datasets.chebi import ChEBIOver50"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 2,
    "id": "a71b7301-6195-4155-a439-f5eb3183d0f3",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:07:26.371796Z",
+     "start_time": "2024-10-05T21:07:26.058728Z"
+    }
+   },
    "outputs": [],
    "source": [
     "chebi_class = ChEBIOver50(chebi_version=231)"
    ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d",
+   "metadata": {},
    "source": [
     "\n",
     "### Inheritance Hierarchy\n",
@@ -73,12 +81,12 @@
     "### Additional Input Parameters\n",
     "\n",
     "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n"
-   ],
-   "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a",
+   "metadata": {},
    "source": [
     "# Available ChEBI Data Classes\n",
     "\n",
@@ -91,8 +99,7 @@
     "\n",
     "Finally, `ChEBIOver50Partial` selects extracts a part of ChEBI based on a given top class, with a threshold of 50 for selecting labels.\n",
     "This class inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n"
-   ],
-   "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -103,25 +110,25 @@
    ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "ed973fb59df11849",
+   "metadata": {},
    "source": [
     "# 2. Preparation / Setup Methods\n",
     "\n",
     "Now we have a ChEBI data class with all the relevant parameters. Next, we need to generate the actual dataset."
-   ],
-   "id": "ed973fb59df11849"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
-   "outputs": [],
    "execution_count": null,
+   "id": "d0a58e2bd9c0e6d9",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "chebi_class.prepare_data()\n",
     "chebi_class.setup()"
-   ],
-   "id": "d0a58e2bd9c0e6d9"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -163,37 +170,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e",
+   "id": "bb6e9a81554368f7",
    "metadata": {},
    "source": [
-    "# 3. Different Data Files Created and their Structure\n",
-    "\n",
-    "\n",
-    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n",
-    "\n",
-    "### Data Files\n",
-    "\n",
-    "1. **`Raw Data Files`**: (e.g., `.obo` file)\n",
-    "   - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
-    "\n",
-    "2. **`data.pkl`**\n",
-    "   - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
-    "\n",
-    "3. **`data.pt`**\n",
-    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
-    "\n",
-    "4. **`classes.txt`**\n",
-    "   - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n",
-    "\n",
-    "5. **`splits.csv`**\n",
-    "   - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
-    "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n",
-    "\n",
-    "### File Structure and Preprocessing Stages\n",
+    "# 3. Overview of the 3 preprocessing stages\n",
     "\n",
     "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n",
     "\n",
@@ -214,34 +194,28 @@
     "   - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
     "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n",
     "\n",
-    "### Data Splits\n",
-    "\n",
-    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n",
-    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n",
-    "\n",
     "### Summary of File Paths\n",
     "\n",
     "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n",
     "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n",
     "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n",
     "\n",
-    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757",
-   "metadata": {},
-   "source": [
-    "---"
+    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n",
+    "\n",
+    "### Data Splits\n",
+    "\n",
+    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n",
+    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "74adb549-9e02-472d-a535-78a584853b52",
+   "id": "7e172c0d1e8bb93f",
    "metadata": {},
    "source": [
-    "# 4. Information Stored in the Files\n"
+    "# 4. Data Files and their structure\n",
+    "\n",
+    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n"
    ]
   },
   {
@@ -249,13 +223,10 @@
    "id": "43329709-5134-4ce5-88e7-edd2176bf84d",
    "metadata": {},
    "source": [
-    "## chebi.obo\n",
+    "## <u>chebi.obo</u> File\n",
     "\n",
-    "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n",
-    "\n",
-    "### Structure of `chebi.obo`\n",
-    "\n",
-    "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n",
+    "**Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n",
+    "   \n",
     "\n",
     "#### Example of a Term Document\n",
     "\n",
@@ -269,6 +240,14 @@
     "is_a: CHEBI:33238\n",
     "```\n",
     "\n",
+    "**File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n",
+    "\n",
+    "\n",
+    "### Structure of `chebi.obo`\n",
+    "\n",
+    "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n",
+    "\n",
+    "\n",
     "### Breakdown of Attributes\n",
     "\n",
     "Each term document in the `chebi.obo` file consists of the following key attributes:\n",
@@ -291,46 +270,46 @@
   },
   {
    "cell_type": "markdown",
-   "id": "322bc926-69ff-4b93-9e95-5e8b85869c38",
+   "id": "558295e5a7ded456",
    "metadata": {},
    "source": [
-    "## `data.pkl` File\n",
-    "\n",
-    "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n",
-    "\n",
-    "\n",
-    "\n",
-    "### Structure of `data.pkl`\n",
-    "`data.pkl` as following structure: \n",
-    "- **Column 0**: Contains the ID of each ChEBI data instance.\n",
-    "- **Column 1**: Contains the name of each ChEBI data instance.\n",
-    "- **Column 2**: Contains the SMILES representation of the chemical.\n",
-    "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n",
+    "## <u>data.pkl</u> File\n",
     "\n",
-    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
+    "**Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 6,
    "id": "fd490270-59b8-4c1c-8b09-204defddf592",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:09:01.622317Z",
+     "start_time": "2024-10-05T21:09:01.606698Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "import pandas as pd"
+    "import pandas as pd\n",
+    "import os"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 10,
    "id": "d7d16247-092c-4e8d-96c2-ab23931cf766",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:11:51.296162Z",
+     "start_time": "2024-10-05T21:11:44.559304Z"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Size of the data (rows x columns):  (129184, 1335)\n"
+      "Size of the data (rows x columns):  (185007, 1514)\n"
      ]
     },
     {
@@ -358,23 +337,23 @@
        "      <th>name</th>\n",
        "      <th>SMILES</th>\n",
        "      <th>1722</th>\n",
+       "      <th>2440</th>\n",
        "      <th>2468</th>\n",
        "      <th>2571</th>\n",
        "      <th>2580</th>\n",
        "      <th>2634</th>\n",
        "      <th>3098</th>\n",
-       "      <th>3992</th>\n",
        "      <th>...</th>\n",
-       "      <th>143017</th>\n",
-       "      <th>143212</th>\n",
-       "      <th>143813</th>\n",
-       "      <th>146180</th>\n",
-       "      <th>147334</th>\n",
-       "      <th>156473</th>\n",
-       "      <th>166828</th>\n",
-       "      <th>166904</th>\n",
-       "      <th>167497</th>\n",
-       "      <th>167559</th>\n",
+       "      <th>176910</th>\n",
+       "      <th>177333</th>\n",
+       "      <th>183508</th>\n",
+       "      <th>183509</th>\n",
+       "      <th>189832</th>\n",
+       "      <th>189840</th>\n",
+       "      <th>192499</th>\n",
+       "      <th>194321</th>\n",
+       "      <th>197504</th>\n",
+       "      <th>229684</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -500,73 +479,91 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>5 rows × 1335 columns</p>\n",
+       "<p>5 rows × 1514 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "      id                  name      SMILES   1722   2468   2571   2580   2634  \\\n",
+       "      id                  name      SMILES   1722   2440   2468   2571   2580  \\\n",
        "0  33429  monoatomic monoanion        [*-]  False  False  False  False  False   \n",
        "1  30151         aluminide(1-)       [Al-]  False  False  False  False  False   \n",
        "2  16042          halide anion        [*-]  False  False  False  False  False   \n",
        "3  17051              fluoride        [F-]  False  False  False  False  False   \n",
        "4  28741       sodium fluoride  [F-].[Na+]  False  False  False  False  False   \n",
        "\n",
-       "    3098   3992  ...  143017  143212  143813  146180  147334  156473  166828  \\\n",
+       "    2634   3098  ...  176910  177333  183508  183509  189832  189840  192499  \\\n",
        "0  False  False  ...   False   False   False   False   False   False   False   \n",
        "1  False  False  ...   False   False   False   False   False   False   False   \n",
        "2  False  False  ...   False   False   False   False   False   False   False   \n",
        "3  False  False  ...   False   False   False   False   False   False   False   \n",
        "4  False  False  ...   False   False   False   False   False   False   False   \n",
        "\n",
-       "   166904  167497  167559  \n",
+       "   194321  197504  229684  \n",
        "0   False   False   False  \n",
        "1   False   False   False  \n",
        "2   False   False   False  \n",
        "3   False   False   False  \n",
        "4   False   False   False  \n",
        "\n",
-       "[5 rows x 1335 columns]"
+       "[5 rows x 1514 columns]"
       ]
      },
-     "execution_count": 53,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n",
+    "pkl_df = pd.DataFrame(\n",
+    "    pd.read_pickle(\n",
+    "        os.path.join(\n",
+    "            chebi_class.processed_dir_main,\n",
+    "            chebi_class.processed_dir_main_file_names_dict[\"data\"],\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
     "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
     "pkl_df.head()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07",
+   "id": "322bc926-69ff-4b93-9e95-5e8b85869c38",
    "metadata": {},
    "source": [
-    "## `data.pt` File\n",
+    "**File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n",
     "\n",
-    "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n",
     "\n",
-    "### Structure of `data.pt`\n",
-    "\n",
-    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
+    "### Structure of `data.pkl`\n",
+    "`data.pkl` as following structure: \n",
+    "- **Column 0**: Contains the ID of each ChEBI data instance.\n",
+    "- **Column 1**: Contains the name of each ChEBI data instance.\n",
+    "- **Column 2**: Contains the SMILES representation of the chemical.\n",
+    "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n",
     "\n",
-    "- **`features`**: \n",
-    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
+    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba019d2d4324bd0b",
+   "metadata": {},
+   "source": [
+    "## <u>data.pt</u> File\n",
     "\n",
-    "- **`labels`**: \n",
-    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
     "\n",
-    "- **`ident`**: \n",
-    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
+    "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library, specifically as a list of dictionaries. Each dictionary in this list includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 11,
    "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:12:49.338943Z",
+     "start_time": "2024-10-05T21:12:49.323319Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch"
@@ -574,9 +571,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 13,
    "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:14:12.892845Z",
+     "start_time": "2024-10-05T21:13:59.859953Z"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -587,15 +589,25 @@
     }
    ],
    "source": [
-    "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n",
+    "data_pt = torch.load(\n",
+    "    os.path.join(\n",
+    "        chebi_class.processed_dir, chebi_class.processed_file_names_dict[\"data\"]\n",
+    "    ),\n",
+    "    weights_only=False,\n",
+    ")\n",
     "print(\"Type of loaded data:\", type(data_pt))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 15,
    "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:14:21.185027Z",
+     "start_time": "2024-10-05T21:14:21.169358Z"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -616,36 +628,61 @@
   },
   {
    "cell_type": "markdown",
-   "id": "861da1c3-0401-49f0-a22f-109814ed95d5",
+   "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07",
    "metadata": {},
    "source": [
-    "## `classes.txt` File\n",
+    "**File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n",
     "\n",
-    "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n",
     "\n",
-    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n"
+    "### Structure of `data.pt`\n",
+    "\n",
+    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
+    "\n",
+    "- **`features`**: \n",
+    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
+    "\n",
+    "- **`labels`**: \n",
+    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
+    "\n",
+    "- **`ident`**: \n",
+    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "186ec6f0eed6ecf7",
+   "metadata": {},
+   "source": [
+    "## <u>classes.txt</u> File\n",
+    "\n",
+    "**Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 16,
    "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:15:19.146285Z",
+     "start_time": "2024-10-05T21:15:18.503284Z"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "1722\n",
+      "2440\n",
       "2468\n",
       "2571\n",
-      "2580\n",
-      "2634\n"
+      "2580\n"
      ]
     }
    ],
    "source": [
-    "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n",
+    "with open(os.path.join(chebi_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n",
     "    for i in range(5):\n",
     "        line = file.readline()\n",
     "        print(line.strip())"
@@ -653,19 +690,37 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b058714f-e434-4367-89b9-74c129ac727f",
+   "id": "861da1c3-0401-49f0-a22f-109814ed95d5",
    "metadata": {},
    "source": [
-    "## `splits.csv` File\n",
     "\n",
-    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n"
+    "**File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n",
+    "\n",
+    "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n",
+    "\n",
+    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb72be449e52b63f",
+   "metadata": {},
+   "source": [
+    "## <u>splits.csv</u> File\n",
+    "\n",
+    "**Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 17,
    "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-05T21:15:54.575116Z",
+     "start_time": "2024-10-05T21:15:53.945139Z"
+    }
+   },
    "outputs": [
     {
      "data": {
@@ -731,16 +786,28 @@
        "4  30340  train"
       ]
      },
-     "execution_count": 98,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n",
+    "csv_df = pd.read_csv(os.path.join(chebi_class.processed_dir_main, \"splits.csv\"))\n",
     "csv_df.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b058714f-e434-4367-89b9-74c129ac727f",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "**File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n",
+    "\n",
+    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee",

From 748eebedc354f64c84932d3d722a4766e41edae5 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 6 Oct 2024 12:02:21 +0200
Subject: [PATCH 067/112] GO: reformat section 3 and 4 as per suggestion

---
 tutorials/data_exploration_go.ipynb | 625 ++++++++++++++++------------
 1 file changed, 364 insertions(+), 261 deletions(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index 8dc4cb44..e60e972b 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -1,26 +1,67 @@
 {
  "cells": [
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "da687d32ba48b188",
+   "metadata": {},
    "source": [
     "# Introduction\n",
     "\n",
-    "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on Gene Ontology (GO) and Swiss UniProt Protein data. This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n",
+    "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the GO-UniProt dataset. It covers\n",
+    "- how to instantiate a data class and generate data\n",
+    "- how the data is processed and stored\n",
+    "- and how to work with different molecule encodings.\n",
     "\n",
-    "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n",
+    "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n",
     "\n",
-    "---"
-   ],
-   "id": "da687d32ba48b188"
+    "---\n"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "0bd07c91-bb02-48d4-b759-aa35ecb224bd",
+   "metadata": {},
    "source": [
     "# 1. Instantiation of a Data Class\n",
     "\n",
-    "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n",
+    "To start working with `chebai`, you first need to instantiate a GO-UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "440f203ceaf7e4b7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T21:25:03.920610Z",
+     "start_time": "2024-09-30T21:25:03.622407Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a648346d81d0dc5e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-30T21:25:08.863132Z",
+     "start_time": "2024-09-30T21:25:08.387739Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "go_class = GOUniProtOver250(go_branch=\"BP\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64585012b0d7f66f",
+   "metadata": {},
+   "source": [
     "### Inheritance Hierarchy\n",
     "\n",
     "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n",
@@ -37,6 +78,11 @@
     "Data classes related to proteins can be configured using the following main parameters:\n",
     "\n",
     "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n",
+    "  - **`\"BP\"`**: Biological Process branch.\n",
+    "  - **`\"MF\"`**: Molecular Function branch.\n",
+    "  - **`\"CC\"`**: Cellular Component branch.\n",
+    "\n",
+    "This allows for more specific datasets focused on a particular aspect of gene function.\n",
     "\n",
     "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n",
     "\n",
@@ -44,69 +90,52 @@
     "\n",
     "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n",
     "\n",
-    "### Available GOUniProt Data Classes\n",
-    "\n",
-    "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n",
-    "\n",
-    "#### `GOUniProtOver250`\n",
     "\n",
-    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n",
+    "# Available ChEBI Data Classes\n",
     "\n",
-    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n",
+    "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n",
     "\n",
-    "#### `GOUniProtOver50`\n",
+    "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset.\n",
     "\n",
-    "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n",
+    "Refer `select_classes` method of `_GOUniProtOverX` for more details on selection process.\n",
     "\n",
-    "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n"
-   ],
-   "id": "64585012b0d7f66f"
+    "If you need a different threshold, you can create your own subclass."
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "### Instantiation Example",
-   "id": "605bbca601037df2"
+   "id": "651ab5c39833bd2c",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-09-30T21:25:03.920610Z",
-     "start_time": "2024-09-30T21:25:03.622407Z"
-    }
-   },
-   "cell_type": "code",
-   "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250",
-   "id": "440f203ceaf7e4b7",
-   "outputs": [],
-   "execution_count": 12
+   "cell_type": "markdown",
+   "id": "a52b4363-7398-44aa-a4cc-8bba14bdd966",
+   "metadata": {},
+   "source": [
+    "# 2. Preparation / Setup Methods\n",
+    "\n",
+    "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is to generate the actual dataset."
+   ]
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-09-30T21:25:08.863132Z",
-     "start_time": "2024-09-30T21:25:08.387739Z"
-    }
-   },
    "cell_type": "code",
-   "source": "go_class = GOUniProtOver250()",
-   "id": "a648346d81d0dc5e",
+   "execution_count": null,
+   "id": "9f77351090560bc4",
+   "metadata": {},
    "outputs": [],
-   "execution_count": 13
+   "source": [
+    "go_class.prepare_data()\n",
+    "go_class.setup()"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "---",
-   "id": "651ab5c39833bd2c"
-  },
-  {
+   "id": "2328e824c4dafb2d",
    "metadata": {},
-   "cell_type": "markdown",
    "source": [
-    "# 2. Preparation / Setup Methods\n",
-    "\n",
-    "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n",
     "### Automatic Execution: \n",
     "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n",
     "\n",
@@ -130,81 +159,86 @@
     "   - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n",
     "\n",
     "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes."
-   ],
-   "id": "2328e824c4dafb2d"
+   ]
   },
   {
+   "cell_type": "markdown",
+   "id": "db5b58f2d96823fc",
    "metadata": {},
-   "cell_type": "code",
    "source": [
-    "go_class.prepare_data()\n",
-    "go_class.setup()"
-   ],
-   "id": "9f77351090560bc4",
-   "outputs": [],
-   "execution_count": null
+    "---"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "---",
-   "id": "db5b58f2d96823fc"
-  },
-  {
+   "id": "ee174b61b36c71aa",
    "metadata": {},
-   "cell_type": "markdown",
    "source": [
-    "# 3. GOUniProt Data File Structure\n",
+    "# 3. Overview of the 3 preprocessing stages\n",
     "\n",
-    "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n",
-    "   - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n",
+    "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n",
+    "\n",
+    "1. **Raw Data Stage**:\n",
+    "   - **File**: `go-basic.obo` and `uniprot_sprot.data`\n",
+    "   - **Description**: This stage contains the raw GO ontology data and raw Swiss-UniProt data, serving as the initial input for further processing.\n",
     "   - **File Paths**:\n",
-    "     - `data/GO_UniProt/raw/${filename}.obo`\n",
-    "     - `data/GO_UniProt/raw/${filename}.dat`\n",
+    "     - `data/GO_UniProt/raw/go-basic.obo`\n",
+    "     - `data/GO_UniProt/raw/uniprot_sprot.dat`\n",
     "\n",
-    "2. **`data.pkl`**\n",
-    "   - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n",
+    "2. **Processed Data Stage 1**:\n",
+    "   - **File**: `data.pkl`\n",
+    "   - **Description**: This stage includes the data after initial processing. It contains sequence strings, class columns, and metadata but lacks data splits.\n",
     "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n",
+    "   - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n",
     "\n",
-    "3. **`data.pt`**\n",
-    "   - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n",
+    "3. **Processed Data Stage 2**:\n",
+    "   - **File**: `data.pt`\n",
+    "   - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n",
     "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n",
+    "   - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n",
     "\n",
-    "4. **`classes.txt`**\n",
-    "   - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n",
+    "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n",
     "\n",
-    "5. **`splits.csv`**\n",
-    "   - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n",
-    "   - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n",
+    "### Summary of File Paths\n",
     "\n",
-    "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n"
-   ],
-   "id": "ee174b61b36c71aa"
+    "- **Raw Data**: `data/GO_UniProt/raw`\n",
+    "- **Processed Data 1**: `data/GO_UniProt/${dataset_name}/processed`\n",
+    "- **Processed Data 2**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}`\n",
+    "\n",
+    "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n",
+    "\n",
+    "### Data Splits\n",
+    "\n",
+    "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n",
+    "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "---",
-   "id": "a927ad484c930960"
+   "id": "a927ad484c930960",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "# 4. Information Stored in the Files",
-   "id": "3f92b58e460c08fd"
+   "id": "3f92b58e460c08fd",
+   "metadata": {},
+   "source": [
+    "# 4. Data Files and their structure\n",
+    "\n",
+    "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "cca75d881cb8bade",
+   "metadata": {},
    "source": [
-    "## go-basic.obo\n",
+    "## <u>go-basic.obo</u> File\n",
     "\n",
-    "The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n",
-    "\n",
-    "### Structure of `go-basic.obo`\n",
-    "\n",
-    "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n",
+    "**Description**: The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n",
     "\n",
     "#### Example of a Term Document\n",
     "\n",
@@ -219,6 +253,14 @@
     "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n",
     "```\n",
     "\n",
+    "**File Path**: `data/GO_UniProt/raw/go-basic.obo`\n",
+    "\n",
+    "### Structure of `go-basic.obo`\n",
+    "\n",
+    "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n",
+    "\n",
+    "\n",
+    "\n",
     "### Breakdown of Attributes\n",
     "\n",
     "Each term document in the `go-basic.obo` file consists of the following key attributes:\n",
@@ -240,22 +282,18 @@
     "- **`is_a: GO:0006057`**: \n",
     "  - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n",
     "  - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n"
-   ],
-   "id": "cca75d881cb8bade"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "87c841de7d80beef",
+   "metadata": {},
    "source": [
-    "## uniprot_sprot.dat\n",
+    "## <u>uniprot_sprot.dat</u> File\n",
     "\n",
-    "The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n",
+    "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n",
     "\n",
     "\n",
-    "## Structure of `uniprot_sprot.dat`\n",
-    "\n",
-    "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n",
-    "\n",
     "### Example of a Protein Entry\n",
     "\n",
     "```plaintext\n",
@@ -302,6 +340,13 @@
     "//\n",
     "```\n",
     "\n",
+    "**File Path**: `data/GO_UniProt/raw/uniprot_sprot.dat`\n",
+    "\n",
+    "\n",
+    "## Structure of `uniprot_sprot.dat`\n",
+    "\n",
+    "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n",
+    "\n",
     "### Breakdown of Attributes\n",
     "\n",
     "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n",
@@ -341,107 +386,56 @@
     "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n",
     "\n",
     "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n",
-    "\n"
-   ],
-   "id": "87c841de7d80beef"
+    "\n",
+    "__Note__: For more details on evidence codes check section 5.2"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "b7687078-f6b8-4fbf-afa7-dfda89061a5e",
+   "metadata": {},
    "source": [
-    "## data.pkl\n",
-    "\n",
-    "The `data.pkl` file, generated during the preprocessing stage, contains the processed GO data in a dataframe format. Below is an example of how this data is structured:\n",
-    "\n",
+    "## <u>data.pkl</u> File\n",
     "\n",
-    "\n",
-    "### Structure of `data.pkl`\n",
-    "`data.pkl` as following structure: \n",
-    "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n",
-    "- **Column 1**: Contains the accession of each Protein data instance.\n",
-    "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n",
-    "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n",
-    "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n",
-    "\n",
-    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
-   ],
-   "id": "735844f0b2474ad6"
+    "**Description**: This file is generated by the `prepare_data` method and contains the processed GO data in a dataframe format. It includes protein IDs, data representations (such as sequence strings), and class columns with boolean values."
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b4da7e73e251e1d1",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-09-30T14:08:33.990378Z",
      "start_time": "2024-09-30T14:08:33.959459Z"
     }
    },
-   "cell_type": "code",
-   "source": "import pandas as pd",
-   "id": "b4da7e73e251e1d1",
    "outputs": [],
-   "execution_count": 3
+   "source": [
+    "import pandas as pd\n",
+    "import os"
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b66fbb9b720d053c",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-09-30T14:10:12.796911Z",
      "start_time": "2024-09-30T14:10:06.052276Z"
     }
    },
-   "cell_type": "code",
-   "source": [
-    "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n",
-    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
-    "pkl_df.head()"
-   ],
-   "id": "b66fbb9b720d053c",
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Size of the data (rows x columns):  (27459, 1050)\n"
+      "Size of the data (rows x columns):  (32933, 1049)\n"
      ]
     },
     {
      "data": {
-      "text/plain": [
-       "       swiss_id             accession  \\\n",
-       "8   14331_ARATH  P42643,Q945M2,Q9M0S7   \n",
-       "9   14331_CAEEL         P41932,Q21537   \n",
-       "10  14331_MAIZE                P49106   \n",
-       "13  14332_MAIZE                Q01526   \n",
-       "14  14333_ARATH  P42644,F4KBI7,Q945L2   \n",
-       "\n",
-       "                                               go_ids  \\\n",
-       "8                                             [19222]   \n",
-       "9   [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...   \n",
-       "10                         [3677, 5634, 10468, 44877]   \n",
-       "13                         [3677, 5634, 10468, 44877]   \n",
-       "14  [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...   \n",
-       "\n",
-       "                                             sequence     41     75    122  \\\n",
-       "8   MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...  False  False  False   \n",
-       "9   MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...  False  False  False   \n",
-       "10  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
-       "13  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
-       "14  MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...  False  False  False   \n",
-       "\n",
-       "      165    209    226  ...  2000145  2000146  2000147  2000241  2000243  \\\n",
-       "8   False  False  False  ...    False    False    False    False    False   \n",
-       "9   False  False  False  ...    False    False    False    False    False   \n",
-       "10  False  False  False  ...    False    False    False    False    False   \n",
-       "13  False  False  False  ...    False    False    False    False    False   \n",
-       "14  False  False  False  ...    False    False    False    False    False   \n",
-       "\n",
-       "    2000377  2001020  2001141  2001233  2001234  \n",
-       "8     False    False    False    False    False  \n",
-       "9     False    False    False    False    False  \n",
-       "10    False    False    False    False    False  \n",
-       "13    False    False    False    False    False  \n",
-       "14    False    False    False    False    False  \n",
-       "\n",
-       "[5 rows x 1050 columns]"
-      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -476,9 +470,9 @@
        "      <th>2000146</th>\n",
        "      <th>2000147</th>\n",
        "      <th>2000241</th>\n",
+       "      <th>2000242</th>\n",
        "      <th>2000243</th>\n",
        "      <th>2000377</th>\n",
-       "      <th>2001020</th>\n",
        "      <th>2001141</th>\n",
        "      <th>2001233</th>\n",
        "      <th>2001234</th>\n",
@@ -607,103 +601,179 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>5 rows × 1050 columns</p>\n",
+       "<p>5 rows × 1049 columns</p>\n",
        "</div>"
+      ],
+      "text/plain": [
+       "       swiss_id             accession  \\\n",
+       "8   14331_ARATH  P42643,Q945M2,Q9M0S7   \n",
+       "9   14331_CAEEL         P41932,Q21537   \n",
+       "10  14331_MAIZE                P49106   \n",
+       "13  14332_MAIZE                Q01526   \n",
+       "14  14333_ARATH  P42644,F4KBI7,Q945L2   \n",
+       "\n",
+       "                                               go_ids  \\\n",
+       "8                                             [19222]   \n",
+       "9   [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...   \n",
+       "10                         [3677, 5634, 10468, 44877]   \n",
+       "13                         [3677, 5634, 10468, 44877]   \n",
+       "14  [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...   \n",
+       "\n",
+       "                                             sequence     41     75    122  \\\n",
+       "8   MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...  False  False  False   \n",
+       "9   MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...  False  False  False   \n",
+       "10  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
+       "13  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
+       "14  MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...  False  False  False   \n",
+       "\n",
+       "      165    209    226  ...  2000145  2000146  2000147  2000241  2000242  \\\n",
+       "8   False  False  False  ...    False    False    False    False    False   \n",
+       "9   False  False  False  ...    False    False    False    False    False   \n",
+       "10  False  False  False  ...    False    False    False    False    False   \n",
+       "13  False  False  False  ...    False    False    False    False    False   \n",
+       "14  False  False  False  ...    False    False    False    False    False   \n",
+       "\n",
+       "    2000243  2000377  2001141  2001233  2001234  \n",
+       "8     False    False    False    False    False  \n",
+       "9     False    False    False    False    False  \n",
+       "10    False    False    False    False    False  \n",
+       "13    False    False    False    False    False  \n",
+       "14    False    False    False    False    False  \n",
+       "\n",
+       "[5 rows x 1049 columns]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 7
+   "source": [
+    "pkl_df = pd.DataFrame(\n",
+    "    pd.read_pickle(\n",
+    "        os.path.join(\n",
+    "            go_class.processed_dir_main,\n",
+    "            go_class.processed_dir_main_file_names_dict[\"data\"],\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
+    "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n",
+    "pkl_df.head()"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "735844f0b2474ad6",
+   "metadata": {},
    "source": [
-    "## data.pt\n",
+    "**File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n",
     "\n",
-    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
     "\n",
-    "- **`features`**: \n",
-    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
+    "### Structure of `data.pkl`\n",
+    "`data.pkl` as following structure: \n",
+    "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n",
+    "- **Column 1**: Contains the accession of each Protein data instance.\n",
+    "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n",
+    "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n",
+    "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n",
     "\n",
-    "- **`labels`**: \n",
-    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
+    "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c9b17f6-93bd-4cc3-8967-7ab1d2e06e51",
+   "metadata": {},
+   "source": [
+    "## <u>data.pt</u> File\n",
     "\n",
-    "- **`ident`**: \n",
-    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
-   ],
-   "id": "2c9f23883c66b48d"
+    "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input."
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "85b097601fb242d6",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-09-30T14:10:35.034002Z",
      "start_time": "2024-09-30T14:10:35.018342Z"
     }
    },
-   "cell_type": "code",
-   "source": "import torch",
-   "id": "85b097601fb242d6",
    "outputs": [],
-   "execution_count": 8
+   "source": [
+    "import torch"
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "289a54a71dec20fb",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-09-30T14:11:36.443693Z",
      "start_time": "2024-09-30T14:11:34.199285Z"
     }
    },
-   "cell_type": "code",
-   "source": [
-    "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n",
-    "print(\"Type of loaded data:\", type(data_pt))\n",
-    "for i in range(1):\n",
-    "    print(data_pt[i])"
-   ],
-   "id": "289a54a71dec20fb",
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Type of loaded data: <class 'list'>\n",
-      "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n"
+      "Content of the data file: \n",
+      " {'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n"
      ]
     }
    ],
-   "execution_count": 11
+   "source": [
+    "data_pt = torch.load(\n",
+    "    os.path.join(go_class.processed_dir, go_class.processed_file_names_dict[\"data\"]),\n",
+    "    weights_only=False,\n",
+    ")\n",
+    "print(\"Type of loaded data:\", type(data_pt))\n",
+    "print(\"Content of the data file: \\n\", data_pt[0])"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "2c9f23883c66b48d",
+   "metadata": {},
    "source": [
-    "## `classes.txt` File\n",
+    "**File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n",
     "\n",
-    "The `classes.txt` file lists selected Swiss Proteins classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset.\n",
+    "The `data.pt` file is a list where each element is a dictionary with the following keys:\n",
     "\n",
-    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n"
-   ],
-   "id": "f69012b3540fd1b6"
+    "- **`features`**: \n",
+    "  - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n",
+    "\n",
+    "- **`labels`**: \n",
+    "  - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n",
+    "\n",
+    "- **`ident`**: \n",
+    "  - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n"
+   ]
   },
   {
+   "cell_type": "markdown",
+   "id": "36aed0b8-ab05-428d-8833-2a24deebacc3",
+   "metadata": {},
+   "source": [
+    "## <u>classes.txt</u> File\n",
+    "\n",
+    "**Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "19200f7ff9a6ebba",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-09-30T21:30:34.344202Z",
      "start_time": "2024-09-30T21:30:34.328318Z"
     }
    },
-   "cell_type": "code",
-   "source": [
-    "with open(r\"data/GO_UniProt/GO250_BP/processed/classes.txt\", \"r\") as file:\n",
-    "    for i in range(5):\n",
-    "        line = file.readline()\n",
-    "        print(line.strip())"
-   ],
-   "id": "19200f7ff9a6ebba",
    "outputs": [
     {
      "name": "stdout",
@@ -717,42 +787,48 @@
      ]
     }
    ],
-   "execution_count": 15
+   "source": [
+    "with open(os.path.join(go_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n",
+    "    for i in range(5):\n",
+    "        line = file.readline()\n",
+    "        print(line.strip())"
+   ]
   },
   {
+   "cell_type": "markdown",
+   "id": "f69012b3540fd1b6",
    "metadata": {},
+   "source": [
+    "**File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n",
+    "\n",
+    "The `classes.txt` file lists selected Swiss Proteins classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset.\n",
+    "\n",
+    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks."
+   ]
+  },
+  {
    "cell_type": "markdown",
+   "id": "b81ea34f-cfa8-4ffa-8b88-b54ca96afd84",
+   "metadata": {},
    "source": [
-    "## `splits.csv` File\n",
+    "## <u>splits.csv</u> File\n",
     "\n",
-    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run."
-   ],
-   "id": "6661dc11247e9753"
+    "**Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`."
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "88c3ea8f01ba9fac",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-09-30T21:30:41.586616Z",
      "start_time": "2024-09-30T21:30:39.318598Z"
     }
    },
-   "cell_type": "code",
-   "source": [
-    "csv_df = pd.read_csv(r\"data/GO_UniProt/GO250_BP/processed/splits.csv\")\n",
-    "csv_df.head()"
-   ],
-   "id": "88c3ea8f01ba9fac",
    "outputs": [
     {
      "data": {
-      "text/plain": [
-       "            id  split\n",
-       "0  14331_ARATH  train\n",
-       "1  14331_CAEEL  train\n",
-       "2  14331_MAIZE  train\n",
-       "3  14332_MAIZE  train\n",
-       "4  14333_ARATH  train"
-      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -805,26 +881,50 @@
        "  </tbody>\n",
        "</table>\n",
        "</div>"
+      ],
+      "text/plain": [
+       "            id  split\n",
+       "0  14331_ARATH  train\n",
+       "1  14331_CAEEL  train\n",
+       "2  14331_MAIZE  train\n",
+       "3  14332_MAIZE  train\n",
+       "4  14333_ARATH  train"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 16
+   "source": [
+    "csv_df = pd.read_csv(os.path.join(go_class.processed_dir_main, \"splits.csv\"))\n",
+    "csv_df.head()"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "---",
-   "id": "e6b1f184a5091b83"
+   "id": "6661dc11247e9753",
+   "metadata": {},
+   "source": [
+    "**File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n",
+    "\n",
+    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run."
+   ]
   },
   {
+   "cell_type": "markdown",
+   "id": "e6b1f184a5091b83",
    "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
    "cell_type": "markdown",
+   "id": "481b8c0271ec9636",
+   "metadata": {},
    "source": [
-    "## Protein Representation Using Amino Acid Sequence Notation\n",
+    "## 5.1 Protein Representation Using Amino Acid Sequence Notation\n",
     "\n",
     "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n",
     "\n",
@@ -873,20 +973,22 @@
     "\n",
     "\n",
     "_Note_:  Refer for amino acid sequence:  https://en.wikipedia.org/wiki/Protein_primary_structure"
-   ],
-   "id": "481b8c0271ec9636"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "---",
-   "id": "db6d7f2cc446e6f9"
+   "id": "db6d7f2cc446e6f9",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "7f42b928364e5cd1",
+   "metadata": {},
    "source": [
-    "## More on GO Evidence Codes\n",
+    "## 5.2 More on GO Evidence Codes\n",
     "\n",
     "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n",
     "\n",
@@ -928,33 +1030,34 @@
     "\n",
     "\n",
     "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation."
-   ],
-   "id": "7f42b928364e5cd1"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "---",
-   "id": "1c11d6f520b02434"
+   "id": "1c11d6f520b02434",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

From 6911d8aa6ac7b37863c9f78d88b221c3a4dd5f80 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 6 Oct 2024 15:30:05 +0200
Subject: [PATCH 068/112] Chebi: reader class explanation

---
 tutorials/data_exploration_chebi.ipynb | 146 ++++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 5 deletions(-)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index 87818cba..64d0014a 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -823,25 +823,161 @@
    "source": [
     "# 5. Example Molecule: Different Encodings\n",
     "\n",
-    "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n",
+    "The `chebai` library supports various encodings for molecules, such as SMILES and SELFIES. In this section, we'll take the example of **benzene** (C₆H₆) and explore its different encodings.\n",
     "\n",
-    "### Explanation:\n",
+    "### Overview of Encodings:\n",
     "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
-    "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n",
+    "- **SELFIES (SELF-referencIng Embedded Strings)**: A robust encoding capable of representing a broader range of chemical structures.\n",
     "\n",
-    "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n",
+    "In `chebai`, encoding and tokenization are implemented through specific reader classes, mainly:\n",
+    "- **ChemDataReader**: For SMILES encoding.\n",
+    "- **SelfiesReader**: For SELFIES encoding.\n",
     "\n",
+    "There are other implementations too for different encodings, you can check out more in the below link.<br>\n",
+    "You can explore the implementation of these readers in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py).\n",
+    "\n",
+    "> **Note**: The library uses an `EMBEDDING_OFFSET` of 10 for encoding purposes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2fa606c5-4d8f-4ca0-89a6-d60f15afe297",
+   "metadata": {},
+   "source": [
     "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n",
     "   - **Benzene SMILES**: `c1ccccc1`\n",
     "   - **Explanation**: \n",
-    "     - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n",
+    "     - The string `c1ccccc1` represents a six-membered aromatic ring, where lowercase `c` indicates aromatic carbon atoms.\n",
+    "     - This encoding provides a compact, human-readable format for molecular structures.\n",
+    "\n",
+    "The `ChemDataReader` class is used for SMILES encoding. SMILES tokenization is performed using the `_tokenize` function from the [`pysmiles.read_smiles`](https://github.com/pckroon/pysmiles/blob/master/pysmiles/read_smiles.py) module."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "da47d47e-4560-46af-b246-235596f27d82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chebai.preprocessing.reader import ChemDataReader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8bdbf309-29ec-4aab-a6dc-9e09bc6961a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chem_dr = ChemDataReader()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "68e5c87c-79c3-4d5f-91e6-635399a84d3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[41, 42, 41, 41, 41, 41, 41, 42]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chem_dr._read_data(\"c1ccccc1\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b7211ee-2ccc-46d3-8e8f-790f344726ba",
+   "metadata": {},
+   "source": [
+    "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/smiles_token/tokens.txt) file, which is used by the `ChemDataReader` class. \n",
     "\n",
+    "Each token in the `tokens.txt` file corresponds to a specific symbol or structure in the SMILES encoding, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f79f0ee-a5d7-427b-b4ac-4a848307917b",
+   "metadata": {},
+   "source": [
     "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n",
     "   - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n",
     "   - **Explanation**: \n",
     "     - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n",
     "     - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n",
     "\n",
+    "The `SelfiesReader` class is used for SELFIES encoding. SELFIES encoding and tokenization are performed using the `encoder` and `split_selfies` functions from the [`selfies`](https://github.com/aspuru-guzik-group/selfies) library.\n",
+    "\n",
+    "In the `_read_data` method of `SelfiesReader`, the following steps are carried out:\n",
+    "   1. The `encoder` function converts the SMILES notation into the SELFIES format.\n",
+    "   2. The `split_selfies` function then tokenizes the SELFIES string into individual tokens for further processing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b23a423e-9447-46e1-a08c-ba164c6877d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chebai.preprocessing.reader import SelfiesReader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "7408f7c9-0204-444c-b51e-79dc1fcbf497",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "selfies_dr = SelfiesReader()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b337cef0-f93e-43f8-81ed-def1f5cdeb38",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[25, 29, 25, 29, 25, 29, 30, 32]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "selfies_dr._read_data(\"c1ccccc1\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "850f4557-7a2e-4c86-a81e-3a41f7a57c12",
+   "metadata": {},
+   "source": [
+    "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/selfies/tokens.txt) file, which is used by the `SelfiesReader` class. \n",
+    "\n",
+    "Each token in the `tokens.txt` file corresponds to a specific symbol or structure in the SELFIES encoding, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "680224f6-748e-4ce0-920f-083218a108fd",
+   "metadata": {},
+   "source": [
     "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics."
    ]
   },

From 6d162c7e69487e04b2c1ec2cdd4ee0dd68a6ac5a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 6 Oct 2024 15:51:27 +0200
Subject: [PATCH 069/112] GO: reader class explanation

---
 tutorials/data_exploration_go.ipynb | 63 +++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index e60e972b..bc70b200 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -938,6 +938,69 @@
     "\n",
     "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n",
     "\n",
+    "### Tokenization and Encoding\n",
+    "\n",
+    "To tokenize and numerically encode this protein sequence, the `ProteinDataReader` class is used. This class allows for n-gram tokenization, where the `n_gram` parameter defines the size of the tokenized units. If `n_gram` is not provided (default is `None`), each amino acid letter is treated as a single token.\n",
+    "\n",
+    "For more details, you can explore the implementation of the `ProteinDataReader` class in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e0cf4fb6-2ca4-4b85-a4e7-0cfbac5cd6c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chebai.preprocessing.reader import ProteinDataReader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "e8343d83-0be3-44df-9224-bba8d5c32336",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "protein_dr = ProteinDataReader()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "8a18dc27-f308-4dde-b1ae-b03a20fb0d45",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[10, 16, 11, 17, 17, 12, 17, 28, 17, 24, 25, 17, 23, 17, 14, 14, 17, 13, 21]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "protein_dr._read_data(\"MRSLLILVLCFLPLAALGK\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e95738a-0b2d-4c56-ac97-f3b24c1de18f",
+   "metadata": {},
+   "source": [
+    "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/protein_token/tokens.txt) file, which is used by the `ProteinDataReader` class. \n",
+    "\n",
+    "Each token in the `tokens.txt` file corresponds to a specific amino-acid letter, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd54ca4a-743c-496e-9e89-cff2d8226eb2",
+   "metadata": {},
+   "source": [
     "### The 20 Amino Acids and Their One-Letter Notations\n",
     "\n",
     "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n",

From 5c8c185a7f5e1de89b92d83f24cf63cc3e93c306 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 6 Oct 2024 15:55:42 +0200
Subject: [PATCH 070/112] chebi: minor change in tokenization and encoding

---
 tutorials/data_exploration_chebi.ipynb | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index 64d0014a..594e786a 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -825,15 +825,17 @@
     "\n",
     "The `chebai` library supports various encodings for molecules, such as SMILES and SELFIES. In this section, we'll take the example of **benzene** (C₆H₆) and explore its different encodings.\n",
     "\n",
-    "### Overview of Encodings:\n",
+    "### Overview of Chemical Encodings:\n",
     "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n",
     "- **SELFIES (SELF-referencIng Embedded Strings)**: A robust encoding capable of representing a broader range of chemical structures.\n",
     "\n",
-    "In `chebai`, encoding and tokenization are implemented through specific reader classes, mainly:\n",
+    "### Tokenization and Encoding\n",
+    "\n",
+    "To tokenize and numerically encode this chemical encodings, we use specific reader classes, mainly:\n",
     "- **ChemDataReader**: For SMILES encoding.\n",
     "- **SelfiesReader**: For SELFIES encoding.\n",
     "\n",
-    "There are other implementations too for different encodings, you can check out more in the below link.<br>\n",
+    "There are other implementations too for different variants, you can check out more in the below link.<br>\n",
     "You can explore the implementation of these readers in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py).\n",
     "\n",
     "> **Note**: The library uses an `EMBEDDING_OFFSET` of 10 for encoding purposes."
@@ -973,14 +975,6 @@
     "Each token in the `tokens.txt` file corresponds to a specific symbol or structure in the SELFIES encoding, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "680224f6-748e-4ce0-920f-083218a108fd",
-   "metadata": {},
-   "source": [
-    "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics."
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "93e328cf-09f9-4694-b175-28320590937d",

From 710d703fd54d4471d28b1b9f9e6f6e02be126a01 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 12 Oct 2024 11:51:39 +0200
Subject: [PATCH 071/112] ignore proteins exceeding max len in preprocessing

---
 chebai/preprocessing/datasets/go_uniprot.py | 44 ++++-----------------
 1 file changed, 7 insertions(+), 37 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index 21b0ad44..a5e804a8 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -415,8 +415,8 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame:
                 # To consider only manually-annotated swiss data
                 continue
 
-            if not record.sequence:
-                # Consider protein with only sequence representation
+            if not record.sequence or record.sequence > self.max_sequence_length:
+                # Consider protein with only sequence representation and seq. length not greater than max seq. length
                 continue
 
             if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence):
@@ -537,39 +537,6 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 
         return df_train, df_val, df_test
 
-    # ------------------------------ Phase: DataLoaders -----------------------------------
-    def dataloader(self, kind: str, **kwargs) -> DataLoader:
-        """
-        Returns a DataLoader object with truncated sequences for the specified kind of data (train, val, or test).
-
-        This method overrides the dataloader method from the superclass. After fetching the dataset from the
-        superclass, it truncates the 'features' of each data instance to a maximum length specified by
-        `self.max_sequence_length`. The truncation is adjusted based on the value of `n_gram` to ensure that
-        the correct number of amino acids is preserved in the truncated sequences.
-
-        Args:
-            kind (str): The kind of data to load (e.g., 'train', 'val', 'test').
-            **kwargs: Additional keyword arguments passed to the superclass dataloader method.
-
-        Returns:
-            DataLoader: A DataLoader object with the truncated sequences.
-        """
-        dataloader = super().dataloader(kind, **kwargs)
-
-        if self.reader.n_gram is None:
-            # Truncate the 'features' to max_sequence_length for each instance
-            truncate_index = self.max_sequence_length
-        else:
-            # If n_gram is given, adjust truncation to ensure maximum sequence length refers to the maximum number of
-            # amino acids in sequence rather than number of n-grams. Eg, Sequence "ABCDEFGHIJ" can form 8 trigrams,
-            # if max length is 5, then only first 3 trigrams should be considered as they are formed by first 5 letters.
-            truncate_index = self.max_sequence_length - (self.reader.n_gram - 1)
-
-        for instance in dataloader.dataset:
-            instance["features"] = instance["features"][:truncate_index]
-
-        return dataloader
-
     # ------------------------------ Phase: Raw Properties -----------------------------------
     @property
     def base_dir(self) -> str:
@@ -617,13 +584,16 @@ def _name(self) -> str:
         """
         Returns the name of the dataset.
 
+        'max_sequence_length' in the name indicates that proteins with sequence lengths exceeding  are ignored
+        in the dataset.
+
         Returns:
             str: The dataset name, formatted with the current threshold value and/or given go_branch.
         """
         if self.go_branch != self._ALL_GO_BRANCHES:
-            return f"GO{self.THRESHOLD}_{self.go_branch}"
+            return f"GO{self.THRESHOLD}_{self.go_branch}_{self.max_sequence_length}"
 
-        return f"GO{self.THRESHOLD}"
+        return f"GO{self.THRESHOLD}_{self.max_sequence_length}"
 
     def select_classes(
         self, g: nx.DiGraph, *args: Any, **kwargs: Dict[str, Any]

From 383b210447942790c6e44219bc6801b2306f9a03 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 12 Oct 2024 13:05:10 +0200
Subject: [PATCH 072/112] fix to access max seq len in name prop

---
 chebai/preprocessing/datasets/go_uniprot.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index a5e804a8..0ba251bb 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -73,13 +73,14 @@ class _GOUniProtDataExtractor(_DynamicDataset, ABC):
 
     def __init__(self, **kwargs):
         self.go_branch: str = self._get_go_branch(**kwargs)
-        super(_GOUniProtDataExtractor, self).__init__(**kwargs)
 
         self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002))
         assert (
             self.max_sequence_length >= 1
         ), "Max sequence length should be greater than or equal to 1."
 
+        super(_GOUniProtDataExtractor, self).__init__(**kwargs)
+
         if self.reader.n_gram is not None:
             assert self.max_sequence_length >= self.reader.n_gram, (
                 f"max_sequence_length ({self.max_sequence_length}) must be greater than "
@@ -415,7 +416,7 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame:
                 # To consider only manually-annotated swiss data
                 continue
 
-            if not record.sequence or record.sequence > self.max_sequence_length:
+            if not record.sequence or len(record.sequence) > self.max_sequence_length:
                 # Consider protein with only sequence representation and seq. length not greater than max seq. length
                 continue
 

From e3c4b6e2c2ec1b5f30a883b0d53be71532d8adf7 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 12 Oct 2024 15:50:56 +0200
Subject: [PATCH 073/112] fix testcase for GO

---
 tests/unit/dataset_classes/testGOUniProDataExtractor.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
index 976334f0..dcde90bc 100644
--- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py
+++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
@@ -1,11 +1,12 @@
 import unittest
-from unittest.mock import MagicMock, PropertyMock, mock_open, patch
+from unittest.mock import PropertyMock, mock_open, patch
 
 import fastobo
 import networkx as nx
 import pandas as pd
 
 from chebai.preprocessing.datasets.go_uniprot import _GOUniProtDataExtractor
+from chebai.preprocessing.reader import ProteinDataReader
 from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData
 
 
@@ -30,9 +31,8 @@ def setUpClass(
         """
         mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor"
         mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor"
-        ReaderMock = MagicMock()
-        ReaderMock.name.return_value = "MockedReaderGOUniProtDataExtractor"
-        _GOUniProtDataExtractor.READER = ReaderMock
+
+        _GOUniProtDataExtractor.READER = ProteinDataReader
 
         cls.extractor = _GOUniProtDataExtractor()
 

From 651108639c6454398fed7e44d63f180101d2cc3e Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Thu, 17 Oct 2024 17:02:31 +0200
Subject: [PATCH 074/112] fix: add all (including transitive) go-labels to data
 instead of only direct ones

---
 chebai/preprocessing/datasets/go_uniprot.py | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index 0ba251bb..e2a4d1a4 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -11,6 +11,7 @@
 __all__ = ["GOUniProtOver250", "GOUniProtOver50"]
 
 import gzip
+import itertools
 import os
 import shutil
 from abc import ABC, abstractmethod
@@ -340,7 +341,18 @@ def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame:
         print(f"Processing graph")
 
         data_df = self._get_swiss_to_go_mapping()
-
+        # add ancestors to go ids
+        data_df["go_ids"] = data_df["go_ids"].apply(
+            lambda go_ids: list(
+                itertools.chain.from_iterable(
+                    [
+                        [go_id] + list(g.predecessors(go_id))
+                        for go_id in go_ids
+                        if go_id in g.nodes
+                    ]
+                )
+            )
+        )
         # Initialize the GO term labels/columns to False
         selected_classes = self.select_classes(g, data_df=data_df)
         new_label_columns = pd.DataFrame(
@@ -642,20 +654,8 @@ def select_classes(
         # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L59-L77
         go_term_annot: Dict[int, int] = {}
         for idx, row in data_df.iterrows():
-            # Set will contain go terms associated with the protein, along with all the ancestors of those
-            # associated go terms
-            associated_go_ids_with_ancestors = set()
-
-            # Collect all ancestors of the GO terms associated with this protein
-            for go_id in row["go_ids"]:
-                if go_id in g.nodes:
-                    associated_go_ids_with_ancestors.add(go_id)
-                    associated_go_ids_with_ancestors.update(
-                        g.predecessors(go_id)
-                    )  # Add all predecessors (ancestors) of go_id
-
             # Count the annotations for each go_id **`per protein`**
-            for go_id in associated_go_ids_with_ancestors:
+            for go_id in row["go_ids"]:
                 if go_id not in go_term_annot:
                     go_term_annot[go_id] = 0
                 go_term_annot[go_id] += 1

From c1ddd17667c3532b0ca80b1196b2e6c0bb855f7f Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 20 Oct 2024 11:44:56 +0200
Subject: [PATCH 075/112] update testcase as per transitive go ids

---
 .../unit/dataset_classes/testGOUniProDataExtractor.py  | 10 +++++++++-
 tests/unit/mock_data/ontology_mock_data.py             |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
index dcde90bc..9da48bee 100644
--- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py
+++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py
@@ -1,4 +1,5 @@
 import unittest
+from collections import OrderedDict
 from unittest.mock import PropertyMock, mock_open, patch
 
 import fastobo
@@ -141,7 +142,14 @@ def test_get_swiss_to_go_mapping(self, mock_open) -> None:
         Test the extraction of SwissProt to GO term mapping.
         """
         mapping_df = self.extractor._get_swiss_to_go_mapping()
-        expected_df = GOUniProtMockData.get_data_in_dataframe().iloc[:, :4]
+        expected_df = pd.DataFrame(
+            OrderedDict(
+                swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"],
+                accession=["Q6GZX4", "DCGZX4"],
+                go_ids=[[2, 3, 5], [2, 5]],
+                sequence=list(GOUniProtMockData.protein_sequences().values()),
+            )
+        )
 
         pd.testing.assert_frame_equal(
             mapping_df,
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index 0c713334..d6feb33d 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -736,11 +736,11 @@ def get_data_in_dataframe() -> pd.DataFrame:
         expected_data = OrderedDict(
             swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"],
             accession=["Q6GZX4", "DCGZX4"],
-            go_ids=[[2, 3, 5], [2, 5]],
+            go_ids=[[1, 2, 3, 5], [1, 2, 5]],
             sequence=list(GOUniProtMockData.protein_sequences().values()),
             **{
                 #   SP_1,  SP_2
-                1: [False, False],
+                1: [True, True],
                 2: [True, True],
                 3: [True, False],
                 4: [False, False],

From bf6bc4aa14999afdfa3d4ebec017791dc6edad09 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sun, 20 Oct 2024 11:51:14 +0200
Subject: [PATCH 076/112] remove test for tox21mol net

- this test will be added in another branch later once #53 is completed
---
 tests/unit/dataset_classes/testTox21MolNet.py | 181 ------------------
 1 file changed, 181 deletions(-)
 delete mode 100644 tests/unit/dataset_classes/testTox21MolNet.py

diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py
deleted file mode 100644
index 86cbb752..00000000
--- a/tests/unit/dataset_classes/testTox21MolNet.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import unittest
-from typing import List
-from unittest.mock import MagicMock, mock_open, patch
-
-import torch
-
-from chebai.preprocessing.datasets.tox21 import Tox21MolNet
-from chebai.preprocessing.reader import ChemDataReader
-from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData
-
-
-class TestTox21MolNet(unittest.TestCase):
-    @classmethod
-    @patch("os.makedirs", return_value=None)
-    def setUpClass(cls, mock_makedirs: MagicMock) -> None:
-        """
-        Initialize a Tox21MolNet instance for testing.
-
-        Args:
-            mock_makedirs (MagicMock): Mocked `os.makedirs` function.
-        """
-        Tox21MolNet.READER = ChemDataReader
-        cls.data_module = Tox21MolNet()
-
-    @patch(
-        "builtins.open",
-        new_callable=mock_open,
-        read_data=Tox21MolNetMockData.get_raw_data(),
-    )
-    def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
-        """
-        Test the `_load_data_from_file` method for correct output.
-
-        Args:
-            mock_open_file (mock_open): Mocked open function to simulate file reading.
-        """
-        actual_data = self.data_module._load_data_from_file("fake/file/path.csv")
-
-        first_instance = next(actual_data)
-
-        # Check for required keys
-        required_keys = ["features", "labels", "ident"]
-        for key in required_keys:
-            self.assertIn(
-                key, first_instance, f"'{key}' key is missing in the output data."
-            )
-
-        self.assertTrue(
-            all(isinstance(feature, int) for feature in first_instance["features"]),
-            "Not all elements in 'features' are integers.",
-        )
-
-        # Check that 'features' can be converted to a tensor
-        features = first_instance["features"]
-        try:
-            tensor_features = torch.tensor(features)
-            self.assertTrue(
-                tensor_features.ndim > 0,
-                "'features' should be convertible to a non-empty tensor.",
-            )
-        except Exception as e:
-            self.fail(f"'features' cannot be converted to a tensor: {str(e)}")
-
-    @patch(
-        "builtins.open",
-        new_callable=mock_open,
-        read_data=Tox21MolNetMockData.get_raw_data(),
-    )
-    @patch("torch.save")
-    def test_setup_processed_simple_split(
-        self,
-        mock_torch_save: MagicMock,
-        mock_open_file: mock_open,
-    ) -> None:
-        """
-        Test the `setup_processed` method for basic data splitting and saving.
-
-        Args:
-            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
-            mock_open_file (mock_open): Mocked `open` function to simulate file reading.
-        """
-        self.data_module.setup_processed()
-
-        # Verify if torch.save was called for each split (train, test, validation)
-        self.assertEqual(
-            mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times."
-        )
-        call_args_list = mock_torch_save.call_args_list
-        self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.")
-        self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.")
-        self.assertIn(
-            "validation", call_args_list[2][0][1], "Missing 'validation' split."
-        )
-
-        # Check for non-overlap between train, test, and validation splits
-        test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]]
-        train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]]
-        validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]]
-
-        self.assertTrue(
-            set(train_split).isdisjoint(test_split),
-            "Overlap detected between the train and test splits.",
-        )
-        self.assertTrue(
-            set(train_split).isdisjoint(validation_split),
-            "Overlap detected between the train and validation splits.",
-        )
-        self.assertTrue(
-            set(test_split).isdisjoint(validation_split),
-            "Overlap detected between the test and validation splits.",
-        )
-
-    @patch.object(
-        Tox21MolNet,
-        "_load_data_from_file",
-        return_value=Tox21MolNetMockData.get_processed_grouped_data(),
-    )
-    @patch("torch.save")
-    def test_setup_processed_with_group_split(
-        self, mock_torch_save: MagicMock, mock_load_file: MagicMock
-    ) -> None:
-        """
-        Test the `setup_processed` method for group-based splitting and saving.
-
-        Args:
-            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
-            mock_load_file (MagicMock): Mocked `_load_data_from_file` to provide custom data.
-        """
-        self.data_module.train_split = 0.5
-        self.data_module.setup_processed()
-
-        # Verify if torch.save was called for each split
-        self.assertEqual(
-            mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times."
-        )
-        call_args_list = mock_torch_save.call_args_list
-        self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.")
-        self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.")
-        self.assertIn(
-            "validation", call_args_list[2][0][1], "Missing 'validation' split."
-        )
-
-        # Check for non-overlap between train, test, and validation splits (based on 'ident')
-        test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]]
-        train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]]
-        validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]]
-
-        self.assertTrue(
-            set(train_split).isdisjoint(test_split),
-            "Overlap detected between the train and test splits (based on 'ident').",
-        )
-        self.assertTrue(
-            set(train_split).isdisjoint(validation_split),
-            "Overlap detected between the train and validation splits (based on 'ident').",
-        )
-        self.assertTrue(
-            set(test_split).isdisjoint(validation_split),
-            "Overlap detected between the test and validation splits (based on 'ident').",
-        )
-
-        # Check for non-overlap between train, test, and validation splits (based on 'group')
-        test_split_grp: List[str] = [d["group"] for d in call_args_list[0][0][0]]
-        train_split_grp: List[str] = [d["group"] for d in call_args_list[1][0][0]]
-        validation_split_grp: List[str] = [d["group"] for d in call_args_list[2][0][0]]
-
-        self.assertTrue(
-            set(train_split_grp).isdisjoint(test_split_grp),
-            "Overlap detected between the train and test splits (based on 'group').",
-        )
-        self.assertTrue(
-            set(train_split_grp).isdisjoint(validation_split_grp),
-            "Overlap detected between the train and validation splits (based on 'group').",
-        )
-        self.assertTrue(
-            set(test_split_grp).isdisjoint(validation_split_grp),
-            "Overlap detected between the test and validation splits (based on 'group').",
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()

From f3ec9470b2b077a5a93d2a48d6d660d90ae984a5 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Mon, 21 Oct 2024 13:39:38 +0200
Subject: [PATCH 077/112] fix: dont count labels twice

---
 chebai/preprocessing/datasets/go_uniprot.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index e2a4d1a4..1c8c106b 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -343,13 +343,15 @@ def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame:
         data_df = self._get_swiss_to_go_mapping()
         # add ancestors to go ids
         data_df["go_ids"] = data_df["go_ids"].apply(
-            lambda go_ids: list(
-                itertools.chain.from_iterable(
-                    [
-                        [go_id] + list(g.predecessors(go_id))
-                        for go_id in go_ids
-                        if go_id in g.nodes
-                    ]
+            lambda go_ids: sorted(
+                set(
+                    itertools.chain.from_iterable(
+                        [
+                            [go_id] + list(g.predecessors(go_id))
+                            for go_id in go_ids
+                            if go_id in g.nodes
+                        ]
+                    )
                 )
             )
         )

From b915b0db7f11710e8f5eabb81c070995cab13844 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 22 Oct 2024 12:53:35 +0200
Subject: [PATCH 078/112] Revert "add group key + convert generator to list"

This reverts commit e4caae8c68368bffb9b018d35b1298f3887a5500.
---
 chebai/preprocessing/datasets/tox21.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/chebai/preprocessing/datasets/tox21.py b/chebai/preprocessing/datasets/tox21.py
index 98d78009..4bdfbdee 100644
--- a/chebai/preprocessing/datasets/tox21.py
+++ b/chebai/preprocessing/datasets/tox21.py
@@ -68,7 +68,7 @@ def download(self) -> None:
     def setup_processed(self) -> None:
         """Processes and splits the dataset."""
         print("Create splits")
-        data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"tox21.csv")))
+        data = self._load_data_from_file(os.path.join(self.raw_dir, f"tox21.csv"))
         groups = np.array([d["group"] for d in data])
         if not all(g is None for g in groups):
             split_size = int(len(set(groups)) * self.train_split)
@@ -145,10 +145,7 @@ def _load_data_from_file(self, input_file_path: str) -> List[Dict]:
                 labels = [
                     bool(int(l)) if l else None for l in (row[k] for k in self.HEADERS)
                 ]
-                group = row.get("group", None)
-                yield dict(
-                    features=smiles, labels=labels, ident=row["mol_id"], group=group
-                )
+                yield dict(features=smiles, labels=labels, ident=row["mol_id"])
 
 
 class Tox21Challenge(XYBaseDataModule):

From 18e3253800f534309711c52a45b23ea9f906b47f Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 22 Oct 2024 18:53:08 +0200
Subject: [PATCH 079/112] update GO evidence codes info

---
 tutorials/data_exploration_go.ipynb | 74 ++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index bc70b200..3e8255a2 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -1053,35 +1053,49 @@
    "source": [
     "## 5.2 More on GO Evidence Codes\n",
     "\n",
-    "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n",
-    "\n",
-    "| **Evidence Code** | **Description** |\n",
-    "|-------------------|-----------------|\n",
-    "| **EXP**           | Inferred from Experiment |\n",
-    "| **IDA**           | Inferred from Direct Assay |\n",
-    "| **IPI**           | Inferred from Physical Interaction |\n",
-    "| **IMP**           | Inferred from Mutant Phenotype |\n",
-    "| **IGI**           | Inferred from Genetic Interaction |\n",
-    "| **IEP**           | Inferred from Expression Pattern |\n",
-    "| **TAS**           | Traceable Author Statement |\n",
-    "| **IC**            | Inferred by Curator |\n",
-    "| **IEA**           | Inferred from Electronic Annotation (Computational) |\n",
-    "| **ISS**           | Inferred from Sequence or Structural Similarity |\n",
-    "| **ISA**           | Inferred from Sequence Alignment |\n",
-    "| **ISM**           | Inferred from Sequence Model |\n",
-    "| **ISO**           | Inferred from Sequence Orthology |\n",
-    "| **ISA**           | Inferred from Sequence Alignment |\n",
-    "| **RCA**           | Inferred from Reviewed Computational Analysis |\n",
-    "| **NAS**           | Non-traceable Author Statement |\n",
-    "| **ND**            | No Biological Data Available (placeholder) |\n",
-    "| **NR**            | Not Recorded |\n",
-    "\n",
-    "\n",
-    "### Grouping of Codes:\n",
-    "\n",
-    "- **Experimental Evidence Codes**: \n",
+    "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both GO evidence codes with brief descriptions:\n",
+    "\n",
+    "| **Evidence Code**     | **Description** |\n",
+    "|-----------------------|-----------------|\n",
+    "| **EXP**               | [Inferred from Experiment (EXP)](http://wiki.geneontology.org/index.php/Inferred_from_Experiment_(EXP)) |\n",
+    "| **IDA**               | [Inferred from Direct Assay (IDA)](http://wiki.geneontology.org/index.php/Inferred_from_Direct_Assay_(IDA)) |\n",
+    "| **IPI**               | [Inferred from Physical Interaction (IPI)](http://wiki.geneontology.org/index.php/Inferred_from_Physical_Interaction_(IPI)) |\n",
+    "| **IMP**               | [Inferred from Mutant Phenotype (IMP)](http://wiki.geneontology.org/index.php/Inferred_from_Mutant_Phenotype_(IMP)) |\n",
+    "| **IGI**               | [Inferred from Genetic Interaction (IGI)](http://wiki.geneontology.org/index.php/Inferred_from_Genetic_Interaction_(IGI)) |\n",
+    "| **IEP**               | [Inferred from Expression Pattern (IEP)](http://wiki.geneontology.org/index.php/Inferred_from_Expression_Pattern_(IEP)) |\n",
+    "| **HTP**               | [Inferred from High Throughput Experiment (HTP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Experiment_(HTP)) |\n",
+    "| **HDA**               | [Inferred from High Throughput Direct Assay (HDA)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Direct_Assay_(HDA)) |\n",
+    "| **HMP**               | [Inferred from High Throughput Mutant Phenotype (HMP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Mutant_Phenotype_(HMP)) |\n",
+    "| **HGI**               | [Inferred from High Throughput Genetic Interaction (HGI)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Genetic_Interaction_(HGI)) |\n",
+    "| **HEP**               | [Inferred from High Throughput Expression Pattern (HEP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Expression_Pattern_(HEP)) |\n",
+    "| **IBA**               | [Inferred from Biological aspect of Ancestor (IBA)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Ancestor_(IBA)) |\n",
+    "| **IBD**               | [Inferred from Biological aspect of Descendant (IBD)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Descendant_(IBD)) |\n",
+    "| **IKR**               | [Inferred from Key Residues (IKR)](http://wiki.geneontology.org/index.php/Inferred_from_Key_Residues_(IKR)) |\n",
+    "| **IRD**               | [Inferred from Rapid Divergence (IRD)](http://wiki.geneontology.org/index.php/Inferred_from_Rapid_Divergence(IRD)) |\n",
+    "| **ISS**               | [Inferred from Sequence or Structural Similarity (ISS)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_or_structural_Similarity_(ISS)) |\n",
+    "| **ISO**               | [Inferred from Sequence Orthology (ISO)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Orthology_(ISO)) |\n",
+    "| **ISA**               | [Inferred from Sequence Alignment (ISA)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Alignment_(ISA)) |\n",
+    "| **ISM**               | [Inferred from Sequence Model (ISM)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Model_(ISM)) |\n",
+    "| **RCA**               | [Inferred from Reviewed Computational Analysis (RCA)](http://wiki.geneontology.org/index.php/Inferred_from_Reviewed_Computational_Analysis_(RCA)) |\n",
+    "| **IEA**               | [Inferred from Electronic Annotation (IEA)](http://wiki.geneontology.org/index.php/Inferred_from_Electronic_Annotation_(IEA)) |\n",
+    "| **TAS**               | [Traceable Author Statement (TAS)](http://wiki.geneontology.org/index.php/Traceable_Author_Statement_(TAS)) |\n",
+    "| **NAS**               | [Non-traceable Author Statement (NAS)](http://wiki.geneontology.org/index.php/Non-traceable_Author_Statement_(NAS)) |\n",
+    "| **IC**                | [Inferred by Curator (IC)](http://wiki.geneontology.org/index.php/Inferred_by_Curator_(IC)) |\n",
+    "| **ND**                | [No Biological Data Available (ND)](http://wiki.geneontology.org/index.php/No_biological_Data_available_(ND)_evidence_code) |\n",
+    "| **NR**                | Not Recorded |\n",
+    "\n",
+    "\n",
+    "### **Grouping of Codes**:\n",
+    "\n",
+    "- **Experimental Evidence Codes**:\n",
     "  - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n",
     "  \n",
+    "- **High-Throughput Experimental Codes**:\n",
+    "  - **HTP**, **HDA**, **HMP**, **HGI**, **HEP**\n",
+    "\n",
+    "- **Phylogenetically-Inferred Codes**:\n",
+    "  - **IBA**, **IBD**, **IKR**, **IRD**\n",
+    "\n",
     "- **Author/Curator Inferred Codes**:\n",
     "  - **TAS**, **IC**, **NAS**\n",
     "\n",
@@ -1089,10 +1103,12 @@
     "  - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n",
     "\n",
     "- **Others**:\n",
-    "  - **ND** (No Data), **NR** (Not Recorded)\n",
+    "  - **ND** (No Biological Data Available), **NR** (Not Recorded)\n",
+    "\n",
     "\n",
+    "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation.\n",
     "\n",
-    "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation."
+    "__Note__ : For more information on GO evidence codes check [here](https://geneontology.org/docs/guide-go-evidence-codes/) "
    ]
   },
   {

From 261e8c1a2686cf1fedff56899f54c0cb76eec7a1 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Tue, 22 Oct 2024 19:08:19 +0200
Subject: [PATCH 080/112] go evidence code minor info change

---
 tutorials/data_exploration_go.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index 3e8255a2..97bcfd22 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -1053,7 +1053,7 @@
    "source": [
     "## 5.2 More on GO Evidence Codes\n",
     "\n",
-    "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both GO evidence codes with brief descriptions:\n",
+    "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the GO evidence codes with brief descriptions:\n",
     "\n",
     "| **Evidence Code**     | **Description** |\n",
     "|-----------------------|-----------------|\n",
@@ -1063,7 +1063,7 @@
     "| **IMP**               | [Inferred from Mutant Phenotype (IMP)](http://wiki.geneontology.org/index.php/Inferred_from_Mutant_Phenotype_(IMP)) |\n",
     "| **IGI**               | [Inferred from Genetic Interaction (IGI)](http://wiki.geneontology.org/index.php/Inferred_from_Genetic_Interaction_(IGI)) |\n",
     "| **IEP**               | [Inferred from Expression Pattern (IEP)](http://wiki.geneontology.org/index.php/Inferred_from_Expression_Pattern_(IEP)) |\n",
-    "| **HTP**               | [Inferred from High Throughput Experiment (HTP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Experiment_(HTP)) |\n",
+    "| **HTP**               | [Inferred from High Throughput Experiment (HTP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Experiment_(HTP) ) |\n",
     "| **HDA**               | [Inferred from High Throughput Direct Assay (HDA)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Direct_Assay_(HDA)) |\n",
     "| **HMP**               | [Inferred from High Throughput Mutant Phenotype (HMP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Mutant_Phenotype_(HMP)) |\n",
     "| **HGI**               | [Inferred from High Throughput Genetic Interaction (HGI)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Genetic_Interaction_(HGI)) |\n",
@@ -1108,7 +1108,7 @@
     "\n",
     "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation.\n",
     "\n",
-    "__Note__ : For more information on GO evidence codes check [here](https://geneontology.org/docs/guide-go-evidence-codes/) "
+    "__Note__ : For more information on GO evidence codes please check [here](https://geneontology.org/docs/guide-go-evidence-codes/) "
    ]
   },
   {

From e4a9e6c63c42818a85315d0fa49ecfdee11674e3 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 24 Oct 2024 22:41:37 +0200
Subject: [PATCH 081/112] make evidence code and invalid AA as global constants

---
 chebai/preprocessing/datasets/go_uniprot.py | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index 1c8c106b..fe4795c6 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -25,11 +25,24 @@
 import requests
 import torch
 from Bio import SwissProt
-from torch.utils.data import DataLoader
 
 from chebai.preprocessing import reader as dr
 from chebai.preprocessing.datasets.base import _DynamicDataset
 
+EXPERIMENTAL_EVIDENCE_CODES = {
+    "EXP",
+    "IDA",
+    "IPI",
+    "IMP",
+    "IGI",
+    "IEP",
+    "TAS",
+    "IC",
+}
+
+# https://github.com/bio-ontology-research-group/deepgo/blob/d97447a05c108127fee97982fd2c57929b2cf7eb/aaindex.py#L8
+AMBIGUOUS_AMINO_ACIDS = {"B", "O", "J", "U", "X", "Z", "*"}
+
 
 class _GOUniProtDataExtractor(_DynamicDataset, ABC):
     """
@@ -412,19 +425,6 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame:
             )
         )
 
-        EXPERIMENTAL_EVIDENCE_CODES = {
-            "EXP",
-            "IDA",
-            "IPI",
-            "IMP",
-            "IGI",
-            "IEP",
-            "TAS",
-            "IC",
-        }
-        # https://github.com/bio-ontology-research-group/deepgo/blob/d97447a05c108127fee97982fd2c57929b2cf7eb/aaindex.py#L8
-        AMBIGUOUS_AMINO_ACIDS = {"B", "O", "J", "U", "X", "Z", "*"}
-
         for record in swiss_data:
             if record.data_class != "Reviewed":
                 # To consider only manually-annotated swiss data

From a27e41534d89b6b55159ef2f122ba4f65da467ff Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 24 Oct 2024 22:42:38 +0200
Subject: [PATCH 082/112] protein pretrain data - rough implementation

---
 .../datasets/protein_pretraining.py           | 197 ++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 chebai/preprocessing/datasets/protein_pretraining.py

diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
new file mode 100644
index 00000000..860a89da
--- /dev/null
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -0,0 +1,197 @@
+import os
+from abc import ABC
+from collections import OrderedDict
+from typing import Any, Dict, Generator, List, Tuple
+
+import networkx as nx
+import pandas as pd
+import torch
+from Bio import SwissProt
+from sklearn.model_selection import train_test_split
+
+from chebai.preprocessing.datasets.base import _DynamicDataset
+from chebai.preprocessing.datasets.go_uniprot import (
+    AMBIGUOUS_AMINO_ACIDS,
+    EXPERIMENTAL_EVIDENCE_CODES,
+    GOUniProtOver250,
+)
+
+
+class _ProteinPretrainingData(_DynamicDataset, ABC):
+    _ID_IDX: int = 0
+    _DATA_REPRESENTATION_IDX: int = 1  # here `sequence` column
+
+    def __init__(self, *args, **kwargs):
+        super(_ProteinPretrainingData).__init__(*args, **kwargs)
+        self._go_extractor = GOUniProtOver250()
+        assert self._go_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES
+
+    # ------------------------------ Phase: Prepare data -----------------------------------
+    def prepare_data(self):
+        print("Checking for processed data in", self.processed_dir_main)
+
+        processed_name = self.processed_dir_main_file_names_dict["data"]
+        if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)):
+            print("Missing processed data file (`data.pkl` file)")
+            os.makedirs(self.processed_dir_main, exist_ok=True)
+            self._download_required_data()
+            protein_df = self._parse_protein_data_for_pretraining()
+            self.save_processed(protein_df, processed_name)
+
+    def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
+        pass
+
+    def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
+        pass
+
+    def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List:
+        pass
+
+    def _download_required_data(self) -> str:
+        return self._go_extractor._download_swiss_uni_prot_data()
+
+    def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
+        """
+        Parses the Swiss-Prot data and returns a DataFrame mapping Swiss-Prot records which does not have any valid
+        Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence code
+        (EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC).
+
+        The DataFrame includes the following columns:
+            - "swiss_id": The unique identifier for each Swiss-Prot record.
+            - "sequence": The protein sequence.
+
+        Note:
+            We ignore proteins with ambiguous amino acid codes (B, O, J, U, X, Z) in their sequence.`
+
+        Returns:
+            pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with its associated GO data.
+        """
+
+        print("Parsing swiss uniprot raw data....")
+
+        swiss_ids, sequences = [], []
+
+        swiss_data = SwissProt.parse(
+            open(
+                os.path.join(self.raw_dir, self.raw_file_names_dict["SwissUniProt"]),
+                "r",
+            )
+        )
+
+        for record in swiss_data:
+            if record.data_class != "Reviewed":
+                # To consider only manually-annotated swiss data
+                continue
+
+            if not record.sequence:
+                # Consider protein with only sequence representation and seq. length not greater than max seq. length
+                continue
+
+            if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence):
+                # Skip proteins with ambiguous amino acid codes
+                continue
+
+            has_valid_associated_go_label = False
+            for cross_ref in record.cross_references:
+                if cross_ref[0] == self._go_extractor._GO_DATA_INIT:
+
+                    if len(cross_ref) <= 3:
+                        # No evidence code
+                        continue
+
+                    # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L63-L66
+                    evidence_code = cross_ref[3].split(":")[0]
+                    if evidence_code in EXPERIMENTAL_EVIDENCE_CODES:
+                        has_valid_associated_go_label = True
+                        break
+
+            if has_valid_associated_go_label:
+                # Skip proteins which has at least one associated go label
+                continue
+
+            swiss_ids.append(record.entry_name)
+            sequences.append(record.sequence)
+
+        data_dict = OrderedDict(
+            swiss_id=swiss_ids,  # swiss_id column at index 0
+            sequence=sequences,  # Sequence column at index 1
+        )
+
+        return pd.DataFrame(data_dict)
+
+    # ------------------------------ Phase: Setup data -----------------------------------
+    def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]:
+        """
+        Loads data from a pickled file and yields individual dictionaries for each row.
+
+        The pickled file is expected to contain rows with the following structure:
+            - Data at row index `self._ID_IDX`: ID of go data instance
+            - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein
+
+        This method is used by `_load_data_from_file` to generate dictionaries that are then
+        processed and converted into a list of dictionaries containing the features and labels.
+
+        Args:
+            input_file_path (str): The path to the pickled input file.
+
+        Yields:
+            Dict[str, Any]: A dictionary containing:
+                - `features` (str): The sequence data from the file.
+                - `ident` (Any): The identifier from row index 0.
+        """
+        with open(input_file_path, "rb") as input_file:
+            df = pd.read_pickle(input_file)
+            for row in df.values:
+                # chebai.preprocessing.reader.DataReader only needs features, labels, ident, group
+                # "group" set to None, by default as no such entity for this data
+                yield dict(
+                    features=row[self._DATA_REPRESENTATION_IDX],
+                    ident=row[self._ID_IDX],
+                )
+
+    # ------------------------------ Phase: Dynamic Splits -----------------------------------
+    def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+        """
+        Loads encoded data and generates training, validation, and test splits.
+
+        This method attempts to load encoded data from a file named `data.pt`. It then splits this data into
+        training, validation, and test sets.
+
+        Raises:
+            FileNotFoundError: If the `data.pt` file does not exist. Ensure that `prepare_data` and/or
+            `setup` methods are called to generate the necessary dataset files.
+
+        Returns:
+            Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames:
+                - Training set
+                - Validation set
+                - Test set
+        """
+        try:
+            filename = self.processed_file_names_dict["data"]
+            data_go = torch.load(
+                os.path.join(self.processed_dir, filename), weights_only=False
+            )
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                f"File data.pt doesn't exists. "
+                f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files"
+            )
+
+        df_go_data = pd.DataFrame(data_go)
+        train_df_go, df_test = train_test_split(
+            df_go_data, seed=self.dynamic_data_split_seed
+        )
+
+        # Get all splits
+        df_train, df_val = train_test_split(
+            train_df_go,
+            seed=self.dynamic_data_split_seed,
+        )
+
+        return df_train, df_val, df_test
+
+    # ------------------------------ Phase: Raw Properties -----------------------------------
+    @property
+    def base_dir(self) -> str:
+        return os.path.join(self._go_extractor.base_dir, "Pretraining")

From fa7b37bc4f2e3b228be9ed571aadd779024ae776 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Oct 2024 13:33:06 +0200
Subject: [PATCH 083/112] Final class + fixes

---
 chebai/preprocessing/datasets/go_uniprot.py   |  7 ++++-
 .../datasets/protein_pretraining.py           | 30 +++++++++++++++----
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
index fe4795c6..a2c4ae54 100644
--- a/chebai/preprocessing/datasets/go_uniprot.py
+++ b/chebai/preprocessing/datasets/go_uniprot.py
@@ -8,7 +8,12 @@
 # https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt
 # https://www.uniprot.org/uniprotkb
 
-__all__ = ["GOUniProtOver250", "GOUniProtOver50"]
+__all__ = [
+    "GOUniProtOver250",
+    "GOUniProtOver50",
+    "EXPERIMENTAL_EVIDENCE_CODES",
+    "AMBIGUOUS_AMINO_ACIDS",
+]
 
 import gzip
 import itertools
diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
index 860a89da..73d0e40e 100644
--- a/chebai/preprocessing/datasets/protein_pretraining.py
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -1,3 +1,5 @@
+__all__ = ["SwissProteinPretrain"]
+
 import os
 from abc import ABC
 from collections import OrderedDict
@@ -15,21 +17,20 @@
     EXPERIMENTAL_EVIDENCE_CODES,
     GOUniProtOver250,
 )
+from chebai.preprocessing.reader import ProteinDataReader
 
 
 class _ProteinPretrainingData(_DynamicDataset, ABC):
     _ID_IDX: int = 0
     _DATA_REPRESENTATION_IDX: int = 1  # here `sequence` column
 
-    def __init__(self, *args, **kwargs):
-        super(_ProteinPretrainingData).__init__(*args, **kwargs)
+    def __init__(self, **kwargs):
         self._go_extractor = GOUniProtOver250()
         assert self._go_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES
+        super(_ProteinPretrainingData, self).__init__(**kwargs)
 
     # ------------------------------ Phase: Prepare data -----------------------------------
-    def prepare_data(self):
-        print("Checking for processed data in", self.processed_dir_main)
-
+    def prepare_data(self, *args: Any, **kwargs: Any) -> None:
         processed_name = self.processed_dir_main_file_names_dict["data"]
         if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)):
             print("Missing processed data file (`data.pkl` file)")
@@ -73,7 +74,10 @@ def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
 
         swiss_data = SwissProt.parse(
             open(
-                os.path.join(self.raw_dir, self.raw_file_names_dict["SwissUniProt"]),
+                os.path.join(
+                    self._go_extractor.raw_dir,
+                    self._go_extractor.raw_file_names_dict["SwissUniProt"],
+                ),
                 "r",
             )
         )
@@ -195,3 +199,17 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     @property
     def base_dir(self) -> str:
         return os.path.join(self._go_extractor.base_dir, "Pretraining")
+
+    @property
+    def raw_dir(self) -> str:
+        """Name of the directory where the raw data is stored."""
+        return self._go_extractor.raw_dir
+
+
+class SwissProteinPretrain(_ProteinPretrainingData):
+
+    READER = ProteinDataReader
+
+    @property
+    def _name(self) -> str:
+        return "SwissProteinPretrain"

From 2c446dcb2925b199465bb2472f296a2cd2728d8d Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Oct 2024 19:46:31 +0200
Subject: [PATCH 084/112] new data reader for protein pretraining data

---
 .../datasets/protein_pretraining.py             |  4 ++--
 chebai/preprocessing/reader.py                  | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
index 73d0e40e..eea5ff57 100644
--- a/chebai/preprocessing/datasets/protein_pretraining.py
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -17,7 +17,7 @@
     EXPERIMENTAL_EVIDENCE_CODES,
     GOUniProtOver250,
 )
-from chebai.preprocessing.reader import ProteinDataReader
+from chebai.preprocessing.reader import ProteinPretrainReader
 
 
 class _ProteinPretrainingData(_DynamicDataset, ABC):
@@ -208,7 +208,7 @@ def raw_dir(self) -> str:
 
 class SwissProteinPretrain(_ProteinPretrainingData):
 
-    READER = ProteinDataReader
+    READER = ProteinPretrainReader
 
     @property
     def _name(self) -> str:
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
index e220e1e4..ba16007d 100644
--- a/chebai/preprocessing/reader.py
+++ b/chebai/preprocessing/reader.py
@@ -469,3 +469,20 @@ def on_finish(self) -> None:
             print(f"Saving {len(self.cache)} tokens to {self.token_path}...")
             print(f"First 10 tokens: {self.cache[:10]}")
             pk.writelines([f"{c}\n" for c in self.cache])
+
+
+class ProteinPretrainReader(ProteinDataReader):
+    def _read_components(self, row: Dict[str, Any]) -> Dict[str, Any]:
+        """Read and return components from the row."""
+        return dict(
+            features=self._get_raw_data(row),
+            ident=self._get_raw_id(row),
+        )
+
+    def to_data(self, row: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert raw row data to processed data."""
+        d = self._read_components(row)
+        return dict(
+            features=self._read_data(d["features"]),
+            ident=self._read_id(d["ident"]),
+        )

From ad4fc95891525ba571b584d814a6f69a582e1a7a Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Oct 2024 20:49:58 +0200
Subject: [PATCH 085/112] pretrain: add docstrings and typehints

---
 .../datasets/protein_pretraining.py           | 80 +++++++++++++++----
 1 file changed, 63 insertions(+), 17 deletions(-)

diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
index eea5ff57..36ddf07b 100644
--- a/chebai/preprocessing/datasets/protein_pretraining.py
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -21,16 +21,37 @@
 
 
 class _ProteinPretrainingData(_DynamicDataset, ABC):
+    """
+    Data module for pretraining protein sequences, specifically designed for Swiss-UniProt data. It includes methods for
+    data preparation, loading, and dynamic splitting of protein sequences.
+    The data is parsed and filtered to only select proteins with no associated `valid` Gene Ontology (GO) labels.
+    A valid GO label is the one which has one of evidence codes defined in `EXPERIMENTAL_EVIDENCE_CODES`.
+    """
+
     _ID_IDX: int = 0
-    _DATA_REPRESENTATION_IDX: int = 1  # here `sequence` column
+    _DATA_REPRESENTATION_IDX: int = 1  # Index of `sequence` column
 
     def __init__(self, **kwargs):
-        self._go_extractor = GOUniProtOver250()
-        assert self._go_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES
+        """
+        Initializes the data module with any GOUniProt extractor class object.
+
+        Args:
+            **kwargs: Additional arguments for the superclass initialization.
+        """
+        self._go_uniprot_extractor = GOUniProtOver250()
+        assert self._go_uniprot_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES
         super(_ProteinPretrainingData, self).__init__(**kwargs)
 
     # ------------------------------ Phase: Prepare data -----------------------------------
     def prepare_data(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Prepares the data by downloading and parsing Swiss-Prot data if not already available. Saves the processed data
+        for further use.
+
+        Args:
+            *args: Additional positional arguments.
+            **kwargs: Additional keyword arguments.
+        """
         processed_name = self.processed_dir_main_file_names_dict["data"]
         if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)):
             print("Missing processed data file (`data.pkl` file)")
@@ -40,20 +61,29 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None:
             self.save_processed(protein_df, processed_name)
 
     def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
+        # method not required as no Swiss-UniProt has no ontological data
         pass
 
     def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
+        # method not required as no Swiss-UniProt has no ontological data
         pass
 
     def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List:
+        # method not required as no Swiss-UniProt has no ontological data
         pass
 
     def _download_required_data(self) -> str:
-        return self._go_extractor._download_swiss_uni_prot_data()
+        """
+        Downloads the required Swiss-Prot data using the GOUniProt extractor class.
+
+        Returns:
+            str: Path to the downloaded data.
+        """
+        return self._go_uniprot_extractor._download_swiss_uni_prot_data()
 
     def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
         """
-        Parses the Swiss-Prot data and returns a DataFrame mapping Swiss-Prot records which does not have any valid
+        Parses the Swiss-Prot data and returns a DataFrame containing Swiss-Prot proteins which does not have any valid
         Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence code
         (EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC).
 
@@ -65,9 +95,8 @@ def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
             We ignore proteins with ambiguous amino acid codes (B, O, J, U, X, Z) in their sequence.`
 
         Returns:
-            pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with its associated GO data.
+            pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with not associated valid GO.
         """
-
         print("Parsing swiss uniprot raw data....")
 
         swiss_ids, sequences = [], []
@@ -75,8 +104,8 @@ def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
         swiss_data = SwissProt.parse(
             open(
                 os.path.join(
-                    self._go_extractor.raw_dir,
-                    self._go_extractor.raw_file_names_dict["SwissUniProt"],
+                    self._go_uniprot_extractor.raw_dir,
+                    self._go_uniprot_extractor.raw_file_names_dict["SwissUniProt"],
                 ),
                 "r",
             )
@@ -88,7 +117,7 @@ def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
                 continue
 
             if not record.sequence:
-                # Consider protein with only sequence representation and seq. length not greater than max seq. length
+                # Consider protein with only sequence representation
                 continue
 
             if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence):
@@ -97,7 +126,7 @@ def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
 
             has_valid_associated_go_label = False
             for cross_ref in record.cross_references:
-                if cross_ref[0] == self._go_extractor._GO_DATA_INIT:
+                if cross_ref[0] == self._go_uniprot_extractor._GO_DATA_INIT:
 
                     if len(cross_ref) <= 3:
                         # No evidence code
@@ -146,8 +175,6 @@ def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, No
         with open(input_file_path, "rb") as input_file:
             df = pd.read_pickle(input_file)
             for row in df.values:
-                # chebai.preprocessing.reader.DataReader only needs features, labels, ident, group
-                # "group" set to None, by default as no such entity for this data
                 yield dict(
                     features=row[self._DATA_REPRESENTATION_IDX],
                     ident=row[self._ID_IDX],
@@ -184,13 +211,16 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 
         df_go_data = pd.DataFrame(data_go)
         train_df_go, df_test = train_test_split(
-            df_go_data, seed=self.dynamic_data_split_seed
+            df_go_data,
+            train_size=self.train_split,
+            random_state=self.dynamic_data_split_seed,
         )
 
         # Get all splits
         df_train, df_val = train_test_split(
             train_df_go,
-            seed=self.dynamic_data_split_seed,
+            train_size=self.train_split,
+            random_state=self.dynamic_data_split_seed,
         )
 
         return df_train, df_val, df_test
@@ -198,18 +228,34 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     # ------------------------------ Phase: Raw Properties -----------------------------------
     @property
     def base_dir(self) -> str:
-        return os.path.join(self._go_extractor.base_dir, "Pretraining")
+        """
+        str: The base directory for pretraining data storage.
+        """
+        return os.path.join(self._go_uniprot_extractor.base_dir, "Pretraining")
 
     @property
     def raw_dir(self) -> str:
         """Name of the directory where the raw data is stored."""
-        return self._go_extractor.raw_dir
+        return self._go_uniprot_extractor.raw_dir
 
 
 class SwissProteinPretrain(_ProteinPretrainingData):
+    """
+    Data module for Swiss-Prot protein pretraining, inheriting from `_ProteinPretrainingData`.
+    This class is specifically designed to handle data processing and loading for Swiss-Prot-based protein datasets.
+
+    Attributes:
+        READER (Type): The data reader class used to load and process protein pretraining data.
+    """
 
     READER = ProteinPretrainReader
 
     @property
     def _name(self) -> str:
+        """
+        The name identifier for this data module.
+
+        Returns:
+            str: A string identifier, "SwissProteinPretrain", representing the name of this data module.
+        """
         return "SwissProteinPretrain"

From d8e2efb97c940b57339f49af5450023c9216e59e Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Oct 2024 22:51:59 +0200
Subject: [PATCH 086/112] Revert "new data reader for protein pretraining data"

This reverts commit 2c446dcb2925b199465bb2472f296a2cd2728d8d.
---
 .../datasets/protein_pretraining.py             |  4 ++--
 chebai/preprocessing/reader.py                  | 17 -----------------
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
index 36ddf07b..1ffc17c2 100644
--- a/chebai/preprocessing/datasets/protein_pretraining.py
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -17,7 +17,7 @@
     EXPERIMENTAL_EVIDENCE_CODES,
     GOUniProtOver250,
 )
-from chebai.preprocessing.reader import ProteinPretrainReader
+from chebai.preprocessing.reader import ProteinDataReader
 
 
 class _ProteinPretrainingData(_DynamicDataset, ABC):
@@ -248,7 +248,7 @@ class SwissProteinPretrain(_ProteinPretrainingData):
         READER (Type): The data reader class used to load and process protein pretraining data.
     """
 
-    READER = ProteinPretrainReader
+    READER = ProteinDataReader
 
     @property
     def _name(self) -> str:
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
index ba16007d..e220e1e4 100644
--- a/chebai/preprocessing/reader.py
+++ b/chebai/preprocessing/reader.py
@@ -469,20 +469,3 @@ def on_finish(self) -> None:
             print(f"Saving {len(self.cache)} tokens to {self.token_path}...")
             print(f"First 10 tokens: {self.cache[:10]}")
             pk.writelines([f"{c}\n" for c in self.cache])
-
-
-class ProteinPretrainReader(ProteinDataReader):
-    def _read_components(self, row: Dict[str, Any]) -> Dict[str, Any]:
-        """Read and return components from the row."""
-        return dict(
-            features=self._get_raw_data(row),
-            ident=self._get_raw_id(row),
-        )
-
-    def to_data(self, row: Dict[str, Any]) -> Dict[str, Any]:
-        """Convert raw row data to processed data."""
-        d = self._read_components(row)
-        return dict(
-            features=self._read_data(d["features"]),
-            ident=self._read_id(d["ident"]),
-        )

From fc50c3174766d96f37b58ec42283f4d08a8a7605 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Oct 2024 22:54:37 +0200
Subject: [PATCH 087/112] pretrain: set labels to None instead of using new
 reader

---
 chebai/preprocessing/datasets/protein_pretraining.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
index 1ffc17c2..38876887 100644
--- a/chebai/preprocessing/datasets/protein_pretraining.py
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -171,6 +171,7 @@ def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, No
             Dict[str, Any]: A dictionary containing:
                 - `features` (str): The sequence data from the file.
                 - `ident` (Any): The identifier from row index 0.
+                - `labels`: Set to None
         """
         with open(input_file_path, "rb") as input_file:
             df = pd.read_pickle(input_file)
@@ -178,6 +179,7 @@ def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, No
                 yield dict(
                     features=row[self._DATA_REPRESENTATION_IDX],
                     ident=row[self._ID_IDX],
+                    labels=None,
                 )
 
     # ------------------------------ Phase: Dynamic Splits -----------------------------------

From 6b0bcf066728d2002ac1708aaf78c18dc5344cc9 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 26 Oct 2024 11:56:03 +0200
Subject: [PATCH 088/112] add workflow for token files

- workflow tested in dummy PR #62
---
 .github/workflows/token_consistency.yaml | 125 +++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 .github/workflows/token_consistency.yaml

diff --git a/.github/workflows/token_consistency.yaml b/.github/workflows/token_consistency.yaml
new file mode 100644
index 00000000..fd36f8e0
--- /dev/null
+++ b/.github/workflows/token_consistency.yaml
@@ -0,0 +1,125 @@
+name: Check consistency of tokens.txt file
+
+# Define the file paths under `paths` to trigger this check only when specific files are modified.
+# This script will then execute checks only on files that have changed, rather than all files listed in `paths`.
+
+# **Note** : To add a new token file for checks, include its path in:
+# - `on` -> `push` and `pull_request` sections
+# - `jobs` -> `check_tokens` -> `steps` -> Set global variable for multiple tokens.txt paths -> `TOKENS_FILES`
+
+on:
+  push:
+    paths:
+      - "chebai/preprocessing/bin/smiles_token/tokens.txt"
+      - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt"
+      - "chebai/preprocessing/bin/selfies/tokens.txt"
+      - "chebai/preprocessing/bin/protein_token/tokens.txt"
+      - "chebai/preprocessing/bin/graph_properties/tokens.txt"
+      - "chebai/preprocessing/bin/graph/tokens.txt"
+      - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
+  pull_request:
+    paths:
+      - "chebai/preprocessing/bin/smiles_token/tokens.txt"
+      - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt"
+      - "chebai/preprocessing/bin/selfies/tokens.txt"
+      - "chebai/preprocessing/bin/protein_token/tokens.txt"
+      - "chebai/preprocessing/bin/graph_properties/tokens.txt"
+      - "chebai/preprocessing/bin/graph/tokens.txt"
+      - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
+
+jobs:
+  check_tokens:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Get list of changed files
+        id: changed_files
+        run: |
+          git fetch origin dev
+
+          # Get the list of changed files compared to origin/dev and save them to a file
+          git diff --name-only origin/dev > changed_files.txt
+
+          # Print the names of changed files on separate lines
+          echo "Changed files:"
+          while read -r line; do
+            echo "Changed File name : $line"
+          done < changed_files.txt
+
+      - name: Set global variable for multiple tokens.txt paths
+        run: |
+          # All token files that needs to checked must be included here too, same as in `paths`.
+          TOKENS_FILES=(
+            "chebai/preprocessing/bin/smiles_token/tokens.txt"
+            "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt"
+            "chebai/preprocessing/bin/selfies/tokens.txt"
+            "chebai/preprocessing/bin/protein_token/tokens.txt"
+            "chebai/preprocessing/bin/graph_properties/tokens.txt"
+            "chebai/preprocessing/bin/graph/tokens.txt"
+            "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
+          )
+          echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV
+
+      - name: Process only changed tokens.txt files
+        run: |
+          # Convert the TOKENS_FILES environment variable into an array
+          TOKENS_FILES=(${TOKENS_FILES})
+
+          # Iterate over each token file path
+          for TOKENS_FILE_PATH in "${TOKENS_FILES[@]}"; do
+            # Check if the current token file path is in the list of changed files
+            if grep -q "$TOKENS_FILE_PATH" changed_files.txt; then
+              echo "----------------------- Processing $TOKENS_FILE_PATH -----------------------"
+
+              # Get previous tokens.txt version
+              git fetch origin dev
+              git diff origin/dev -- $TOKENS_FILE_PATH > tokens_diff.txt || echo "No previous tokens.txt found for $TOKENS_FILE_PATH"
+
+              # Check for deleted or added lines in tokens.txt
+              if [ -f tokens_diff.txt ]; then
+
+                # Check for deleted lines (lines starting with '-')
+                deleted_lines=$(grep '^-' tokens_diff.txt | grep -v '^---' | sed 's/^-//' || true)
+                if [ -n "$deleted_lines" ]; then
+                  echo "Error: Lines have been deleted from $TOKENS_FILE_PATH."
+                  echo -e "Deleted Lines: \n$deleted_lines"
+                  exit 1
+                fi
+
+                # Check for added lines (lines starting with '+')
+                added_lines=$(grep '^+' tokens_diff.txt | grep -v '^+++' | sed 's/^+//' || true)
+                if [ -n "$added_lines" ]; then
+
+                  # Count how many lines have been added
+                  num_added_lines=$(echo "$added_lines" | wc -l)
+
+                  # Get last `n` lines (equal to num_added_lines) of tokens.txt
+                  last_lines=$(tail -n "$num_added_lines" $TOKENS_FILE_PATH)
+
+                  # Check if the added lines are at the end of the file
+                  if [ "$added_lines" != "$last_lines" ]; then
+
+                    # Find lines that were added but not appended at the end of the file
+                    non_appended_lines=$(diff <(echo "$added_lines") <(echo "$last_lines") | grep '^<' | sed 's/^< //')
+
+                    echo "Error: New lines have been added to $TOKENS_FILE_PATH, but they are not at the end of the file."
+                    echo -e "Added lines that are not at the end of the file: \n$non_appended_lines"
+                    exit 1
+                  fi
+                fi
+
+                if [ "$added_lines" == "" ]; then
+                    echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and no new lines were added."
+                else
+                    echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and new lines were correctly appended at the end."
+                fi
+              else
+                echo "No previous version of $TOKENS_FILE_PATH found."
+              fi
+            else
+              echo "$TOKENS_FILE_PATH was not changed, skipping."
+            fi
+          done

From 8e91ca73a69ffa31682aa00720b5825215d98238 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Tue, 29 Oct 2024 18:17:27 +0100
Subject: [PATCH 089/112] minor changes to GO notebook

---
 tutorials/data_exploration_go.ipynb | 55 +++++++++++++----------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index bc70b200..3e1041d5 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "440f203ceaf7e4b7",
    "metadata": {
     "ExecuteTime": {
@@ -38,13 +38,11 @@
     }
    },
    "outputs": [],
-   "source": [
-    "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250"
-   ]
+   "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250"
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "id": "a648346d81d0dc5e",
    "metadata": {
     "ExecuteTime": {
@@ -91,11 +89,11 @@
     "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n",
     "\n",
     "\n",
-    "# Available ChEBI Data Classes\n",
+    "# Available Data Classes\n",
     "\n",
-    "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n",
+    "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py).\n",
     "\n",
-    "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset.\n",
+    "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset. For instance, GOUniProtOver50 will only select labels which have at least 50 samples in the dataset.\n",
     "\n",
     "Refer `select_classes` method of `_GOUniProtOverX` for more details on selection process.\n",
     "\n",
@@ -291,7 +289,7 @@
    "source": [
     "## <u>uniprot_sprot.dat</u> File\n",
     "\n",
-    "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n",
+    "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotations. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n",
     "\n",
     "\n",
     "### Example of a Protein Entry\n",
@@ -371,8 +369,6 @@
     "  - **Description**: The amino acid sequence of the protein.\n",
     "  - **Example**: The sequence consists of 320 amino acids.\n",
     "\n",
-    "The `uniprot_sprot.dat` file is an extensively curated resource, containing comprehensive protein data used for various bioinformatics applications.\n",
-    "\n",
     "__Note__: For more detailed information refer [here](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt\n",
     "). \n",
     "\n",
@@ -761,7 +757,7 @@
    "source": [
     "## <u>classes.txt</u> File\n",
     "\n",
-    "**Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis."
+    "**Description**: This file lists the GO classes that are used as labels. It can be used to match labels in `data.pt` with GO classes: For position `i` in the label-tensor, the GO-ID is in line `i` of `classes.txt`"
    ]
   },
   {
@@ -801,9 +797,7 @@
    "source": [
     "**File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n",
     "\n",
-    "The `classes.txt` file lists selected Swiss Proteins classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset.\n",
-    "\n",
-    "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks."
+    "The `classes.txt` file lists selected GO classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset."
    ]
   },
   {
@@ -908,7 +902,7 @@
    "source": [
     "**File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n",
     "\n",
-    "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run."
+    "To reuse an existing split, you can use the `splits_file_path` argument. This way, you can reuse the same datasplit across several runs."
    ]
   },
   {
@@ -947,7 +941,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 19,
    "id": "e0cf4fb6-2ca4-4b85-a4e7-0cfbac5cd6c1",
    "metadata": {},
    "outputs": [],
@@ -957,33 +951,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 23,
    "id": "e8343d83-0be3-44df-9224-bba8d5c32336",
    "metadata": {},
    "outputs": [],
    "source": [
+    "protein_dr_3gram = ProteinDataReader(n_gram=3)\n",
     "protein_dr = ProteinDataReader()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 24,
    "id": "8a18dc27-f308-4dde-b1ae-b03a20fb0d45",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "[10, 16, 11, 17, 17, 12, 17, 28, 17, 24, 25, 17, 23, 17, 14, 14, 17, 13, 21]"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[10, 16, 11, 17, 17, 12, 17, 28, 17, 24, 25, 17, 23, 17, 14, 14, 17, 13, 21]\n",
+      "[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]\n"
+     ]
     }
    ],
    "source": [
-    "protein_dr._read_data(\"MRSLLILVLCFLPLAALGK\")"
+    "protein = \"MRSLLILVLCFLPLAALGK\"\n",
+    "print(protein_dr._read_data(protein))\n",
+    "print(protein_dr_3gram._read_data(protein))"
    ]
   },
   {
@@ -1106,7 +1101,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -1120,7 +1115,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,

From 66dd504706e1eefa27b41b73798113ef81f56517 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Wed, 30 Oct 2024 14:21:42 +0100
Subject: [PATCH 090/112] Update protein_pretraining.py

---
 .../datasets/protein_pretraining.py            | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
index 38876887..8550db2b 100644
--- a/chebai/preprocessing/datasets/protein_pretraining.py
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -40,8 +40,20 @@ def __init__(self, **kwargs):
         """
         self._go_uniprot_extractor = GOUniProtOver250()
         assert self._go_uniprot_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES
+
+        self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002))
+        assert (
+            self.max_sequence_length >= 1
+        ), "Max sequence length should be greater than or equal to 1."
+
         super(_ProteinPretrainingData, self).__init__(**kwargs)
 
+        if self.reader.n_gram is not None:
+            assert self.max_sequence_length >= self.reader.n_gram, (
+                f"max_sequence_length ({self.max_sequence_length}) must be greater than "
+                f"or equal to n_gram ({self.reader.n_gram})."
+            )
+
     # ------------------------------ Phase: Prepare data -----------------------------------
     def prepare_data(self, *args: Any, **kwargs: Any) -> None:
         """
@@ -120,6 +132,10 @@ def _parse_protein_data_for_pretraining(self) -> pd.DataFrame:
                 # Consider protein with only sequence representation
                 continue
 
+            if len(record.sequence) > self.max_sequence_length:
+                # Consider protein with only sequence length not greater than max seq. length
+                continue
+
             if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence):
                 # Skip proteins with ambiguous amino acid codes
                 continue
@@ -260,4 +276,4 @@ def _name(self) -> str:
         Returns:
             str: A string identifier, "SwissProteinPretrain", representing the name of this data module.
         """
-        return "SwissProteinPretrain"
+        return f"Swiss_{self.max_sequence_length}"

From ba800d944b4e3e48339631a7430b16460d4d0398 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Wed, 30 Oct 2024 19:37:21 +0100
Subject: [PATCH 091/112] add protein trigram token file to action workflow

---
 .github/workflows/token_consistency.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/token_consistency.yaml b/.github/workflows/token_consistency.yaml
index fd36f8e0..06c3a42e 100644
--- a/.github/workflows/token_consistency.yaml
+++ b/.github/workflows/token_consistency.yaml
@@ -17,6 +17,7 @@ on:
       - "chebai/preprocessing/bin/graph_properties/tokens.txt"
       - "chebai/preprocessing/bin/graph/tokens.txt"
       - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
+      - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
   pull_request:
     paths:
       - "chebai/preprocessing/bin/smiles_token/tokens.txt"
@@ -26,6 +27,7 @@ on:
       - "chebai/preprocessing/bin/graph_properties/tokens.txt"
       - "chebai/preprocessing/bin/graph/tokens.txt"
       - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
+      - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
 
 jobs:
   check_tokens:
@@ -60,6 +62,7 @@ jobs:
             "chebai/preprocessing/bin/graph_properties/tokens.txt"
             "chebai/preprocessing/bin/graph/tokens.txt"
             "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
+            "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
           )
           echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV
 

From 32ff64bd4b6f345828cf76221e8a68f3214d33d8 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 31 Oct 2024 14:21:09 +0100
Subject: [PATCH 092/112] add python script to export constants to json

---
 .github/workflows/export_constants.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 .github/workflows/export_constants.py

diff --git a/.github/workflows/export_constants.py b/.github/workflows/export_constants.py
new file mode 100644
index 00000000..6421498a
--- /dev/null
+++ b/.github/workflows/export_constants.py
@@ -0,0 +1,22 @@
+import json
+
+from chebai.preprocessing.reader import (
+    CLS_TOKEN,
+    EMBEDDING_OFFSET,
+    MASK_TOKEN_INDEX,
+    PADDING_TOKEN_INDEX,
+)
+
+# Define the constants you want to export
+# Any changes in the key names here should also follow the same change in verify_constants.yml code
+constants = {
+    "EMBEDDING_OFFSET": EMBEDDING_OFFSET,
+    "CLS_TOKEN": CLS_TOKEN,
+    "PADDING_TOKEN_INDEX": PADDING_TOKEN_INDEX,
+    "MASK_TOKEN_INDEX": MASK_TOKEN_INDEX,
+}
+
+if __name__ == "__main__":
+    # Write constants to a JSON file
+    with open("constants.json", "w") as f:
+        json.dump(constants, f)

From abd3602080bcfbac20bfad9f6733b9f98b32ddb5 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Thu, 31 Oct 2024 14:21:46 +0100
Subject: [PATCH 093/112] add workflow action to check constants

---
 .github/workflows/verify_constants.yml | 116 +++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 .github/workflows/verify_constants.yml

diff --git a/.github/workflows/verify_constants.yml b/.github/workflows/verify_constants.yml
new file mode 100644
index 00000000..3246f64d
--- /dev/null
+++ b/.github/workflows/verify_constants.yml
@@ -0,0 +1,116 @@
+name: Verify Constants
+
+# Define the file paths under `paths` to trigger this check only when specific files are modified.
+# This script will then execute checks only on files that have changed, rather than all files listed in `paths`.
+
+# **Note** : To add a new file for checks, include its path in:
+# - `on` -> `push` and `pull_request` sections
+# - `jobs` -> `verify-constants` -> `steps` -> Verify constants -> Add a new if else for your file, with check logic inside it.
+
+
+on:
+  push:
+    paths:
+      - "chebai/preprocessing/reader.py"
+  pull_request:
+    paths:
+      - "chebai/preprocessing/reader.py"
+
+jobs:
+  verify-constants:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [
+#       Only use 3.10 as of now
+#          "3.9",
+          "3.10",
+#          "3.11"
+        ]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set PYTHONPATH
+        run: echo "PYTHONPATH=$PWD" >> $GITHUB_ENV
+
+      - name: Get list of changed files
+        id: changed_files
+        run: |
+          git fetch origin dev
+
+          # Get the list of changed files compared to origin/dev and save them to a file
+          git diff --name-only origin/dev > changed_files.txt
+
+          # Print the names of changed files on separate lines
+          echo "Changed files:"
+          while read -r line; do
+            echo "Changed File name : $line"
+          done < changed_files.txt
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        # Setting a fix version for torch due to an error with latest version (2.5.1)
+        # ImportError: cannot import name 'T_co' from 'torch.utils.data.dataset'
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade pip setuptools wheel
+          python -m pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
+          python -m pip install -e .
+
+      - name: Export constants
+        run: python .github/workflows/export_constants.py
+
+      - name: Load constants into environment variables
+        id: load_constants
+        # "E_" is appended as suffix to every constant, to protect overwriting other sys env variables with same name
+        run: |
+          constants=$(cat constants.json)
+          echo "$constants" | jq -r 'to_entries|map("E_\(.key)=\(.value|tostring)")|.[]' >> $GITHUB_ENV
+
+      - name: Print all environment variables
+        run: printenv
+
+      - name: Verify constants
+        run: |
+          file_name="chebai/preprocessing/reader.py"
+          if grep -q "$file_name" changed_files.txt; then
+            echo "----------------------- Checking file : $file_name ----------------------- "
+
+            # Define expected values for constants
+            exp_embedding_offset="10"
+            exp_cls_token="2"
+            exp_padding_token_index="0"
+            exp_mask_token_index="1"
+
+            # Debugging output to check environment variables
+            echo "Current Environment Variables:"
+            echo "E_EMBEDDING_OFFSET = $E_EMBEDDING_OFFSET"
+            echo "Expected: $exp_embedding_offset"
+
+            # Verify constants match expected values
+            if [ "$E_EMBEDDING_OFFSET" != "$exp_embedding_offset" ]; then
+              echo "EMBEDDING_OFFSET ($E_EMBEDDING_OFFSET) does not match expected value ($exp_embedding_offset)!"
+              exit 1
+            fi
+            if [ "$E_CLS_TOKEN" != "$exp_cls_token" ]; then
+              echo "CLS_TOKEN ($E_CLS_TOKEN) does not match expected value ($exp_cls_token)!"
+              exit 1
+            fi
+            if [ "$E_PADDING_TOKEN_INDEX" != "$exp_padding_token_index" ]; then
+              echo "PADDING_TOKEN_INDEX ($E_PADDING_TOKEN_INDEX) does not match expected value ($exp_padding_token_index)!"
+              exit 1
+            fi
+            if [ "$E_MASK_TOKEN_INDEX" != "$exp_mask_token_index" ]; then
+              echo "MASK_TOKEN_INDEX ($E_MASK_TOKEN_INDEX) does not match expected value ($exp_mask_token_index)!"
+              exit 1
+            fi
+          else
+            echo "$file_name not found in changed_files.txt; skipping check."
+          fi

From 5082829df4d3e271f696cc0e60d45b9b3b92c25f Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 10:29:24 +0100
Subject: [PATCH 094/112] GO: add cell to change the cwd to project root dir

---
 tutorials/data_exploration_go.ipynb | 34 +++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index eb39e238..f98da5ee 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -27,6 +27,35 @@
     "To start working with `chebai`, you first need to instantiate a GO-UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a4d590fb-9a83-456e-9cb4-303caa8203e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Already in the project root directory: G:\\github-aditya0by0\\python-chebai\n"
+     ]
+    }
+   ],
+   "source": [
+    "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n",
+    "import os\n",
+    "\n",
+    "# Root directory name of the project\n",
+    "expected_root_dir = \"python-chebai\"\n",
+    "\n",
+    "# Check if the current directory ends with the expected root directory name\n",
+    "if not os.getcwd().endswith(expected_root_dir):\n",
+    "    os.chdir(\"..\")  # Move up one directory level\n",
+    "    if os.getcwd().endswith(expected_root_dir):\n",
+    "        print(\"Changed to project root directory:\", os.getcwd())\n",
+    "    else:\n",
+    "        print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n",
+    "else:\n",
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -38,7 +67,7 @@
     }
    },
    "outputs": [],
-   "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250"
+   ]
   },
   {
    "cell_type": "code",
@@ -1118,6 +1147,7 @@
  "metadata": {
   "kernelspec": {
    "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1131,7 +1161,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

From 661a78a12614e6a0d05ec84b1410d41ca31d913f Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 10:32:37 +0100
Subject: [PATCH 095/112] chebi: add cell to change the cwd to project root dir

---
 tutorials/data_exploration_chebi.ipynb | 32 ++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index 594e786a..a17b914c 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -27,6 +27,38 @@
     "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "990cc6f2-6b4a-4fa7-905f-dda183c3ec4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Already in the project root directory: G:\\github-aditya0by0\\python-chebai\n"
+     ]
+    }
+   ],
+   "source": [
+    "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n",
+    "import os\n",
+    "\n",
+    "# Root directory name of the project\n",
+    "expected_root_dir = \"python-chebai\"\n",
+    "\n",
+    "# Check if the current directory ends with the expected root directory name\n",
+    "if not os.getcwd().endswith(expected_root_dir):\n",
+    "    os.chdir(\"..\")  # Move up one directory level\n",
+    "    if os.getcwd().endswith(expected_root_dir):\n",
+    "        print(\"Changed to project root directory:\", os.getcwd())\n",
+    "    else:\n",
+    "        print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n",
+    "else:\n",
+    "    print(\"Already in the project root directory:\", os.getcwd())"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,

From eecc96f5cfece2e341258ff0c194c269fdbbe647 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 10:57:07 +0100
Subject: [PATCH 096/112] GO: use spilt file to create new data class

---
 tutorials/data_exploration_go.ipynb | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index f98da5ee..84eb2e14 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -56,6 +56,9 @@
     "    else:\n",
     "        print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n",
     "else:\n",
+    "    print(\"Already in the project root directory:\", os.getcwd())"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -934,6 +937,22 @@
     "To reuse an existing split, you can use the `splits_file_path` argument. This way, you can reuse the same datasplit across several runs."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2b02d8b4-c2de-4b8e-b680-ec67b40d9a30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can specify a literal path for the `splits_file_path`, or if another `go_class` instance is already defined, \n",
+    "# you can use its existing `splits_file_path` attribute for consistency.\n",
+    "go_class_with_splits = GOUniProtOver250(\n",
+    "    go_branch=\"BP\", \n",
+    "    # splits_file_path=\"data/GO_UniProt/GO250_BP_1002/processed/splits.csv\",  # Literal path option\n",
+    "    splits_file_path=go_class.splits_file_path  # Use path from an existing `go_class` instance\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "e6b1f184a5091b83",

From c05b8684bf1d0aaf5d86aed19360b82e01054b7f Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 11:17:27 +0100
Subject: [PATCH 097/112] GO: fix json parsing

---
 tutorials/data_exploration_go.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index 84eb2e14..0078cfb3 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -70,7 +70,7 @@
     }
    },
    "outputs": [],
-   ]
+   "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250"
   },
   {
    "cell_type": "code",

From b83e5cdce49dd9d5c507f0fb112c1b6ba4dd3e0d Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 11:23:35 +0100
Subject: [PATCH 098/112] chebi: use spilt file to create new data class

---
 tutorials/data_exploration_chebi.ipynb | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index a17b914c..e9454986 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -840,6 +840,22 @@
     "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6dc3fd6c-7cf6-47ef-812f-54319a0cdeb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can specify a literal path for the `splits_file_path`, or if another `chebi_class` instance is already defined, \n",
+    "# you can use its existing `splits_file_path` attribute for consistency.\n",
+    "chebi_class_with_splits = ChEBIOver50(\n",
+    "    chebi_version=231, \n",
+    "    # splits_file_path=\"data/chebi_v231/ChEBI50/processed/splits.csv\",  # Literal path option\n",
+    "    splits_file_path=chebi_class.splits_file_path  # Use path from an existing `chebi_class` instance\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee",

From b5abb0af4108131d4d284606b9f2cdea6f08aaa0 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 11:52:37 +0100
Subject: [PATCH 099/112] go: changes to data as per new code change

---
 tutorials/data_exploration_go.ipynb | 285 +++++++++++++++++++++-------
 1 file changed, 215 insertions(+), 70 deletions(-)

diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index 0078cfb3..7173bf80 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -112,6 +112,8 @@
     "  - **`\"MF\"`**: Molecular Function branch.\n",
     "  - **`\"CC\"`**: Cellular Component branch.\n",
     "\n",
+    "- **`max_sequence_length (int)`**: Specifies the maximum allowed sequence length for a protein, with a default of `1002`. During data preprocessing, any proteins exceeding this length will be excluded from further processing.\n",
+    "\n",
     "This allows for more specific datasets focused on a particular aspect of gene function.\n",
     "\n",
     "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n",
@@ -152,10 +154,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "9f77351090560bc4",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Checking for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\n",
+      "Missing processed data file (`data.pkl` file)\n",
+      "Extracting class hierarchy...\n",
+      "Compute transitive closure\n",
+      "Processing graph\n",
+      "Parsing swiss uniprot raw data....\n",
+      "Selecting GO terms based on given threshold: 250 ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Check for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\\protein_token\n",
+      "Cross-validation enabled: False\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Missing transformed data (`data.pt` file). Transforming data.... \n",
+      "Processing 53604 lines...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|███████████████████████████████████████████████████████████████████████████| 53604/53604 [01:18<00:00, 678.84it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saving 20 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\protein_token\\tokens.txt...\n",
+      "First 10 tokens: ['M', 'S', 'I', 'G', 'A', 'T', 'R', 'L', 'Q', 'N']\n"
+     ]
+    }
+   ],
    "source": [
     "go_class.prepare_data()\n",
     "go_class.setup()"
@@ -459,7 +506,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Size of the data (rows x columns):  (32933, 1049)\n"
+      "Size of the data (rows x columns):  (53604, 902)\n"
      ]
     },
     {
@@ -494,13 +541,13 @@
        "      <th>209</th>\n",
        "      <th>226</th>\n",
        "      <th>...</th>\n",
+       "      <th>1990778</th>\n",
+       "      <th>2000026</th>\n",
        "      <th>2000145</th>\n",
        "      <th>2000146</th>\n",
        "      <th>2000147</th>\n",
        "      <th>2000241</th>\n",
-       "      <th>2000242</th>\n",
        "      <th>2000243</th>\n",
-       "      <th>2000377</th>\n",
        "      <th>2001141</th>\n",
        "      <th>2001233</th>\n",
        "      <th>2001234</th>\n",
@@ -508,11 +555,11 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>14331_ARATH</td>\n",
-       "      <td>P42643,Q945M2,Q9M0S7</td>\n",
-       "      <td>[19222]</td>\n",
-       "      <td>MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...</td>\n",
+       "      <th>1</th>\n",
+       "      <td>11S1_CARIL</td>\n",
+       "      <td>B5KVH4</td>\n",
+       "      <td>[3006, 8150, 9791, 10431, 21700, 22414, 32501,...</td>\n",
+       "      <td>MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
@@ -532,11 +579,11 @@
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>14331_CAEEL</td>\n",
-       "      <td>P41932,Q21537</td>\n",
-       "      <td>[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...</td>\n",
-       "      <td>MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...</td>\n",
+       "      <th>3</th>\n",
+       "      <td>11S2_SESIN</td>\n",
+       "      <td>Q9XHP0</td>\n",
+       "      <td>[3006, 8150, 10431, 21700, 22414, 32502, 48609]</td>\n",
+       "      <td>MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
@@ -556,11 +603,11 @@
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>14331_MAIZE</td>\n",
-       "      <td>P49106</td>\n",
-       "      <td>[3677, 5634, 10468, 44877]</td>\n",
-       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
+       "      <th>6</th>\n",
+       "      <td>14310_ARATH</td>\n",
+       "      <td>P48347,Q9LME5</td>\n",
+       "      <td>[7165, 8150, 9742, 9755, 9987, 43401, 50789, 5...</td>\n",
+       "      <td>MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
@@ -580,11 +627,11 @@
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>14332_MAIZE</td>\n",
-       "      <td>Q01526</td>\n",
-       "      <td>[3677, 5634, 10468, 44877]</td>\n",
-       "      <td>MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...</td>\n",
+       "      <th>8</th>\n",
+       "      <td>14331_ARATH</td>\n",
+       "      <td>P42643,Q945M2,Q9M0S7</td>\n",
+       "      <td>[8150, 19222, 50789, 65007]</td>\n",
+       "      <td>MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
@@ -604,17 +651,17 @@
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>14333_ARATH</td>\n",
-       "      <td>P42644,F4KBI7,Q945L2</td>\n",
-       "      <td>[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...</td>\n",
-       "      <td>MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...</td>\n",
-       "      <td>False</td>\n",
+       "      <th>9</th>\n",
+       "      <td>14331_CAEEL</td>\n",
+       "      <td>P41932,Q21537</td>\n",
+       "      <td>[132, 226, 1708, 6611, 6810, 6886, 6913, 6950,...</td>\n",
+       "      <td>MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
+       "      <td>True</td>\n",
        "      <td>...</td>\n",
        "      <td>False</td>\n",
        "      <td>False</td>\n",
@@ -629,46 +676,46 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>5 rows × 1049 columns</p>\n",
+       "<p>5 rows × 902 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       swiss_id             accession  \\\n",
-       "8   14331_ARATH  P42643,Q945M2,Q9M0S7   \n",
-       "9   14331_CAEEL         P41932,Q21537   \n",
-       "10  14331_MAIZE                P49106   \n",
-       "13  14332_MAIZE                Q01526   \n",
-       "14  14333_ARATH  P42644,F4KBI7,Q945L2   \n",
+       "      swiss_id             accession  \\\n",
+       "1   11S1_CARIL                B5KVH4   \n",
+       "3   11S2_SESIN                Q9XHP0   \n",
+       "6  14310_ARATH         P48347,Q9LME5   \n",
+       "8  14331_ARATH  P42643,Q945M2,Q9M0S7   \n",
+       "9  14331_CAEEL         P41932,Q21537   \n",
        "\n",
-       "                                               go_ids  \\\n",
-       "8                                             [19222]   \n",
-       "9   [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...   \n",
-       "10                         [3677, 5634, 10468, 44877]   \n",
-       "13                         [3677, 5634, 10468, 44877]   \n",
-       "14  [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...   \n",
+       "                                              go_ids  \\\n",
+       "1  [3006, 8150, 9791, 10431, 21700, 22414, 32501,...   \n",
+       "3    [3006, 8150, 10431, 21700, 22414, 32502, 48609]   \n",
+       "6  [7165, 8150, 9742, 9755, 9987, 43401, 50789, 5...   \n",
+       "8                        [8150, 19222, 50789, 65007]   \n",
+       "9  [132, 226, 1708, 6611, 6810, 6886, 6913, 6950,...   \n",
        "\n",
-       "                                             sequence     41     75    122  \\\n",
-       "8   MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...  False  False  False   \n",
-       "9   MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...  False  False  False   \n",
-       "10  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
-       "13  MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...  False  False  False   \n",
-       "14  MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...  False  False  False   \n",
+       "                                            sequence     41     75    122  \\\n",
+       "1  MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...  False  False  False   \n",
+       "3  MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...  False  False  False   \n",
+       "6  MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...  False  False  False   \n",
+       "8  MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...  False  False  False   \n",
+       "9  MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...  False  False  False   \n",
        "\n",
-       "      165    209    226  ...  2000145  2000146  2000147  2000241  2000242  \\\n",
-       "8   False  False  False  ...    False    False    False    False    False   \n",
-       "9   False  False  False  ...    False    False    False    False    False   \n",
-       "10  False  False  False  ...    False    False    False    False    False   \n",
-       "13  False  False  False  ...    False    False    False    False    False   \n",
-       "14  False  False  False  ...    False    False    False    False    False   \n",
+       "     165    209    226  ...  1990778  2000026  2000145  2000146  2000147  \\\n",
+       "1  False  False  False  ...    False    False    False    False    False   \n",
+       "3  False  False  False  ...    False    False    False    False    False   \n",
+       "6  False  False  False  ...    False    False    False    False    False   \n",
+       "8  False  False  False  ...    False    False    False    False    False   \n",
+       "9  False  False   True  ...    False    False    False    False    False   \n",
        "\n",
-       "    2000243  2000377  2001141  2001233  2001234  \n",
-       "8     False    False    False    False    False  \n",
-       "9     False    False    False    False    False  \n",
-       "10    False    False    False    False    False  \n",
-       "13    False    False    False    False    False  \n",
-       "14    False    False    False    False    False  \n",
+       "   2000241  2000243  2001141  2001233  2001234  \n",
+       "1    False    False    False    False    False  \n",
+       "3    False    False    False    False    False  \n",
+       "6    False    False    False    False    False  \n",
+       "8    False    False    False    False    False  \n",
+       "9    False    False    False    False    False  \n",
        "\n",
-       "[5 rows x 1049 columns]"
+       "[5 rows x 902 columns]"
       ]
      },
      "execution_count": 8,
@@ -720,7 +767,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "85b097601fb242d6",
    "metadata": {
     "ExecuteTime": {
@@ -735,7 +782,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "289a54a71dec20fb",
    "metadata": {
     "ExecuteTime": {
@@ -750,7 +797,106 @@
      "text": [
       "Type of loaded data: <class 'list'>\n",
       "Content of the data file: \n",
-      " {'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n"
+      " {'features': [10, 14, 21, 23, 12, 17, 17, 11, 12, 22, 17, 24, 17, 12, 12, 28, 14, 17, 25, 19, 13, 24, 17, 14, 18, 11, 13, 13, 16, 18, 18, 29, 21, 25, 13, 18, 24, 18, 17, 19, 16, 17, 20, 14, 17, 27, 23, 15, 19, 16, 12, 27, 14, 27, 14, 13, 28, 12, 27, 11, 26, 20, 23, 19, 29, 18, 18, 17, 18, 24, 14, 13, 28, 14, 28, 28, 16, 16, 15, 12, 27, 23, 19, 13, 17, 17, 17, 23, 29, 22, 11, 19, 14, 23, 18, 17, 28, 22, 12, 14, 16, 13, 16, 13, 12, 15, 13, 28, 17, 25, 23, 13, 24, 23, 27, 15, 25, 27, 27, 11, 18, 16, 18, 11, 18, 18, 13, 18, 16, 16, 27, 25, 18, 18, 20, 16, 29, 18, 21, 12, 16, 29, 25, 16, 27, 13, 20, 12, 12, 14, 25, 23, 14, 13, 28, 14, 29, 26, 24, 22, 19, 20, 13, 11, 11, 23, 28, 28, 14, 12, 25, 17, 17, 20, 15, 29, 19, 19, 14, 19, 18, 17, 20, 18, 19, 23, 16, 19, 25, 22, 17, 14, 13, 19, 23, 20, 20, 27, 25, 16, 23, 18, 13, 18, 18, 27, 22, 27, 18, 29, 16, 16, 18, 18, 18, 29, 18, 18, 16, 16, 13, 27, 29, 13, 27, 18, 18, 16, 20, 17, 13, 19, 19, 28, 25, 11, 13, 25, 20, 14, 27, 25, 17, 14, 20, 14, 25, 19, 28, 20, 15, 27, 15, 14, 16, 16, 17, 18, 11, 27, 19, 20, 29, 16, 13, 11, 12, 28, 16, 28, 27, 13, 16, 18, 17, 18, 28, 12, 16, 23, 16, 26, 11, 16, 27, 27, 18, 27, 29, 27, 27, 16, 21, 27, 16, 27, 16, 27, 16, 27, 11, 27, 11, 27, 16, 16, 18, 11, 16, 16, 13, 13, 16, 20, 20, 19, 13, 17, 27, 27, 15, 12, 24, 15, 17, 11, 17, 16, 27, 19, 12, 13, 20, 23, 11, 16, 14, 20, 12, 22, 15, 27, 27, 14, 13, 16, 12, 11, 15, 28, 19, 11, 29, 19, 17, 23, 12, 17, 16, 26, 17, 18, 17, 11, 14, 27, 16, 13, 14, 17, 22, 11, 20, 14, 17, 22, 28, 23, 29, 26, 19, 17, 19, 14, 29, 11, 28, 28, 22, 14, 17, 16, 13, 16, 14, 27, 28, 18, 28, 28, 20, 19, 25, 13, 18, 15, 28, 25, 20, 20, 27, 17, 16, 27, 13, 18, 17, 17, 15, 12, 23, 18, 19, 25, 14, 28, 28, 21, 16, 14, 16, 20, 27, 13, 25, 27, 26, 28, 11, 25, 21, 15, 19, 27, 19, 14, 10, 28, 11, 23, 17, 14, 13, 16, 15, 11, 14, 12, 16, 14, 17, 23, 27, 27, 28, 17, 28, 19, 14, 25, 18, 12, 23, 16, 27, 20, 14, 16, 16, 17, 21, 25, 19, 16, 18, 27, 11, 15, 17, 28, 16, 11, 16, 11, 16, 11, 11, 16, 11, 27, 16, 16, 14, 27, 28], 'labels': array([False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False,  True, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False,  True, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False,  True, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False,  True, False, False, False, False, False,  True,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False,  True,  True, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False,  True,\n",
+      "        True, False, False, False, False, False, False, False, False,\n",
+      "        True, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False, False, False,\n",
+      "       False, False, False, False, False, False, False]), 'ident': '11S1_CARIL', 'group': None}\n"
      ]
     }
    ],
@@ -794,7 +940,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "id": "19200f7ff9a6ebba",
    "metadata": {
     "ExecuteTime": {
@@ -844,7 +990,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "id": "88c3ea8f01ba9fac",
    "metadata": {
     "ExecuteTime": {
@@ -917,7 +1063,7 @@
        "4  14333_ARATH  train"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1165,7 +1311,6 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"

From f659311a30272a520e56f208c419c4c5a51a740d Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 13:21:05 +0100
Subject: [PATCH 100/112] add output for prepare-setup data cell

---
 tutorials/data_exploration_chebi.ipynb | 47 ++++++++++++++++++++++++--
 tutorials/data_exploration_go.ipynb    |  8 +++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index e9454986..040faf69 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -153,10 +153,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "d0a58e2bd9c0e6d9",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Checking for processed data in data\\chebi_v231\\ChEBI50\\processed\n",
+      "Missing processed data file (`data.pkl` file)\n",
+      "Missing raw chebi data related to version: v_231, Downloading...\n",
+      "Compute transitive closure\n",
+      "Process graph\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n",
+      "Cross-validation enabled: False\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Missing transformed data (`data.pt` file). Transforming data.... \n",
+      "Processing 185007 lines...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████| 185007/185007 [05:43<00:00, 539.23it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n",
+      "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n"
+     ]
+    }
+   ],
    "source": [
     "chebi_class.prepare_data()\n",
     "chebi_class.setup()"
diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index 7173bf80..aeffe7b9 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -164,6 +164,14 @@
      "text": [
       "Checking for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\n",
       "Missing processed data file (`data.pkl` file)\n",
+	  "Downloading Swiss UniProt data....\n",
+	  "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
+	  "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
+	  "Unzipping the file....\n",
+	  "Unpacked and saved to data\\GO_UniProt\\raw\\uniprot_sprot.dat\n",
+	  "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
+	  "Missing Gene Ontology raw data\n",
+	  "Downloading Gene Ontology data....\n",
       "Extracting class hierarchy...\n",
       "Compute transitive closure\n",
       "Processing graph\n",

From a71b199f3b569ec0d13e5c96e43fdd036308060b Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 16:40:21 +0100
Subject: [PATCH 101/112] update swiss data for pretraining test

---
 tests/unit/mock_data/ontology_mock_data.py | 110 +++++++++++++++------
 1 file changed, 79 insertions(+), 31 deletions(-)

diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index d6feb33d..92a070cb 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -632,17 +632,44 @@ def protein_sequences() -> Dict[str, str]:
             ),
         }
 
+    @staticmethod
+    def proteins_for_pretraining() -> List[str]:
+        """
+        Returns a list of protein IDs which will be used for pretraining based on mock UniProt data.
+
+        Proteins include those with:
+        - No GO classes or invalid GO classes (missing required evidence codes).
+
+        Returns:
+            List[str]: A list of protein IDs that do not meet validation criteria.
+        """
+        return [
+            "Swiss_Prot_5",  # No GO classes associated
+            "Swiss_Prot_6",  # GO class with no evidence code
+            "Swiss_Prot_7",  # GO class with invalid evidence code
+        ]
+
     @staticmethod
     def get_UniProt_raw_data() -> str:
         """
         Get raw data in string format for UniProt proteins.
 
-        This mock data contains six Swiss-Prot proteins with different properties:
-        - Swiss_Prot_1 and Swiss_Prot_2 are valid proteins.
-        - Swiss_Prot_3 has a sequence length greater than 1002.
-        - Swiss_Prot_4 contains "X", a non-valid amino acid in its sequence.
-        - Swiss_Prot_5 has no GO IDs mapped to it.
-        - Swiss_Prot_6 has GO IDs mapped, but no evidence codes.
+        This mock data contains eleven Swiss-Prot proteins with different properties:
+        - **Swiss_Prot_1**: A valid protein with three valid GO classes and one invalid GO class.
+        - **Swiss_Prot_2**: Another valid protein with two valid GO classes and one invalid.
+        - **Swiss_Prot_3**: Contains valid GO classes but has a sequence length > 1002.
+        - **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'X'.
+        - **Swiss_Prot_5**: Has a sequence but no GO classes associated.
+        - **Swiss_Prot_6**: Has GO classes without any associated evidence codes.
+        - **Swiss_Prot_7**: Has a GO class with an invalid evidence code.
+        - **Swiss_Prot_8**: Has a sequence length > 1002 and has only invalid GO class.
+        - **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'X', in its sequence.
+        - **Swiss_Prot_10**: Has a valid GO class but lacks a sequence.
+        - **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence.
+
+        Note:
+        A valid GO label is the one which has one of the following evidence code
+        (EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC).
 
         Returns:
             str: The raw UniProt data in string format.
@@ -650,6 +677,7 @@ def get_UniProt_raw_data() -> str:
         protein_sq_1 = GOUniProtMockData.protein_sequences()["Swiss_Prot_1"]
         protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"]
         raw_str = (
+            # Below protein with 3 valid associated GO class and one invalid GO class
             f"ID   Swiss_Prot_1              Reviewed;         {len(protein_sq_1)} AA. \n"
             "AC   Q6GZX4;\n"
             "DR   GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n"
@@ -659,6 +687,7 @@ def get_UniProt_raw_data() -> str:
             f"SQ   SEQUENCE   {len(protein_sq_1)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
             f"     {protein_sq_1}\n"
             "//\n"
+            # Below protein with 2 valid associated GO class and one invalid GO class
             f"ID   Swiss_Prot_2              Reviewed;         {len(protein_sq_2)} AA.\n"
             "AC   DCGZX4;\n"
             "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
@@ -668,34 +697,17 @@ def get_UniProt_raw_data() -> str:
             f"SQ   SEQUENCE   {len(protein_sq_2)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
             f"     {protein_sq_2}\n"
             "//\n"
-            "ID   Swiss_Prot_3              Reviewed;         1165 AA.\n"
+            # Below protein with all valid associated GO class but sequence length greater than 1002
+            f"ID   Swiss_Prot_3              Reviewed;         {len(protein_sq_1 * 25)} AA.\n"
             "AC   Q6GZX4;\n"
             "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
             "DR   GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n"
             "DR   GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n"
             "DR   GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
-            "SQ   SEQUENCE   1165 AA;  129118 MW;  FE2984658CED53A8 CRC64;\n"
-            "     MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n"
-            "     GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n"
-            "     DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n"
-            "     DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n"
-            "     KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n"
-            "     EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n"
-            "     SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n"
-            "     KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n"
-            "     KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n"
-            "     WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n"
-            "     NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n"
-            "     NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n"
-            "     SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n"
-            "     GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n"
-            "     YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n"
-            "     AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n"
-            "     AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n"
-            "     NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n"
-            "     ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n"
-            "     DVRVSGVMIY RASEGKPMQY VSLLM\n"
+            f"SQ   SEQUENCE   {len(protein_sq_1 * 25)} AA;  129118 MW;  FE2984658CED53A8 CRC64;\n"
+            f"     {protein_sq_1 * 25}\n"
             "//\n"
+            # Below protein has valid go class association but invalid amino acid `X` in its sequence
             "ID   Swiss_Prot_4              Reviewed;         60 AA.\n"
             "AC   Q6GZX4;\n"
             "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
@@ -705,18 +717,54 @@ def get_UniProt_raw_data() -> str:
             "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
             "     XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
             "//\n"
+            # Below protein with sequence string but has no GO class
             "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
             "AC   Q6GZX4;\n"
             "DR   EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
             "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
             "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
             "//\n"
-            "ID   Swiss_Prot_5              Reviewed;         60 AA.\n"
+            # Below protein with sequence string and with NO `valid` associated GO class (no evidence code)
+            "ID   Swiss_Prot_6              Reviewed;         60 AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   GO; GO:0000023; P:regulation of viral transcription;\n"
+            "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            "//\n"
+            # Below protein with sequence string and with NO `valid` associated GO class (invalid evidence code)
+            "ID   Swiss_Prot_7              Reviewed;         60 AA.\n"
             "AC   Q6GZX4;\n"
-            "DR   GO; GO:0000005; P:regulation of viral transcription;\n"
+            "DR   GO; GO:0000024; P:regulation of viral transcription; IEA:SGD.\n"
             "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
             "     MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
-            "//"
+            "//\n"
+            # Below protein with sequence length greater than 1002 but with `Invalid` associated GO class
+            f"ID   Swiss_Prot_8              Reviewed;         {len(protein_sq_2 * 25)} AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   GO; GO:0000025; P:regulation of viral transcription; IC:Inferred.\n"
+            f"SQ   SEQUENCE   {len(protein_sq_2 * 25)} AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            f"     {protein_sq_2 * 25}\n"
+            "//\n"
+            # Below protein with sequence string but invalid amino acid `X` in its sequence
+            "ID   Swiss_Prot_9              Reviewed;         60 AA.\n"
+            "AC   Q6GZX4;\n"
+            "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            "     XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
+            "//\n"
+            # Below protein with a `valid` associated GO class but without sequence string 
+            "ID   Swiss_Prot_10              Reviewed;         60 AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   GO; GO:0000027; P:regulation of viral transcription; EXP:InterPro.\n"
+            "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            "     \n"  
+            "//\n"
+            # Below protein with a `Invalid` associated GO class but without sequence string 
+            "ID   Swiss_Prot_11              Reviewed;         60 AA.\n"
+            "AC   Q6GZX4;\n"
+            "DR   GO; GO:0000028; P:regulation of viral transcription; ND:NoData.\n"
+            "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
+            "     \n"  
+            "//\n"
         )
 
         return raw_str

From 8abd14d7e72ec62efd5aba801c8fe547d04a12ea Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 16:41:20 +0100
Subject: [PATCH 102/112] add test for protein pretraining class

---
 .../testProteinPretrainingData.py             | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 tests/unit/dataset_classes/testProteinPretrainingData.py

diff --git a/tests/unit/dataset_classes/testProteinPretrainingData.py b/tests/unit/dataset_classes/testProteinPretrainingData.py
new file mode 100644
index 00000000..d3046fdf
--- /dev/null
+++ b/tests/unit/dataset_classes/testProteinPretrainingData.py
@@ -0,0 +1,71 @@
+import unittest
+from unittest.mock import PropertyMock, mock_open, patch
+from chebai.preprocessing.datasets.protein_pretraining import _ProteinPretrainingData
+from chebai.preprocessing.reader import ProteinDataReader
+from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData
+
+
+class TestProteinPretrainingData(unittest.TestCase):
+    """
+    Unit tests for the _ProteinPretrainingData class.
+    Tests focus on data parsing and validation checks for protein pretraining.
+    """
+
+    @classmethod
+    @patch.multiple(_ProteinPretrainingData, __abstractmethods__=frozenset())
+    @patch.object(_ProteinPretrainingData, "base_dir", new_callable=PropertyMock)
+    @patch.object(_ProteinPretrainingData, "_name", new_callable=PropertyMock)
+    @patch("os.makedirs", return_value=None)
+    def setUpClass(
+        cls,
+        mock_makedirs,
+        mock_name_property: PropertyMock,
+        mock_base_dir_property: PropertyMock,
+    ) -> None:
+        """
+        Class setup for mocking abstract properties of _ProteinPretrainingData.
+
+        Mocks the required abstract properties and sets up the data extractor.
+        """
+        mock_base_dir_property.return_value = "MockedBaseDirPropProteinPretrainingData"
+        mock_name_property.return_value = "MockedNameProp_ProteinPretrainingData"
+
+        # Set the READER class for the pretraining data
+        _ProteinPretrainingData.READER = ProteinDataReader
+
+        # Initialize the extractor instance
+        cls.extractor = _ProteinPretrainingData()
+
+    @patch(
+        "builtins.open",
+        new_callable=mock_open,
+        read_data=GOUniProtMockData.get_UniProt_raw_data(),
+    )
+    def test_parse_protein_data_for_pretraining(self, mock_open_file: mock_open) -> None:
+        """
+        Tests the _parse_protein_data_for_pretraining method.
+
+        Verifies that:
+        - The parsed DataFrame contains the expected protein IDs.
+        - The protein sequences are not empty.
+        """
+        # Parse the pretraining data
+        pretrain_df = self.extractor._parse_protein_data_for_pretraining()
+        list_of_pretrain_swiss_ids = GOUniProtMockData.proteins_for_pretraining()
+
+        # Assert that all expected Swiss-Prot IDs are present in the DataFrame
+        self.assertEqual(
+            set(pretrain_df['swiss_id']),
+            set(list_of_pretrain_swiss_ids),
+            msg="The parsed DataFrame does not contain the expected Swiss-Prot IDs for pretraining."
+        )
+
+        # Assert that all sequences are not empty
+        self.assertTrue(
+            pretrain_df['sequence'].str.len().gt(0).all(),
+            msg="Some protein sequences in the pretraining DataFrame are empty."
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From aae57d355b03b96600ded3e41c8e88361855d624 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 16:44:12 +0100
Subject: [PATCH 103/112] test : reformat with precommit

---
 .../dataset_classes/testProteinPretrainingData.py   | 13 ++++++++-----
 tests/unit/mock_data/ontology_mock_data.py          |  8 ++++----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/tests/unit/dataset_classes/testProteinPretrainingData.py b/tests/unit/dataset_classes/testProteinPretrainingData.py
index d3046fdf..cb6b0688 100644
--- a/tests/unit/dataset_classes/testProteinPretrainingData.py
+++ b/tests/unit/dataset_classes/testProteinPretrainingData.py
@@ -1,5 +1,6 @@
 import unittest
 from unittest.mock import PropertyMock, mock_open, patch
+
 from chebai.preprocessing.datasets.protein_pretraining import _ProteinPretrainingData
 from chebai.preprocessing.reader import ProteinDataReader
 from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData
@@ -41,7 +42,9 @@ def setUpClass(
         new_callable=mock_open,
         read_data=GOUniProtMockData.get_UniProt_raw_data(),
     )
-    def test_parse_protein_data_for_pretraining(self, mock_open_file: mock_open) -> None:
+    def test_parse_protein_data_for_pretraining(
+        self, mock_open_file: mock_open
+    ) -> None:
         """
         Tests the _parse_protein_data_for_pretraining method.
 
@@ -55,15 +58,15 @@ def test_parse_protein_data_for_pretraining(self, mock_open_file: mock_open) ->
 
         # Assert that all expected Swiss-Prot IDs are present in the DataFrame
         self.assertEqual(
-            set(pretrain_df['swiss_id']),
+            set(pretrain_df["swiss_id"]),
             set(list_of_pretrain_swiss_ids),
-            msg="The parsed DataFrame does not contain the expected Swiss-Prot IDs for pretraining."
+            msg="The parsed DataFrame does not contain the expected Swiss-Prot IDs for pretraining.",
         )
 
         # Assert that all sequences are not empty
         self.assertTrue(
-            pretrain_df['sequence'].str.len().gt(0).all(),
-            msg="Some protein sequences in the pretraining DataFrame are empty."
+            pretrain_df["sequence"].str.len().gt(0).all(),
+            msg="Some protein sequences in the pretraining DataFrame are empty.",
         )
 
 
diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py
index 92a070cb..a05b89f1 100644
--- a/tests/unit/mock_data/ontology_mock_data.py
+++ b/tests/unit/mock_data/ontology_mock_data.py
@@ -751,19 +751,19 @@ def get_UniProt_raw_data() -> str:
             "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
             "     XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
             "//\n"
-            # Below protein with a `valid` associated GO class but without sequence string 
+            # Below protein with a `valid` associated GO class but without sequence string
             "ID   Swiss_Prot_10              Reviewed;         60 AA.\n"
             "AC   Q6GZX4;\n"
             "DR   GO; GO:0000027; P:regulation of viral transcription; EXP:InterPro.\n"
             "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
-            "     \n"  
+            "     \n"
             "//\n"
-            # Below protein with a `Invalid` associated GO class but without sequence string 
+            # Below protein with a `Invalid` associated GO class but without sequence string
             "ID   Swiss_Prot_11              Reviewed;         60 AA.\n"
             "AC   Q6GZX4;\n"
             "DR   GO; GO:0000028; P:regulation of viral transcription; ND:NoData.\n"
             "SQ   SEQUENCE   60 AA;  29735 MW;  B4840739BF7D4121 CRC64;\n"
-            "     \n"  
+            "     \n"
             "//\n"
         )
 

From 22e864fae9814cc68c0ac2907f768cf773047d04 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 2 Nov 2024 16:53:12 +0100
Subject: [PATCH 104/112] reformat with precommit

---
 tutorials/data_exploration_chebi.ipynb |  6 +++---
 tutorials/data_exploration_go.ipynb    | 22 +++++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb
index 040faf69..81256f4a 100644
--- a/tutorials/data_exploration_chebi.ipynb
+++ b/tutorials/data_exploration_chebi.ipynb
@@ -890,12 +890,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# You can specify a literal path for the `splits_file_path`, or if another `chebi_class` instance is already defined, \n",
+    "# You can specify a literal path for the `splits_file_path`, or if another `chebi_class` instance is already defined,\n",
     "# you can use its existing `splits_file_path` attribute for consistency.\n",
     "chebi_class_with_splits = ChEBIOver50(\n",
-    "    chebi_version=231, \n",
+    "    chebi_version=231,\n",
     "    # splits_file_path=\"data/chebi_v231/ChEBI50/processed/splits.csv\",  # Literal path option\n",
-    "    splits_file_path=chebi_class.splits_file_path  # Use path from an existing `chebi_class` instance\n",
+    "    splits_file_path=chebi_class.splits_file_path,  # Use path from an existing `chebi_class` instance\n",
     ")"
    ]
   },
diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb
index aeffe7b9..6f67c82b 100644
--- a/tutorials/data_exploration_go.ipynb
+++ b/tutorials/data_exploration_go.ipynb
@@ -164,14 +164,14 @@
      "text": [
       "Checking for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\n",
       "Missing processed data file (`data.pkl` file)\n",
-	  "Downloading Swiss UniProt data....\n",
-	  "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
-	  "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
-	  "Unzipping the file....\n",
-	  "Unpacked and saved to data\\GO_UniProt\\raw\\uniprot_sprot.dat\n",
-	  "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
-	  "Missing Gene Ontology raw data\n",
-	  "Downloading Gene Ontology data....\n",
+      "Downloading Swiss UniProt data....\n",
+      "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
+      "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
+      "Unzipping the file....\n",
+      "Unpacked and saved to data\\GO_UniProt\\raw\\uniprot_sprot.dat\n",
+      "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n",
+      "Missing Gene Ontology raw data\n",
+      "Downloading Gene Ontology data....\n",
       "Extracting class hierarchy...\n",
       "Compute transitive closure\n",
       "Processing graph\n",
@@ -1098,12 +1098,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# You can specify a literal path for the `splits_file_path`, or if another `go_class` instance is already defined, \n",
+    "# You can specify a literal path for the `splits_file_path`, or if another `go_class` instance is already defined,\n",
     "# you can use its existing `splits_file_path` attribute for consistency.\n",
     "go_class_with_splits = GOUniProtOver250(\n",
-    "    go_branch=\"BP\", \n",
+    "    go_branch=\"BP\",\n",
     "    # splits_file_path=\"data/GO_UniProt/GO250_BP_1002/processed/splits.csv\",  # Literal path option\n",
-    "    splits_file_path=go_class.splits_file_path  # Use path from an existing `go_class` instance\n",
+    "    splits_file_path=go_class.splits_file_path,  # Use path from an existing `go_class` instance\n",
     ")"
    ]
   },

From 98e3f9cf5b40624c2b5cb2f645608ed350b9ccd9 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 13:29:39 +0100
Subject: [PATCH 105/112] handle predictions and labels that are tuples (e.g.
 atom and bond predictions for gnn pretraining) in evaluation

---
 chebai/result/utils.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/chebai/result/utils.py b/chebai/result/utils.py
index d015bd80..80bf56e2 100644
--- a/chebai/result/utils.py
+++ b/chebai/result/utils.py
@@ -66,6 +66,13 @@ def _run_batch(batch, model, collate):
     return preds, labels
 
 
+def _concat_tuple(l):
+    if isinstance(l[0], tuple):
+        print(l[0])
+        return tuple([torch.cat([t[i] for t in l]) for i in range(len(l[0]))])
+    return torch.cat(l)
+
+
 def evaluate_model(
     model: ChebaiBaseNet,
     data_module: XYBaseDataModule,
@@ -125,12 +132,12 @@ def evaluate_model(
             if buffer_dir is not None:
                 if n_saved * batch_size >= save_batch_size:
                     torch.save(
-                        torch.cat(preds_list),
+                        _concat_tuple(preds_list),
                         os.path.join(buffer_dir, f"preds{save_ind:03d}.pt"),
                     )
                     if labels_list[0] is not None:
                         torch.save(
-                            torch.cat(labels_list),
+                            _concat_tuple(labels_list),
                             os.path.join(buffer_dir, f"labels{save_ind:03d}.pt"),
                         )
                     preds_list = []
@@ -141,20 +148,20 @@ def evaluate_model(
         n_saved += 1
 
     if buffer_dir is None:
-        test_preds = torch.cat(preds_list)
+        test_preds = _concat_tuple(preds_list)
         if labels_list is not None:
-            test_labels = torch.cat(labels_list)
+            test_labels = _concat_tuple(labels_list)
 
             return test_preds, test_labels
         return test_preds, None
     else:
         torch.save(
-            torch.cat(preds_list),
+            _concat_tuple(preds_list),
             os.path.join(buffer_dir, f"preds{save_ind:03d}.pt"),
         )
         if labels_list[0] is not None:
             torch.save(
-                torch.cat(labels_list),
+                _concat_tuple(labels_list),
                 os.path.join(buffer_dir, f"labels{save_ind:03d}.pt"),
             )
 

From 16013af7f6f021f300c0c4ebdd9596c5fb36a5b6 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 14:48:18 +0100
Subject: [PATCH 106/112] add processed_main file names

---
 chebai/preprocessing/datasets/base.py  | 65 ++++++++++++++------------
 chebai/preprocessing/datasets/chebi.py |  8 ++--
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
index f163a9e6..bb0f50d2 100644
--- a/chebai/preprocessing/datasets/base.py
+++ b/chebai/preprocessing/datasets/base.py
@@ -394,45 +394,61 @@ def setup_processed(self):
         raise NotImplementedError
 
     @property
-    def processed_file_names(self) -> List[str]:
+    def processed_dir_main_file_names_dict(self) -> dict:
         """
-        Returns the list of processed file names.
-
-        This property should be implemented by subclasses to provide the list of processed file names.
+        Returns a dictionary mapping processed data file names.
 
         Returns:
-            List[str]: The list of processed file names.
+            dict: A dictionary mapping dataset key to their respective file names.
+                  For example, {"data": "data.pkl"}.
         """
         raise NotImplementedError
 
     @property
-    def raw_file_names(self) -> List[str]:
+    def processed_dir_main_file_names(self) -> List[str]:
         """
-        Returns the list of raw file names.
+        Returns a list of file names for processed data (before tokenization).
+
+        Returns:
+            List[str]: A list of file names corresponding to the processed data.
+        """
+        return list(self.processed_dir_main_file_names_dict.values())
 
-        This property should be implemented by subclasses to provide the list of raw file names.
+    @property
+    def processed_file_names_dict(self) -> dict:
+        """
+        Returns a dictionary for the processed and tokenized data files.
 
         Returns:
-            List[str]: The list of raw file names.
+            dict: A dictionary mapping dataset keys to their respective file names.
+                  For example, {"data": "data.pt"}.
         """
         raise NotImplementedError
 
     @property
-    def processed_file_names_dict(self) -> dict:
+    def processed_file_names(self) -> List[str]:
+        """
+        Returns a list of file names for processed data.
+
+        Returns:
+            List[str]: A list of file names corresponding to the processed data.
         """
-        Returns the dictionary of processed file names.
+        return list(self.processed_file_names_dict.values())
 
-        This property should be implemented by subclasses to provide the dictionary of processed file names.
+    @property
+    def raw_file_names(self) -> List[str]:
+        """
+        Returns the list of raw file names.
 
         Returns:
-            dict: The dictionary of processed file names.
+            List[str]: The list of raw file names.
         """
-        raise NotImplementedError
+        return list(self.raw_file_names_dict.values())
 
     @property
     def raw_file_names_dict(self) -> dict:
         """
-        Returns the dictionary of raw file names.
+        Returns the dictionary of raw file names (i.e., files that are directly obtained from an external source).
 
         This property should be implemented by subclasses to provide the dictionary of raw file names.
 
@@ -1133,10 +1149,10 @@ def processed_dir(self) -> str:
     @property
     def processed_dir_main_file_names_dict(self) -> dict:
         """
-        Returns a dictionary mapping processed data file names, processed by `prepare_data` method.
+        Returns a dictionary mapping processed data file names.
 
         Returns:
-            dict: A dictionary mapping dataset types to their respective processed file names.
+            dict: A dictionary mapping dataset key to their respective file names.
                   For example, {"data": "data.pkl"}.
         """
         return {"data": "data.pkl"}
@@ -1144,21 +1160,10 @@ def processed_dir_main_file_names_dict(self) -> dict:
     @property
     def processed_file_names_dict(self) -> dict:
         """
-        Returns a dictionary mapping processed and transformed data file names to their final formats, which are
-        processed by `setup` method.
+        Returns a dictionary for the processed and tokenized data files.
 
         Returns:
-            dict: A dictionary mapping dataset types to their respective final file names.
+            dict: A dictionary mapping dataset keys to their respective file names.
                   For example, {"data": "data.pt"}.
         """
         return {"data": "data.pt"}
-
-    @property
-    def processed_file_names(self) -> List[str]:
-        """
-        Returns a list of file names for processed data.
-
-        Returns:
-            List[str]: A list of file names corresponding to the processed data.
-        """
-        return list(self.processed_file_names_dict.values())
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 9d80929a..1b49d0e2 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -216,9 +216,7 @@ def _load_chebi(self, version: int) -> str:
         Returns:
             str: The file path of the loaded ChEBI ontology.
         """
-        chebi_name = (
-            f"chebi.obo" if version == self.chebi_version else f"chebi_v{version}.obo"
-        )
+        chebi_name = self.raw_file_names_dict["chebi"]
         chebi_path = os.path.join(self.raw_dir, chebi_name)
         if not os.path.isfile(chebi_path):
             print(
@@ -540,6 +538,10 @@ def processed_dir(self) -> str:
         else:
             return os.path.join(res, f"single_{self.single_class}")
 
+    @property
+    def raw_file_names_dict(self) -> dict:
+        return {"chebi": "chebi.obo"}
+
 
 class JCIExtendedBase(_ChEBIDataExtractor):
 

From 9731f897466071f742ee06ad59444df5cb18b833 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 15:00:43 +0100
Subject: [PATCH 107/112] shorten processed_dir_main_file_names to
 processed_main_file_names

---
 chebai/preprocessing/datasets/base.py                | 12 ++++++------
 chebai/preprocessing/datasets/chebi.py               |  2 +-
 chebai/preprocessing/datasets/protein_pretraining.py |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
index bb0f50d2..73c2b2cd 100644
--- a/chebai/preprocessing/datasets/base.py
+++ b/chebai/preprocessing/datasets/base.py
@@ -394,7 +394,7 @@ def setup_processed(self):
         raise NotImplementedError
 
     @property
-    def processed_dir_main_file_names_dict(self) -> dict:
+    def processed_main_file_names_dict(self) -> dict:
         """
         Returns a dictionary mapping processed data file names.
 
@@ -405,14 +405,14 @@ def processed_dir_main_file_names_dict(self) -> dict:
         raise NotImplementedError
 
     @property
-    def processed_dir_main_file_names(self) -> List[str]:
+    def processed_main_file_names(self) -> List[str]:
         """
         Returns a list of file names for processed data (before tokenization).
 
         Returns:
             List[str]: A list of file names corresponding to the processed data.
         """
-        return list(self.processed_dir_main_file_names_dict.values())
+        return list(self.processed_main_file_names_dict.values())
 
     @property
     def processed_file_names_dict(self) -> dict:
@@ -721,7 +721,7 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None:
         """
         print("Checking for processed data in", self.processed_dir_main)
 
-        processed_name = self.processed_dir_main_file_names_dict["data"]
+        processed_name = self.processed_main_file_names_dict["data"]
         if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)):
             print("Missing processed data file (`data.pkl` file)")
             os.makedirs(self.processed_dir_main, exist_ok=True)
@@ -812,7 +812,7 @@ def setup_processed(self) -> None:
             self._load_data_from_file(
                 os.path.join(
                     self.processed_dir_main,
-                    self.processed_dir_main_file_names_dict["data"],
+                    self.processed_main_file_names_dict["data"],
                 )
             ),
             os.path.join(self.processed_dir, self.processed_file_names_dict["data"]),
@@ -1147,7 +1147,7 @@ def processed_dir(self) -> str:
         )
 
     @property
-    def processed_dir_main_file_names_dict(self) -> dict:
+    def processed_main_file_names_dict(self) -> dict:
         """
         Returns a dictionary mapping processed data file names.
 
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 1b49d0e2..d927a44c 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -185,7 +185,7 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None:
             if not os.path.isfile(
                 os.path.join(
                     self._chebi_version_train_obj.processed_dir_main,
-                    self._chebi_version_train_obj.processed_dir_main_file_names_dict[
+                    self._chebi_version_train_obj.processed_main_file_names_dict[
                         "data"
                     ],
                 )
diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py
index 8550db2b..6b5d1df0 100644
--- a/chebai/preprocessing/datasets/protein_pretraining.py
+++ b/chebai/preprocessing/datasets/protein_pretraining.py
@@ -64,7 +64,7 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None:
             *args: Additional positional arguments.
             **kwargs: Additional keyword arguments.
         """
-        processed_name = self.processed_dir_main_file_names_dict["data"]
+        processed_name = self.processed_main_file_names_dict["data"]
         if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)):
             print("Missing processed data file (`data.pkl` file)")
             os.makedirs(self.processed_dir_main, exist_ok=True)

From 32a8052e1479d8dcdbb57fbe6aca3db3cecae36a Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 15:09:29 +0100
Subject: [PATCH 108/112] run unittests only for tests/unit folder

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a75533f8..4ef35725 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,4 +24,4 @@ jobs:
           python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           python -m pip install -e .
       - name: Display Python version
-        run: python -m unittest
\ No newline at end of file
+        run: python -m unittest discover -s tests/unit

From 14741d3f18c86e874e454737a372e20d3fc4d407 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 15:15:32 +0100
Subject: [PATCH 109/112] remove T_co typehint (T_co does not exist for newer
 torch versions)

---
 chebai/preprocessing/structures.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/chebai/preprocessing/structures.py b/chebai/preprocessing/structures.py
index 1e384598..1fb3711a 100644
--- a/chebai/preprocessing/structures.py
+++ b/chebai/preprocessing/structures.py
@@ -2,7 +2,6 @@
 
 import networkx as nx
 import torch
-from torch.utils.data.dataset import T_co
 
 
 class XYData(torch.utils.data.Dataset):
@@ -23,7 +22,7 @@ def __init__(
         self.x = x
         self.y = y
 
-    def __getitem__(self, index: int) -> T_co:
+    def __getitem__(self, index: int):
         """Returns the data and target at the given index."""
         return self.x[index], self.y[index]
 

From fdabd961d2eb7c3250ed3d45654e45fc9cc0dff8 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 15:37:26 +0100
Subject: [PATCH 110/112] remove pypy version

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4ef35725..b0792c99 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11"]
 
     steps:
       - uses: actions/checkout@v4

From ccc5aea6f89b64993e9b9a1eec4dc978668612a2 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 17:44:56 +0100
Subject: [PATCH 111/112] move processed_dir_main to XYBaseDataModule

---
 chebai/preprocessing/datasets/base.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
index 73c2b2cd..dfa0f999 100644
--- a/chebai/preprocessing/datasets/base.py
+++ b/chebai/preprocessing/datasets/base.py
@@ -134,10 +134,15 @@ def base_dir(self) -> str:
             return self._base_dir
         return os.path.join("data", self._name)
 
+    @property
+    def processed_dir_main(self) -> str:
+        """Name of the directory where processed (but not tokenized) data is stored."""
+        return os.path.join(self.base_dir, "processed")
+
     @property
     def processed_dir(self) -> str:
-        """Name of the directory where the processed data is stored."""
-        return os.path.join(self.base_dir, "processed", *self.identifier)
+        """Name of the directory where the processed and tokenized data is stored."""
+        return os.path.join(self.processed_dir_main, *self.identifier)
 
     @property
     def raw_dir(self) -> str:
@@ -1133,19 +1138,6 @@ def processed_dir_main(self) -> str:
             "processed",
         )
 
-    @property
-    def processed_dir(self) -> str:
-        """
-        Returns the specific directory path for processed data, including identifiers.
-
-        Returns:
-            str: The path to the processed data directory, including additional identifiers.
-        """
-        return os.path.join(
-            self.processed_dir_main,
-            *self.identifier,
-        )
-
     @property
     def processed_main_file_names_dict(self) -> dict:
         """

From dfea71ee12dcdb687e23560ef98178881e20be91 Mon Sep 17 00:00:00 2001
From: sfluegel <sfluegel@ovgu.de>
Date: Wed, 6 Nov 2024 17:45:30 +0100
Subject: [PATCH 112/112] use processed-main instead of raw file for BCE
 weights

---
 chebai/loss/bce_weighted.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/chebai/loss/bce_weighted.py b/chebai/loss/bce_weighted.py
index b69fff43..c00756e6 100644
--- a/chebai/loss/bce_weighted.py
+++ b/chebai/loss/bce_weighted.py
@@ -43,8 +43,10 @@ def set_pos_weight(self, input: torch.Tensor) -> None:
             self.beta is not None
             and self.data_extractor is not None
             and all(
-                os.path.exists(os.path.join(self.data_extractor.raw_dir, raw_file))
-                for raw_file in self.data_extractor.raw_file_names
+                os.path.exists(
+                    os.path.join(self.data_extractor.processed_dir_main, file_name)
+                )
+                for file_name in self.data_extractor.processed_main_file_names
             )
             and self.pos_weight is None
         ):
@@ -53,13 +55,13 @@ def set_pos_weight(self, input: torch.Tensor) -> None:
                     pd.read_pickle(
                         open(
                             os.path.join(
-                                self.data_extractor.raw_dir,
-                                raw_file_name,
+                                self.data_extractor.processed_dir_main,
+                                file_name,
                             ),
                             "rb",
                         )
                     )
-                    for raw_file_name in self.data_extractor.raw_file_names
+                    for file_name in self.data_extractor.processed_main_file_names
                 ]
             )
             value_counts = []