From d983fcfe411d8c38f148c5542b21ce57daf39d48 Mon Sep 17 00:00:00 2001 From: Martin Glauer Date: Thu, 8 Aug 2024 10:46:17 +0200 Subject: [PATCH 001/112] Add actions for unittests --- .github/workflows/test.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..a687fdda --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,20 @@ +name: Unittests + +on: [pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Display Python version + run: python -m unittest \ No newline at end of file From 491428d6a445265fad45450c8212b8af9b3a3285 Mon Sep 17 00:00:00 2001 From: Martin Glauer Date: Thu, 8 Aug 2024 10:52:59 +0200 Subject: [PATCH 002/112] Add dependencies to test scipt --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a687fdda..28d9b4cb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,5 +16,10 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade pip setuptools wheel + pip install -e . - name: Display Python version run: python -m unittest \ No newline at end of file From 04615d91769002ca9e6b092cfe3699a293909d0c Mon Sep 17 00:00:00 2001 From: Martin Glauer Date: Thu, 8 Aug 2024 11:01:08 +0200 Subject: [PATCH 003/112] Install cpu-based version of torch --- .github/workflows/test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 28d9b4cb..f2143ff2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,6 +20,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install --upgrade pip setuptools wheel - pip install -e . + python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + python -m pip install -e . - name: Display Python version run: python -m unittest \ No newline at end of file From 605064425dd220c8f19627c2386663b4665fd015 Mon Sep 17 00:00:00 2001 From: Martin Glauer Date: Thu, 8 Aug 2024 11:17:13 +0200 Subject: [PATCH 004/112] Disable fail-fast --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f2143ff2..a75533f8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,6 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11"] From 6bb1a85ae3503e39b379ccddaa6e0e6358d514da Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 26 Aug 2024 19:45:34 +0200 Subject: [PATCH 005/112] Create data_exploration.ipynb --- data_exploration.ipynb | 637 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 637 insertions(+) create mode 100644 data_exploration.ipynb diff --git a/data_exploration.ipynb b/data_exploration.ipynb new file mode 100644 index 00000000..6f1045a4 --- /dev/null +++ b/data_exploration.ipynb @@ -0,0 +1,637 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "id": "81559360-c8b8-462d-bfa1-6ae22bed1615", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "# Ignore all warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "33275d3c-cdbf-4c1f-aa04-f135511f3643", + "metadata": {}, + "source": [ + "# 1. Instantiation of a Data Class\r\n", + "\r\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "### Inheritance Hierarchy\n", + "\n", + "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + ".\r\n", + "\r\n", + "### Explanation\r\n", + "a ChEBI data classiData` class can be configured with the following main parameters:\r\n", + "\r\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\r\n", + "\r\n", + "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\r\n", + "\r\n", + "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\r\n", + "\r\n", + "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\r\n", + "\r\n", + "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\r\n", + "\r\n", + "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", + "\r\n", + "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\r\n", + "\r\n", + "### Additional Input Parameters\r\n", + "\r\n", + "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\r\n", + "\r\n", + "- **batch_size (int)**: The batch size for data loading. Default is `1`.\r\n", + "\r\n", + "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\r\n", + "\r\n", + "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\r\n", + "\r\n", + "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\r\n", + "\r\n", + "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\r\n", + "\r\n", + "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\r\n", + "\r\n", + "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\r\n", + "\r\n", + "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\r\n", + "\r\n", + "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\r\n", + "\r\n", + "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\r\n", + "\r\n", + "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\r\n", + "\r\n", + "- **kwargs**: Additional keyword arguments.\r\n", + "\r\n", + "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\r\n", + "ining and validation.\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", + "metadata": {}, + "source": [ + "# Available ChEBI Data Classes\n", + "\n", + "## `ChEBIOver100`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver50`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver100DeepSMILES`\n", + "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver100SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver50SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", + "\n", + "## `ChEBIOver50Partial`\n", + "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "metadata": {}, + "outputs": [], + "source": [ + "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", + "metadata": {}, + "outputs": [], + "source": [ + "chebi_class = ChEBIOver50(chebi_version=231)" + ] + }, + { + "cell_type": "markdown", + "id": "8456b545-88c5-401d-baa5-47e8ae710f04", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "1655d489-25fe-46de-9feb-eeca5d36936f", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\r\n", + "\r\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\r\n", + "\r\n", + "### Why is Preparation Needed?\r\n", + "\r\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\r\n", + "- **Data Integrity**: It ensures that the data files are up-to-date and compatible with the specified ChEBI version.\r\n", + "\r\n", + "### Main Methods for Data Preprocessing\r\n", + "\r\n", + "The data preprocessing in a data class involves two main methods:\r\n", + "\r\n", + "1. **`prepare_data` Method**:\r\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\r\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\r\n", + "\r\n", + "2. **`setup` Method**:\r\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\r\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\r\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\r\n", + "\r\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes.\r\n", + "alidation processes.\r\n", + "processed(data_df, processed_name)\r\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", + "Cross-validation enabled: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", + "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", + "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" + ] + } + ], + "source": [ + "chebi_class.prepare_data()\n", + "chebi_class.setup()" + ] + }, + { + "cell_type": "markdown", + "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", + "metadata": {}, + "source": [ + "# 3. Different Data Files Created and their Structure\n", + "\r\n", + "\r\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\r\n", + "\r\n", + "### Data Files\r\n", + "\r\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file)\r\n", + " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", + "\r\n", + "2. **`data.pkl`**\r\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", + "\r\n", + "3. **`data.pt`**\r\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", + "\r\n", + "4. **`classes.txt`**\r\n", + " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\r\n", + "\r\n", + "5. **`splits.csv`**\r\n", + " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\r\n", + "\r\n", + "### File Structure and Preprocessing Stages\r\n", + "\r\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\r\n", + "\r\n", + "1. **Raw Data Stage**:\r\n", + " - **File**: `chebi.obo`\r\n", + " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", + "\r\n", + "2. **Processed Data Stage 1**:\r\n", + " - **File**: `data.pkl`\r\n", + " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\r\n", + "\r\n", + "3. **Processed Data Stage 2**:\r\n", + " - **File**: `data.pt`\r\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\r\n", + "\r\n", + "### Data Splits\r\n", + "\r\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\r\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\r\n", + "\r\n", + "### Summary of File Paths\r\n", + "\r\n", + "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\r\n", + "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\r\n", + "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\r\n", + "\r\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\r\n", + "that each step is well-documented and reproducible.\r\n", + "sing, from raw input to model-ready formats.\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "74adb549-9e02-472d-a535-78a584853b52", + "metadata": {}, + "source": [ + "# 4. Information Stored in the Files\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", + "metadata": {}, + "source": [ + "\n", + "## data.pkl\n", + "\n", + "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", + "\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the ID of each ChEBI data instance.\n", + "- **Column 1**: Contains the name of each ChEBI data instance.\n", + "- **Column 2**: Contains the SMILES representation of the chemical.\n", + "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (129184, 1335)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameSMILES1722246825712580263430983992...143017143212143813146180147334156473166828166904167497167559
033429monoatomic monoanion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
130151aluminide(1-)[Al-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
216042halide anion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
317051fluoride[F-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
428741sodium fluoride[F-].[Na+]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1335 columns

\n", + "
" + ], + "text/plain": [ + " id name SMILES 1722 2468 2571 2580 2634 \\\n", + "0 33429 monoatomic monoanion [*-] False False False False False \n", + "1 30151 aluminide(1-) [Al-] False False False False False \n", + "2 16042 halide anion [*-] False False False False False \n", + "3 17051 fluoride [F-] False False False False False \n", + "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", + "\n", + " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", + "0 False False ... False False False False False False False \n", + "1 False False ... False False False False False False False \n", + "2 False False ... False False False False False False False \n", + "3 False False ... False False False False False False False \n", + "4 False False ... False False False False False False False \n", + "\n", + " 166904 167497 167559 \n", + "0 False False False \n", + "1 False False False \n", + "2 False False False \n", + "3 False False False \n", + "4 False False False \n", + "\n", + "[5 rows x 1335 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", + "metadata": {}, + "source": [ + "# 6. Example Molecule: Different Encodings\n", + "\n", + "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", + "\n", + "### Explanation:\n", + "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", + "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "5b0f7974-f262-429c-b064-4207277e22ad", + "metadata": {}, + "source": [ + "# 7. Additional Useful Features\n", + "\n", + "- **Substructure Search**: `chebai` allows you to perform substructure searches within the ChEBI database.\n", + "- **Property Filters**: You can filter molecules based on specific properties, such as molecular weight or charge.\n", + "- **Visualization**: `chebai` provides tools for visualizing molecular structures directly within the notebook.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "314801c7-9a1c-4247-9809-497f8481ac90", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "This notebook provided an introduction to the `chebai` package, focusing on how data is structured and utilized. With this knowledge, you can start exploring chemical data more effectively using `chebai`." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (env_chebai)", + "language": "python", + "name": "env_chebai" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 830184f6886a42f293c2ff702c0509aff29ca9cb Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 00:04:40 +0200 Subject: [PATCH 006/112] added information stored in files --- data_exploration.ipynb | 289 +++++++++++++++++++++++++++++++++++------ 1 file changed, 251 insertions(+), 38 deletions(-) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index 6f1045a4..c4d60ab2 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -1,18 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 16, - "id": "81559360-c8b8-462d-bfa1-6ae22bed1615", - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "# Ignore all warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, { "cell_type": "markdown", "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", @@ -314,13 +301,51 @@ ] }, { - "cell_type": "code", - "execution_count": 49, - "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "cell_type": "markdown", + "id": "43329709-5134-4ce5-88e7-edd2176bf84d", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd" + "## chebi.obo\n", + "\n", + "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", + "\n", + "### Structure of `chebi.obo`\n", + "\n", + "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "\n", + "#### Example of a Term Document\n", + "\n", + "```plaintext\n", + "[Term]\n", + "id: CHEBI:24867\n", + "name: monoatomic ion\n", + "subset: 3_STAR\n", + "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", + "is_a: CHEBI:24870\n", + "is_a: CHEBI:33238\n", + "```0\r\n", + "is_a: CHEBI:3323Relevant 8\r\n", + "```\r\n", + "\r\n", + "### Breakdown of Attributes\r\n", + "\r\n", + "Each term document in the `chebi.obo` file consists of the following key attributes:\r\n", + "\r\n", + "- **`[Term]`**: \r\n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\r\n", + "\r\n", + "- **`id: CHEBI:24867`**: \r\n", + " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\r\n", + " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\r\n", + "\r\n", + "- **`name: monoatomic ion`**: \r\n", + " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\r\n", + " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\r\n", + "\r\n", + "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \r\n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\r\n", + " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats.\r\n", + "```" ] }, { @@ -345,6 +370,16 @@ "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" ] }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, { "cell_type": "code", "execution_count": 53, @@ -566,50 +601,228 @@ }, { "cell_type": "markdown", - "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", + "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", "metadata": {}, "source": [ - "---" + "## `data.pt` File\n", + "\n", + "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", + "\n", + "### Structure of `data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", + "metadata": {}, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n" + ] + } + ], + "source": [ + "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n", + "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n", + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n", + "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n", + "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n" + ] + } + ], + "source": [ + "for i in range(5):\n", + " print(data_pt[i])" ] }, { "cell_type": "markdown", - "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", + "id": "861da1c3-0401-49f0-a22f-109814ed95d5", "metadata": {}, "source": [ - "# 6. Example Molecule: Different Encodings\n", + "## `classes.txt` File\n", "\n", - "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", + "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", "\n", - "### Explanation:\n", - "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", - "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", - "\n", - "---" + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1722\n", + "2468\n", + "2571\n", + "2580\n", + "2634\n" + ] + } + ], + "source": [ + "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", + " for i in range(5):\n", + " line = file.readline()\n", + " print(line.strip())" ] }, { "cell_type": "markdown", - "id": "5b0f7974-f262-429c-b064-4207277e22ad", + "id": "b058714f-e434-4367-89b9-74c129ac727f", + "metadata": {}, + "source": [ + "## `splits.csv`\r\n", + "\r\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
033429train
130151train
217051train
332129train
430340train
\n", + "
" + ], + "text/plain": [ + " id split\n", + "0 33429 train\n", + "1 30151 train\n", + "2 17051 train\n", + "3 32129 train\n", + "4 30340 train" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", + "csv_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", "metadata": {}, "source": [ - "# 7. Additional Useful Features\n", - "\n", - "- **Substructure Search**: `chebai` allows you to perform substructure searches within the ChEBI database.\n", - "- **Property Filters**: You can filter molecules based on specific properties, such as molecular weight or charge.\n", - "- **Visualization**: `chebai` provides tools for visualizing molecular structures directly within the notebook.\n", - "\n", "---" ] }, { "cell_type": "markdown", - "id": "314801c7-9a1c-4247-9809-497f8481ac90", + "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", "metadata": {}, "source": [ - "# Conclusion\n", + "# 6. Example Molecule: Different Encodings\n", + "\n", + "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", "\n", - "This notebook provided an introduction to the `chebai` package, focusing on how data is structured and utilized. With this knowledge, you can start exploring chemical data more effectively using `chebai`." + "### Explanation:\n", + "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", + "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", + "\n", + "---" ] } ], From 7005a69c420b95cfe4e0ad4a23414ccc90858199 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 00:29:31 +0200 Subject: [PATCH 007/112] Molecule: Different Encodings --- data_exploration.ipynb | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index c4d60ab2..e36fc1fe 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -353,8 +353,7 @@ "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", "metadata": {}, "source": [ - "\n", - "## data.pkl\n", + "## `data.pkl` File\n", "\n", "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", "\n", @@ -716,7 +715,7 @@ "id": "b058714f-e434-4367-89b9-74c129ac727f", "metadata": {}, "source": [ - "## `splits.csv`\r\n", + "## `splits.csv` File\r\n", "\r\n", "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n" ] @@ -814,7 +813,7 @@ "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", "metadata": {}, "source": [ - "# 6. Example Molecule: Different Encodings\n", + "# 5. Example Molecule: Different Encodings\n", "\n", "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", "\n", @@ -822,7 +821,40 @@ "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", "\n", - "---" + "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\r\n", + "\r\n", + "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\r\n", + " - **Benzene SMILES**: `c1ccccc1`\r\n", + " - **Explanation**: \r\n", + " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\r\n", + "\r\n", + "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\r\n", + " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\r\n", + " - **Explanation**: \r\n", + " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\r\n", + " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\r\n", + "\r\n", + "### 3. **InChI (IUPAC International Chemical Identifier)**\r\n", + " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\r\n", + " - **Explanation**: \r\n", + " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\r\n", + "\r\n", + "### 4. **InChIKey**\r\n", + " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\r\n", + " - **Explanation**: \r\n", + " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\r\n", + "\r\n", + "### 5. **Canonical SMILES**\r\n", + " - **Benzene Canonical SMILES**: `c1ccccc1`\r\n", + " - **Explanation**:\r\n", + " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\r\n", + "\r\n", + "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\r\n", + " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\r\n", + " - **Explanation**: \r\n", + " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\r\n", + "\r\n", + "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools." ] } ], From 13aa945938079e265aa28947e9509a5484d03a2d Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 11:24:05 +0200 Subject: [PATCH 008/112] add info related to protein dataset --- data_exploration.ipynb | 418 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 418 insertions(+) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index e36fc1fe..b0c9e78f 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -856,6 +856,424 @@ "\r\n", "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools." ] + }, + { + "cell_type": "markdown", + "id": "93e328cf-09f9-4694-b175-28320590937d", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde", + "metadata": {}, + "source": [ + "# Information for Protein Dataset\r\n", + "\r\n", + "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\r\n", + "\r\n", + "### Configuration Parameters\r\n", + "\r\n", + "Data classes related to proteins can be configured using the following main parameters:\r\n", + "\r\n", + "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\r\n", + "\r\n", + "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\r\n", + "\r\n", + "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\r\n", + "\r\n", + "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", + "\r\n", + "### Available GOUniProt Data Classes\r\n", + "\r\n", + "#### `GOUniProtOver250`\r\n", + "\r\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\r\n", + "\r\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", + "\r\n", + "#### `GOUniProtOver50`\r\n", + "\r\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\r\n", + "\r\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", + "\r\n", + "### Instantiation Example\r\n", + "\r\n", + "```python\r\n", + "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\r\n", + "go_class = GOUniProtOver250()\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "2ffca830-bc0b-421c-8054-0860c95c10f2", + "metadata": {}, + "source": [ + "## GOUniProt Data File Structure\r\n", + "\r\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\r\n", + " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\r\n", + " - **File Paths**:\r\n", + " - `data/GO_UniProt/raw/${filename}.obo`\r\n", + " - `data/GO_UniProt/raw/${filename}.dat`\r\n", + "\r\n", + "2. **`data.pkl`**\r\n", + " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\r\n", + "\r\n", + "3. **`data.pt`**\r\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\r\n", + "\r\n", + "4. **`classes.txt`**\r\n", + " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\r\n", + "\r\n", + "5. **`splits.csv`**\r\n", + " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\r\n", + "\r\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\r\n", + "}/processed/splits.csv`\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "61bc261e-2328-4968-aca6-14c48bb24348", + "metadata": {}, + "source": [ + "## data.pkl" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "31df4ee7-4c03-4ea2-9798-5e5082a74c2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (27459, 1050)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
swiss_idaccessiongo_idssequence4175122165209226...2000145200014620001472000241200024320003772001020200114120012332001234
814331_ARATHP42643,Q945M2,Q9M0S7[19222]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1014331_MAIZEP49106[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1314332_MAIZEQ01526[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1414333_ARATHP42644,F4KBI7,Q945L2[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1050 columns

\n", + "
" + ], + "text/plain": [ + " swiss_id accession \\\n", + "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", + "9 14331_CAEEL P41932,Q21537 \n", + "10 14331_MAIZE P49106 \n", + "13 14332_MAIZE Q01526 \n", + "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", + "\n", + " go_ids \\\n", + "8 [19222] \n", + "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", + "10 [3677, 5634, 10468, 44877] \n", + "13 [3677, 5634, 10468, 44877] \n", + "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", + "\n", + " sequence 41 75 122 \\\n", + "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", + "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", + "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", + "\n", + " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", + "8 False False False ... False False False False False \n", + "9 False False False ... False False False False False \n", + "10 False False False ... False False False False False \n", + "13 False False False ... False False False False False \n", + "14 False False False ... False False False False False \n", + "\n", + " 2000377 2001020 2001141 2001233 2001234 \n", + "8 False False False False False \n", + "9 False False False False False \n", + "10 False False False False False \n", + "13 False False False False False \n", + "14 False False False False False \n", + "\n", + "[5 rows x 1050 columns]" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "be0078fd-bcf1-4d4c-b8c6-c84e3aeac99c", + "metadata": {}, + "source": [ + "## data.pt" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "a70f9c35-daca-4728-a9ea-b1212866f421", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n", + "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n" + ] + } + ], + "source": [ + "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))\n", + "for i in range(1):\n", + " print(data_pt[i])" + ] + }, + { + "cell_type": "markdown", + "id": "380049c1-2963-4223-b698-a7b59b9fe595", + "metadata": {}, + "source": [ + "## Protein Representation Using Amino Acid Sequence Notation\n", + "\n", + "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", + "\n", + "### Example Protein Sequence\n", + "\n", + "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", + "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", + "\n", + "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", + "- **Sequence Length**: 147\n", + "\n", + "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", + "\n", + "### The 20 Amino Acids and Their One-Letter Notations\n", + "\n", + "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", + "\n", + "| One-Letter Notation | Amino Acid Name | Description |\n", + "|---------------------|----------------------|---------------------------------------------------------|\n", + "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", + "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", + "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **F** | Phenylalanine | Aromatic, non-polar. |\n", + "| **G** | Glycine | Smallest amino acid, non-polar. |\n", + "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", + "| **I** | Isoleucine | Non-polar, aliphatic. |\n", + "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", + "| **L** | Leucine | Non-polar, aliphatic. |\n", + "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", + "| **N** | Asparagine | Polar, uncharged. |\n", + "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", + "| **Q** | Glutamine | Polar, uncharged. |\n", + "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", + "| **S** | Serine | Polar, can be phosphorylated. |\n", + "| **T** | Threonine | Polar, can be phosphorylated. |\n", + "| **V** | Valine | Non-polar, aliphatic. |\n", + "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", + "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", + "\n", + "### Understanding Protein Sequences\n", + "\n", + "In the example sequence `MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQGQL`, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", + "\n", + "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", + "\n", + "\n", + "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" + ] + }, + { + "cell_type": "markdown", + "id": "702359d6-5338-4391-b196-2328ba5676a1", + "metadata": {}, + "source": [ + "---" + ] } ], "metadata": { From 0e4814fde3f5b365587912729eba6ef5aba131c6 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 12:33:37 +0200 Subject: [PATCH 009/112] fix - jupyter markdown cells formatting issue - https://github.com/jupyter/notebook/issues/7002 - Fix using notebook formatter provided by pycharm professional --- data_exploration.ipynb | 512 ++++++++++++++++++++--------------------- 1 file changed, 252 insertions(+), 260 deletions(-) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index b0c9e78f..8cd834b1 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -14,11 +14,11 @@ }, { "cell_type": "markdown", - "id": "33275d3c-cdbf-4c1f-aa04-f135511f3643", + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", "metadata": {}, "source": [ - "# 1. Instantiation of a Data Class\r\n", - "\r\n", + "# 1. Instantiation of a Data Class\n", + "\n", "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", "### Inheritance Hierarchy\n", "\n", @@ -29,55 +29,54 @@ "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", "\n", "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - ".\r\n", - "\r\n", - "### Explanation\r\n", - "a ChEBI data classiData` class can be configured with the following main parameters:\r\n", - "\r\n", - "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\r\n", - "\r\n", - "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\r\n", - "\r\n", - "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\r\n", - "\r\n", - "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\r\n", - "\r\n", - "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\r\n", - "\r\n", - "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", - "\r\n", - "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\r\n", - "\r\n", - "### Additional Input Parameters\r\n", - "\r\n", - "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\r\n", - "\r\n", - "- **batch_size (int)**: The batch size for data loading. Default is `1`.\r\n", - "\r\n", - "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\r\n", - "\r\n", - "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\r\n", - "\r\n", - "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\r\n", - "\r\n", - "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\r\n", - "\r\n", - "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\r\n", - "\r\n", - "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\r\n", - "\r\n", - "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\r\n", - "\r\n", - "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\r\n", - "\r\n", - "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\r\n", - "\r\n", - "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\r\n", - "\r\n", - "- **kwargs**: Additional keyword arguments.\r\n", - "\r\n", - "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\r\n", - "ining and validation.\r\n" + "\n", + "\n", + "### Explanation\n", + "A ChEBI data class can be configured with the following main parameters:\n", + "\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", + "\n", + "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", + "\n", + "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\n", + "\n", + "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\n", + "\n", + "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", + "\n", + "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\n", + "\n", + "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", + "\n", + "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n", + "\n", + "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\n", + "\n", + "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\n", + "\n", + "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\n", + "\n", + "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\n", + "\n", + "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\n", + "\n", + "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\n", + "\n", + "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\n", + "\n", + "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\n", + "\n", + "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\n", + "\n", + "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\n", + "\n", + "- **kwargs**: Additional keyword arguments.\n", + "\n", + "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\n" ] }, { @@ -151,31 +150,29 @@ "id": "1655d489-25fe-46de-9feb-eeca5d36936f", "metadata": {}, "source": [ - "# 2. Preparation / Setup Methods\r\n", - "\r\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\r\n", - "\r\n", - "### Why is Preparation Needed?\r\n", - "\r\n", - "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\r\n", - "- **Data Integrity**: It ensures that the data files are up-to-date and compatible with the specified ChEBI version.\r\n", - "\r\n", - "### Main Methods for Data Preprocessing\r\n", - "\r\n", - "The data preprocessing in a data class involves two main methods:\r\n", - "\r\n", - "1. **`prepare_data` Method**:\r\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\r\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\r\n", - "\r\n", - "2. **`setup` Method**:\r\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\r\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\r\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\r\n", - "\r\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes.\r\n", - "alidation processes.\r\n", - "processed(data_df, processed_name)\r\n" + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." ] }, { @@ -221,67 +218,65 @@ "metadata": {}, "source": [ "# 3. Different Data Files Created and their Structure\n", - "\r\n", - "\r\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\r\n", - "\r\n", - "### Data Files\r\n", - "\r\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file)\r\n", - " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", - "\r\n", - "2. **`data.pkl`**\r\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", - "\r\n", - "3. **`data.pt`**\r\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", - "\r\n", - "4. **`classes.txt`**\r\n", - " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\r\n", - "\r\n", - "5. **`splits.csv`**\r\n", - " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\r\n", - "\r\n", - "### File Structure and Preprocessing Stages\r\n", - "\r\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\r\n", - "\r\n", - "1. **Raw Data Stage**:\r\n", - " - **File**: `chebi.obo`\r\n", - " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", - "\r\n", - "2. **Processed Data Stage 1**:\r\n", - " - **File**: `data.pkl`\r\n", - " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", - " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\r\n", - "\r\n", - "3. **Processed Data Stage 2**:\r\n", - " - **File**: `data.pt`\r\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\r\n", - "\r\n", - "### Data Splits\r\n", - "\r\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\r\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\r\n", - "\r\n", - "### Summary of File Paths\r\n", - "\r\n", - "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\r\n", - "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\r\n", - "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\r\n", - "\r\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\r\n", - "that each step is well-documented and reproducible.\r\n", - "sing, from raw input to model-ready formats.\r\n" + "\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", + "\n", + "### Data Files\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", + " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", + "\n", + "### File Structure and Preprocessing Stages\n", + "\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **File**: `chebi.obo`\n", + " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", + "\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", + "\n", + "### Summary of File Paths\n", + "\n", + "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", + "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", + "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." ] }, { @@ -323,29 +318,27 @@ "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", "is_a: CHEBI:24870\n", "is_a: CHEBI:33238\n", - "```0\r\n", - "is_a: CHEBI:3323Relevant 8\r\n", - "```\r\n", - "\r\n", - "### Breakdown of Attributes\r\n", - "\r\n", - "Each term document in the `chebi.obo` file consists of the following key attributes:\r\n", - "\r\n", - "- **`[Term]`**: \r\n", - " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\r\n", - "\r\n", - "- **`id: CHEBI:24867`**: \r\n", - " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\r\n", - " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\r\n", - "\r\n", - "- **`name: monoatomic ion`**: \r\n", - " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\r\n", - " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\r\n", - "\r\n", - "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \r\n", - " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\r\n", - " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats.\r\n", - "```" + "is_a: CHEBI:3323Relevant 8\n", + "```\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each term document in the `chebi.obo` file consists of the following key attributes:\n", + "\n", + "- **`[Term]`**: \n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n", + "\n", + "- **`id: CHEBI:24867`**: \n", + " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n", + " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n", + "\n", + "- **`name: monoatomic ion`**: \n", + " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n", + " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n", + "\n", + "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n", + " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats." ] }, { @@ -715,9 +708,9 @@ "id": "b058714f-e434-4367-89b9-74c129ac727f", "metadata": {}, "source": [ - "## `splits.csv` File\r\n", - "\r\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n" + "## `splits.csv` File\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" ] }, { @@ -821,40 +814,40 @@ "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", "\n", - "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\r\n", - "\r\n", - "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\r\n", - " - **Benzene SMILES**: `c1ccccc1`\r\n", - " - **Explanation**: \r\n", - " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\r\n", - "\r\n", - "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\r\n", - " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\r\n", - " - **Explanation**: \r\n", - " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\r\n", - " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\r\n", - "\r\n", - "### 3. **InChI (IUPAC International Chemical Identifier)**\r\n", - " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\r\n", - " - **Explanation**: \r\n", - " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\r\n", - "\r\n", - "### 4. **InChIKey**\r\n", - " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\r\n", - " - **Explanation**: \r\n", - " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\r\n", - "\r\n", - "### 5. **Canonical SMILES**\r\n", - " - **Benzene Canonical SMILES**: `c1ccccc1`\r\n", - " - **Explanation**:\r\n", - " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\r\n", - "\r\n", - "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\r\n", - " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\r\n", - " - **Explanation**: \r\n", - " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\r\n", - "\r\n", - "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools." + "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n", + "\n", + "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n", + " - **Benzene SMILES**: `c1ccccc1`\n", + " - **Explanation**: \n", + " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n", + "\n", + "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n", + " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n", + " - **Explanation**: \n", + " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n", + " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n", + "\n", + "### 3. **InChI (IUPAC International Chemical Identifier)**\n", + " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\n", + " - **Explanation**: \n", + " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\n", + "\n", + "### 4. **InChIKey**\n", + " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\n", + " - **Explanation**: \n", + " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\n", + "\n", + "### 5. **Canonical SMILES**\n", + " - **Benzene Canonical SMILES**: `c1ccccc1`\n", + " - **Explanation**:\n", + " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\n", + "\n", + "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\n", + " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\n", + " - **Explanation**: \n", + " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\n", + "\n", + "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics." ] }, { @@ -870,41 +863,41 @@ "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde", "metadata": {}, "source": [ - "# Information for Protein Dataset\r\n", - "\r\n", - "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\r\n", - "\r\n", - "### Configuration Parameters\r\n", - "\r\n", - "Data classes related to proteins can be configured using the following main parameters:\r\n", - "\r\n", - "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\r\n", - "\r\n", - "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\r\n", - "\r\n", - "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\r\n", - "\r\n", - "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", - "\r\n", - "### Available GOUniProt Data Classes\r\n", - "\r\n", - "#### `GOUniProtOver250`\r\n", - "\r\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\r\n", - "\r\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", - "\r\n", - "#### `GOUniProtOver50`\r\n", - "\r\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\r\n", - "\r\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", - "\r\n", - "### Instantiation Example\r\n", - "\r\n", - "```python\r\n", - "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\r\n", - "go_class = GOUniProtOver250()\r\n" + "# Information for Protein Dataset\n", + "\n", + "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\n", + "\n", + "### Configuration Parameters\n", + "\n", + "Data classes related to proteins can be configured using the following main parameters:\n", + "\n", + "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", + "\n", + "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\n", + "\n", + "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", + "\n", + "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\n", + "\n", + "### Available GOUniProt Data Classes\n", + "\n", + "#### `GOUniProtOver250`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "\n", + "#### `GOUniProtOver50`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "\n", + "### Instantiation Example\n", + "\n", + "```python\n", + "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\n", + "go_class = GOUniProtOver250()\n" ] }, { @@ -912,32 +905,31 @@ "id": "2ffca830-bc0b-421c-8054-0860c95c10f2", "metadata": {}, "source": [ - "## GOUniProt Data File Structure\r\n", - "\r\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\r\n", - " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\r\n", - " - **File Paths**:\r\n", - " - `data/GO_UniProt/raw/${filename}.obo`\r\n", - " - `data/GO_UniProt/raw/${filename}.dat`\r\n", - "\r\n", - "2. **`data.pkl`**\r\n", - " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\r\n", - "\r\n", - "3. **`data.pt`**\r\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\r\n", - "\r\n", - "4. **`classes.txt`**\r\n", - " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\r\n", - "\r\n", - "5. **`splits.csv`**\r\n", - " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\r\n", - "\r\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\r\n", - "}/processed/splits.csv`\r\n" + "## GOUniProt Data File Structure\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", + " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", + " - **File Paths**:\n", + " - `data/GO_UniProt/raw/${filename}.obo`\n", + " - `data/GO_UniProt/raw/${filename}.dat`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", + "\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" ] }, { @@ -1259,7 +1251,7 @@ "\n", "### Understanding Protein Sequences\n", "\n", - "In the example sequence `MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQGQL`, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", + "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", "\n", "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", "\n", From 8539f3bc3f1376dcf98eecfa06de6258f7a0b77a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 12:34:47 +0200 Subject: [PATCH 010/112] move to tutorials dir --- data_exploration.ipynb => tutorials/data_exploration.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data_exploration.ipynb => tutorials/data_exploration.ipynb (100%) diff --git a/data_exploration.ipynb b/tutorials/data_exploration.ipynb similarity index 100% rename from data_exploration.ipynb rename to tutorials/data_exploration.ipynb From cc5bc08d31ca7bbd1731144f96e44647ace78f82 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 29 Aug 2024 21:07:45 +0200 Subject: [PATCH 011/112] move previous tests to integration dir --- tests/integration/__init__.py | 3 +++ tests/{ => integration}/testChebiData.py | 0 .../{ => integration}/testChebiDynamicDataSplits.py | 0 .../testCustomBalancedAccuracyMetric.py | 0 tests/{ => integration}/testCustomMacroF1Metric.py | 0 tests/{ => integration}/testPubChemData.py | 0 tests/{ => integration}/testTox21MolNetData.py | 0 .../test_data/ChEBIOver100_test/labels000.pt | Bin .../test_data/ChEBIOver100_test/labels001.pt | Bin .../test_data/ChEBIOver100_test/labels002.pt | Bin .../test_data/ChEBIOver100_test/labels003.pt | Bin .../test_data/ChEBIOver100_test/labels004.pt | Bin .../test_data/ChEBIOver100_test/labels005.pt | Bin .../test_data/ChEBIOver100_test/labels006.pt | Bin .../test_data/ChEBIOver100_test/labels007.pt | Bin .../test_data/ChEBIOver100_test/labels008.pt | Bin .../test_data/ChEBIOver100_test/labels009.pt | Bin .../test_data/ChEBIOver100_test/labels010.pt | Bin .../test_data/ChEBIOver100_test/labels011.pt | Bin .../test_data/ChEBIOver100_test/labels012.pt | Bin .../test_data/ChEBIOver100_test/labels013.pt | Bin .../test_data/ChEBIOver100_test/labels014.pt | Bin .../test_data/ChEBIOver100_test/labels015.pt | Bin .../test_data/ChEBIOver100_test/labels016.pt | Bin .../test_data/ChEBIOver100_test/labels017.pt | Bin .../test_data/ChEBIOver100_test/labels018.pt | Bin .../test_data/ChEBIOver100_test/labels019.pt | Bin .../test_data/ChEBIOver100_test/preds000.pt | Bin .../test_data/ChEBIOver100_test/preds001.pt | Bin .../test_data/ChEBIOver100_test/preds002.pt | Bin .../test_data/ChEBIOver100_test/preds003.pt | Bin .../test_data/ChEBIOver100_test/preds004.pt | Bin .../test_data/ChEBIOver100_test/preds005.pt | Bin .../test_data/ChEBIOver100_test/preds006.pt | Bin .../test_data/ChEBIOver100_test/preds007.pt | Bin .../test_data/ChEBIOver100_test/preds008.pt | Bin .../test_data/ChEBIOver100_test/preds009.pt | Bin .../test_data/ChEBIOver100_test/preds010.pt | Bin .../test_data/ChEBIOver100_test/preds011.pt | Bin .../test_data/ChEBIOver100_test/preds012.pt | Bin .../test_data/ChEBIOver100_test/preds013.pt | Bin .../test_data/ChEBIOver100_test/preds014.pt | Bin .../test_data/ChEBIOver100_test/preds015.pt | Bin .../test_data/ChEBIOver100_test/preds016.pt | Bin .../test_data/ChEBIOver100_test/preds017.pt | Bin .../test_data/ChEBIOver100_test/preds018.pt | Bin .../test_data/ChEBIOver100_test/preds019.pt | Bin 47 files changed, 3 insertions(+) create mode 100644 tests/integration/__init__.py rename tests/{ => integration}/testChebiData.py (100%) rename tests/{ => integration}/testChebiDynamicDataSplits.py (100%) rename tests/{ => integration}/testCustomBalancedAccuracyMetric.py (100%) rename tests/{ => integration}/testCustomMacroF1Metric.py (100%) rename tests/{ => integration}/testPubChemData.py (100%) rename tests/{ => integration}/testTox21MolNetData.py (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels000.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels001.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels002.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels003.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels004.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels005.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels006.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels007.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels008.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels009.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels010.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels011.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels012.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels013.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels014.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels015.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels016.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels017.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels018.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/labels019.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds000.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds001.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds002.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds003.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds004.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds005.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds006.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds007.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds008.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds009.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds010.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds011.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds012.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds013.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds014.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds015.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds016.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds017.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds018.pt (100%) rename tests/{ => integration}/test_data/ChEBIOver100_test/preds019.pt (100%) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..caa8759f --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1,3 @@ +""" +This directory contains integration tests that cover the overall behavior of the data preprocessing tool. +""" diff --git a/tests/testChebiData.py b/tests/integration/testChebiData.py similarity index 100% rename from tests/testChebiData.py rename to tests/integration/testChebiData.py diff --git a/tests/testChebiDynamicDataSplits.py b/tests/integration/testChebiDynamicDataSplits.py similarity index 100% rename from tests/testChebiDynamicDataSplits.py rename to tests/integration/testChebiDynamicDataSplits.py diff --git a/tests/testCustomBalancedAccuracyMetric.py b/tests/integration/testCustomBalancedAccuracyMetric.py similarity index 100% rename from tests/testCustomBalancedAccuracyMetric.py rename to tests/integration/testCustomBalancedAccuracyMetric.py diff --git a/tests/testCustomMacroF1Metric.py b/tests/integration/testCustomMacroF1Metric.py similarity index 100% rename from tests/testCustomMacroF1Metric.py rename to tests/integration/testCustomMacroF1Metric.py diff --git a/tests/testPubChemData.py b/tests/integration/testPubChemData.py similarity index 100% rename from tests/testPubChemData.py rename to tests/integration/testPubChemData.py diff --git a/tests/testTox21MolNetData.py b/tests/integration/testTox21MolNetData.py similarity index 100% rename from tests/testTox21MolNetData.py rename to tests/integration/testTox21MolNetData.py diff --git a/tests/test_data/ChEBIOver100_test/labels000.pt b/tests/integration/test_data/ChEBIOver100_test/labels000.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels000.pt rename to tests/integration/test_data/ChEBIOver100_test/labels000.pt diff --git a/tests/test_data/ChEBIOver100_test/labels001.pt b/tests/integration/test_data/ChEBIOver100_test/labels001.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels001.pt rename to tests/integration/test_data/ChEBIOver100_test/labels001.pt diff --git a/tests/test_data/ChEBIOver100_test/labels002.pt b/tests/integration/test_data/ChEBIOver100_test/labels002.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels002.pt rename to tests/integration/test_data/ChEBIOver100_test/labels002.pt diff --git a/tests/test_data/ChEBIOver100_test/labels003.pt b/tests/integration/test_data/ChEBIOver100_test/labels003.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels003.pt rename to tests/integration/test_data/ChEBIOver100_test/labels003.pt diff --git a/tests/test_data/ChEBIOver100_test/labels004.pt b/tests/integration/test_data/ChEBIOver100_test/labels004.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels004.pt rename to tests/integration/test_data/ChEBIOver100_test/labels004.pt diff --git a/tests/test_data/ChEBIOver100_test/labels005.pt b/tests/integration/test_data/ChEBIOver100_test/labels005.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels005.pt rename to tests/integration/test_data/ChEBIOver100_test/labels005.pt diff --git a/tests/test_data/ChEBIOver100_test/labels006.pt b/tests/integration/test_data/ChEBIOver100_test/labels006.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels006.pt rename to tests/integration/test_data/ChEBIOver100_test/labels006.pt diff --git a/tests/test_data/ChEBIOver100_test/labels007.pt b/tests/integration/test_data/ChEBIOver100_test/labels007.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels007.pt rename to tests/integration/test_data/ChEBIOver100_test/labels007.pt diff --git a/tests/test_data/ChEBIOver100_test/labels008.pt b/tests/integration/test_data/ChEBIOver100_test/labels008.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels008.pt rename to tests/integration/test_data/ChEBIOver100_test/labels008.pt diff --git a/tests/test_data/ChEBIOver100_test/labels009.pt b/tests/integration/test_data/ChEBIOver100_test/labels009.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels009.pt rename to tests/integration/test_data/ChEBIOver100_test/labels009.pt diff --git a/tests/test_data/ChEBIOver100_test/labels010.pt b/tests/integration/test_data/ChEBIOver100_test/labels010.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels010.pt rename to tests/integration/test_data/ChEBIOver100_test/labels010.pt diff --git a/tests/test_data/ChEBIOver100_test/labels011.pt b/tests/integration/test_data/ChEBIOver100_test/labels011.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels011.pt rename to tests/integration/test_data/ChEBIOver100_test/labels011.pt diff --git a/tests/test_data/ChEBIOver100_test/labels012.pt b/tests/integration/test_data/ChEBIOver100_test/labels012.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels012.pt rename to tests/integration/test_data/ChEBIOver100_test/labels012.pt diff --git a/tests/test_data/ChEBIOver100_test/labels013.pt b/tests/integration/test_data/ChEBIOver100_test/labels013.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels013.pt rename to tests/integration/test_data/ChEBIOver100_test/labels013.pt diff --git a/tests/test_data/ChEBIOver100_test/labels014.pt b/tests/integration/test_data/ChEBIOver100_test/labels014.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels014.pt rename to tests/integration/test_data/ChEBIOver100_test/labels014.pt diff --git a/tests/test_data/ChEBIOver100_test/labels015.pt b/tests/integration/test_data/ChEBIOver100_test/labels015.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels015.pt rename to tests/integration/test_data/ChEBIOver100_test/labels015.pt diff --git a/tests/test_data/ChEBIOver100_test/labels016.pt b/tests/integration/test_data/ChEBIOver100_test/labels016.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels016.pt rename to tests/integration/test_data/ChEBIOver100_test/labels016.pt diff --git a/tests/test_data/ChEBIOver100_test/labels017.pt b/tests/integration/test_data/ChEBIOver100_test/labels017.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels017.pt rename to tests/integration/test_data/ChEBIOver100_test/labels017.pt diff --git a/tests/test_data/ChEBIOver100_test/labels018.pt b/tests/integration/test_data/ChEBIOver100_test/labels018.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels018.pt rename to tests/integration/test_data/ChEBIOver100_test/labels018.pt diff --git a/tests/test_data/ChEBIOver100_test/labels019.pt b/tests/integration/test_data/ChEBIOver100_test/labels019.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/labels019.pt rename to tests/integration/test_data/ChEBIOver100_test/labels019.pt diff --git a/tests/test_data/ChEBIOver100_test/preds000.pt b/tests/integration/test_data/ChEBIOver100_test/preds000.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds000.pt rename to tests/integration/test_data/ChEBIOver100_test/preds000.pt diff --git a/tests/test_data/ChEBIOver100_test/preds001.pt b/tests/integration/test_data/ChEBIOver100_test/preds001.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds001.pt rename to tests/integration/test_data/ChEBIOver100_test/preds001.pt diff --git a/tests/test_data/ChEBIOver100_test/preds002.pt b/tests/integration/test_data/ChEBIOver100_test/preds002.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds002.pt rename to tests/integration/test_data/ChEBIOver100_test/preds002.pt diff --git a/tests/test_data/ChEBIOver100_test/preds003.pt b/tests/integration/test_data/ChEBIOver100_test/preds003.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds003.pt rename to tests/integration/test_data/ChEBIOver100_test/preds003.pt diff --git a/tests/test_data/ChEBIOver100_test/preds004.pt b/tests/integration/test_data/ChEBIOver100_test/preds004.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds004.pt rename to tests/integration/test_data/ChEBIOver100_test/preds004.pt diff --git a/tests/test_data/ChEBIOver100_test/preds005.pt b/tests/integration/test_data/ChEBIOver100_test/preds005.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds005.pt rename to tests/integration/test_data/ChEBIOver100_test/preds005.pt diff --git a/tests/test_data/ChEBIOver100_test/preds006.pt b/tests/integration/test_data/ChEBIOver100_test/preds006.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds006.pt rename to tests/integration/test_data/ChEBIOver100_test/preds006.pt diff --git a/tests/test_data/ChEBIOver100_test/preds007.pt b/tests/integration/test_data/ChEBIOver100_test/preds007.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds007.pt rename to tests/integration/test_data/ChEBIOver100_test/preds007.pt diff --git a/tests/test_data/ChEBIOver100_test/preds008.pt b/tests/integration/test_data/ChEBIOver100_test/preds008.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds008.pt rename to tests/integration/test_data/ChEBIOver100_test/preds008.pt diff --git a/tests/test_data/ChEBIOver100_test/preds009.pt b/tests/integration/test_data/ChEBIOver100_test/preds009.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds009.pt rename to tests/integration/test_data/ChEBIOver100_test/preds009.pt diff --git a/tests/test_data/ChEBIOver100_test/preds010.pt b/tests/integration/test_data/ChEBIOver100_test/preds010.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds010.pt rename to tests/integration/test_data/ChEBIOver100_test/preds010.pt diff --git a/tests/test_data/ChEBIOver100_test/preds011.pt b/tests/integration/test_data/ChEBIOver100_test/preds011.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds011.pt rename to tests/integration/test_data/ChEBIOver100_test/preds011.pt diff --git a/tests/test_data/ChEBIOver100_test/preds012.pt b/tests/integration/test_data/ChEBIOver100_test/preds012.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds012.pt rename to tests/integration/test_data/ChEBIOver100_test/preds012.pt diff --git a/tests/test_data/ChEBIOver100_test/preds013.pt b/tests/integration/test_data/ChEBIOver100_test/preds013.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds013.pt rename to tests/integration/test_data/ChEBIOver100_test/preds013.pt diff --git a/tests/test_data/ChEBIOver100_test/preds014.pt b/tests/integration/test_data/ChEBIOver100_test/preds014.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds014.pt rename to tests/integration/test_data/ChEBIOver100_test/preds014.pt diff --git a/tests/test_data/ChEBIOver100_test/preds015.pt b/tests/integration/test_data/ChEBIOver100_test/preds015.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds015.pt rename to tests/integration/test_data/ChEBIOver100_test/preds015.pt diff --git a/tests/test_data/ChEBIOver100_test/preds016.pt b/tests/integration/test_data/ChEBIOver100_test/preds016.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds016.pt rename to tests/integration/test_data/ChEBIOver100_test/preds016.pt diff --git a/tests/test_data/ChEBIOver100_test/preds017.pt b/tests/integration/test_data/ChEBIOver100_test/preds017.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds017.pt rename to tests/integration/test_data/ChEBIOver100_test/preds017.pt diff --git a/tests/test_data/ChEBIOver100_test/preds018.pt b/tests/integration/test_data/ChEBIOver100_test/preds018.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds018.pt rename to tests/integration/test_data/ChEBIOver100_test/preds018.pt diff --git a/tests/test_data/ChEBIOver100_test/preds019.pt b/tests/integration/test_data/ChEBIOver100_test/preds019.pt similarity index 100% rename from tests/test_data/ChEBIOver100_test/preds019.pt rename to tests/integration/test_data/ChEBIOver100_test/preds019.pt From 5af03512863cb7b68193eb0698c899b762de721b Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 29 Aug 2024 21:13:07 +0200 Subject: [PATCH 012/112] unit dir + test for ChemDataReader --- tests/unit/__init__.py | 4 ++ tests/unit/collators/__init__.py | 0 tests/unit/data_readers/__init__.py | 0 tests/unit/data_readers/testChemDataReader.py | 71 +++++++++++++++++++ tests/unit/dataset_classes/__init__.py | 0 5 files changed, 75 insertions(+) create mode 100644 tests/unit/__init__.py create mode 100644 tests/unit/collators/__init__.py create mode 100644 tests/unit/data_readers/__init__.py create mode 100644 tests/unit/data_readers/testChemDataReader.py create mode 100644 tests/unit/dataset_classes/__init__.py diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..6640a696 --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1,4 @@ +""" +This directory contains unit tests, which focus on individual functions and methods, ensuring they work as +expected in isolation. +""" diff --git a/tests/unit/collators/__init__.py b/tests/unit/collators/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/data_readers/__init__.py b/tests/unit/data_readers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/data_readers/testChemDataReader.py b/tests/unit/data_readers/testChemDataReader.py new file mode 100644 index 00000000..bf3dea6e --- /dev/null +++ b/tests/unit/data_readers/testChemDataReader.py @@ -0,0 +1,71 @@ +import unittest +from typing import List +from unittest.mock import mock_open, patch + +from chebai.preprocessing.reader import EMBEDDING_OFFSET, ChemDataReader + + +class TestChemDataReader(unittest.TestCase): + """ + Unit tests for the ChemDataReader class. + """ + + @patch( + "chebai.preprocessing.reader.open", + new_callable=mock_open, + read_data="C\nO\nN\n=\n1\n(", + ) + def setUp(self, mock_file: mock_open) -> None: + """ + Set up the test environment by initializing a ChemDataReader instance with a mocked token file. + + Args: + mock_file: Mock object for file operations. + """ + self.reader = ChemDataReader(token_path="/mock/path") + # After initializing, self.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '('] + self.assertEqual(self.reader.cache, ["C", "O", "N", "=", "1", "("]) + + def test_read_data(self) -> None: + """ + Test the _read_data method with a SMILES string to ensure it correctly tokenizes the string. + """ + raw_data = "CC(=O)NC1" + # Expected output as per the tokens already in the cache, and ")" getting added to it. + expected_output: List[int] = [ + EMBEDDING_OFFSET + 0, # C + EMBEDDING_OFFSET + 0, # C + EMBEDDING_OFFSET + 5, # = + EMBEDDING_OFFSET + 3, # O + EMBEDDING_OFFSET + 1, # N + EMBEDDING_OFFSET + 6, # ( + EMBEDDING_OFFSET + 2, # C + EMBEDDING_OFFSET + 0, # C + EMBEDDING_OFFSET + 4, # 1 + ] + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + def test_read_data_with_new_token(self) -> None: + """ + Test the _read_data method with a SMILES string that includes a new token. + Ensure that the new token is added to the cache and processed correctly. + """ + raw_data = "[H-]" + + # Note: test methods within a TestCase class are not guaranteed to be executed in any specific order. + # Determine the index for the new token based on the current size of the cache. + index_for_last_token = len(self.reader.cache) + expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token] + + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + # Verify that '[H-]' was added to the cache + self.assertIn("[H-]", self.reader.cache) + # Ensure it's at the correct index + self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/dataset_classes/__init__.py b/tests/unit/dataset_classes/__init__.py new file mode 100644 index 00000000..e69de29b From a0810a233dd319c7fcb18bb3684eacd3047796ef Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 29 Aug 2024 21:15:49 +0200 Subject: [PATCH 013/112] Test for DataReader --- tests/unit/data_readers/testDataReader.py | 51 +++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 tests/unit/data_readers/testDataReader.py diff --git a/tests/unit/data_readers/testDataReader.py b/tests/unit/data_readers/testDataReader.py new file mode 100644 index 00000000..1a511b26 --- /dev/null +++ b/tests/unit/data_readers/testDataReader.py @@ -0,0 +1,51 @@ +import unittest +from typing import Any, Dict, List + +from chebai.preprocessing.reader import DataReader + + +class TestDataReader(unittest.TestCase): + """ + Unit tests for the DataReader class. + """ + + def setUp(self) -> None: + """ + Set up the test environment by initializing a DataReader instance. + """ + self.reader = DataReader() + + def test_to_data(self) -> None: + """ + Test the to_data method to ensure it correctly processes the input row + and formats it according to the expected output. + + This method tests the conversion of raw data into a processed format, + including extracting features, labels, ident, group, and additional + keyword arguments. + """ + features_list: List[int] = [10, 20, 30] + labels_list: List[bool] = [True, False, True] + ident_no: int = 123 + + row: Dict[str, Any] = { + "features": features_list, + "labels": labels_list, + "ident": ident_no, + "group": "group_data", + "additional_kwargs": {"extra_key": "extra_value"}, + } + + expected: Dict[str, Any] = { + "features": features_list, + "labels": labels_list, + "ident": ident_no, + "group": "group_data", + "extra_key": "extra_value", + } + + self.assertEqual(self.reader.to_data(row), expected) + + +if __name__ == "__main__": + unittest.main() From 1b3836d5c103a1455f41245b757a94acc0b3d5f5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 29 Aug 2024 23:09:53 +0200 Subject: [PATCH 014/112] tests for DeepChemReader --- .../{data_readers => readers}/__init__.py | 0 .../testChemDataReader.py | 8 +- .../testDataReader.py | 0 tests/unit/readers/testDeepChemDataReader.py | 80 +++++++++++++++++++ 4 files changed, 85 insertions(+), 3 deletions(-) rename tests/unit/{data_readers => readers}/__init__.py (100%) rename tests/unit/{data_readers => readers}/testChemDataReader.py (90%) rename tests/unit/{data_readers => readers}/testDataReader.py (100%) create mode 100644 tests/unit/readers/testDeepChemDataReader.py diff --git a/tests/unit/data_readers/__init__.py b/tests/unit/readers/__init__.py similarity index 100% rename from tests/unit/data_readers/__init__.py rename to tests/unit/readers/__init__.py diff --git a/tests/unit/data_readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py similarity index 90% rename from tests/unit/data_readers/testChemDataReader.py rename to tests/unit/readers/testChemDataReader.py index bf3dea6e..2bc525e1 100644 --- a/tests/unit/data_readers/testChemDataReader.py +++ b/tests/unit/readers/testChemDataReader.py @@ -8,6 +8,8 @@ class TestChemDataReader(unittest.TestCase): """ Unit tests for the ChemDataReader class. + + Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order. """ @patch( @@ -30,7 +32,7 @@ def test_read_data(self) -> None: """ Test the _read_data method with a SMILES string to ensure it correctly tokenizes the string. """ - raw_data = "CC(=O)NC1" + raw_data = "CC(=O)NC1[Mg-2]" # Expected output as per the tokens already in the cache, and ")" getting added to it. expected_output: List[int] = [ EMBEDDING_OFFSET + 0, # C @@ -38,10 +40,11 @@ def test_read_data(self) -> None: EMBEDDING_OFFSET + 5, # = EMBEDDING_OFFSET + 3, # O EMBEDDING_OFFSET + 1, # N - EMBEDDING_OFFSET + 6, # ( + EMBEDDING_OFFSET + len(self.reader.cache), # ( EMBEDDING_OFFSET + 2, # C EMBEDDING_OFFSET + 0, # C EMBEDDING_OFFSET + 4, # 1 + EMBEDDING_OFFSET + len(self.reader.cache) + 1, # [Mg-2] ] result = self.reader._read_data(raw_data) self.assertEqual(result, expected_output) @@ -53,7 +56,6 @@ def test_read_data_with_new_token(self) -> None: """ raw_data = "[H-]" - # Note: test methods within a TestCase class are not guaranteed to be executed in any specific order. # Determine the index for the new token based on the current size of the cache. index_for_last_token = len(self.reader.cache) expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token] diff --git a/tests/unit/data_readers/testDataReader.py b/tests/unit/readers/testDataReader.py similarity index 100% rename from tests/unit/data_readers/testDataReader.py rename to tests/unit/readers/testDataReader.py diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py new file mode 100644 index 00000000..c93e2592 --- /dev/null +++ b/tests/unit/readers/testDeepChemDataReader.py @@ -0,0 +1,80 @@ +import unittest +from typing import List +from unittest.mock import mock_open, patch + +from chebai.preprocessing.reader import EMBEDDING_OFFSET, DeepChemDataReader + + +class TestDeepChemDataReader(unittest.TestCase): + """ + Unit tests for the DeepChemDataReader class. + + Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order. + """ + + @patch( + "chebai.preprocessing.reader.open", + new_callable=mock_open, + read_data="C\nO\nc\n)", + ) + def setUp(self, mock_file: mock_open) -> None: + """ + Set up the test environment by initializing a DeepChemDataReader instance with a mocked token file. + + Args: + mock_file: Mock object for file operations. + """ + self.reader = DeepChemDataReader(token_path="/mock/path") + # After initializing, self.reader.cache should now be set to ['C', 'O', 'c', ')'] + self.assertEqual(self.reader.cache, ["C", "O", "c", ")"]) + + def test_read_data(self) -> None: + """ + Test the _read_data method with a SMILES string to ensure it correctly tokenizes the string. + """ + raw_data = "c1ccccc1C(Br)(OC)I[Ni-2]" + + # Expected output as per the tokens already in the cache, and new tokens getting added to it. + expected_output: List[int] = [ + EMBEDDING_OFFSET + 2, # c + EMBEDDING_OFFSET + 2, # c + EMBEDDING_OFFSET + 2, # c + EMBEDDING_OFFSET + 2, # c + EMBEDDING_OFFSET + 2, # c + EMBEDDING_OFFSET + 2, # c + EMBEDDING_OFFSET + len(self.reader.cache), # 6 (new token) + EMBEDDING_OFFSET + 0, # C + EMBEDDING_OFFSET + len(self.reader.cache) + 1, # Br (new token) + EMBEDDING_OFFSET + 3, # ) + EMBEDDING_OFFSET + 1, # O + EMBEDDING_OFFSET + 0, # C + EMBEDDING_OFFSET + 3, # ) + EMBEDDING_OFFSET + 3, # ) + EMBEDDING_OFFSET + len(self.reader.cache) + 2, # I (new token) + EMBEDDING_OFFSET + len(self.reader.cache) + 3, # [Ni-2] (new token) + ] + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + def test_read_data_with_new_token(self) -> None: + """ + Test the _read_data method with a SMILES string that includes a new token. + Ensure that the new token is added to the cache and processed correctly. + """ + raw_data = "[H-]" + + # Determine the index for the new token based on the current size of the cache. + index_for_last_token = len(self.reader.cache) + expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token] + + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + # Verify that '[H-]' was added to the cache + self.assertIn("[H-]", self.reader.cache) + # Ensure it's at the correct index + self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token) + + +if __name__ == "__main__": + unittest.main() From aa467c6fde67a9545b23c79132c128d0a837b69e Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Fri, 30 Aug 2024 00:06:39 +0200 Subject: [PATCH 015/112] Test for SelfiesReader --- tests/unit/readers/testDeepChemDataReader.py | 3 + tests/unit/readers/testSelfiesReader.py | 106 +++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 tests/unit/readers/testSelfiesReader.py diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py index c93e2592..ac1a50b7 100644 --- a/tests/unit/readers/testDeepChemDataReader.py +++ b/tests/unit/readers/testDeepChemDataReader.py @@ -34,6 +34,9 @@ def test_read_data(self) -> None: """ raw_data = "c1ccccc1C(Br)(OC)I[Ni-2]" + # benzene is c1ccccc1 in SMILES but cccccc6 in DeepSMILES + # SMILES C(Br)(OC)I can be converted to the DeepSMILES CBr)OC))I. + # Resultant String: "cccccc6CBr)OC))I[Ni-2]" # Expected output as per the tokens already in the cache, and new tokens getting added to it. expected_output: List[int] = [ EMBEDDING_OFFSET + 2, # c diff --git a/tests/unit/readers/testSelfiesReader.py b/tests/unit/readers/testSelfiesReader.py new file mode 100644 index 00000000..41202757 --- /dev/null +++ b/tests/unit/readers/testSelfiesReader.py @@ -0,0 +1,106 @@ +import unittest +from typing import List +from unittest.mock import mock_open, patch + +from chebai.preprocessing.reader import EMBEDDING_OFFSET, SelfiesReader + + +class TestSelfiesReader(unittest.TestCase): + """ + Unit tests for the SelfiesReader class. + + Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order. + """ + + @patch( + "chebai.preprocessing.reader.open", + new_callable=mock_open, + read_data="[C]\n[O]\n[=C]", + ) + def setUp(self, mock_file: mock_open) -> None: + """ + Set up the test environment by initializing a SelfiesReader instance with a mocked token file. + + Args: + mock_file: Mock object for file operations. + """ + self.reader = SelfiesReader(token_path="/mock/path") + # After initializing, self.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[('] + self.assertEqual( + self.reader.cache, + [ + "[C]", + "[O]", + "[=C]", + ], + ) + + def test_read_data(self) -> None: + """ + Test the _read_data method with a SELFIES string to ensure it correctly tokenizes the string. + """ + raw_data = "c1ccccc1C(Br)(OC)I[Ni-2]" + + # benzene is "c1ccccc1" in SMILES is translated to "[C][=C][C][=C][C][=C][Ring1][=Branch1]" in SELFIES + # SELFIES translation of SMILES "c1ccccc1C(Br)(OC)I[Ni-2]": + # "[C][=C][C][=C][C][=C][Ring1][=Branch1][C][Branch1][C][Br][Branch1][Ring1][O][C][I][Ni-2]" + expected_output: List[int] = [ + EMBEDDING_OFFSET + 0, # [C] (already in cache) + EMBEDDING_OFFSET + 2, # [=C] (already in cache) + EMBEDDING_OFFSET + 0, # [C] (already in cache) + EMBEDDING_OFFSET + 2, # [=C] (already in cache) + EMBEDDING_OFFSET + 0, # [C] (already in cache) + EMBEDDING_OFFSET + 2, # [=C] (already in cache) + EMBEDDING_OFFSET + len(self.reader.cache), # [Ring1] (new token) + EMBEDDING_OFFSET + len(self.reader.cache) + 1, # [=Branch1] (new token) + EMBEDDING_OFFSET + 0, # [C] (already in cache) + EMBEDDING_OFFSET + len(self.reader.cache) + 2, # [Branch1] (new token) + EMBEDDING_OFFSET + 0, # [C] (already in cache) + EMBEDDING_OFFSET + len(self.reader.cache) + 3, # [Br] (new token) + EMBEDDING_OFFSET + + len(self.reader.cache) + + 2, # [Branch1] (reused new token) + EMBEDDING_OFFSET + len(self.reader.cache), # [Ring1] (reused new token) + EMBEDDING_OFFSET + 1, # [O] (already in cache) + EMBEDDING_OFFSET + 0, # [C] (already in cache) + EMBEDDING_OFFSET + len(self.reader.cache) + 4, # [I] (new token) + EMBEDDING_OFFSET + len(self.reader.cache) + 5, # [Ni-2] (new token) + ] + + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + def test_read_data_with_new_token(self) -> None: + """ + Test the _read_data method with a SELFIES string that includes a new token. + Ensure that the new token is added to the cache and processed correctly. + """ + raw_data = "[H-]" + + # Determine the index for the new token based on the current size of the cache. + index_for_last_token = len(self.reader.cache) + expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token] + + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + # Verify that '[H-1]' was added to the cache, "[H-]" translated to "[H-1]" in SELFIES + self.assertIn("[H-1]", self.reader.cache) + # Ensure it's at the correct index + self.assertEqual(self.reader.cache.index("[H-1]"), index_for_last_token) + + def test_read_data_with_invalid_selfies(self) -> None: + """ + Test the _read_data method with an invalid SELFIES string to ensure error handling works. + """ + raw_data = "[C][O][INVALID][N]" + + result = self.reader._read_data(raw_data) + self.assertIsNone(result) + + # Verify that the error count was incremented + self.assertEqual(self.reader.error_count, 1) + + +if __name__ == "__main__": + unittest.main() From b6f5e5162d22359a67fa212c288b13715fd51356 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Fri, 30 Aug 2024 23:14:49 +0200 Subject: [PATCH 016/112] test for ProteinDataReader --- tests/unit/readers/testProteinDataReader.py | 105 ++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 tests/unit/readers/testProteinDataReader.py diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py new file mode 100644 index 00000000..5f828e75 --- /dev/null +++ b/tests/unit/readers/testProteinDataReader.py @@ -0,0 +1,105 @@ +import unittest +from typing import List +from unittest.mock import mock_open, patch + +from chebai.preprocessing.reader import EMBEDDING_OFFSET, ProteinDataReader + + +class TestProteinDataReader(unittest.TestCase): + """ + Unit tests for the ProteinDataReader class. + """ + + @patch( + "chebai.preprocessing.reader.open", + new_callable=mock_open, + read_data="M\nK\nT\nF\nR\nN", + ) + def setUp(self, mock_file: mock_open) -> None: + """ + Set up the test environment by initializing a ProteinDataReader instance with a mocked token file. + + Args: + mock_file: Mock object for file operations. + """ + self.reader = ProteinDataReader(token_path="/mock/path") + # After initializing, self.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N'] + self.assertEqual(self.reader.cache, ["M", "K", "T", "F", "R", "N"]) + + def test_read_data(self) -> None: + """ + Test the _read_data method with a protein sequence to ensure it correctly tokenizes the sequence. + """ + raw_data = "MKTFFRN" + + # Expected output based on the cached tokens + expected_output: List[int] = [ + EMBEDDING_OFFSET + 0, # M + EMBEDDING_OFFSET + 1, # K + EMBEDDING_OFFSET + 2, # T + EMBEDDING_OFFSET + 3, # F + EMBEDDING_OFFSET + 3, # F (repeated token) + EMBEDDING_OFFSET + 4, # R + EMBEDDING_OFFSET + 5, # N + ] + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + def test_read_data_with_new_token(self) -> None: + """ + Test the _read_data method with a protein sequence that includes a new token. + Ensure that the new token is added to the cache and processed correctly. + """ + raw_data = "MKTFY" + + # 'Y' is not in the initial cache and should be added. + expected_output: List[int] = [ + EMBEDDING_OFFSET + 0, # M + EMBEDDING_OFFSET + 1, # K + EMBEDDING_OFFSET + 2, # T + EMBEDDING_OFFSET + 3, # F + EMBEDDING_OFFSET + len(self.reader.cache), # Y (new token) + ] + + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + # Verify that 'Y' was added to the cache + self.assertIn("Y", self.reader.cache) + # Ensure it's at the correct index + self.assertEqual(self.reader.cache.index("Y"), len(self.reader.cache) - 1) + + def test_read_data_with_invalid_token(self) -> None: + """ + Test the _read_data method with an invalid amino acid token to ensure it raises a KeyError. + """ + raw_data = "MKTFZ" # 'Z' is not a valid amino acid token + + with self.assertRaises(KeyError) as context: + self.reader._read_data(raw_data) + + self.assertIn("Invalid token 'Z' encountered", str(context.exception)) + + def test_read_data_with_empty_sequence(self) -> None: + """ + Test the _read_data method with an empty protein sequence to ensure it returns an empty list. + """ + raw_data = "" + + result = self.reader._read_data(raw_data) + self.assertEqual(result, []) + + def test_read_data_with_repeated_tokens(self) -> None: + """ + Test the _read_data method with repeated amino acid tokens to ensure it handles them correctly. + """ + raw_data = "MMMMM" + + expected_output: List[int] = [EMBEDDING_OFFSET + 0] * 5 # All tokens are 'M' + + result = self.reader._read_data(raw_data) + self.assertEqual(result, expected_output) + + +if __name__ == "__main__": + unittest.main() From 73f05c01f81c90107eccb61c638529755b05df15 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 31 Aug 2024 00:03:21 +0200 Subject: [PATCH 017/112] test for DefaultCollator --- tests/unit/collators/testDefaultCollator.py | 52 +++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 tests/unit/collators/testDefaultCollator.py diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py new file mode 100644 index 00000000..6362d0a6 --- /dev/null +++ b/tests/unit/collators/testDefaultCollator.py @@ -0,0 +1,52 @@ +import unittest +from typing import Dict, List + +from chebai.preprocessing.collate import DefaultCollator +from chebai.preprocessing.structures import XYData + + +class TestDefaultCollator(unittest.TestCase): + """ + Unit tests for the DefaultCollator class. + """ + + def setUp(self) -> None: + """ + Set up the test environment by initializing a DefaultCollator instance. + """ + self.collator = DefaultCollator() + + def test_call_with_valid_data(self) -> None: + """ + Test the __call__ method with valid data to ensure features and labels are correctly extracted. + """ + data: List[Dict] = [ + {"features": [1.0, 2.0], "labels": 0}, + {"features": [3.0, 4.0], "labels": 1}, + ] + + result: XYData = self.collator(data) + self.assertIsInstance(result, XYData) + + expected_x = ([1.0, 2.0], [3.0, 4.0]) + expected_y = (0, 1) + + self.assertEqual(result.x, expected_x) + self.assertEqual(result.y, expected_y) + + def test_call_with_empty_data(self) -> None: + """ + Test the __call__ method with an empty list to ensure it handles the edge case correctly. + """ + data: List[Dict] = [] + + with self.assertRaises(ValueError) as context: + self.collator(data) + + self.assertEqual( + str(context.exception), "not enough values to unpack (expected 2, got 0)" + ) + + +if __name__ == "__main__": + unittest.main() From 8007f37f7622168fa3db1837e5b7fafcb8307a5e Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 31 Aug 2024 22:05:16 +0200 Subject: [PATCH 018/112] test for RaggedColllator --- tests/unit/collators/testRaggedCollator.py | 150 +++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 tests/unit/collators/testRaggedCollator.py diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py new file mode 100644 index 00000000..97e1c08f --- /dev/null +++ b/tests/unit/collators/testRaggedCollator.py @@ -0,0 +1,150 @@ +import unittest +from typing import Dict, List, Tuple + +import torch + +from chebai.preprocessing.collate import RaggedCollator +from chebai.preprocessing.structures import XYData + + +class TestRaggedCollator(unittest.TestCase): + """ + Unit tests for the RaggedCollator class. + """ + + def setUp(self) -> None: + """ + Set up the test environment by initializing a RaggedCollator instance. + """ + self.collator = RaggedCollator() + + def test_call_with_valid_data(self) -> None: + """ + Test the __call__ method with valid ragged data to ensure features, labels, and masks are correctly handled. + """ + data: List[Dict] = [ + {"features": [1, 2], "labels": [1, 0], "ident": "sample1"}, + {"features": [3, 4, 5], "labels": [0, 1, 1], "ident": "sample2"}, + {"features": [6], "labels": [1], "ident": "sample3"}, + ] + + result: XYData = self.collator(data) + + expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]]) + expected_y = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]]) + expected_mask_for_x = torch.tensor( + [[True, True, False], [True, True, True], [True, False, False]] + ) + expected_lens_for_x = torch.tensor([2, 3, 1]) + + self.assertTrue(torch.equal(result.x, expected_x)) + self.assertTrue(torch.equal(result.y, expected_y)) + self.assertTrue( + torch.equal( + result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x + ) + ) + self.assertTrue( + torch.equal( + result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x + ) + ) + self.assertEqual( + result.additional_fields["idents"], ("sample1", "sample2", "sample3") + ) + + def test_call_with_missing_entire_labels(self) -> None: + """ + Test the __call__ method with data where some samples are missing labels. + """ + data: List[Dict] = [ + {"features": [1, 2], "labels": [1, 0], "ident": "sample1"}, + {"features": [3, 4, 5], "labels": None, "ident": "sample2"}, + {"features": [6], "labels": [1], "ident": "sample3"}, + ] + + result: XYData = self.collator(data) + + expected_x = torch.tensor([[1, 2], [6, 0]]) + expected_y = torch.tensor([[1, 0], [1, 0]]) + expected_mask_for_x = torch.tensor([[True, True], [True, False]]) + expected_lens_for_x = torch.tensor([2, 1]) + + self.assertTrue(torch.equal(result.x, expected_x)) + self.assertTrue(torch.equal(result.y, expected_y)) + self.assertTrue( + torch.equal( + result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x + ) + ) + self.assertTrue( + torch.equal( + result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x + ) + ) + self.assertEqual( + result.additional_fields["loss_kwargs"]["non_null_labels"], [0, 2] + ) + self.assertEqual( + result.additional_fields["idents"], ("sample1", "sample2", "sample3") + ) + + def test_call_with_none_in_labels(self) -> None: + """ + Test the __call__ method with data where one of the elements in the labels is None. + """ + data: List[Dict] = [ + {"features": [1, 2], "labels": [None, 1], "ident": "sample1"}, + {"features": [3, 4, 5], "labels": [1, 0], "ident": "sample2"}, + {"features": [6], "labels": [1], "ident": "sample3"}, + ] + + result: XYData = self.collator(data) + + expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]]) + expected_y = torch.tensor([[0, 1], [1, 0], [1, 0]]) # None is replaced by 0 + expected_mask_for_x = torch.tensor( + [[True, True, False], [True, True, True], [True, False, False]] + ) + expected_lens_for_x = torch.tensor([2, 3, 1]) + + self.assertTrue(torch.equal(result.x, expected_x)) + self.assertTrue(torch.equal(result.y, expected_y)) + self.assertTrue( + torch.equal( + result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x + ) + ) + self.assertTrue( + torch.equal( + result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x + ) + ) + self.assertEqual( + result.additional_fields["idents"], ("sample1", "sample2", "sample3") + ) + + def test_call_with_empty_data(self) -> None: + """ + Test the __call__ method with an empty list to ensure it raises an error. + """ + data: List[Dict] = [] + + with self.assertRaises(Exception): + self.collator(data) + + def test_process_label_rows(self) -> None: + """ + Test the process_label_rows method to ensure it pads label sequences correctly. + """ + labels: Tuple = ([1, 0], [0, 1, 1], [1]) + + result: torch.Tensor = self.collator.process_label_rows(labels) + + expected_output = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]]) + + self.assertTrue(torch.equal(result, expected_output)) + + +if __name__ == "__main__": + unittest.main() From 248eaa7034ac2aa204d578c2c249096ee07dbd83 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 31 Aug 2024 23:55:52 +0200 Subject: [PATCH 019/112] modify tests to use `setUpClass` class method instead of `setUp` instance method --- tests/unit/collators/testDefaultCollator.py | 5 +++-- tests/unit/collators/testRaggedCollator.py | 5 +++-- tests/unit/readers/testChemDataReader.py | 9 +++++---- tests/unit/readers/testDataReader.py | 5 +++-- tests/unit/readers/testDeepChemDataReader.py | 9 +++++---- tests/unit/readers/testProteinDataReader.py | 9 +++++---- tests/unit/readers/testSelfiesReader.py | 16 +++++----------- 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py index 6362d0a6..287cadcd 100644 --- a/tests/unit/collators/testDefaultCollator.py +++ b/tests/unit/collators/testDefaultCollator.py @@ -10,11 +10,12 @@ class TestDefaultCollator(unittest.TestCase): Unit tests for the DefaultCollator class. """ - def setUp(self) -> None: + @classmethod + def setUpClass(cls) -> None: """ Set up the test environment by initializing a DefaultCollator instance. """ - self.collator = DefaultCollator() + cls.collator = DefaultCollator() def test_call_with_valid_data(self) -> None: """ diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py index 97e1c08f..a3126314 100644 --- a/tests/unit/collators/testRaggedCollator.py +++ b/tests/unit/collators/testRaggedCollator.py @@ -12,11 +12,12 @@ class TestRaggedCollator(unittest.TestCase): Unit tests for the RaggedCollator class. """ - def setUp(self) -> None: + @classmethod + def setUpClass(cls) -> None: """ Set up the test environment by initializing a RaggedCollator instance. """ - self.collator = RaggedCollator() + cls.collator = RaggedCollator() def test_call_with_valid_data(self) -> None: """ diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py index 2bc525e1..3d7b5e6f 100644 --- a/tests/unit/readers/testChemDataReader.py +++ b/tests/unit/readers/testChemDataReader.py @@ -12,21 +12,22 @@ class TestChemDataReader(unittest.TestCase): Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order. """ + @classmethod @patch( "chebai.preprocessing.reader.open", new_callable=mock_open, read_data="C\nO\nN\n=\n1\n(", ) - def setUp(self, mock_file: mock_open) -> None: + def setUpClass(cls, mock_file: mock_open) -> None: """ Set up the test environment by initializing a ChemDataReader instance with a mocked token file. Args: mock_file: Mock object for file operations. """ - self.reader = ChemDataReader(token_path="/mock/path") - # After initializing, self.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '('] - self.assertEqual(self.reader.cache, ["C", "O", "N", "=", "1", "("]) + cls.reader = ChemDataReader(token_path="/mock/path") + # After initializing, cls.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '('] + assert cls.reader.cache == ["C", "O", "N", "=", "1", "("] def test_read_data(self) -> None: """ diff --git a/tests/unit/readers/testDataReader.py b/tests/unit/readers/testDataReader.py index 1a511b26..8a8af053 100644 --- a/tests/unit/readers/testDataReader.py +++ b/tests/unit/readers/testDataReader.py @@ -9,11 +9,12 @@ class TestDataReader(unittest.TestCase): Unit tests for the DataReader class. """ - def setUp(self) -> None: + @classmethod + def setUpClass(cls) -> None: """ Set up the test environment by initializing a DataReader instance. """ - self.reader = DataReader() + cls.reader = DataReader() def test_to_data(self) -> None: """ diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py index ac1a50b7..23ac35d5 100644 --- a/tests/unit/readers/testDeepChemDataReader.py +++ b/tests/unit/readers/testDeepChemDataReader.py @@ -12,21 +12,22 @@ class TestDeepChemDataReader(unittest.TestCase): Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order. """ + @classmethod @patch( "chebai.preprocessing.reader.open", new_callable=mock_open, read_data="C\nO\nc\n)", ) - def setUp(self, mock_file: mock_open) -> None: + def setUpClass(cls, mock_file: mock_open) -> None: """ Set up the test environment by initializing a DeepChemDataReader instance with a mocked token file. Args: mock_file: Mock object for file operations. """ - self.reader = DeepChemDataReader(token_path="/mock/path") - # After initializing, self.reader.cache should now be set to ['C', 'O', 'c', ')'] - self.assertEqual(self.reader.cache, ["C", "O", "c", ")"]) + cls.reader = DeepChemDataReader(token_path="/mock/path") + # After initializing, cls.reader.cache should now be set to ['C', 'O', 'c', ')'] + assert cls.reader.cache == ["C", "O", "c", ")"] def test_read_data(self) -> None: """ diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py index 5f828e75..6e5f325c 100644 --- a/tests/unit/readers/testProteinDataReader.py +++ b/tests/unit/readers/testProteinDataReader.py @@ -10,21 +10,22 @@ class TestProteinDataReader(unittest.TestCase): Unit tests for the ProteinDataReader class. """ + @classmethod @patch( "chebai.preprocessing.reader.open", new_callable=mock_open, read_data="M\nK\nT\nF\nR\nN", ) - def setUp(self, mock_file: mock_open) -> None: + def setUpClass(cls, mock_file: mock_open) -> None: """ Set up the test environment by initializing a ProteinDataReader instance with a mocked token file. Args: mock_file: Mock object for file operations. """ - self.reader = ProteinDataReader(token_path="/mock/path") - # After initializing, self.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N'] - self.assertEqual(self.reader.cache, ["M", "K", "T", "F", "R", "N"]) + cls.reader = ProteinDataReader(token_path="/mock/path") + # After initializing, cls.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N'] + assert cls.reader.cache == ["M", "K", "T", "F", "R", "N"] def test_read_data(self) -> None: """ diff --git a/tests/unit/readers/testSelfiesReader.py b/tests/unit/readers/testSelfiesReader.py index 41202757..019a0f59 100644 --- a/tests/unit/readers/testSelfiesReader.py +++ b/tests/unit/readers/testSelfiesReader.py @@ -12,28 +12,22 @@ class TestSelfiesReader(unittest.TestCase): Note: Test methods within a TestCase class are not guaranteed to be executed in any specific order. """ + @classmethod @patch( "chebai.preprocessing.reader.open", new_callable=mock_open, read_data="[C]\n[O]\n[=C]", ) - def setUp(self, mock_file: mock_open) -> None: + def setUpClass(cls, mock_file: mock_open) -> None: """ Set up the test environment by initializing a SelfiesReader instance with a mocked token file. Args: mock_file: Mock object for file operations. """ - self.reader = SelfiesReader(token_path="/mock/path") - # After initializing, self.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[('] - self.assertEqual( - self.reader.cache, - [ - "[C]", - "[O]", - "[=C]", - ], - ) + cls.reader = SelfiesReader(token_path="/mock/path") + # After initializing, cls.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[('] + assert cls.reader.cache == ["[C]", "[O]", "[=C]"] def test_read_data(self) -> None: """ From 3e57d78420ec8b1076b5e5842c535b03b212da8a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 1 Sep 2024 13:33:18 +0200 Subject: [PATCH 020/112] bool labels instead of numeric, for realistic data --- tests/unit/collators/testDefaultCollator.py | 6 ++-- tests/unit/collators/testRaggedCollator.py | 34 +++++++++++++-------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py index 287cadcd..29b1cc91 100644 --- a/tests/unit/collators/testDefaultCollator.py +++ b/tests/unit/collators/testDefaultCollator.py @@ -22,15 +22,15 @@ def test_call_with_valid_data(self) -> None: Test the __call__ method with valid data to ensure features and labels are correctly extracted. """ data: List[Dict] = [ - {"features": [1.0, 2.0], "labels": 0}, - {"features": [3.0, 4.0], "labels": 1}, + {"features": [1.0, 2.0], "labels": [True, False, True]}, + {"features": [3.0, 4.0], "labels": [False, False, True]}, ] result: XYData = self.collator(data) self.assertIsInstance(result, XYData) expected_x = ([1.0, 2.0], [3.0, 4.0]) - expected_y = (0, 1) + expected_y = ([True, False, True], [False, False, True]) self.assertEqual(result.x, expected_x) self.assertEqual(result.y, expected_y) diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py index a3126314..81947b47 100644 --- a/tests/unit/collators/testRaggedCollator.py +++ b/tests/unit/collators/testRaggedCollator.py @@ -24,15 +24,17 @@ def test_call_with_valid_data(self) -> None: Test the __call__ method with valid ragged data to ensure features, labels, and masks are correctly handled. """ data: List[Dict] = [ - {"features": [1, 2], "labels": [1, 0], "ident": "sample1"}, - {"features": [3, 4, 5], "labels": [0, 1, 1], "ident": "sample2"}, - {"features": [6], "labels": [1], "ident": "sample3"}, + {"features": [1, 2], "labels": [True, False], "ident": "sample1"}, + {"features": [3, 4, 5], "labels": [False, True, True], "ident": "sample2"}, + {"features": [6], "labels": [True], "ident": "sample3"}, ] result: XYData = self.collator(data) expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]]) - expected_y = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]]) + expected_y = torch.tensor( + [[True, False, False], [False, True, True], [True, False, False]] + ) expected_mask_for_x = torch.tensor( [[True, True, False], [True, True, True], [True, False, False]] ) @@ -59,15 +61,17 @@ def test_call_with_missing_entire_labels(self) -> None: Test the __call__ method with data where some samples are missing labels. """ data: List[Dict] = [ - {"features": [1, 2], "labels": [1, 0], "ident": "sample1"}, + {"features": [1, 2], "labels": [True, False], "ident": "sample1"}, {"features": [3, 4, 5], "labels": None, "ident": "sample2"}, - {"features": [6], "labels": [1], "ident": "sample3"}, + {"features": [6], "labels": [True], "ident": "sample3"}, ] result: XYData = self.collator(data) expected_x = torch.tensor([[1, 2], [6, 0]]) - expected_y = torch.tensor([[1, 0], [1, 0]]) + expected_y = torch.tensor( + [[True, False], [True, False]] + ) # True -> 1, False -> 0 expected_mask_for_x = torch.tensor([[True, True], [True, False]]) expected_lens_for_x = torch.tensor([2, 1]) @@ -95,15 +99,17 @@ def test_call_with_none_in_labels(self) -> None: Test the __call__ method with data where one of the elements in the labels is None. """ data: List[Dict] = [ - {"features": [1, 2], "labels": [None, 1], "ident": "sample1"}, - {"features": [3, 4, 5], "labels": [1, 0], "ident": "sample2"}, - {"features": [6], "labels": [1], "ident": "sample3"}, + {"features": [1, 2], "labels": [None, True], "ident": "sample1"}, + {"features": [3, 4, 5], "labels": [True, False], "ident": "sample2"}, + {"features": [6], "labels": [True], "ident": "sample3"}, ] result: XYData = self.collator(data) expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]]) - expected_y = torch.tensor([[0, 1], [1, 0], [1, 0]]) # None is replaced by 0 + expected_y = torch.tensor( + [[False, True], [True, False], [True, False]] + ) # None -> False expected_mask_for_x = torch.tensor( [[True, True, False], [True, True, True], [True, False, False]] ) @@ -138,11 +144,13 @@ def test_process_label_rows(self) -> None: """ Test the process_label_rows method to ensure it pads label sequences correctly. """ - labels: Tuple = ([1, 0], [0, 1, 1], [1]) + labels: Tuple = ([True, False], [False, True, True], [True]) result: torch.Tensor = self.collator.process_label_rows(labels) - expected_output = torch.tensor([[1, 0, 0], [0, 1, 1], [1, 0, 0]]) + expected_output = torch.tensor( + [[True, False, False], [False, True, True], [True, False, False]] + ) self.assertTrue(torch.equal(result, expected_output)) From f9ca653d76b9a8434b1a1a487ee57b796156b40a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 1 Sep 2024 13:33:51 +0200 Subject: [PATCH 021/112] test for XYBaseDataModule --- .../dataset_classes/testXYBaseDataModule.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tests/unit/dataset_classes/testXYBaseDataModule.py diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py new file mode 100644 index 00000000..d8aabc67 --- /dev/null +++ b/tests/unit/dataset_classes/testXYBaseDataModule.py @@ -0,0 +1,76 @@ +import unittest +from unittest.mock import PropertyMock, patch + +from chebai.preprocessing.datasets.base import XYBaseDataModule +from chebai.preprocessing.reader import ProteinDataReader + + +class TestXYBaseDataModule(unittest.TestCase): + """ + Unit tests for the methods of the XYBaseDataModule class. + """ + + @classmethod + @patch.object(XYBaseDataModule, "_name", new_callable=PropertyMock) + def setUpClass(cls, mock_name_property) -> None: + """ + Set up a base instance of XYBaseDataModule for testing. + """ + + # Mock the _name property of XYBaseDataModule + mock_name_property.return_value = "MockedXYBaseDataModule" + + # Assign a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) + XYBaseDataModule.READER = ProteinDataReader + + # Initialize the module with a label_filter + cls.module = XYBaseDataModule( + label_filter=1, # Provide a label_filter + balance_after_filter=1.0, # Balance ratio + ) + + def test_filter_labels_valid_index(self) -> None: + """ + Test the _filter_labels method with a valid label_filter index. + """ + self.module.label_filter = 1 + row = { + "features": ["feature1", "feature2"], + "labels": [0, 3, 1, 2], # List of labels + } + filtered_row = self.module._filter_labels(row) + expected_labels = [3] # Only the label at index 1 should be kept + + self.assertEqual(filtered_row["labels"], expected_labels) + + row = { + "features": ["feature1", "feature2"], + "labels": [True, False, True, True], + } + self.assertEqual(self.module._filter_labels(row)["labels"], [False]) + + def test_filter_labels_no_filter(self) -> None: + """ + Test the _filter_labels method with no label_filter index. + """ + # Update the module to have no label filter + self.module.label_filter = None + row = {"features": ["feature1", "feature2"], "labels": [False, True]} + # Handle the case where the index is out of bounds + with self.assertRaises(TypeError): + self.module._filter_labels(row) + + def test_filter_labels_invalid_index(self) -> None: + """ + Test the _filter_labels method with an invalid label_filter index. + """ + # Set an invalid label filter index (e.g., greater than the number of labels) + self.module.label_filter = 10 + row = {"features": ["feature1", "feature2"], "labels": [False, True]} + # Handle the case where the index is out of bounds + with self.assertRaises(IndexError): + self.module._filter_labels(row) + + +if __name__ == "__main__": + unittest.main() From d8016aa6459548f8981c43473706a80c9748fca9 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 2 Sep 2024 00:25:58 +0200 Subject: [PATCH 022/112] test for DynamicDataset --- .../dataset_classes/testDynamicDataset.py | 231 ++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 tests/unit/dataset_classes/testDynamicDataset.py diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py new file mode 100644 index 00000000..ae69952a --- /dev/null +++ b/tests/unit/dataset_classes/testDynamicDataset.py @@ -0,0 +1,231 @@ +import unittest +from typing import Tuple +from unittest.mock import PropertyMock, patch + +import pandas as pd + +from chebai.preprocessing.datasets.base import _DynamicDataset +from chebai.preprocessing.reader import ProteinDataReader + + +class TestDynamicDataset(unittest.TestCase): + """ + Test case for _DynamicDataset functionality, ensuring correct data splits and integrity + of train, validation, and test datasets. + """ + + @classmethod + @patch.multiple(_DynamicDataset, __abstractmethods__=frozenset()) + @patch.object(_DynamicDataset, "base_dir", new_callable=PropertyMock) + @patch.object(_DynamicDataset, "_name", new_callable=PropertyMock) + def setUpClass( + cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock + ) -> None: + """ + Set up a base instance of _DynamicDataset for testing with mocked properties. + """ + + # Mocking properties + mock_base_dir_property.return_value = "MockedBaseDirProperty" + mock_name_property.return_value = "MockedNameProperty" + + # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) + _DynamicDataset.READER = ProteinDataReader + + # Creating an instance of the dataset + cls.dataset: _DynamicDataset = _DynamicDataset() + + # Dataset with a balanced distribution of labels + X = [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + [13, 14], + [15, 16], + [17, 18], + [19, 20], + [21, 22], + [23, 24], + [25, 26], + [27, 28], + [29, 30], + [31, 32], + ] + y = [ + [False, False], + [False, True], + [True, False], + [True, True], + [False, False], + [False, True], + [True, False], + [True, True], + [False, False], + [False, True], + [True, False], + [True, True], + [False, False], + [False, True], + [True, False], + [True, True], + ] + cls.df = pd.DataFrame( + {"ident": [f"id{i + 1}" for i in range(len(X))], "features": X, "labels": y} + ) + + def test_get_test_split_valid(self) -> None: + """ + Test splitting the dataset into train and test sets and verify balance and non-overlap. + """ + self.dataset.train_split = 0.5 + # Test size will be 0.25 * 16 = 4 + train_df, test_df = self.dataset.get_test_split(self.df, seed=0) + + # Assert the correct number of rows in train and test sets + self.assertEqual(len(train_df), 12, "Train set should contain 12 samples.") + self.assertEqual(len(test_df), 4, "Test set should contain 4 samples.") + + # Check positive and negative label counts in train and test sets + train_pos_count, train_neg_count = self.get_positive_negative_labels_counts( + train_df + ) + test_pos_count, test_neg_count = self.get_positive_negative_labels_counts( + test_df + ) + + # Ensure that the train and test sets have balanced positives and negatives + self.assertEqual( + train_pos_count, train_neg_count, "Train set labels should be balanced." + ) + self.assertEqual( + test_pos_count, test_neg_count, "Test set labels should be balanced." + ) + + # Assert there is no overlap between train and test sets + train_idents = set(train_df["ident"]) + test_idents = set(test_df["ident"]) + self.assertEqual( + len(train_idents.intersection(test_idents)), + 0, + "Train and test sets should not overlap.", + ) + + def test_get_test_split_missing_labels(self) -> None: + """ + Test the behavior when the 'labels' column is missing in the dataset. + """ + df_missing_labels = pd.DataFrame({"ident": ["id1", "id2"]}) + with self.assertRaises( + KeyError, msg="Expected KeyError when 'labels' column is missing." + ): + self.dataset.get_test_split(df_missing_labels) + + def test_get_test_split_seed_consistency(self) -> None: + """ + Test that splitting the dataset with the same seed produces consistent results. + """ + train_df1, test_df1 = self.dataset.get_test_split(self.df, seed=42) + train_df2, test_df2 = self.dataset.get_test_split(self.df, seed=42) + + pd.testing.assert_frame_equal( + train_df1, + train_df2, + obj="Train sets should be identical for the same seed.", + ) + pd.testing.assert_frame_equal( + test_df1, test_df2, obj="Test sets should be identical for the same seed." + ) + + def test_get_train_val_splits_given_test(self) -> None: + """ + Test splitting the dataset into train and validation sets and verify balance and non-overlap. + """ + self.dataset.use_inner_cross_validation = False + self.dataset.train_split = 0.5 + df_train_main, test_df = self.dataset.get_test_split(self.df, seed=0) + train_df, val_df = self.dataset.get_train_val_splits_given_test( + df_train_main, test_df, seed=42 + ) + + # Ensure there is no overlap between train and test sets + train_idents = set(train_df["ident"]) + test_idents = set(test_df["ident"]) + self.assertEqual( + len(train_idents.intersection(test_idents)), + 0, + "Train and test sets should not overlap.", + ) + + # Ensure there is no overlap between validation and test sets + val_idents = set(val_df["ident"]) + self.assertEqual( + len(val_idents.intersection(test_idents)), + 0, + "Validation and test sets should not overlap.", + ) + + # Ensure there is no overlap between train and validation sets + self.assertEqual( + len(train_idents.intersection(val_idents)), + 0, + "Train and validation sets should not overlap.", + ) + + # Check positive and negative label counts in train and validation sets + train_pos_count, train_neg_count = self.get_positive_negative_labels_counts( + train_df + ) + val_pos_count, val_neg_count = self.get_positive_negative_labels_counts(val_df) + + # Ensure that the train and validation sets have balanced positives and negatives + self.assertEqual( + train_pos_count, train_neg_count, "Train set labels should be balanced." + ) + self.assertEqual( + val_pos_count, val_neg_count, "Validation set labels should be balanced." + ) + + def test_get_train_val_splits_given_test_consistency(self) -> None: + """ + Test that splitting the dataset into train and validation sets with the same seed produces consistent results. + """ + test_df = self.df.iloc[12:] # Assume rows 12 onward are for testing + train_df1, val_df1 = self.dataset.get_train_val_splits_given_test( + self.df, test_df, seed=42 + ) + train_df2, val_df2 = self.dataset.get_train_val_splits_given_test( + self.df, test_df, seed=42 + ) + + pd.testing.assert_frame_equal( + train_df1, + train_df2, + obj="Train sets should be identical for the same seed.", + ) + pd.testing.assert_frame_equal( + val_df1, + val_df2, + obj="Validation sets should be identical for the same seed.", + ) + + @staticmethod + def get_positive_negative_labels_counts(df: pd.DataFrame) -> Tuple[int, int]: + """ + Count the number of True and False values within the labels column. + + Args: + df (pd.DataFrame): The DataFrame containing the 'labels' column. + + Returns: + Tuple[int, int]: A tuple containing the counts of True and False values, respectively. + """ + true_count = sum(sum(label) for label in df["labels"]) + false_count = sum(len(label) - sum(label) for label in df["labels"]) + return true_count, false_count + + +if __name__ == "__main__": + unittest.main() From 0c7c5b8fab7612bbcfc8c7feba8d07d7b797a3d9 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 2 Sep 2024 00:53:59 +0200 Subject: [PATCH 023/112] add relevant msg to each assert statement --- tests/unit/collators/testDefaultCollator.py | 20 ++++- tests/unit/collators/testRaggedCollator.py | 73 ++++++++++++++----- .../dataset_classes/testDynamicDataset.py | 4 +- .../dataset_classes/testXYBaseDataModule.py | 25 +++++-- tests/unit/readers/testChemDataReader.py | 33 +++++++-- tests/unit/readers/testDataReader.py | 6 +- tests/unit/readers/testDeepChemDataReader.py | 31 ++++++-- tests/unit/readers/testProteinDataReader.py | 49 +++++++++++-- tests/unit/readers/testSelfiesReader.py | 43 +++++++++-- 9 files changed, 227 insertions(+), 57 deletions(-) diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py index 29b1cc91..73f09c75 100644 --- a/tests/unit/collators/testDefaultCollator.py +++ b/tests/unit/collators/testDefaultCollator.py @@ -27,13 +27,23 @@ def test_call_with_valid_data(self) -> None: ] result: XYData = self.collator(data) - self.assertIsInstance(result, XYData) + self.assertIsInstance( + result, XYData, "The result should be an instance of XYData." + ) expected_x = ([1.0, 2.0], [3.0, 4.0]) expected_y = ([True, False, True], [False, False, True]) - self.assertEqual(result.x, expected_x) - self.assertEqual(result.y, expected_y) + self.assertEqual( + result.x, + expected_x, + "The feature data 'x' does not match the expected output.", + ) + self.assertEqual( + result.y, + expected_y, + "The label data 'y' does not match the expected output.", + ) def test_call_with_empty_data(self) -> None: """ @@ -45,7 +55,9 @@ def test_call_with_empty_data(self) -> None: self.collator(data) self.assertEqual( - str(context.exception), "not enough values to unpack (expected 2, got 0)" + str(context.exception), + "not enough values to unpack (expected 2, got 0)", + "The exception message for empty data is not as expected.", ) diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py index 81947b47..d31776a6 100644 --- a/tests/unit/collators/testRaggedCollator.py +++ b/tests/unit/collators/testRaggedCollator.py @@ -40,20 +40,30 @@ def test_call_with_valid_data(self) -> None: ) expected_lens_for_x = torch.tensor([2, 3, 1]) - self.assertTrue(torch.equal(result.x, expected_x)) - self.assertTrue(torch.equal(result.y, expected_y)) + self.assertTrue( + torch.equal(result.x, expected_x), + "The feature tensor 'x' does not match the expected output.", + ) + self.assertTrue( + torch.equal(result.y, expected_y), + "The label tensor 'y' does not match the expected output.", + ) self.assertTrue( torch.equal( result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x - ) + ), + "The mask tensor does not match the expected output.", ) self.assertTrue( torch.equal( result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x - ) + ), + "The lens tensor does not match the expected output.", ) self.assertEqual( - result.additional_fields["idents"], ("sample1", "sample2", "sample3") + result.additional_fields["idents"], + ("sample1", "sample2", "sample3"), + "The identifiers do not match the expected output.", ) def test_call_with_missing_entire_labels(self) -> None: @@ -75,23 +85,35 @@ def test_call_with_missing_entire_labels(self) -> None: expected_mask_for_x = torch.tensor([[True, True], [True, False]]) expected_lens_for_x = torch.tensor([2, 1]) - self.assertTrue(torch.equal(result.x, expected_x)) - self.assertTrue(torch.equal(result.y, expected_y)) + self.assertTrue( + torch.equal(result.x, expected_x), + "The feature tensor 'x' does not match the expected output when labels are missing.", + ) + self.assertTrue( + torch.equal(result.y, expected_y), + "The label tensor 'y' does not match the expected output when labels are missing.", + ) self.assertTrue( torch.equal( result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x - ) + ), + "The mask tensor does not match the expected output when labels are missing.", ) self.assertTrue( torch.equal( result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x - ) + ), + "The lens tensor does not match the expected output when labels are missing.", ) self.assertEqual( - result.additional_fields["loss_kwargs"]["non_null_labels"], [0, 2] + result.additional_fields["loss_kwargs"]["non_null_labels"], + [0, 2], + "The non-null labels list does not match the expected output.", ) self.assertEqual( - result.additional_fields["idents"], ("sample1", "sample2", "sample3") + result.additional_fields["idents"], + ("sample1", "sample2", "sample3"), + "The identifiers do not match the expected output when labels are missing.", ) def test_call_with_none_in_labels(self) -> None: @@ -115,20 +137,30 @@ def test_call_with_none_in_labels(self) -> None: ) expected_lens_for_x = torch.tensor([2, 3, 1]) - self.assertTrue(torch.equal(result.x, expected_x)) - self.assertTrue(torch.equal(result.y, expected_y)) + self.assertTrue( + torch.equal(result.x, expected_x), + "The feature tensor 'x' does not match the expected output when labels contain None.", + ) + self.assertTrue( + torch.equal(result.y, expected_y), + "The label tensor 'y' does not match the expected output when labels contain None.", + ) self.assertTrue( torch.equal( result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x - ) + ), + "The mask tensor does not match the expected output when labels contain None.", ) self.assertTrue( torch.equal( result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x - ) + ), + "The lens tensor does not match the expected output when labels contain None.", ) self.assertEqual( - result.additional_fields["idents"], ("sample1", "sample2", "sample3") + result.additional_fields["idents"], + ("sample1", "sample2", "sample3"), + "The identifiers do not match the expected output when labels contain None.", ) def test_call_with_empty_data(self) -> None: @@ -137,7 +169,9 @@ def test_call_with_empty_data(self) -> None: """ data: List[Dict] = [] - with self.assertRaises(Exception): + with self.assertRaises( + Exception, msg="Expected an Error when no data is provided" + ): self.collator(data) def test_process_label_rows(self) -> None: @@ -152,7 +186,10 @@ def test_process_label_rows(self) -> None: [[True, False, False], [False, True, True], [True, False, False]] ) - self.assertTrue(torch.equal(result, expected_output)) + self.assertTrue( + torch.equal(result, expected_output), + "The processed label rows tensor does not match the expected output.", + ) if __name__ == "__main__": diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py index ae69952a..50b9287a 100644 --- a/tests/unit/dataset_classes/testDynamicDataset.py +++ b/tests/unit/dataset_classes/testDynamicDataset.py @@ -26,8 +26,8 @@ def setUpClass( """ # Mocking properties - mock_base_dir_property.return_value = "MockedBaseDirProperty" - mock_name_property.return_value = "MockedNameProperty" + mock_base_dir_property.return_value = "MockedBaseDirPropertyDynamicDataset" + mock_name_property.return_value = "MockedNamePropertyDynamicDataset" # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) _DynamicDataset.READER = ProteinDataReader diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py index d8aabc67..4c2d21dc 100644 --- a/tests/unit/dataset_classes/testXYBaseDataModule.py +++ b/tests/unit/dataset_classes/testXYBaseDataModule.py @@ -12,13 +12,13 @@ class TestXYBaseDataModule(unittest.TestCase): @classmethod @patch.object(XYBaseDataModule, "_name", new_callable=PropertyMock) - def setUpClass(cls, mock_name_property) -> None: + def setUpClass(cls, mock_name_property: PropertyMock) -> None: """ Set up a base instance of XYBaseDataModule for testing. """ # Mock the _name property of XYBaseDataModule - mock_name_property.return_value = "MockedXYBaseDataModule" + mock_name_property.return_value = "MockedNamePropXYBaseDataModule" # Assign a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) XYBaseDataModule.READER = ProteinDataReader @@ -41,13 +41,21 @@ def test_filter_labels_valid_index(self) -> None: filtered_row = self.module._filter_labels(row) expected_labels = [3] # Only the label at index 1 should be kept - self.assertEqual(filtered_row["labels"], expected_labels) + self.assertEqual( + filtered_row["labels"], + expected_labels, + "The filtered labels do not match the expected labels.", + ) row = { "features": ["feature1", "feature2"], "labels": [True, False, True, True], } - self.assertEqual(self.module._filter_labels(row)["labels"], [False]) + self.assertEqual( + self.module._filter_labels(row)["labels"], + [False], + "The filtered labels for the boolean case do not match the expected labels.", + ) def test_filter_labels_no_filter(self) -> None: """ @@ -57,7 +65,9 @@ def test_filter_labels_no_filter(self) -> None: self.module.label_filter = None row = {"features": ["feature1", "feature2"], "labels": [False, True]} # Handle the case where the index is out of bounds - with self.assertRaises(TypeError): + with self.assertRaises( + TypeError, msg="Expected a TypeError when no label filter is provided." + ): self.module._filter_labels(row) def test_filter_labels_invalid_index(self) -> None: @@ -68,7 +78,10 @@ def test_filter_labels_invalid_index(self) -> None: self.module.label_filter = 10 row = {"features": ["feature1", "feature2"], "labels": [False, True]} # Handle the case where the index is out of bounds - with self.assertRaises(IndexError): + with self.assertRaises( + IndexError, + msg="Expected an IndexError when the label filter index is out of bounds.", + ): self.module._filter_labels(row) diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py index 3d7b5e6f..fde8604f 100644 --- a/tests/unit/readers/testChemDataReader.py +++ b/tests/unit/readers/testChemDataReader.py @@ -27,7 +27,14 @@ def setUpClass(cls, mock_file: mock_open) -> None: """ cls.reader = ChemDataReader(token_path="/mock/path") # After initializing, cls.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '('] - assert cls.reader.cache == ["C", "O", "N", "=", "1", "("] + assert cls.reader.cache == [ + "C", + "O", + "N", + "=", + "1", + "(", + ], "Initial cache does not match expected values." def test_read_data(self) -> None: """ @@ -48,7 +55,11 @@ def test_read_data(self) -> None: EMBEDDING_OFFSET + len(self.reader.cache) + 1, # [Mg-2] ] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The output of _read_data does not match the expected tokenized values.", + ) def test_read_data_with_new_token(self) -> None: """ @@ -62,12 +73,24 @@ def test_read_data_with_new_token(self) -> None: expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The output for new token '[H-]' does not match the expected values.", + ) # Verify that '[H-]' was added to the cache - self.assertIn("[H-]", self.reader.cache) + self.assertIn( + "[H-]", + self.reader.cache, + "The new token '[H-]' was not added to the cache.", + ) # Ensure it's at the correct index - self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token) + self.assertEqual( + self.reader.cache.index("[H-]"), + index_for_last_token, + "The new token '[H-]' was not added at the correct index in the cache.", + ) if __name__ == "__main__": diff --git a/tests/unit/readers/testDataReader.py b/tests/unit/readers/testDataReader.py index 8a8af053..745c0ace 100644 --- a/tests/unit/readers/testDataReader.py +++ b/tests/unit/readers/testDataReader.py @@ -45,7 +45,11 @@ def test_to_data(self) -> None: "extra_key": "extra_value", } - self.assertEqual(self.reader.to_data(row), expected) + self.assertEqual( + self.reader.to_data(row), + expected, + "The to_data method did not process the input row as expected.", + ) if __name__ == "__main__": diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py index 23ac35d5..31a63dd1 100644 --- a/tests/unit/readers/testDeepChemDataReader.py +++ b/tests/unit/readers/testDeepChemDataReader.py @@ -27,7 +27,12 @@ def setUpClass(cls, mock_file: mock_open) -> None: """ cls.reader = DeepChemDataReader(token_path="/mock/path") # After initializing, cls.reader.cache should now be set to ['C', 'O', 'c', ')'] - assert cls.reader.cache == ["C", "O", "c", ")"] + assert cls.reader.cache == [ + "C", + "O", + "c", + ")", + ], "Cache initialization did not match expected tokens." def test_read_data(self) -> None: """ @@ -58,7 +63,11 @@ def test_read_data(self) -> None: EMBEDDING_OFFSET + len(self.reader.cache) + 3, # [Ni-2] (new token) ] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The _read_data method did not produce the expected tokenized output for the SMILES string.", + ) def test_read_data_with_new_token(self) -> None: """ @@ -72,12 +81,24 @@ def test_read_data_with_new_token(self) -> None: expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The _read_data method did not produce the expected output for a SMILES string with a new token.", + ) # Verify that '[H-]' was added to the cache - self.assertIn("[H-]", self.reader.cache) + self.assertIn( + "[H-]", + self.reader.cache, + "The new token '[H-]' was not added to the cache as expected.", + ) # Ensure it's at the correct index - self.assertEqual(self.reader.cache.index("[H-]"), index_for_last_token) + self.assertEqual( + self.reader.cache.index("[H-]"), + index_for_last_token, + "The new token '[H-]' was not added to the correct index in the cache.", + ) if __name__ == "__main__": diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py index 6e5f325c..c5bc5e9a 100644 --- a/tests/unit/readers/testProteinDataReader.py +++ b/tests/unit/readers/testProteinDataReader.py @@ -25,7 +25,14 @@ def setUpClass(cls, mock_file: mock_open) -> None: """ cls.reader = ProteinDataReader(token_path="/mock/path") # After initializing, cls.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N'] - assert cls.reader.cache == ["M", "K", "T", "F", "R", "N"] + assert cls.reader.cache == [ + "M", + "K", + "T", + "F", + "R", + "N", + ], "Cache initialization did not match expected tokens." def test_read_data(self) -> None: """ @@ -44,7 +51,11 @@ def test_read_data(self) -> None: EMBEDDING_OFFSET + 5, # N ] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The _read_data method did not produce the expected tokenized output.", + ) def test_read_data_with_new_token(self) -> None: """ @@ -63,12 +74,22 @@ def test_read_data_with_new_token(self) -> None: ] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The _read_data method did not correctly handle a new token.", + ) # Verify that 'Y' was added to the cache - self.assertIn("Y", self.reader.cache) + self.assertIn( + "Y", self.reader.cache, "The new token 'Y' was not added to the cache." + ) # Ensure it's at the correct index - self.assertEqual(self.reader.cache.index("Y"), len(self.reader.cache) - 1) + self.assertEqual( + self.reader.cache.index("Y"), + len(self.reader.cache) - 1, + "The new token 'Y' was not added at the correct index in the cache.", + ) def test_read_data_with_invalid_token(self) -> None: """ @@ -79,7 +100,11 @@ def test_read_data_with_invalid_token(self) -> None: with self.assertRaises(KeyError) as context: self.reader._read_data(raw_data) - self.assertIn("Invalid token 'Z' encountered", str(context.exception)) + self.assertIn( + "Invalid token 'Z' encountered", + str(context.exception), + "The KeyError did not contain the expected message for an invalid token.", + ) def test_read_data_with_empty_sequence(self) -> None: """ @@ -88,7 +113,11 @@ def test_read_data_with_empty_sequence(self) -> None: raw_data = "" result = self.reader._read_data(raw_data) - self.assertEqual(result, []) + self.assertEqual( + result, + [], + "The _read_data method did not return an empty list for an empty input sequence.", + ) def test_read_data_with_repeated_tokens(self) -> None: """ @@ -99,7 +128,11 @@ def test_read_data_with_repeated_tokens(self) -> None: expected_output: List[int] = [EMBEDDING_OFFSET + 0] * 5 # All tokens are 'M' result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The _read_data method did not correctly handle repeated tokens.", + ) if __name__ == "__main__": diff --git a/tests/unit/readers/testSelfiesReader.py b/tests/unit/readers/testSelfiesReader.py index 019a0f59..411fc63b 100644 --- a/tests/unit/readers/testSelfiesReader.py +++ b/tests/unit/readers/testSelfiesReader.py @@ -26,8 +26,12 @@ def setUpClass(cls, mock_file: mock_open) -> None: mock_file: Mock object for file operations. """ cls.reader = SelfiesReader(token_path="/mock/path") - # After initializing, cls.reader.cache should now be set to ['[C]', '[O]', '[N]', '[=]', '[1]', '[('] - assert cls.reader.cache == ["[C]", "[O]", "[=C]"] + # After initializing, cls.reader.cache should now be set to ['[C]', '[O]', '[=C]'] + assert cls.reader.cache == [ + "[C]", + "[O]", + "[=C]", + ], "Cache initialization did not match expected tokens." def test_read_data(self) -> None: """ @@ -62,7 +66,11 @@ def test_read_data(self) -> None: ] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The _read_data method did not produce the expected tokenized output.", + ) def test_read_data_with_new_token(self) -> None: """ @@ -76,12 +84,24 @@ def test_read_data_with_new_token(self) -> None: expected_output: List[int] = [EMBEDDING_OFFSET + index_for_last_token] result = self.reader._read_data(raw_data) - self.assertEqual(result, expected_output) + self.assertEqual( + result, + expected_output, + "The _read_data method did not correctly handle a new token.", + ) # Verify that '[H-1]' was added to the cache, "[H-]" translated to "[H-1]" in SELFIES - self.assertIn("[H-1]", self.reader.cache) + self.assertIn( + "[H-1]", + self.reader.cache, + "The new token '[H-1]' was not added to the cache.", + ) # Ensure it's at the correct index - self.assertEqual(self.reader.cache.index("[H-1]"), index_for_last_token) + self.assertEqual( + self.reader.cache.index("[H-1]"), + index_for_last_token, + "The new token '[H-1]' was not added at the correct index in the cache.", + ) def test_read_data_with_invalid_selfies(self) -> None: """ @@ -90,10 +110,17 @@ def test_read_data_with_invalid_selfies(self) -> None: raw_data = "[C][O][INVALID][N]" result = self.reader._read_data(raw_data) - self.assertIsNone(result) + self.assertIsNone( + result, + "The _read_data method did not return None for an invalid SELFIES string.", + ) # Verify that the error count was incremented - self.assertEqual(self.reader.error_count, 1) + self.assertEqual( + self.reader.error_count, + 1, + "The error count was not incremented for an invalid SELFIES string.", + ) if __name__ == "__main__": From c0aaeeaef84efa06b0a68879ddf3e0874c749138 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 4 Sep 2024 17:34:03 +0200 Subject: [PATCH 024/112] test data class for chebi ontology --- tests/unit/mock_data/ontology_mock_data.py | 146 +++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 tests/unit/mock_data/ontology_mock_data.py diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py new file mode 100644 index 00000000..27fd511f --- /dev/null +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -0,0 +1,146 @@ +class ChebiMockOntology: + """ + Nodes: + - CHEBI:12345 (Compound A) + - CHEBI:54321 (Compound B) + - CHEBI:67890 (Compound C) + - CHEBI:11111 (Compound D) + - CHEBI:22222 (Compound E) + - CHEBI:99999 (Compound F) + - CHEBI:77533 (Compound F, Obsolete node) + - CHEBI:77564 (Compound H, Obsolete node) + - CHEBI:88888 (Compound I) + + Valid Edges: + - CHEBI:54321 -> CHEBI:12345 + - CHEBI:67890 -> CHEBI:12345 + - CHEBI:67890 -> CHEBI:88888 + - CHEBI:11111 -> CHEBI:54321 + - CHEBI:77564 -> CHEBI:54321 (Ignored due to obsolete status) + - CHEBI:22222 -> CHEBI:67890 + - CHEBI:12345 -> CHEBI:99999 + - CHEBI:77533 -> CHEBI:99999 (Ignored due to obsolete status) + """ + + @staticmethod + def get_nodes(): + return {12345, 54321, 67890, 11111, 22222, 99999, 88888} + + @staticmethod + def get_number_of_nodes(): + return len(ChebiMockOntology.get_nodes()) + + @staticmethod + def get_edges_of_transitive_closure_graph(): + return { + (54321, 12345), + (54321, 99999), + (67890, 12345), + (67890, 99999), + (67890, 88888), + (11111, 54321), + (11111, 12345), + (11111, 99999), + (22222, 67890), + (22222, 12345), + (22222, 99999), + (22222, 88888), + (12345, 99999), + } + + @staticmethod + def get_number_of_transitive_edges(): + return len(ChebiMockOntology.get_edges_of_transitive_closure_graph()) + + @staticmethod + def get_edges(): + return { + (54321, 12345), + (67890, 12345), + (67890, 88888), + (11111, 54321), + (22222, 67890), + (12345, 99999), + } + + @staticmethod + def get_number_of_edges(): + return len(ChebiMockOntology.get_edges()) + + @staticmethod + def get_obsolete_nodes_ids(): + return {77533, 77564} + + @staticmethod + def get_raw_data(): + # Create mock terms with a complex hierarchy, names, and SMILES strings + return """ + [Term] + id: CHEBI:12345 + name: Compound A + subset: 2_STAR + property_value: http://purl.obolibrary.org/obo/chebi/formula "C26H35ClN4O6S" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/charge "0" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/monoisotopicmass "566.19658" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/mass "567.099" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/inchikey "ROXPMFGZZQEKHB-IUKKYPGJSA-N" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/inchi "InChI=1S/C26H35ClN4O6S/c1-16(2)28-26(34)30(5)14-23-17(3)13-31(18(4)15-32)25(33)21-7-6-8-22(24(21)37-23)29-38(35,36)20-11-9-19(27)10-12-20/h6-12,16-18,23,29,32H,13-15H2,1-5H3,(H,28,34)/t17-,18-,23+/m0/s1" xsd:string + xref: LINCS:LSM-20139 + is_a: CHEBI:54321 + is_a: CHEBI:67890 + + [Term] + id: CHEBI:54321 + name: Compound B + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1O" xsd:string + is_a: CHEBI:11111 + is_a: CHEBI:77564 + + [Term] + id: CHEBI:67890 + name: Compound C + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1N" xsd:string + is_a: CHEBI:22222 + + [Term] + id: CHEBI:11111 + name: Compound D + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1F" xsd:string + + [Term] + id: CHEBI:22222 + name: Compound E + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1Cl" xsd:string + + [Term] + id: CHEBI:99999 + name: Compound F + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1Br" xsd:string + is_a: CHEBI:12345 + + [Term] + id: CHEBI:77533 + name: Compound G + is_a: CHEBI:99999 + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=C1Br" xsd:string + is_obsolete: true + + [Term] + id: CHEBI:77564 + name: Compound H + property_value: http://purl.obolibrary.org/obo/chebi/smiles "CC=C1Br" xsd:string + is_obsolete: true + + [Typedef] + id: has_major_microspecies_at_pH_7_3 + name: has major microspecies at pH 7.3 + is_cyclic: true + is_transitive: false + + [Term] + id: CHEBI:88888 + name: Compound I + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1[Mg+]" xsd:string + is_a: CHEBI:67890 + """ From 764216e91e032693b90b9044eccc2fb411fcfad5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 4 Sep 2024 17:38:13 +0200 Subject: [PATCH 025/112] test for term callback + mock data changes --- .../dataset_classes/testChebiTermCallback.py | 67 +++++++++++++ tests/unit/mock_data/__init__.py | 0 tests/unit/mock_data/ontology_mock_data.py | 98 +++++++++++++++---- 3 files changed, 144 insertions(+), 21 deletions(-) create mode 100644 tests/unit/dataset_classes/testChebiTermCallback.py create mode 100644 tests/unit/mock_data/__init__.py diff --git a/tests/unit/dataset_classes/testChebiTermCallback.py b/tests/unit/dataset_classes/testChebiTermCallback.py new file mode 100644 index 00000000..7b22d1a2 --- /dev/null +++ b/tests/unit/dataset_classes/testChebiTermCallback.py @@ -0,0 +1,67 @@ +import unittest +from typing import Any, Dict + +import fastobo +from fastobo.term import TermFrame + +from chebai.preprocessing.datasets.chebi import term_callback +from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology + + +class TestChebiTermCallback(unittest.TestCase): + """ + Unit tests for the `term_callback` function used in processing ChEBI ontology terms. + """ + + @classmethod + def setUpClass(cls) -> None: + """ + Set up the test class by loading ChEBI term data and storing it in a dictionary + where keys are the term IDs and values are TermFrame instances. + """ + cls.callback_input_data: Dict[int, TermFrame] = { + int(term_doc.id.local): term_doc + for term_doc in fastobo.loads(ChebiMockOntology.get_raw_data()) + if term_doc and ":" in str(term_doc.id) + } + + def test_process_valid_terms(self) -> None: + """ + Test that `term_callback` correctly processes valid ChEBI terms. + """ + + expected_result: Dict[str, Any] = { + "id": 12345, + "parents": [54321, 67890], + "has_part": set(), + "name": "Compound A", + "smiles": "C1=CC=CC=C1", + } + + actual_dict: Dict[str, Any] = term_callback( + self.callback_input_data.get(expected_result["id"]) + ) + self.assertEqual( + expected_result, + actual_dict, + msg="term_callback should correctly extract information from valid ChEBI terms.", + ) + + def test_skip_obsolete_terms(self) -> None: + """ + Test that `term_callback` correctly skips obsolete ChEBI terms. + """ + + term_callback_output = [ + term_callback(self.callback_input_data.get(ident)) + for ident in ChebiMockOntology.get_obsolete_nodes_ids() + ] + self.assertEqual( + term_callback_output, + [], + msg="The term_callback function should skip obsolete terms and return an empty list.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/mock_data/__init__.py b/tests/unit/mock_data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index 27fd511f..11d5c9ce 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -1,5 +1,12 @@ +from typing import Set, Tuple + + class ChebiMockOntology: """ + A mock ontology representing a simplified ChEBI (Chemical Entities of Biological Interest) structure. + This class is used for testing purposes and includes nodes and edges representing chemical compounds + and their relationships in a graph structure. + Nodes: - CHEBI:12345 (Compound A) - CHEBI:54321 (Compound B) @@ -7,7 +14,7 @@ class ChebiMockOntology: - CHEBI:11111 (Compound D) - CHEBI:22222 (Compound E) - CHEBI:99999 (Compound F) - - CHEBI:77533 (Compound F, Obsolete node) + - CHEBI:77533 (Compound G, Obsolete node) - CHEBI:77564 (Compound H, Obsolete node) - CHEBI:88888 (Compound I) @@ -16,64 +23,113 @@ class ChebiMockOntology: - CHEBI:67890 -> CHEBI:12345 - CHEBI:67890 -> CHEBI:88888 - CHEBI:11111 -> CHEBI:54321 - - CHEBI:77564 -> CHEBI:54321 (Ignored due to obsolete status) - CHEBI:22222 -> CHEBI:67890 - CHEBI:12345 -> CHEBI:99999 - - CHEBI:77533 -> CHEBI:99999 (Ignored due to obsolete status) + + The class also includes methods to retrieve nodes, edges, and transitive closure of the graph. """ @staticmethod - def get_nodes(): + def get_nodes() -> Set[int]: + """ + Get the set of valid node IDs in the mock ontology. + + Returns: + - Set[int]: A set of integers representing the valid ChEBI node IDs. + """ return {12345, 54321, 67890, 11111, 22222, 99999, 88888} @staticmethod - def get_number_of_nodes(): + def get_number_of_nodes() -> int: + """ + Get the number of valid nodes in the mock ontology. + + Returns: + - int: The number of valid nodes. + """ return len(ChebiMockOntology.get_nodes()) @staticmethod - def get_edges_of_transitive_closure_graph(): + def get_edges() -> Set[Tuple[int, int]]: + """ + Get the set of valid edges in the mock ontology. + + Returns: + - Set[Tuple[int, int]]: A set of tuples representing the directed edges + between ChEBI nodes. + """ return { (54321, 12345), - (54321, 99999), (67890, 12345), - (67890, 99999), (67890, 88888), (11111, 54321), - (11111, 12345), - (11111, 99999), (22222, 67890), - (22222, 12345), - (22222, 99999), - (22222, 88888), (12345, 99999), } @staticmethod - def get_number_of_transitive_edges(): - return len(ChebiMockOntology.get_edges_of_transitive_closure_graph()) + def get_number_of_edges() -> int: + """ + Get the number of valid edges in the mock ontology. + + Returns: + - int: The number of valid edges. + """ + return len(ChebiMockOntology.get_edges()) @staticmethod - def get_edges(): + def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: + """ + Get the set of edges derived from the transitive closure of the mock ontology graph. + + Returns: + - Set[Tuple[int, int]]: A set of tuples representing the directed edges + in the transitive closure of the ChEBI graph. + """ return { (54321, 12345), + (54321, 99999), (67890, 12345), + (67890, 99999), (67890, 88888), (11111, 54321), + (11111, 12345), + (11111, 99999), (22222, 67890), + (22222, 12345), + (22222, 99999), + (22222, 88888), (12345, 99999), } @staticmethod - def get_number_of_edges(): - return len(ChebiMockOntology.get_edges()) + def get_number_of_transitive_edges() -> int: + """ + Get the number of edges in the transitive closure of the mock ontology graph. + + Returns: + - int: The number of edges in the transitive closure graph. + """ + return len(ChebiMockOntology.get_edges_of_transitive_closure_graph()) @staticmethod - def get_obsolete_nodes_ids(): + def get_obsolete_nodes_ids() -> Set[int]: + """ + Get the set of obsolete node IDs in the mock ontology. + + Returns: + - Set[int]: A set of integers representing the obsolete ChEBI node IDs. + """ return {77533, 77564} @staticmethod - def get_raw_data(): - # Create mock terms with a complex hierarchy, names, and SMILES strings + def get_raw_data() -> str: + """ + Get the raw data representing the mock ontology in OBO format. + + Returns: + - str: A string containing the raw OBO data for the mock ChEBI terms. + """ return """ [Term] id: CHEBI:12345 From 1dd8428bbfc46ebf5aa445cc542851cfd8df4f5a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 5 Sep 2024 20:10:35 +0200 Subject: [PATCH 026/112] test for chebidataextractor + changes in mock data --- .../dataset_classes/testChebiDataExtractor.py | 214 ++++++++++++++++++ tests/unit/mock_data/ontology_mock_data.py | 80 ++++++- 2 files changed, 291 insertions(+), 3 deletions(-) create mode 100644 tests/unit/dataset_classes/testChebiDataExtractor.py diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py new file mode 100644 index 00000000..cb52e68f --- /dev/null +++ b/tests/unit/dataset_classes/testChebiDataExtractor.py @@ -0,0 +1,214 @@ +import unittest +from unittest.mock import PropertyMock, mock_open, patch + +import networkx as nx +import pandas as pd + +from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor +from chebai.preprocessing.reader import ChemDataReader +from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology + + +class TestChEBIDataExtractor(unittest.TestCase): + + @classmethod + @patch.multiple(_ChEBIDataExtractor, __abstractmethods__=frozenset()) + @patch.object(_ChEBIDataExtractor, "base_dir", new_callable=PropertyMock) + @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock) + def setUpClass( + cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock + ) -> None: + """ + Set up a base instance of _DynamicDataset for testing with mocked properties. + """ + + # Mocking properties + mock_base_dir_property.return_value = "MockedBaseDirPropertyChebiDataExtractor" + mock_name_property.return_value = "MockedNamePropertyChebiDataExtractor" + + # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) + _ChEBIDataExtractor.READER = ChemDataReader + + # Creating an instance of the dataset + cls.extractor: _ChEBIDataExtractor = _ChEBIDataExtractor( + chebi_version=231, chebi_version_train=200 + ) + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=ChebiMockOntology.get_raw_data(), + ) + def test_extract_class_hierarchy(self, mock_open): + # Mock the output of fastobo.loads + graph = self.extractor._extract_class_hierarchy("fake_path") + + # Validate the graph structure + self.assertIsInstance( + graph, nx.DiGraph, "The result should be a directed graph." + ) + + # Check nodes + actual_nodes = set(graph.nodes) + self.assertEqual( + set(ChebiMockOntology.get_nodes()), + actual_nodes, + "The graph nodes do not match the expected nodes.", + ) + + # Check edges + actual_edges = set(graph.edges) + self.assertEqual( + ChebiMockOntology.get_edges_of_transitive_closure_graph(), + actual_edges, + "The graph edges do not match the expected edges.", + ) + + # Check number of nodes and edges + self.assertEqual( + ChebiMockOntology.get_number_of_nodes(), + len(actual_nodes), + "The number of nodes should match the actual number of nodes in the graph.", + ) + + self.assertEqual( + ChebiMockOntology.get_number_of_transitive_edges(), + len(actual_edges), + "The number of transitive edges should match the actual number of transitive edges in the graph.", + ) + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=ChebiMockOntology.get_raw_data(), + ) + @patch.object( + _ChEBIDataExtractor, + "select_classes", + return_value=ChebiMockOntology.get_nodes(), + ) + def test_graph_to_raw_dataset(self, mock_open, mock_select_classes): + graph = self.extractor._extract_class_hierarchy("fake_path") + data_df = self.extractor._graph_to_raw_dataset(graph) + + pd.testing.assert_frame_equal( + data_df, + ChebiMockOntology.get_data_in_dataframe(), + obj="The DataFrame should match the expected structure", + ) + + @patch( + "builtins.open", new_callable=mock_open, read_data=b"Mocktestdata" + ) # Mocking open as a binary file + @patch("pandas.read_pickle") + def test_load_dict(self, mock_open, mock_read_pickle): + + # Mock the DataFrame returned by read_pickle + mock_df = pd.DataFrame( + { + "id": [12345, 67890, 11111, 54321], # Corrected ID + "name": ["A", "B", "C", "D"], + "SMILES": ["C1CCCCC1", "O=C=O", "C1CC=CC1", "C[Mg+]"], + 12345: [True, False, False, True], + 67890: [False, True, True, False], + 11111: [True, False, True, False], + } + ) + mock_read_pickle.return_value = mock_df # Mock the return value of read_pickle + + # Call the actual function (with open correctly mocked) + generator = self.extractor._load_dict("data/tests") + result = list(generator) # Collect all output from the generator + + # Expected output for comparison + expected_result = [ + {"features": "C1CCCCC1", "labels": [True, False, True], "ident": 12345}, + {"features": "O=C=O", "labels": [False, True, False], "ident": 67890}, + {"features": "C1CC=CC1", "labels": [False, True, True], "ident": 11111}, + { + "features": "C[Mg+]", + "labels": [True, False, False], + "ident": 54321, + }, # Corrected ID + ] + + # Assert if the result matches the expected output + self.assertEqual( + result, + expected_result, + "The loaded dictionary should match the expected structure.", + ) + + @patch("builtins.open", new_callable=mock_open) + @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock) + @patch.object(_ChEBIDataExtractor, "processed_dir_main", new_callable=PropertyMock) + @patch.object( + _ChEBIDataExtractor, "_chebi_version_train_obj", new_callable=PropertyMock + ) + def test_setup_pruned_test_set( + self, + mock_chebi_version_train_obj, + mock_processed_dir_main, + mock_name_property, + mock_open_file, + ): + # Mock the content for the two open calls (original classes and new classes) + mock_orig_classes = "12345\n67890\n88888\n54321\n77777\n" + mock_new_classes = "12345\n67890\n99999\n77777\n" + + # Use side_effect to simulate the two different file reads + mock_open_file.side_effect = [ + mock_open( + read_data=mock_orig_classes + ).return_value, # First open() for orig_classes + mock_open( + read_data=mock_new_classes + ).return_value, # Second open() for new_classes + ] + + # Mock the attributes used in the method + mock_processed_dir_main.return_value = "/mock/path/to/current" + mock_chebi_version_train_obj.return_value.processed_dir_main = ( + "/mock/path/to/train" + ) + + # Mock DataFrame to simulate the test dataset + mock_df = pd.DataFrame( + { + "labels": [ + [ + True, + False, + True, + False, + True, + ], # First test instance labels (match orig_classes) + [False, True, False, True, False], + ] # Second test instance labels + } + ) + + # Call the method under test + pruned_df = self.extractor._setup_pruned_test_set(mock_df) + + # Expected DataFrame labels after pruning (only "12345", "67890", "77777",and "99999" remain) + expected_labels = [[True, False, False, True], [False, True, False, False]] + + # Check if the pruned DataFrame still has the same number of rows + self.assertEqual( + len(pruned_df), + len(mock_df), + "The pruned DataFrame should have the same number of rows.", + ) + + # Check that the labels are correctly pruned + for i in range(len(pruned_df)): + self.assertEqual( + pruned_df.iloc[i]["labels"], + expected_labels[i], + f"Row {i}'s labels should be pruned correctly.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index 11d5c9ce..61b4462a 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -1,4 +1,7 @@ -from typing import Set, Tuple +from collections import OrderedDict +from typing import List, Set, Tuple + +import pandas as pd class ChebiMockOntology: @@ -30,14 +33,14 @@ class ChebiMockOntology: """ @staticmethod - def get_nodes() -> Set[int]: + def get_nodes() -> List[int]: """ Get the set of valid node IDs in the mock ontology. Returns: - Set[int]: A set of integers representing the valid ChEBI node IDs. """ - return {12345, 54321, 67890, 11111, 22222, 99999, 88888} + return [11111, 12345, 22222, 54321, 67890, 88888, 99999] @staticmethod def get_number_of_nodes() -> int: @@ -200,3 +203,74 @@ def get_raw_data() -> str: property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1[Mg+]" xsd:string is_a: CHEBI:67890 """ + + @staticmethod + def get_data_in_dataframe(): + data = OrderedDict( + id=[ + 12345, + 54321, + 67890, + 11111, + 22222, + 99999, + 88888, + ], + name=[ + "Compound A", + "Compound B", + "Compound C", + "Compound D", + "Compound E", + "Compound F", + "Compound I", + ], + SMILES=[ + "C1=CC=CC=C1", + "C1=CC=CC=C1O", + "C1=CC=CC=C1N", + "C1=CC=CC=C1F", + "C1=CC=CC=C1Cl", + "C1=CC=CC=C1Br", + "C1=CC=CC=C1[Mg+]", + ], + # Relationships { + # 12345: [11111, 54321, 22222, 67890], + # 67890: [22222], + # 99999: [67890, 11111, 54321, 22222, 12345], + # 54321: [11111], + # 88888: [22222, 67890] + # 11111: [] + # 22222: [] + # } + **{ + # -row- [11111, 12345, 22222, 54321, 67890, 88888, 99999] + 11111: [False, False, False, False, False, False, False], + 12345: [True, True, True, True, True, False, False], + 22222: [False, False, False, False, False, False, False], + 54321: [True, False, False, True, False, False, False], + 67890: [False, False, True, False, True, False, False], + 88888: [False, False, True, False, True, True, False], + 99999: [True, True, True, True, True, False, True], + } + ) + + data_df = pd.DataFrame(data) + + # ------------- Code Approach ------- + # ancestors_of_nodes = {} + # for parent, child in ChebiMockOntology.get_edges_of_transitive_closure_graph(): + # if child not in ancestors_of_nodes: + # ancestors_of_nodes[child] = set() + # if parent not in ancestors_of_nodes: + # ancestors_of_nodes[parent] = set() + # ancestors_of_nodes[child].add(parent) + # ancestors_of_nodes[child].add(child) + # + # # For each node in the ontology, create a column to check if it's an ancestor of any other node or itself + # for node in ChebiMockOntology.get_nodes(): + # data_df[node] = data_df['id'].apply( + # lambda x: (x == node) or (node in ancestors_of_nodes[x]) + # ) + + return data_df From f3519b566410ef1d20f9020258bceabe57199f74 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 5 Sep 2024 21:13:59 +0200 Subject: [PATCH 027/112] mock reader for all + test_setup_pruned_test_set changes --- .../dataset_classes/testChebiDataExtractor.py | 78 +++++++++++-------- .../dataset_classes/testDynamicDataset.py | 25 +++--- .../dataset_classes/testXYBaseDataModule.py | 8 +- 3 files changed, 62 insertions(+), 49 deletions(-) diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py index cb52e68f..0559e090 100644 --- a/tests/unit/dataset_classes/testChebiDataExtractor.py +++ b/tests/unit/dataset_classes/testChebiDataExtractor.py @@ -1,11 +1,10 @@ import unittest -from unittest.mock import PropertyMock, mock_open, patch +from unittest.mock import MagicMock, PropertyMock, mock_open, patch import networkx as nx import pandas as pd from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor -from chebai.preprocessing.reader import ChemDataReader from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology @@ -16,30 +15,39 @@ class TestChEBIDataExtractor(unittest.TestCase): @patch.object(_ChEBIDataExtractor, "base_dir", new_callable=PropertyMock) @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock) def setUpClass( - cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock + cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock ) -> None: """ - Set up a base instance of _DynamicDataset for testing with mocked properties. + Set up a base instance of _ChEBIDataExtractor for testing with mocked properties. """ - # Mocking properties mock_base_dir_property.return_value = "MockedBaseDirPropertyChebiDataExtractor" mock_name_property.return_value = "MockedNamePropertyChebiDataExtractor" - # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) - _ChEBIDataExtractor.READER = ChemDataReader + # Mock Data Reader + ReaderMock = MagicMock() + ReaderMock.name.return_value = "MockedReader" + _ChEBIDataExtractor.READER = ReaderMock - # Creating an instance of the dataset + # Create an instance of the dataset cls.extractor: _ChEBIDataExtractor = _ChEBIDataExtractor( chebi_version=231, chebi_version_train=200 ) + # Mock instance for _chebi_version_train_obj + mock_train_obj = MagicMock() + mock_train_obj.processed_dir_main = "/mock/path/to/train" + cls.extractor._chebi_version_train_obj = mock_train_obj + @patch( "builtins.open", new_callable=mock_open, read_data=ChebiMockOntology.get_raw_data(), ) - def test_extract_class_hierarchy(self, mock_open): + def test_extract_class_hierarchy(self, mock_open: mock_open) -> None: + """ + Test the extraction of class hierarchy and validate the structure of the resulting graph. + """ # Mock the output of fastobo.loads graph = self.extractor._extract_class_hierarchy("fake_path") @@ -87,22 +95,31 @@ def test_extract_class_hierarchy(self, mock_open): "select_classes", return_value=ChebiMockOntology.get_nodes(), ) - def test_graph_to_raw_dataset(self, mock_open, mock_select_classes): + def test_graph_to_raw_dataset( + self, mock_select_classes: PropertyMock, mock_open: mock_open + ) -> None: + """ + Test conversion of a graph to a raw dataset and compare it with the expected DataFrame. + """ graph = self.extractor._extract_class_hierarchy("fake_path") data_df = self.extractor._graph_to_raw_dataset(graph) pd.testing.assert_frame_equal( data_df, ChebiMockOntology.get_data_in_dataframe(), - obj="The DataFrame should match the expected structure", + obj="The DataFrame should match the expected structure.", ) @patch( "builtins.open", new_callable=mock_open, read_data=b"Mocktestdata" ) # Mocking open as a binary file @patch("pandas.read_pickle") - def test_load_dict(self, mock_open, mock_read_pickle): - + def test_load_dict( + self, mock_read_pickle: PropertyMock, mock_open: mock_open + ) -> None: + """ + Test loading data from a pickled file and verify the generator output. + """ # Mock the DataFrame returned by read_pickle mock_df = pd.DataFrame( { @@ -114,22 +131,21 @@ def test_load_dict(self, mock_open, mock_read_pickle): 11111: [True, False, True, False], } ) - mock_read_pickle.return_value = mock_df # Mock the return value of read_pickle + mock_read_pickle.return_value = mock_df - # Call the actual function (with open correctly mocked) generator = self.extractor._load_dict("data/tests") - result = list(generator) # Collect all output from the generator + result = list(generator) + + # Convert NumPy arrays to lists for comparison + for item in result: + item["labels"] = list(item["labels"]) # Expected output for comparison expected_result = [ {"features": "C1CCCCC1", "labels": [True, False, True], "ident": 12345}, {"features": "O=C=O", "labels": [False, True, False], "ident": 67890}, {"features": "C1CC=CC1", "labels": [False, True, True], "ident": 11111}, - { - "features": "C[Mg+]", - "labels": [True, False, False], - "ident": 54321, - }, # Corrected ID + {"features": "C[Mg+]", "labels": [True, False, False], "ident": 54321}, ] # Assert if the result matches the expected output @@ -140,18 +156,15 @@ def test_load_dict(self, mock_open, mock_read_pickle): ) @patch("builtins.open", new_callable=mock_open) - @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock) @patch.object(_ChEBIDataExtractor, "processed_dir_main", new_callable=PropertyMock) - @patch.object( - _ChEBIDataExtractor, "_chebi_version_train_obj", new_callable=PropertyMock - ) def test_setup_pruned_test_set( self, - mock_chebi_version_train_obj, - mock_processed_dir_main, - mock_name_property, - mock_open_file, - ): + mock_processed_dir_main: PropertyMock, + mock_open_file: mock_open, + ) -> None: + """ + Test the pruning of the test set to match classes in the training set. + """ # Mock the content for the two open calls (original classes and new classes) mock_orig_classes = "12345\n67890\n88888\n54321\n77777\n" mock_new_classes = "12345\n67890\n99999\n77777\n" @@ -168,9 +181,6 @@ def test_setup_pruned_test_set( # Mock the attributes used in the method mock_processed_dir_main.return_value = "/mock/path/to/current" - mock_chebi_version_train_obj.return_value.processed_dir_main = ( - "/mock/path/to/train" - ) # Mock DataFrame to simulate the test dataset mock_df = pd.DataFrame( @@ -191,7 +201,7 @@ def test_setup_pruned_test_set( # Call the method under test pruned_df = self.extractor._setup_pruned_test_set(mock_df) - # Expected DataFrame labels after pruning (only "12345", "67890", "77777",and "99999" remain) + # Expected DataFrame labels after pruning (only "12345", "67890", "77777", and "99999" remain) expected_labels = [[True, False, False, True], [False, True, False, False]] # Check if the pruned DataFrame still has the same number of rows diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py index 50b9287a..1ff6c26d 100644 --- a/tests/unit/dataset_classes/testDynamicDataset.py +++ b/tests/unit/dataset_classes/testDynamicDataset.py @@ -1,11 +1,10 @@ import unittest from typing import Tuple -from unittest.mock import PropertyMock, patch +from unittest.mock import MagicMock, PropertyMock, patch import pandas as pd from chebai.preprocessing.datasets.base import _DynamicDataset -from chebai.preprocessing.reader import ProteinDataReader class TestDynamicDataset(unittest.TestCase): @@ -29,8 +28,10 @@ def setUpClass( mock_base_dir_property.return_value = "MockedBaseDirPropertyDynamicDataset" mock_name_property.return_value = "MockedNamePropertyDynamicDataset" - # Assigning a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) - _DynamicDataset.READER = ProteinDataReader + # Mock Data Reader + ReaderMock = MagicMock() + ReaderMock.name.return_value = "MockedReader" + _DynamicDataset.READER = ReaderMock # Creating an instance of the dataset cls.dataset: _DynamicDataset = _DynamicDataset() @@ -72,7 +73,7 @@ def setUpClass( [True, False], [True, True], ] - cls.df = pd.DataFrame( + cls.data_df = pd.DataFrame( {"ident": [f"id{i + 1}" for i in range(len(X))], "features": X, "labels": y} ) @@ -82,7 +83,7 @@ def test_get_test_split_valid(self) -> None: """ self.dataset.train_split = 0.5 # Test size will be 0.25 * 16 = 4 - train_df, test_df = self.dataset.get_test_split(self.df, seed=0) + train_df, test_df = self.dataset.get_test_split(self.data_df, seed=0) # Assert the correct number of rows in train and test sets self.assertEqual(len(train_df), 12, "Train set should contain 12 samples.") @@ -127,8 +128,8 @@ def test_get_test_split_seed_consistency(self) -> None: """ Test that splitting the dataset with the same seed produces consistent results. """ - train_df1, test_df1 = self.dataset.get_test_split(self.df, seed=42) - train_df2, test_df2 = self.dataset.get_test_split(self.df, seed=42) + train_df1, test_df1 = self.dataset.get_test_split(self.data_df, seed=42) + train_df2, test_df2 = self.dataset.get_test_split(self.data_df, seed=42) pd.testing.assert_frame_equal( train_df1, @@ -145,7 +146,7 @@ def test_get_train_val_splits_given_test(self) -> None: """ self.dataset.use_inner_cross_validation = False self.dataset.train_split = 0.5 - df_train_main, test_df = self.dataset.get_test_split(self.df, seed=0) + df_train_main, test_df = self.dataset.get_test_split(self.data_df, seed=0) train_df, val_df = self.dataset.get_train_val_splits_given_test( df_train_main, test_df, seed=42 ) @@ -192,12 +193,12 @@ def test_get_train_val_splits_given_test_consistency(self) -> None: """ Test that splitting the dataset into train and validation sets with the same seed produces consistent results. """ - test_df = self.df.iloc[12:] # Assume rows 12 onward are for testing + test_df = self.data_df.iloc[12:] # Assume rows 12 onward are for testing train_df1, val_df1 = self.dataset.get_train_val_splits_given_test( - self.df, test_df, seed=42 + self.data_df, test_df, seed=42 ) train_df2, val_df2 = self.dataset.get_train_val_splits_given_test( - self.df, test_df, seed=42 + self.data_df, test_df, seed=42 ) pd.testing.assert_frame_equal( diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py index 4c2d21dc..8e3575ab 100644 --- a/tests/unit/dataset_classes/testXYBaseDataModule.py +++ b/tests/unit/dataset_classes/testXYBaseDataModule.py @@ -1,8 +1,7 @@ import unittest -from unittest.mock import PropertyMock, patch +from unittest.mock import MagicMock, PropertyMock, patch from chebai.preprocessing.datasets.base import XYBaseDataModule -from chebai.preprocessing.reader import ProteinDataReader class TestXYBaseDataModule(unittest.TestCase): @@ -21,7 +20,10 @@ def setUpClass(cls, mock_name_property: PropertyMock) -> None: mock_name_property.return_value = "MockedNamePropXYBaseDataModule" # Assign a static variable READER with ProteinDataReader (to get rid of default Abstract DataReader) - XYBaseDataModule.READER = ProteinDataReader + # Mock Data Reader + ReaderMock = MagicMock() + ReaderMock.name.return_value = "MockedReader" + XYBaseDataModule.READER = ReaderMock # Initialize the module with a label_filter cls.module = XYBaseDataModule( From fc0fd47389ea60a7573b4de7645c1a133816245d Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 5 Sep 2024 21:43:10 +0200 Subject: [PATCH 028/112] fix for misalignment between x an y in RaggedCollator - https://github.com/ChEB-AI/python-chebai/pull/48#issuecomment-2324393829 --- tests/unit/collators/testRaggedCollator.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py index d31776a6..d9ab2b1d 100644 --- a/tests/unit/collators/testRaggedCollator.py +++ b/tests/unit/collators/testRaggedCollator.py @@ -78,12 +78,15 @@ def test_call_with_missing_entire_labels(self) -> None: result: XYData = self.collator(data) - expected_x = torch.tensor([[1, 2], [6, 0]]) + # https://github.com/ChEB-AI/python-chebai/pull/48#issuecomment-2324393829 + expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]]) expected_y = torch.tensor( [[True, False], [True, False]] ) # True -> 1, False -> 0 - expected_mask_for_x = torch.tensor([[True, True], [True, False]]) - expected_lens_for_x = torch.tensor([2, 1]) + expected_mask_for_x = torch.tensor( + [[True, True, False], [True, True, True], [True, False, False]] + ) + expected_lens_for_x = torch.tensor([2, 3, 1]) self.assertTrue( torch.equal(result.x, expected_x), @@ -110,6 +113,11 @@ def test_call_with_missing_entire_labels(self) -> None: [0, 2], "The non-null labels list does not match the expected output.", ) + self.assertEqual( + len(result.additional_fields["loss_kwargs"]["non_null_labels"]), + result.y.shape[1], + "The length of non null labels list must match with target label variable size", + ) self.assertEqual( result.additional_fields["idents"], ("sample1", "sample2", "sample3"), From f7f163142c86480c08d31d9b686baba2eabcc81a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Fri, 6 Sep 2024 12:24:58 +0200 Subject: [PATCH 029/112] test for ChebiOverX --- tests/unit/dataset_classes/testChEBIOverX.py | 123 +++++++++++++++++++ tests/unit/mock_data/ontology_mock_data.py | 34 ++++- 2 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 tests/unit/dataset_classes/testChEBIOverX.py diff --git a/tests/unit/dataset_classes/testChEBIOverX.py b/tests/unit/dataset_classes/testChEBIOverX.py new file mode 100644 index 00000000..78d85dd4 --- /dev/null +++ b/tests/unit/dataset_classes/testChEBIOverX.py @@ -0,0 +1,123 @@ +import unittest +from unittest.mock import PropertyMock, mock_open, patch + +from chebai.preprocessing.datasets.chebi import ChEBIOverX +from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology + + +class TestChEBIOverX(unittest.TestCase): + @classmethod + @patch.multiple(ChEBIOverX, __abstractmethods__=frozenset()) + @patch.object(ChEBIOverX, "processed_dir_main", new_callable=PropertyMock) + def setUpClass(cls, mock_processed_dir_main: PropertyMock) -> None: + """ + Set up the ChEBIOverX instance with a mock processed directory path and a test graph. + + Args: + mock_processed_dir_main (PropertyMock): Mocked property for the processed directory path. + """ + mock_processed_dir_main.return_value = "/mock/processed_dir" + cls.chebi_extractor = ChEBIOverX(chebi_version=231) + cls.test_graph = ChebiMockOntology.get_transitively_closed_graph() + + @patch("builtins.open", new_callable=mock_open) + def test_select_classes(self, mock_open_file: mock_open) -> None: + """ + Test the select_classes method to ensure it correctly selects nodes based on the threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + self.chebi_extractor.THRESHOLD = 3 + selected_classes = self.chebi_extractor.select_classes(self.test_graph) + + # Check if the returned selected classes match the expected list + expected_classes = sorted([11111, 22222, 67890]) + self.assertListEqual( + selected_classes, + expected_classes, + "The selected classes do not match the expected output for the given threshold of 3.", + ) + + # Expected data as string + expected_lines = "\n".join(map(str, expected_classes)) + "\n" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + "The written lines do not match the expected lines for the given threshold of 3.", + ) + + @patch("builtins.open", new_callable=mock_open) + def test_no_classes_meet_threshold(self, mock_open_file: mock_open) -> None: + """ + Test the select_classes method when no nodes meet the successor threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + self.chebi_extractor.THRESHOLD = 5 + selected_classes = self.chebi_extractor.select_classes(self.test_graph) + + # Expected empty result + self.assertEqual( + selected_classes, + [], + "The selected classes list should be empty when no nodes meet the threshold of 5.", + ) + + # Expected data as string + expected_lines = "" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + "The written lines do not match the expected lines when no nodes meet the threshold of 5.", + ) + + @patch("builtins.open", new_callable=mock_open) + def test_all_nodes_meet_threshold(self, mock_open_file: mock_open) -> None: + """ + Test the select_classes method when all nodes meet the successor threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + self.chebi_extractor.THRESHOLD = 0 + selected_classes = self.chebi_extractor.select_classes(self.test_graph) + + expected_classes = sorted(ChebiMockOntology.get_nodes()) + # Check if the returned selected classes match the expected list + self.assertListEqual( + selected_classes, + expected_classes, + "The selected classes do not match the expected output when all nodes meet the threshold of 0.", + ) + + # Expected data as string + expected_lines = "\n".join(map(str, expected_classes)) + "\n" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + "The written lines do not match the expected lines when all nodes meet the threshold of 0.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index 61b4462a..e6c14a93 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -1,6 +1,7 @@ from collections import OrderedDict -from typing import List, Set, Tuple +from typing import Dict, List, Set, Tuple +import networkx as nx import pandas as pd @@ -30,6 +31,18 @@ class ChebiMockOntology: - CHEBI:12345 -> CHEBI:99999 The class also includes methods to retrieve nodes, edges, and transitive closure of the graph. + + Visual Representation Graph with Valid Nodes and Edges: + + 22222 + / + 11111 67890 + \\ / \ + 54321 / 88888 + \\ / + 12345 + \ + 99999 """ @staticmethod @@ -205,7 +218,7 @@ def get_raw_data() -> str: """ @staticmethod - def get_data_in_dataframe(): + def get_data_in_dataframe() -> pd.DataFrame: data = OrderedDict( id=[ 12345, @@ -274,3 +287,20 @@ def get_data_in_dataframe(): # ) return data_df + + @staticmethod + def get_transitively_closed_graph() -> nx.DiGraph: + """ + Create a directed graph, compute its transitive closure, and return it. + + Returns: + g (nx.DiGraph): A transitively closed directed graph. + """ + g = nx.DiGraph() + + for node in ChebiMockOntology.get_nodes(): + g.add_node(node, **{"smiles": "test_smiles_placeholder"}) + + g.add_edges_from(ChebiMockOntology.get_edges_of_transitive_closure_graph()) + + return g From bf45bb5360eceadf7f8fb7c651a42d8208de20ec Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Fri, 6 Sep 2024 13:52:12 +0200 Subject: [PATCH 030/112] test for ChebiXOverPartial --- .../dataset_classes/testChebiOverXPartial.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 tests/unit/dataset_classes/testChebiOverXPartial.py diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py new file mode 100644 index 00000000..c2515d75 --- /dev/null +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -0,0 +1,108 @@ +import unittest +from unittest.mock import mock_open, patch + +import networkx as nx + +from chebai.preprocessing.datasets.chebi import ChEBIOverXPartial +from tests.unit.mock_data.ontology_mock_data import ChebiMockOntology + + +class TestChEBIOverX(unittest.TestCase): + + @classmethod + @patch.multiple(ChEBIOverXPartial, __abstractmethods__=frozenset()) + def setUpClass(cls) -> None: + """ + Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph. + """ + cls.chebi_extractor = ChEBIOverXPartial(top_class_id=11111, chebi_version=231) + cls.test_graph = ChebiMockOntology.get_transitively_closed_graph() + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=ChebiMockOntology.get_raw_data(), + ) + def test_extract_class_hierarchy(self, mock_open: mock_open) -> None: + """ + Test the extraction of class hierarchy and validate the structure of the resulting graph. + """ + # Mock the output of fastobo.loads + self.chebi_extractor.top_class_id = 11111 + graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path") + + # Validate the graph structure + self.assertIsInstance( + graph, nx.DiGraph, "The result should be a directed graph." + ) + + # Check nodes + expected_nodes = {11111, 54321, 12345, 99999} + expected_edges = { + (54321, 12345), + (54321, 99999), + (11111, 54321), + (11111, 12345), + (11111, 99999), + (12345, 99999), + } + self.assertEqual( + set(graph.nodes), + expected_nodes, + f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.", + ) + + # Check edges + self.assertEqual( + expected_edges, + set(graph.edges), + "The graph edges do not match the expected edges.", + ) + + # Check number of nodes and edges + self.assertEqual( + len(graph.nodes), + len(expected_nodes), + "The number of nodes should match the actual number of nodes in the graph.", + ) + + self.assertEqual( + len(expected_edges), + len(graph.edges), + "The number of transitive edges should match the actual number of transitive edges in the graph.", + ) + + self.chebi_extractor.top_class_id = 22222 + graph = self.chebi_extractor.extract_class_hierarchy("fake_path") + + # Check nodes with top class as 22222 + self.assertEqual( + set(graph.nodes), + {67890, 88888, 12345, 99999, 22222}, + f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.", + ) + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=ChebiMockOntology.get_raw_data(), + ) + def test_extract_class_hierarchy_with_bottom_cls( + self, mock_open: mock_open + ) -> None: + """ + Test the extraction of class hierarchy and validate the structure of the resulting graph. + """ + self.chebi_extractor.top_class_id = 88888 + graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path") + + # Check nodes with top class as 88888 + self.assertEqual( + set(graph.nodes), + {self.chebi_extractor.top_class_id}, + f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.", + ) + + +if __name__ == "__main__": + unittest.main() From 17bf5843df4ade5dde7264ee926cb7123cb97289 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 9 Sep 2024 11:26:58 +0200 Subject: [PATCH 031/112] Mock data for GOUniProt --- tests/unit/mock_data/ontology_mock_data.py | 459 ++++++++++++++++++++- 1 file changed, 457 insertions(+), 2 deletions(-) diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index e6c14a93..dbce56d2 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from collections import OrderedDict from typing import Dict, List, Set, Tuple @@ -5,7 +6,115 @@ import pandas as pd -class ChebiMockOntology: +class MockOntologyGraphData(ABC): + """ + Abstract base class for mocking ontology graph data. + + This class provides a set of static methods that must be implemented by subclasses + to return various elements of an ontology graph such as nodes, edges, and dataframes. + """ + + @staticmethod + @abstractmethod + def get_nodes() -> List[int]: + """ + Get a list of node IDs in the ontology graph. + + Returns: + List[int]: A list of node IDs. + """ + pass + + @staticmethod + @abstractmethod + def get_number_of_nodes() -> int: + """ + Get the number of nodes in the ontology graph. + + Returns: + int: The total number of nodes. + """ + pass + + @staticmethod + @abstractmethod + def get_edges() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes. + """ + pass + + @staticmethod + @abstractmethod + def get_number_of_edges() -> int: + """ + Get the number of edges in the ontology graph. + + Returns: + int: The total number of edges. + """ + pass + + @staticmethod + @abstractmethod + def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the transitive closure of the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples representing the transitive closure edges. + """ + pass + + @staticmethod + @abstractmethod + def get_number_of_transitive_edges() -> int: + """ + Get the number of edges in the transitive closure of the ontology graph. + + Returns: + int: The total number of transitive edges. + """ + pass + + @staticmethod + @abstractmethod + def get_obsolete_nodes_ids() -> Set[int]: + """ + Get the set of obsolete node IDs in the ontology graph. + + Returns: + Set[int]: A set of obsolete node IDs. + """ + pass + + @staticmethod + @abstractmethod + def get_transitively_closed_graph() -> nx.DiGraph: + """ + Get the transitive closure of the ontology graph. + + Returns: + nx.DiGraph: A directed graph representing the transitive closure of the ontology graph. + """ + pass + + @staticmethod + @abstractmethod + def get_data_in_dataframe() -> pd.DataFrame: + """ + Get the ontology data as a Pandas DataFrame. + + Returns: + pd.DataFrame: A DataFrame containing ontology data. + """ + pass + + +class ChebiMockOntology(MockOntologyGraphData): """ A mock ontology representing a simplified ChEBI (Chemical Entities of Biological Interest) structure. This class is used for testing purposes and includes nodes and edges representing chemical compounds @@ -265,7 +374,7 @@ def get_data_in_dataframe() -> pd.DataFrame: 67890: [False, False, True, False, True, False, False], 88888: [False, False, True, False, True, True, False], 99999: [True, True, True, True, True, False, True], - } + }, ) data_df = pd.DataFrame(data) @@ -304,3 +413,349 @@ def get_transitively_closed_graph() -> nx.DiGraph: g.add_edges_from(ChebiMockOntology.get_edges_of_transitive_closure_graph()) return g + + +class GOUniProtMockData(MockOntologyGraphData): + """ + A mock ontology representing a simplified version of the Gene Ontology (GO) structure with nodes and edges + representing GO terms and their relationships in a directed acyclic graph (DAG). + + Nodes: + - GO_1 + - GO_2 + - GO_3 + - GO_4 + - GO_5 + - GO_6 + + Edges (Parent-Child Relationships): + - GO_1 -> GO_2 + - GO_1 -> GO_3 + - GO_2 -> GO_4 + - GO_2 -> GO_5 + - GO_3 -> GO_4 + - GO_4 -> GO_6 + + This mock ontology structure is useful for testing methods related to GO hierarchy, graph extraction, and transitive + closure operations. + + The class also includes methods to retrieve nodes, edges, and transitive closure of the graph. + + Visual Representation Graph with Valid Nodes and Edges: + + GO_1 + / \ + GO_2 GO_3 + / \ / + GO_5 GO_4 + \ + GO_6 + + Valid Swiss Proteins with mapping to valid GO ids + Swiss_Prot_1 -> GO_2, GO_3, GO_5 + Swiss_Prot_2 -> GO_2, GO_5 + """ + + @staticmethod + def get_nodes() -> List[int]: + """ + Get a sorted list of node IDs. + + Returns: + List[int]: A sorted list of node IDs in the ontology graph. + """ + return sorted([1, 2, 3, 4, 5, 6]) + + @staticmethod + def get_number_of_nodes() -> int: + """ + Get the total number of nodes in the ontology graph. + + Returns: + int: The number of nodes. + """ + return len(GOUniProtMockData.get_nodes()) + + @staticmethod + def get_edges() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes. + """ + return {(1, 2), (1, 3), (2, 4), (2, 5), (3, 4), (4, 6)} + + @staticmethod + def get_number_of_edges() -> int: + """ + Get the total number of edges in the ontology graph. + + Returns: + int: The number of edges. + """ + return len(GOUniProtMockData.get_edges()) + + @staticmethod + def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the transitive closure of the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples representing edges in the transitive closure graph. + """ + return { + (1, 2), + (1, 3), + (1, 4), + (1, 5), + (1, 6), + (2, 4), + (2, 5), + (2, 6), + (3, 4), + (3, 6), + (4, 6), + } + + @staticmethod + def get_number_of_transitive_edges() -> int: + """ + Get the total number of edges in the transitive closure graph. + + Returns: + int: The number of transitive edges. + """ + return len(GOUniProtMockData.get_edges_of_transitive_closure_graph()) + + @staticmethod + def get_obsolete_nodes_ids() -> Set[int]: + """ + Get the set of obsolete node IDs in the ontology graph. + + Returns: + Set[int]: A set of node IDs representing obsolete nodes. + """ + return {7, 8} + + @staticmethod + def get_GO_raw_data() -> str: + """ + Get raw data in string format for GO ontology. + + This data simulates a basic GO ontology in a format typically used for testing. + + Returns: + str: The raw GO data in string format. + """ + return """ + [Term] + id: GO:0000001 + name: GO_1 + namespace: molecular_function + def: "OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function." [GOC:jl, PMID:12150913] + comment: This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular function. + synonym: "ribosomal chaperone activity" EXACT [] + xref: MetaCyc:BETAGALACTOSID-RXN + xref: Reactome:R-HSA-189062 "lactose + H2O => D-glucose + D-galactose" + xref: Reactome:R-HSA-5658001 "Defective LCT does not hydrolyze Lac" + xref: RHEA:10076 + + [Term] + id: GO:0000002 + name: GO_2 + namespace: biological_process + is_a: GO:0000001 ! hydrolase activity, hydrolyzing O-glycosyl compounds + + [Term] + id: GO:0000003 + name: GO_3 + namespace: cellular_component + is_a: GO:0000001 ! regulation of DNA recombination + + [Term] + id: GO:0000004 + name: GO_4 + namespace: biological_process + is_a: GO:0000003 ! regulation of DNA recombination + is_a: GO:0000002 ! hydrolase activity, hydrolyzing O-glycosyl compounds + + [Term] + id: GO:0000005 + name: GO_5 + namespace: molecular_function + is_a: GO:0000002 ! regulation of DNA recombination + + [Term] + id: GO:0000006 + name: GO_6 + namespace: cellular_component + is_a: GO:0000004 ! glucoside transport + + [Term] + id: GO:0000007 + name: GO_7 + namespace: biological_process + is_a: GO:0000003 ! glucoside transport + is_obsolete: true + + [Term] + id: GO:0000008 + name: GO_8 + namespace: molecular_function + is_a: GO:0000001 ! glucoside transport + is_obsolete: true + + [Typedef] + id: term_tracker_item + name: term tracker item + namespace: external + xref: IAO:0000233 + is_metadata_tag: true + is_class_level: true + """ + + @staticmethod + def protein_sequences() -> Dict[str, str]: + """ + Get the protein sequences for Swiss-Prot proteins. + + Returns: + Dict[str, str]: A dictionary where keys are Swiss-Prot IDs and values are their respective sequences. + """ + return { + "Swiss_Prot_1": "MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK".replace( + " ", "" + ), + "Swiss_Prot_2": "EKGLIVGHFS GIKYKGEKAQ ASEVDVNKMC CWVSKFKDAM RRYQGIQTCK".replace( + " ", "" + ), + } + + @staticmethod + def get_UniProt_raw_data() -> str: + """ + Get raw data in string format for UniProt proteins. + + This mock data contains six Swiss-Prot proteins with different properties: + - Swiss_Prot_1 and Swiss_Prot_2 are valid proteins. + - Swiss_Prot_3 has a sequence length greater than 1002. + - Swiss_Prot_4 contains "X", a non-valid amino acid in its sequence. + - Swiss_Prot_5 has no GO IDs mapped to it. + - Swiss_Prot_6 has GO IDs mapped, but no evidence codes. + + Returns: + str: The raw UniProt data in string format. + """ + protein_sq_1 = GOUniProtMockData.protein_sequences()["Swiss_Prot_1"] + protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"] + raw_str = ( + f"ID Swiss_Prot_1 Reviewed; {len(protein_sq_1)} AA. \n" + + "AC Q6GZX4;\n" + + "DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n" + + "DR GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n" + + "DR GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n" + + "DR GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n" + + f"SQ SEQUENCE {len(protein_sq_1)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + + f" {protein_sq_1}\n" + + "//\n" + + f"ID Swiss_Prot_2 Reviewed; {len(protein_sq_2)} AA.\n" + + "AC DCGZX4;\n" + + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + + "DR GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n" + + "DR GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n" + + "DR GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n" + + f"SQ SEQUENCE {len(protein_sq_2)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + + f" {protein_sq_2}\n" + + "//\n" + + "ID Swiss_Prot_3 Reviewed; 1165 AA.\n" + + "AC Q6GZX4;\n" + + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + + "DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n" + + "DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n" + + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" + + "SQ SEQUENCE 1165 AA; 129118 MW; FE2984658CED53A8 CRC64;\n" + + " MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n" + + " GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n" + + " DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n" + + " DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n" + + " KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n" + + " EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n" + + " SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n" + + " KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n" + + " KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n" + + " WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n" + + " NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n" + + " NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n" + + " SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n" + + " GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n" + + " YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n" + + " AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n" + + " AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n" + + " NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n" + + " ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n" + + " DVRVSGVMIY RASEGKPMQY VSLLM\n" + + "//\n" + + "ID Swiss_Prot_4 Reviewed; 60 AA.\n" + + "AC Q6GZX4;\n" + + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + + "DR GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n" + + "DR GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n" + + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" + + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + + " XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + + "//\n" + + "ID Swiss_Prot_5 Reviewed; 60 AA.\n" + + "AC Q6GZX4;\n" + + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + + "//\n" + + "ID Swiss_Prot_5 Reviewed; 60 AA.\n" + + "AC Q6GZX4;\n" + + "DR GO; GO:0000005; P:regulation of viral transcription;\n" + + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + + "//" + ) + + return raw_str + + @staticmethod + def get_data_in_dataframe() -> pd.DataFrame: + """ + Get a mock DataFrame representing UniProt data. + + The DataFrame contains Swiss-Prot protein data, including identifiers, accessions, GO terms, sequences, + and binary label columns representing whether each protein is associated with certain GO classes. + + Returns: + pd.DataFrame: A DataFrame containing mock UniProt data with columns for 'swiss_id', 'accession', 'go_ids', 'sequence', + and binary labels for GO classes. + """ + expected_data = OrderedDict( + swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"], + accession=["Q6GZX4", "DCGZX4"], + go_ids=[[2, 3, 5], [2, 5]], + sequence=list(GOUniProtMockData.protein_sequences().values()), + **{ + # SP_1, SP_2 + 1: [False, False], + 2: [True, True], + 3: [True, False], + 4: [False, False], + 5: [True, True], + 6: [False, False], + }, + ) + return pd.DataFrame(expected_data) + + @staticmethod + def get_transitively_closed_graph() -> nx.DiGraph: + """ + Get the transitive closure of the ontology graph. + + Returns: + nx.DiGraph: A directed graph representing the transitive closure of the ontology graph. + """ + pass From c6c5a59990b6933d785898d6001595a94a5396be Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 9 Sep 2024 11:27:26 +0200 Subject: [PATCH 032/112] test for GOUniProtDataExtractor --- .../testGOUniProDataExtractor.py | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 tests/unit/dataset_classes/testGOUniProDataExtractor.py diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py new file mode 100644 index 00000000..7394405d --- /dev/null +++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py @@ -0,0 +1,217 @@ +import unittest +from unittest.mock import MagicMock, PropertyMock, mock_open, patch + +import fastobo +import networkx as nx +import pandas as pd + +from chebai.preprocessing.datasets.go_uniprot import _GOUniProtDataExtractor +from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData + + +class TestGOUniProtDataExtractor(unittest.TestCase): + """ + Unit tests for the _GOUniProtDataExtractor class. + """ + + @classmethod + @patch.multiple(_GOUniProtDataExtractor, __abstractmethods__=frozenset()) + @patch.object(_GOUniProtDataExtractor, "base_dir", new_callable=PropertyMock) + @patch.object(_GOUniProtDataExtractor, "_name", new_callable=PropertyMock) + def setUpClass( + cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock + ) -> None: + """ + Class setup for mocking abstract properties of _GOUniProtDataExtractor. + """ + mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor" + mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor" + ReaderMock = MagicMock() + ReaderMock.name.return_value = "MockedReader" + _GOUniProtDataExtractor.READER = ReaderMock + + cls.extractor = _GOUniProtDataExtractor() + + def test_term_callback(self) -> None: + """ + Test the term_callback method for correct parsing and filtering of GO terms. + """ + self.extractor.go_branch = "all" + term_mapping = {} + for term in fastobo.loads(GOUniProtMockData.get_GO_raw_data()): + if isinstance(term, fastobo.typedef.TypedefFrame): + continue + term_mapping[self.extractor._parse_go_id(term.id)] = term + + # Test individual term callback + term_dict = self.extractor.term_callback(term_mapping[4]) + expected_dict = {"go_id": 4, "parents": [3, 2], "name": "GO_4"} + self.assertEqual( + term_dict, + expected_dict, + "The term_callback did not return the expected dictionary.", + ) + + # Test filtering valid terms + valid_terms_docs = set() + for term_id, term_doc in term_mapping.items(): + if self.extractor.term_callback(term_doc): + valid_terms_docs.add(term_id) + + self.assertEqual( + valid_terms_docs, + set(GOUniProtMockData.get_nodes()), + "The valid terms do not match expected nodes.", + ) + + # Test that obsolete terms are filtered out + self.assertFalse( + any( + self.extractor.term_callback(term_mapping[obs_id]) + for obs_id in GOUniProtMockData.get_obsolete_nodes_ids() + ), + "Obsolete terms should not be present.", + ) + + # Test filtering by GO branch (e.g., BP) + self.extractor.go_branch = "BP" + BP_terms = { + term_id + for term_id, term in term_mapping.items() + if self.extractor.term_callback(term) + } + self.assertEqual( + BP_terms, {2, 4}, "The BP terms do not match the expected set." + ) + + @patch( + "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) + ) + def test_extract_class_hierarchy(self, mock_load) -> None: + """ + Test the extraction of the class hierarchy from the ontology. + """ + graph = self.extractor._extract_class_hierarchy("fake_path") + + # Validate the graph structure + self.assertIsInstance( + graph, nx.DiGraph, "The result should be a directed graph." + ) + + # Check nodes + actual_nodes = set(graph.nodes) + self.assertEqual( + set(GOUniProtMockData.get_nodes()), + actual_nodes, + "The graph nodes do not match the expected nodes.", + ) + + # Check edges + actual_edges = set(graph.edges) + self.assertEqual( + GOUniProtMockData.get_edges_of_transitive_closure_graph(), + actual_edges, + "The graph edges do not match the expected edges.", + ) + + # Check number of nodes and edges + self.assertEqual( + GOUniProtMockData.get_number_of_nodes(), + len(actual_nodes), + "The number of nodes should match the actual number of nodes in the graph.", + ) + + self.assertEqual( + GOUniProtMockData.get_number_of_transitive_edges(), + len(actual_edges), + "The number of transitive edges should match the actual number of transitive edges in the graph.", + ) + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=GOUniProtMockData.get_UniProt_raw_data(), + ) + def test_get_swiss_to_go_mapping(self, mock_open) -> None: + """ + Test the extraction of SwissProt to GO term mapping. + """ + mapping_df = self.extractor._get_swiss_to_go_mapping() + expected_df = GOUniProtMockData.get_data_in_dataframe().iloc[:, :4] + + pd.testing.assert_frame_equal( + mapping_df, + expected_df, + obj="The SwissProt to GO mapping DataFrame does not match the expected DataFrame.", + ) + + @patch( + "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) + ) + @patch( + "builtins.open", + new_callable=mock_open, + read_data=GOUniProtMockData.get_UniProt_raw_data(), + ) + @patch.object( + _GOUniProtDataExtractor, + "select_classes", + return_value=GOUniProtMockData.get_nodes(), + ) + def test_graph_to_raw_dataset( + self, mock_select_classes, mock_open, mock_load + ) -> None: + """ + Test the conversion of the class hierarchy graph to a raw dataset. + """ + graph = self.extractor._extract_class_hierarchy("fake_path") + actual_df = self.extractor._graph_to_raw_dataset(graph) + expected_df = GOUniProtMockData.get_data_in_dataframe() + + pd.testing.assert_frame_equal( + actual_df, + expected_df, + obj="The raw dataset DataFrame does not match the expected DataFrame.", + ) + + @patch("builtins.open", new_callable=mock_open, read_data=b"Mocktestdata") + @patch("pandas.read_pickle") + def test_load_dict( + self, mock_read_pickle: PropertyMock, mock_open: mock_open + ) -> None: + """ + Test the loading of the dictionary from a DataFrame. + """ + mock_df = GOUniProtMockData.get_data_in_dataframe() + mock_read_pickle.return_value = mock_df + + generator = self.extractor._load_dict("data/tests") + result = list(generator) + + # Convert NumPy arrays to lists for comparison + for item in result: + item["labels"] = list(item["labels"]) + + # Expected output for comparison + expected_result = [ + { + "features": mock_df["sequence"][0], + "labels": mock_df.iloc[0, 4:].to_list(), + "ident": mock_df["swiss_id"][0], + }, + { + "features": mock_df["sequence"][1], + "labels": mock_df.iloc[1, 4:].to_list(), + "ident": mock_df["swiss_id"][1], + }, + ] + + self.assertEqual( + result, + expected_result, + "The loaded dictionary does not match the expected structure.", + ) + + +if __name__ == "__main__": + unittest.main() From 427bc60a1e6d6d33a7fbfd7a7707224f3922a894 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 9 Sep 2024 12:29:32 +0200 Subject: [PATCH 033/112] update test to new method name _extract_class_hierarchy --- tests/unit/dataset_classes/testChebiOverXPartial.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py index c2515d75..a8c53408 100644 --- a/tests/unit/dataset_classes/testChebiOverXPartial.py +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -29,7 +29,7 @@ def test_extract_class_hierarchy(self, mock_open: mock_open) -> None: """ # Mock the output of fastobo.loads self.chebi_extractor.top_class_id = 11111 - graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path") + graph: nx.DiGraph = self.chebi_extractor._extract_class_hierarchy("fake_path") # Validate the graph structure self.assertIsInstance( @@ -73,7 +73,7 @@ def test_extract_class_hierarchy(self, mock_open: mock_open) -> None: ) self.chebi_extractor.top_class_id = 22222 - graph = self.chebi_extractor.extract_class_hierarchy("fake_path") + graph = self.chebi_extractor._extract_class_hierarchy("fake_path") # Check nodes with top class as 22222 self.assertEqual( @@ -94,7 +94,7 @@ def test_extract_class_hierarchy_with_bottom_cls( Test the extraction of class hierarchy and validate the structure of the resulting graph. """ self.chebi_extractor.top_class_id = 88888 - graph: nx.DiGraph = self.chebi_extractor.extract_class_hierarchy("fake_path") + graph: nx.DiGraph = self.chebi_extractor._extract_class_hierarchy("fake_path") # Check nodes with top class as 88888 self.assertEqual( From c01ecde837227eb4c4e99afb95063aa58d7cb9cb Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 9 Sep 2024 13:12:22 +0200 Subject: [PATCH 034/112] test for GOUniProtOverX --- .../dataset_classes/testGoUniProtOverX.py | 139 ++++++++++++++++++ tests/unit/mock_data/ontology_mock_data.py | 5 +- 2 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 tests/unit/dataset_classes/testGoUniProtOverX.py diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py new file mode 100644 index 00000000..282091b5 --- /dev/null +++ b/tests/unit/dataset_classes/testGoUniProtOverX.py @@ -0,0 +1,139 @@ +import unittest +from typing import List +from unittest.mock import mock_open, patch + +import networkx as nx +import pandas as pd + +from chebai.preprocessing.datasets.go_uniprot import _GOUniProtOverX +from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData + + +class TestGOUniProtOverX(unittest.TestCase): + @classmethod + @patch.multiple(_GOUniProtOverX, __abstractmethods__=frozenset()) + def setUpClass(cls) -> None: + """ + Set up the class for tests by initializing the extractor, graph, and input DataFrame. + """ + cls.extractor = _GOUniProtOverX() + cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph() + cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[ + :, :4 + ] + + @patch("builtins.open", new_callable=mock_open) + def test_select_classes(self, mock_open_file: mock_open) -> None: + """ + Test the `select_classes` method to ensure it selects classes based on the threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + # Set threshold for testing + self.extractor.THRESHOLD = 2 + selected_classes: List[int] = self.extractor.select_classes( + self.test_graph, data_df=self.input_df + ) + + # Expected result: GO terms 1, 2, and 5 should be selected based on the threshold + expected_selected_classes: List[int] = sorted([1, 2, 5]) + + # Check if the selected classes are as expected + self.assertEqual( + selected_classes, + expected_selected_classes, + msg="The selected classes do not match the expected output for threshold 2.", + ) + + # Expected data as string + expected_lines: str = "\n".join(map(str, expected_selected_classes)) + "\n" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines: str = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + msg="The written lines do not match the expected lines for the given threshold of 2.", + ) + + @patch("builtins.open", new_callable=mock_open) + def test_no_classes_meet_threshold(self, mock_open_file: mock_open) -> None: + """ + Test the `select_classes` method when no nodes meet the successor threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + self.extractor.THRESHOLD = 5 + selected_classes: List[int] = self.extractor.select_classes( + self.test_graph, data_df=self.input_df + ) + + # Expected result: No classes should meet the threshold of 5 + expected_selected_classes: List[int] = [] + + # Check if the selected classes are as expected + self.assertEqual( + selected_classes, + expected_selected_classes, + msg="The selected classes list should be empty when no nodes meet the threshold of 5.", + ) + + # Expected data as string + expected_lines: str = "" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines: str = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + msg="The written lines do not match the expected lines when no nodes meet the threshold of 5.", + ) + + @patch("builtins.open", new_callable=mock_open) + def test_all_nodes_meet_threshold(self, mock_open_file: mock_open) -> None: + """ + Test the `select_classes` method when all nodes meet the successor threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + self.extractor.THRESHOLD = 0 + selected_classes: List[int] = self.extractor.select_classes( + self.test_graph, data_df=self.input_df + ) + + # Expected result: All nodes except those not referenced by any protein (4 and 6) should be selected + expected_classes: List[int] = sorted([1, 2, 3, 5]) + + # Check if the returned selected classes match the expected list + self.assertListEqual( + selected_classes, + expected_classes, + msg="The selected classes do not match the expected output when all nodes meet the threshold of 0.", + ) + + # Expected data as string + expected_lines: str = "\n".join(map(str, expected_classes)) + "\n" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines: str = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + msg="The written lines do not match the expected lines when all nodes meet the threshold of 0.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index dbce56d2..d516a7a0 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -758,4 +758,7 @@ def get_transitively_closed_graph() -> nx.DiGraph: Returns: nx.DiGraph: A directed graph representing the transitive closure of the ontology graph. """ - pass + g = nx.DiGraph() + g.add_nodes_from(node for node in ChebiMockOntology.get_nodes()) + g.add_edges_from(GOUniProtMockData.get_edges_of_transitive_closure_graph()) + return g From dfd084e6c49ef10d1f4c22388fe2c01217c8cde6 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 10 Sep 2024 15:21:24 +0200 Subject: [PATCH 035/112] test for _load_data_from_file for Tox21MolNet --- .../testGOUniProDataExtractor.py | 2 +- tests/unit/dataset_classes/testTox21MolNet.py | 115 ++++++++++ tests/unit/mock_data/tox_mock_data.py | 201 ++++++++++++++++++ 3 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 tests/unit/dataset_classes/testTox21MolNet.py create mode 100644 tests/unit/mock_data/tox_mock_data.py diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py index 7394405d..1b60aa97 100644 --- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py +++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py @@ -27,7 +27,7 @@ def setUpClass( mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor" mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor" ReaderMock = MagicMock() - ReaderMock.name.return_value = "MockedReader" + ReaderMock.name.return_value = "MockedReaderGOUniProtDataExtractor" _GOUniProtDataExtractor.READER = ReaderMock cls.extractor = _GOUniProtDataExtractor() diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py new file mode 100644 index 00000000..3639f5d1 --- /dev/null +++ b/tests/unit/dataset_classes/testTox21MolNet.py @@ -0,0 +1,115 @@ +import os +import unittest +from typing import Dict, List +from unittest.mock import MagicMock, mock_open, patch + +import torch +from sklearn.model_selection import GroupShuffleSplit + +from chebai.preprocessing.datasets.tox21 import Tox21MolNet +from tests.unit.mock_data.tox_mock_data import Tox21MockData + + +class TestTox21MolNet(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + """Initialize a Tox21MolNet instance for testing.""" + ReaderMock = MagicMock() + ReaderMock.name.return_value = "MockedReaderTox21MolNet" + Tox21MolNet.READER = ReaderMock + cls.data_module = Tox21MolNet() + # cls.data_module.raw_dir = "/mock/raw_dir" + # cls.data_module.processed_dir = "/mock/processed_dir" + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=Tox21MockData.get_raw_data(), + ) + def test_load_data_from_file(self, mock_open_file: mock_open) -> None: + """ + Test the `_load_data_from_file` method for correct CSV parsing. + + Args: + mock_open_file (mock_open): Mocked open function to simulate file reading. + """ + expected_data = Tox21MockData.get_processed_data() + actual_data = self.data_module._load_data_from_file("fake/file/path.csv") + + self.assertEqual( + list(actual_data), + expected_data, + "The loaded data does not match the expected output.", + ) + + @patch.object( + Tox21MolNet, + "_load_data_from_file", + return_value=Tox21MockData.get_processed_data(), + ) + @patch("torch.save") + def test_setup_processed_simple_split( + self, mock_load_data: MagicMock, mock_torch_save: MagicMock + ) -> None: + """ + Test the `setup_processed` method for basic data splitting and saving. + + Args: + mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data. + mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes. + """ + self.data_module.setup_processed() + + # # Check that torch.save was called for train, test, and validation splits + # self.assertEqual( + # mock_torch_save.call_count, + # 3, + # "torch.save should have been called exactly three times for train, test, and validation splits." + # ) + + # @patch("os.path.isfile", return_value=False) + # @patch.object(Tox21MolNet, + # "_load_data_from_file", + # return_value= Tox21MockData.get_processed_grouped_data()) + # @patch("torch.save") + # @patch("torch.load") + # @patch("chebai.preprocessing.datasets.tox21.GroupShuffleSplit") + # def test_setup_processed_group_split( + # self, + # mock_group_split: MagicMock, + # mock_torch_load: MagicMock, + # mock_save: MagicMock, + # mock_load_data: MagicMock, + # mock_isfile: MagicMock + # ) -> None: + # """ + # Test the `setup_processed` method for group-based data splitting and saving. + # + # Args: + # mock_save (MagicMock): Mocked `torch.save` function to avoid file writes. + # mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data. + # mock_isfile (MagicMock): Mocked `os.path.isfile` function to simulate file presence. + # mock_group_split (MagicMock): Mocked `GroupShuffleSplit` to control data splitting behavior. + # """ + # mock_group_split.return_value = GroupShuffleSplit(n_splits=1, train_size=0.7) + # self.data_module.setup_processed() + # + # # Load the test split + # test_split_path = os.path.join(self.data_module.processed_dir, "test.pt") + # test_split = torch.load(test_split_path) + # + # # Check if torch.save was called with correct arguments + # mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/test.pt") + # mock_save.assert_any_call([mock_data[0]], "/mock/processed_dir/train.pt") + # mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/validation.pt") + # # Check that torch.save was called for train, test, and validation splits + # self.assertEqual( + # mock_torch_save.call_count, + # 3, + # "torch.save should have been called exactly three times for train, test, and validation splits." + # ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py new file mode 100644 index 00000000..912d172c --- /dev/null +++ b/tests/unit/mock_data/tox_mock_data.py @@ -0,0 +1,201 @@ +class Tox21MockData: + """ + A utility class providing mock data for testing the Tox21MolNet dataset. + + This class includes static methods that return mock data in various formats, simulating + the raw and processed data of the Tox21MolNet dataset. The mock data is used for unit tests + to verify the functionality of methods within the Tox21MolNet class without relying on actual + data files. + """ + + @staticmethod + def get_raw_data() -> str: + """ + Returns a raw CSV string that simulates the raw data of the Tox21MolNet dataset. + """ + return ( + "NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53," + + "mol_id,smiles\n" + + "0,0,1,0,1,1,0,1,0,,1,0,TOX958,Nc1ccc([N+](=O)[O-])cc1N\n" + + ",,,,,,,,,1,,,TOX31681,Nc1cc(C(F)(F)F)ccc1S\n" + + "0,0,0,0,0,0,0,,0,0,0,0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C\n" + + "0,0,0,0,0,0,0,0,0,0,0,0,TOX6619,O=S(=O)(Cl)c1ccccc1\n" + + "0,0,0,,0,0,,,0,,1,,TOX27679,CCCCCc1ccco1\n" + + "0,,1,,,,0,,1,1,1,1,TOX2801,Oc1c(Cl)cc(Cl)c2cccnc12\n" + + "0,0,0,0,,0,,,0,0,,1,TOX2808,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21\n" + + "0,,0,1,,,,1,0,,1,,TOX29085,CCCCCCCCCCCCCCn1cc[n+](C)c1\n" + ) + + @staticmethod + def get_processed_data() -> list: + """ + Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset. + Each dictionary contains 'ident', 'features', and 'labels'. + """ + return [ + { + "ident": "TOX958", + "features": "Nc1ccc([N+](=O)[O-])cc1N", + "labels": [ + False, + False, + True, + False, + True, + True, + False, + True, + False, + None, + True, + False, + ], + }, + { + "ident": "TOX31681", + "features": "Nc1cc(C(F)(F)F)ccc1S", + "labels": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + True, + None, + None, + ], + }, + { + "ident": "TOX5110", + "features": "CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C", + "labels": [ + False, + False, + False, + False, + False, + False, + False, + None, + False, + False, + False, + False, + ], + }, + { + "ident": "TOX6619", + "features": "O=S(=O)(Cl)c1ccccc1", + "labels": [ + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + ], + }, + { + "ident": "TOX27679", + "features": "CCCCCc1ccco1", + "labels": [ + False, + False, + False, + None, + False, + False, + None, + None, + False, + None, + True, + None, + ], + }, + { + "ident": "TOX2801", + "features": "Oc1c(Cl)cc(Cl)c2cccnc12", + "labels": [ + False, + None, + True, + None, + None, + None, + False, + None, + True, + True, + True, + True, + ], + }, + { + "ident": "TOX2808", + "features": "CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21", + "labels": [ + False, + False, + False, + False, + None, + False, + None, + None, + False, + False, + None, + True, + ], + }, + { + "ident": "TOX29085", + "features": "CCCCCCCCCCCCCCn1cc[n+](C)c1", + "labels": [ + False, + None, + False, + True, + None, + None, + None, + True, + False, + None, + True, + None, + ], + }, + ] + + @staticmethod + def get_processed_grouped_data(): + """ + Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset. + Each dictionary contains 'ident', 'features', and 'labels'. + """ + processed_data = Tox21MockData.get_processed_data() + groups = ["A", "A", "B", "B", "C", "C", "C", "C"] + + assert len(processed_data) == len( + groups + ), "The number of processed data entries does not match the number of groups." + + # Combine processed data with their corresponding groups + grouped_data = [ + {**data, "group": group, "original": True} + for data, group in zip(processed_data, groups) + ] + + return grouped_data From 77956d473b88f71cc0fa7b262da9b595849fa92e Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 16 Sep 2024 13:06:47 +0200 Subject: [PATCH 036/112] _load_data_from_file test case Tox21Challenge --- .../dataset_classes/testTox21Challenge.py | 43 ++++ tests/unit/dataset_classes/testTox21MolNet.py | 10 +- tests/unit/mock_data/ontology_mock_data.py | 132 +++++------ tests/unit/mock_data/tox_mock_data.py | 214 +++++++++++++++++- 4 files changed, 317 insertions(+), 82 deletions(-) create mode 100644 tests/unit/dataset_classes/testTox21Challenge.py diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py new file mode 100644 index 00000000..4b23c487 --- /dev/null +++ b/tests/unit/dataset_classes/testTox21Challenge.py @@ -0,0 +1,43 @@ +import os +import unittest +from unittest.mock import MagicMock, mock_open, patch + +from rdkit import Chem + +from chebai.preprocessing.datasets.tox21 import Tox21Challenge +from chebai.preprocessing.reader import ChemDataReader +from tests.unit.mock_data.tox_mock_data import Tox21ChallengeMockData + + +class TestTox21Challenge(unittest.TestCase): + + @classmethod + def setUpClass(cls): + """ + Set up the Tox21Challenge instance and mock data for testing. + """ + Tox21Challenge.READER = ChemDataReader + cls.tox21 = Tox21Challenge() + + @patch("rdkit.Chem.SDMolSupplier") + def test_load_data_from_file(self, mock_sdmol_supplier) -> None: + """ + Test the _load_data_from_file method to ensure it correctly loads data from an SDF file. + """ + # Use ForwardSDMolSupplier to read the mock data from the binary string + mock_file = mock_open(read_data=Tox21ChallengeMockData.get_raw_train_data()) + with patch("builtins.open", mock_file): + with open( + r"G:\github-aditya0by0\chebai_data\tox21_challenge\tox21_10k_data_all.sdf\tox21_10k_data_all.sdf", + "rb", + ) as f: + suppl = Chem.ForwardSDMolSupplier(f) + + mock_sdmol_supplier.return_value = suppl + + actual_data = self.tox21._load_data_from_file("fake/path") + self.assertEqual(Tox21ChallengeMockData.data_in_dict_format(), actual_data) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py index 3639f5d1..0a2d67b1 100644 --- a/tests/unit/dataset_classes/testTox21MolNet.py +++ b/tests/unit/dataset_classes/testTox21MolNet.py @@ -7,7 +7,7 @@ from sklearn.model_selection import GroupShuffleSplit from chebai.preprocessing.datasets.tox21 import Tox21MolNet -from tests.unit.mock_data.tox_mock_data import Tox21MockData +from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData class TestTox21MolNet(unittest.TestCase): @@ -25,7 +25,7 @@ def setUpClass(cls) -> None: @patch( "builtins.open", new_callable=mock_open, - read_data=Tox21MockData.get_raw_data(), + read_data=Tox21MolNetMockData.get_raw_data(), ) def test_load_data_from_file(self, mock_open_file: mock_open) -> None: """ @@ -34,7 +34,7 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None: Args: mock_open_file (mock_open): Mocked open function to simulate file reading. """ - expected_data = Tox21MockData.get_processed_data() + expected_data = Tox21MolNetMockData.get_processed_data() actual_data = self.data_module._load_data_from_file("fake/file/path.csv") self.assertEqual( @@ -46,7 +46,7 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None: @patch.object( Tox21MolNet, "_load_data_from_file", - return_value=Tox21MockData.get_processed_data(), + return_value=Tox21MolNetMockData.get_processed_data(), ) @patch("torch.save") def test_setup_processed_simple_split( @@ -71,7 +71,7 @@ def test_setup_processed_simple_split( # @patch("os.path.isfile", return_value=False) # @patch.object(Tox21MolNet, # "_load_data_from_file", - # return_value= Tox21MockData.get_processed_grouped_data()) + # return_value= Tox21MolNetMockData.get_processed_grouped_data()) # @patch("torch.save") # @patch("torch.load") # @patch("chebai.preprocessing.datasets.tox21.GroupShuffleSplit") diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index d516a7a0..478a2bbb 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -651,72 +651,72 @@ def get_UniProt_raw_data() -> str: protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"] raw_str = ( f"ID Swiss_Prot_1 Reviewed; {len(protein_sq_1)} AA. \n" - + "AC Q6GZX4;\n" - + "DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n" - + "DR GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n" - + "DR GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n" - + "DR GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n" - + f"SQ SEQUENCE {len(protein_sq_1)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - + f" {protein_sq_1}\n" - + "//\n" - + f"ID Swiss_Prot_2 Reviewed; {len(protein_sq_2)} AA.\n" - + "AC DCGZX4;\n" - + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - + "DR GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n" - + "DR GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n" - + "DR GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n" - + f"SQ SEQUENCE {len(protein_sq_2)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - + f" {protein_sq_2}\n" - + "//\n" - + "ID Swiss_Prot_3 Reviewed; 1165 AA.\n" - + "AC Q6GZX4;\n" - + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - + "DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n" - + "DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n" - + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" - + "SQ SEQUENCE 1165 AA; 129118 MW; FE2984658CED53A8 CRC64;\n" - + " MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n" - + " GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n" - + " DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n" - + " DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n" - + " KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n" - + " EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n" - + " SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n" - + " KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n" - + " KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n" - + " WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n" - + " NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n" - + " NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n" - + " SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n" - + " GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n" - + " YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n" - + " AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n" - + " AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n" - + " NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n" - + " ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n" - + " DVRVSGVMIY RASEGKPMQY VSLLM\n" - + "//\n" - + "ID Swiss_Prot_4 Reviewed; 60 AA.\n" - + "AC Q6GZX4;\n" - + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - + "DR GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n" - + "DR GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n" - + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" - + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - + " XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - + "//\n" - + "ID Swiss_Prot_5 Reviewed; 60 AA.\n" - + "AC Q6GZX4;\n" - + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - + "//\n" - + "ID Swiss_Prot_5 Reviewed; 60 AA.\n" - + "AC Q6GZX4;\n" - + "DR GO; GO:0000005; P:regulation of viral transcription;\n" - + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - + "//" + "AC Q6GZX4;\n" + "DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n" + "DR GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n" + "DR GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n" + f"SQ SEQUENCE {len(protein_sq_1)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + f" {protein_sq_1}\n" + "//\n" + f"ID Swiss_Prot_2 Reviewed; {len(protein_sq_2)} AA.\n" + "AC DCGZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "DR GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n" + "DR GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n" + f"SQ SEQUENCE {len(protein_sq_2)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + f" {protein_sq_2}\n" + "//\n" + "ID Swiss_Prot_3 Reviewed; 1165 AA.\n" + "AC Q6GZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n" + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" + "SQ SEQUENCE 1165 AA; 129118 MW; FE2984658CED53A8 CRC64;\n" + " MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n" + " GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n" + " DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n" + " DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n" + " KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n" + " EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n" + " SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n" + " KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n" + " KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n" + " WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n" + " NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n" + " NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n" + " SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n" + " GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n" + " YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n" + " AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n" + " AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n" + " NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n" + " ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n" + " DVRVSGVMIY RASEGKPMQY VSLLM\n" + "//\n" + "ID Swiss_Prot_4 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "DR GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n" + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//\n" + "ID Swiss_Prot_5 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//\n" + "ID Swiss_Prot_5 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR GO; GO:0000005; P:regulation of viral transcription;\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//" ) return raw_str diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py index 912d172c..96b31a91 100644 --- a/tests/unit/mock_data/tox_mock_data.py +++ b/tests/unit/mock_data/tox_mock_data.py @@ -1,4 +1,4 @@ -class Tox21MockData: +class Tox21MolNetMockData: """ A utility class providing mock data for testing the Tox21MolNet dataset. @@ -15,15 +15,15 @@ def get_raw_data() -> str: """ return ( "NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53," - + "mol_id,smiles\n" - + "0,0,1,0,1,1,0,1,0,,1,0,TOX958,Nc1ccc([N+](=O)[O-])cc1N\n" - + ",,,,,,,,,1,,,TOX31681,Nc1cc(C(F)(F)F)ccc1S\n" - + "0,0,0,0,0,0,0,,0,0,0,0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C\n" - + "0,0,0,0,0,0,0,0,0,0,0,0,TOX6619,O=S(=O)(Cl)c1ccccc1\n" - + "0,0,0,,0,0,,,0,,1,,TOX27679,CCCCCc1ccco1\n" - + "0,,1,,,,0,,1,1,1,1,TOX2801,Oc1c(Cl)cc(Cl)c2cccnc12\n" - + "0,0,0,0,,0,,,0,0,,1,TOX2808,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21\n" - + "0,,0,1,,,,1,0,,1,,TOX29085,CCCCCCCCCCCCCCn1cc[n+](C)c1\n" + "mol_id,smiles\n" + "0,0,1,0,1,1,0,1,0,,1,0,TOX958,Nc1ccc([N+](=O)[O-])cc1N\n" + ",,,,,,,,,1,,,TOX31681,Nc1cc(C(F)(F)F)ccc1S\n" + "0,0,0,0,0,0,0,,0,0,0,0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C\n" + "0,0,0,0,0,0,0,0,0,0,0,0,TOX6619,O=S(=O)(Cl)c1ccccc1\n" + "0,0,0,,0,0,,,0,,1,,TOX27679,CCCCCc1ccco1\n" + "0,,1,,,,0,,1,1,1,1,TOX2801,Oc1c(Cl)cc(Cl)c2cccnc12\n" + "0,0,0,0,,0,,,0,0,,1,TOX2808,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21\n" + "0,,0,1,,,,1,0,,1,,TOX29085,CCCCCCCCCCCCCCn1cc[n+](C)c1\n" ) @staticmethod @@ -185,7 +185,7 @@ def get_processed_grouped_data(): Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset. Each dictionary contains 'ident', 'features', and 'labels'. """ - processed_data = Tox21MockData.get_processed_data() + processed_data = Tox21MolNetMockData.get_processed_data() groups = ["A", "A", "B", "B", "C", "C", "C", "C"] assert len(processed_data) == len( @@ -199,3 +199,195 @@ def get_processed_grouped_data(): ] return grouped_data + + +class Tox21ChallengeMockData: + + MOL_BINARY_STR = ( + b"cyclobutane\n" + b" RDKit 2D\n\n" + b" 4 4 0 0 0 0 0 0 0 0999 V2000\n" + b" 1.0607 -0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n" + b" -0.0000 -1.0607 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n" + b" -1.0607 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n" + b" 0.0000 1.0607 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n" + b" 1 2 1 0\n" + b" 2 3 1 0\n" + b" 3 4 1 0\n" + b" 4 1 1 0\n" + b"M END\n\n" + ) + + SMILES_OF_MOL = "C1CCC1" + # Feature encoding of SMILES as per chebai/preprocessing/bin/smiles_token/tokens.txt + FEATURE_OF_SMILES = [19, 42, 19, 19, 19, 42] + + @staticmethod + def get_raw_train_data(): + raw_str = ( + Tox21ChallengeMockData.MOL_BINARY_STR + b"> \n" + b"25848\n\n" + b"> \n" + b"0\n\n" + b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b"> \n" + b"2384\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"0\n\n" + b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b"> \n" + b"27102\n\n" + b"> \n" + b"0\n\n" + b"> \n" + b"0\n\n" + b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b"> \n" + b"26792\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b"> \n" + b"26401\n\n" + b"> \n" + b"1\n\n" + b"> \n" + b"1\n\n" + b"$$$$\n" + Tox21ChallengeMockData.MOL_BINARY_STR + b"> \n" + b"25973\n\n" + b"$$$$\n" + ) + return raw_str + + @staticmethod + def data_in_dict_format(): + data_list = [ + { + "labels": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + 0, + None, + None, + ], + "ident": "25848", + }, + { + "labels": [ + 0, + None, + None, + 1, + None, + None, + None, + None, + None, + None, + None, + None, + ], + "ident": "2384", + }, + { + "labels": [ + 0, + None, + 0, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ], + "ident": "27102", + }, + { + "labels": [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + ], + "ident": "26792", + }, + { + "labels": [ + None, + None, + None, + None, + None, + None, + None, + 1, + None, + 1, + None, + None, + ], + "ident": "26401", + }, + { + "labels": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ], + "ident": "25973", + }, + ] + + for dict_ in data_list: + dict_["features"] = Tox21ChallengeMockData.FEATURE_OF_SMILES + dict_["group"] = None + + return data_list From a3670b0ca2a73ebb417bb4d45dea8e87d61937ac Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 17 Sep 2024 12:23:24 +0200 Subject: [PATCH 037/112] test for Tox21Chal --- .../dataset_classes/testTox21Challenge.py | 95 +++++++++++++- tests/unit/mock_data/tox_mock_data.py | 122 +++++++++++++++++- 2 files changed, 206 insertions(+), 11 deletions(-) diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py index 4b23c487..9986c82f 100644 --- a/tests/unit/dataset_classes/testTox21Challenge.py +++ b/tests/unit/dataset_classes/testTox21Challenge.py @@ -1,28 +1,37 @@ -import os import unittest -from unittest.mock import MagicMock, mock_open, patch +from unittest.mock import mock_open, patch from rdkit import Chem from chebai.preprocessing.datasets.tox21 import Tox21Challenge from chebai.preprocessing.reader import ChemDataReader -from tests.unit.mock_data.tox_mock_data import Tox21ChallengeMockData +from tests.unit.mock_data.tox_mock_data import ( + Tox21ChallengeMockData, + Tox21MolNetMockData, +) class TestTox21Challenge(unittest.TestCase): + """ + Unit tests for the Tox21Challenge class. + """ @classmethod - def setUpClass(cls): + def setUpClass(cls) -> None: """ Set up the Tox21Challenge instance and mock data for testing. + This is run once for the test class. """ Tox21Challenge.READER = ChemDataReader cls.tox21 = Tox21Challenge() @patch("rdkit.Chem.SDMolSupplier") - def test_load_data_from_file(self, mock_sdmol_supplier) -> None: + def test_load_data_from_file(self, mock_sdmol_supplier: patch) -> None: """ - Test the _load_data_from_file method to ensure it correctly loads data from an SDF file. + Test the `_load_data_from_file` method to ensure it correctly loads data from an SDF file. + + Args: + mock_sdmol_supplier (patch): A mock of the RDKit SDMolSupplier. """ # Use ForwardSDMolSupplier to read the mock data from the binary string mock_file = mock_open(read_data=Tox21ChallengeMockData.get_raw_train_data()) @@ -36,7 +45,79 @@ def test_load_data_from_file(self, mock_sdmol_supplier) -> None: mock_sdmol_supplier.return_value = suppl actual_data = self.tox21._load_data_from_file("fake/path") - self.assertEqual(Tox21ChallengeMockData.data_in_dict_format(), actual_data) + expected_data = Tox21ChallengeMockData.data_in_dict_format() + + self.assertEqual( + actual_data, + expected_data, + "The loaded data from file does not match the expected data.", + ) + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=Tox21MolNetMockData.get_raw_data(), + ) + def test_load_dict(self, mock_open_file: mock_open) -> None: + """ + Test the `_load_dict` method to ensure correct CSV parsing. + + Args: + mock_open_file (mock_open): Mocked open function to simulate file reading. + """ + expected_data = Tox21MolNetMockData.get_processed_data() + actual_data = self.tox21._load_dict("fake/file/path.csv") + + self.assertEqual( + list(actual_data), + expected_data, + "The loaded data from CSV does not match the expected processed data.", + ) + + @patch.object(Tox21Challenge, "_load_data_from_file", return_value="test") + @patch("builtins.open", new_callable=mock_open) + @patch("torch.save") + @patch("os.path.join") + def test_setup_processed( + self, + mock_join: patch, + mock_torch_save: patch, + mock_open_file: mock_open, + mock_load_file: patch, + ) -> None: + """ + Test the `setup_processed` method to ensure it processes and saves data correctly. + + Args: + mock_join (patch): Mock of os.path.join to simulate file path joining. + mock_torch_save (patch): Mock of torch.save to simulate saving processed data. + mock_open_file (mock_open): Mocked open function to simulate file reading. + mock_load_file (patch): Mocked data loading method. + """ + # Simulated raw and processed directories + path_str = "fake/test/path" + mock_join.return_value = path_str + + # Mock the file content for test.smiles and score.txt + mock_open_file.side_effect = [ + mock_open( + read_data=Tox21ChallengeMockData.get_raw_smiles_data() + ).return_value, + mock_open( + read_data=Tox21ChallengeMockData.get_raw_score_txt_data() + ).return_value, + ] + + # Call setup_processed to simulate the data processing workflow + self.tox21.setup_processed() + + # Assert that torch.save was called with the correct processed data + expected_test_data = Tox21ChallengeMockData.get_setup_processed_output_data() + mock_torch_save.assert_called_with(expected_test_data, path_str) + + self.assertTrue( + mock_torch_save.called, "The processed data was not saved as expected." + ) if __name__ == "__main__": diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py index 96b31a91..32745c38 100644 --- a/tests/unit/mock_data/tox_mock_data.py +++ b/tests/unit/mock_data/tox_mock_data.py @@ -1,3 +1,6 @@ +from typing import Dict, List + + class Tox21MolNetMockData: """ A utility class providing mock data for testing the Tox21MolNet dataset. @@ -27,7 +30,7 @@ def get_raw_data() -> str: ) @staticmethod - def get_processed_data() -> list: + def get_processed_data() -> List[Dict]: """ Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset. Each dictionary contains 'ident', 'features', and 'labels'. @@ -180,7 +183,7 @@ def get_processed_data() -> list: ] @staticmethod - def get_processed_grouped_data(): + def get_processed_grouped_data() -> List[Dict]: """ Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset. Each dictionary contains 'ident', 'features', and 'labels'. @@ -223,7 +226,7 @@ class Tox21ChallengeMockData: FEATURE_OF_SMILES = [19, 42, 19, 19, 19, 42] @staticmethod - def get_raw_train_data(): + def get_raw_train_data() -> bytes: raw_str = ( Tox21ChallengeMockData.MOL_BINARY_STR + b"> \n" b"25848\n\n" @@ -280,7 +283,7 @@ def get_raw_train_data(): return raw_str @staticmethod - def data_in_dict_format(): + def data_in_dict_format() -> List[Dict]: data_list = [ { "labels": [ @@ -391,3 +394,114 @@ def data_in_dict_format(): dict_["group"] = None return data_list + + @staticmethod + def get_raw_smiles_data() -> str: + """ + Returns mock SMILES data in a tab-delimited format (mocks test.smiles file). + + The data represents molecules and their associated sample IDs. + + Returns: + str: A string containing SMILES representations and corresponding sample IDs. + """ + return ( + "#SMILES\tSample ID\n" + f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00260869-01\n" + f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261776-01\n" + f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261380-01\n" + f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261842-01\n" + f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261662-01\n" + f"{Tox21ChallengeMockData.SMILES_OF_MOL}\tNCGC00261190-01\n" + ) + + @staticmethod + def get_raw_score_txt_data() -> str: + """ + Returns mock score data in a tab-delimited format (mocks test_results.txt file). + + The data represents toxicity test results for different molecular samples, including several toxicity endpoints. + + Returns: + str: A string containing toxicity scores for each molecular sample and corresponding toxicity endpoints. + """ + return ( + "Sample ID\tNR-AhR\tNR-AR\tNR-AR-LBD\tNR-Aromatase\tNR-ER\tNR-ER-LBD\tNR-PPAR-gamma\t" + "SR-ARE\tSR-ATAD5\tSR-HSE\tSR-MMP\tSR-p53\n" + "NCGC00260869-01\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n" + "NCGC00261776-01\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\n" + "NCGC00261380-01\tx\tx\tx\tx\tx\tx\tx\tx\tx\tx\tx\tx\n" + "NCGC00261842-01\t0\t0\t0\tx\t0\t0\t0\t0\t0\t0\tx\t1\n" + "NCGC00261662-01\t1\t0\t0\tx\t1\t1\t1\tx\t1\t1\tx\t1\n" + "NCGC00261190-01\tx\t0\t0\tx\t1\t0\t0\t1\t0\t0\t1\t1\n" + ) + + @staticmethod + def get_setup_processed_output_data() -> List[Dict]: + """ + Returns mock processed data used for testing the `setup_processed` method. + + The data contains molecule identifiers and their corresponding toxicity labels for multiple endpoints. + Each dictionary in the list represents a molecule with its associated labels, features, and group information. + + Returns: + List[Dict]: A list of dictionaries where each dictionary contains: + - "features": The SMILES features of the molecule. + - "labels": A list of toxicity endpoint labels (0, 1, or None). + - "ident": The sample identifier. + - "group": None (default value for the group key). + """ + + # "NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", + # "SR-HSE", "SR-MMP", "SR-p53", + data_list = [ + { + "labels": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "ident": "NCGC00260869-01", + }, + { + "labels": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "ident": "NCGC00261776-01", + }, + { + "labels": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ], + "ident": "NCGC00261380-01", + }, + { + "labels": [0, 0, 0, None, 0, 0, 0, 0, 0, 0, None, 1], + "ident": "NCGC00261842-01", + }, + { + "labels": [0, 0, 1, None, 1, 1, 1, None, 1, 1, None, 1], + "ident": "NCGC00261662-01", + }, + { + "labels": [0, 0, None, None, 1, 0, 0, 1, 0, 0, 1, 1], + "ident": "NCGC00261190-01", + }, + ] + + complete_list = [] + for dict_ in data_list: + complete_list.append( + { + "features": Tox21ChallengeMockData.FEATURE_OF_SMILES, + **dict_, + "group": None, + } + ) + + return complete_list From ac3ac19deed760fb422a60f8f8b2e84bc45540cb Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 17 Sep 2024 13:12:35 +0200 Subject: [PATCH 038/112] patch `os.makedirs` in tests to avoid creating directories --- tests/unit/dataset_classes/testChEBIOverX.py | 4 +- .../dataset_classes/testChebiDataExtractor.py | 6 +- .../dataset_classes/testChebiOverXPartial.py | 3 +- .../dataset_classes/testDynamicDataset.py | 6 +- .../testGOUniProDataExtractor.py | 6 +- .../dataset_classes/testGoUniProtOverX.py | 3 +- .../dataset_classes/testTox21Challenge.py | 3 +- tests/unit/dataset_classes/testTox21MolNet.py | 55 +------------------ .../dataset_classes/testXYBaseDataModule.py | 3 +- 9 files changed, 29 insertions(+), 60 deletions(-) diff --git a/tests/unit/dataset_classes/testChEBIOverX.py b/tests/unit/dataset_classes/testChEBIOverX.py index 78d85dd4..270b868c 100644 --- a/tests/unit/dataset_classes/testChEBIOverX.py +++ b/tests/unit/dataset_classes/testChEBIOverX.py @@ -9,11 +9,13 @@ class TestChEBIOverX(unittest.TestCase): @classmethod @patch.multiple(ChEBIOverX, __abstractmethods__=frozenset()) @patch.object(ChEBIOverX, "processed_dir_main", new_callable=PropertyMock) - def setUpClass(cls, mock_processed_dir_main: PropertyMock) -> None: + @patch("os.makedirs", return_value=None) + def setUpClass(cls, mock_makedirs, mock_processed_dir_main: PropertyMock) -> None: """ Set up the ChEBIOverX instance with a mock processed directory path and a test graph. Args: + mock_makedirs: This patches os.makedirs to do nothing mock_processed_dir_main (PropertyMock): Mocked property for the processed directory path. """ mock_processed_dir_main.return_value = "/mock/processed_dir" diff --git a/tests/unit/dataset_classes/testChebiDataExtractor.py b/tests/unit/dataset_classes/testChebiDataExtractor.py index 0559e090..8da900da 100644 --- a/tests/unit/dataset_classes/testChebiDataExtractor.py +++ b/tests/unit/dataset_classes/testChebiDataExtractor.py @@ -14,8 +14,12 @@ class TestChEBIDataExtractor(unittest.TestCase): @patch.multiple(_ChEBIDataExtractor, __abstractmethods__=frozenset()) @patch.object(_ChEBIDataExtractor, "base_dir", new_callable=PropertyMock) @patch.object(_ChEBIDataExtractor, "_name", new_callable=PropertyMock) + @patch("os.makedirs", return_value=None) def setUpClass( - cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock + cls, + mock_makedirs, + mock_name_property: PropertyMock, + mock_base_dir_property: PropertyMock, ) -> None: """ Set up a base instance of _ChEBIDataExtractor for testing with mocked properties. diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py index a8c53408..7720d301 100644 --- a/tests/unit/dataset_classes/testChebiOverXPartial.py +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -11,7 +11,8 @@ class TestChEBIOverX(unittest.TestCase): @classmethod @patch.multiple(ChEBIOverXPartial, __abstractmethods__=frozenset()) - def setUpClass(cls) -> None: + @patch("os.makedirs", return_value=None) + def setUpClass(cls, mock_makedirs) -> None: """ Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph. """ diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py index 1ff6c26d..e42c3e7e 100644 --- a/tests/unit/dataset_classes/testDynamicDataset.py +++ b/tests/unit/dataset_classes/testDynamicDataset.py @@ -17,8 +17,12 @@ class TestDynamicDataset(unittest.TestCase): @patch.multiple(_DynamicDataset, __abstractmethods__=frozenset()) @patch.object(_DynamicDataset, "base_dir", new_callable=PropertyMock) @patch.object(_DynamicDataset, "_name", new_callable=PropertyMock) + @patch("os.makedirs", return_value=None) def setUpClass( - cls, mock_base_dir_property: PropertyMock, mock_name_property: PropertyMock + cls, + mock_makedirs, + mock_base_dir_property: PropertyMock, + mock_name_property: PropertyMock, ) -> None: """ Set up a base instance of _DynamicDataset for testing with mocked properties. diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py index 1b60aa97..976334f0 100644 --- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py +++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py @@ -18,8 +18,12 @@ class TestGOUniProtDataExtractor(unittest.TestCase): @patch.multiple(_GOUniProtDataExtractor, __abstractmethods__=frozenset()) @patch.object(_GOUniProtDataExtractor, "base_dir", new_callable=PropertyMock) @patch.object(_GOUniProtDataExtractor, "_name", new_callable=PropertyMock) + @patch("os.makedirs", return_value=None) def setUpClass( - cls, mock_name_property: PropertyMock, mock_base_dir_property: PropertyMock + cls, + mock_makedirs, + mock_name_property: PropertyMock, + mock_base_dir_property: PropertyMock, ) -> None: """ Class setup for mocking abstract properties of _GOUniProtDataExtractor. diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py index 282091b5..d4157770 100644 --- a/tests/unit/dataset_classes/testGoUniProtOverX.py +++ b/tests/unit/dataset_classes/testGoUniProtOverX.py @@ -12,7 +12,8 @@ class TestGOUniProtOverX(unittest.TestCase): @classmethod @patch.multiple(_GOUniProtOverX, __abstractmethods__=frozenset()) - def setUpClass(cls) -> None: + @patch("os.makedirs", return_value=None) + def setUpClass(cls, mock_makedirs) -> None: """ Set up the class for tests by initializing the extractor, graph, and input DataFrame. """ diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py index 9986c82f..b94c8ca4 100644 --- a/tests/unit/dataset_classes/testTox21Challenge.py +++ b/tests/unit/dataset_classes/testTox21Challenge.py @@ -17,7 +17,8 @@ class TestTox21Challenge(unittest.TestCase): """ @classmethod - def setUpClass(cls) -> None: + @patch("os.makedirs", return_value=None) + def setUpClass(cls, mock_makedirs) -> None: """ Set up the Tox21Challenge instance and mock data for testing. This is run once for the test class. diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py index 0a2d67b1..c995e701 100644 --- a/tests/unit/dataset_classes/testTox21MolNet.py +++ b/tests/unit/dataset_classes/testTox21MolNet.py @@ -13,14 +13,13 @@ class TestTox21MolNet(unittest.TestCase): @classmethod - def setUpClass(cls) -> None: + @patch("os.makedirs", return_value=None) + def setUpClass(cls, mock_makedirs) -> None: """Initialize a Tox21MolNet instance for testing.""" ReaderMock = MagicMock() ReaderMock.name.return_value = "MockedReaderTox21MolNet" Tox21MolNet.READER = ReaderMock cls.data_module = Tox21MolNet() - # cls.data_module.raw_dir = "/mock/raw_dir" - # cls.data_module.processed_dir = "/mock/processed_dir" @patch( "builtins.open", @@ -59,57 +58,9 @@ def test_setup_processed_simple_split( mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data. mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes. """ + # Facing technical error here self.data_module.setup_processed() - # # Check that torch.save was called for train, test, and validation splits - # self.assertEqual( - # mock_torch_save.call_count, - # 3, - # "torch.save should have been called exactly three times for train, test, and validation splits." - # ) - - # @patch("os.path.isfile", return_value=False) - # @patch.object(Tox21MolNet, - # "_load_data_from_file", - # return_value= Tox21MolNetMockData.get_processed_grouped_data()) - # @patch("torch.save") - # @patch("torch.load") - # @patch("chebai.preprocessing.datasets.tox21.GroupShuffleSplit") - # def test_setup_processed_group_split( - # self, - # mock_group_split: MagicMock, - # mock_torch_load: MagicMock, - # mock_save: MagicMock, - # mock_load_data: MagicMock, - # mock_isfile: MagicMock - # ) -> None: - # """ - # Test the `setup_processed` method for group-based data splitting and saving. - # - # Args: - # mock_save (MagicMock): Mocked `torch.save` function to avoid file writes. - # mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data. - # mock_isfile (MagicMock): Mocked `os.path.isfile` function to simulate file presence. - # mock_group_split (MagicMock): Mocked `GroupShuffleSplit` to control data splitting behavior. - # """ - # mock_group_split.return_value = GroupShuffleSplit(n_splits=1, train_size=0.7) - # self.data_module.setup_processed() - # - # # Load the test split - # test_split_path = os.path.join(self.data_module.processed_dir, "test.pt") - # test_split = torch.load(test_split_path) - # - # # Check if torch.save was called with correct arguments - # mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/test.pt") - # mock_save.assert_any_call([mock_data[0]], "/mock/processed_dir/train.pt") - # mock_save.assert_any_call([mock_data[1]], "/mock/processed_dir/validation.pt") - # # Check that torch.save was called for train, test, and validation splits - # self.assertEqual( - # mock_torch_save.call_count, - # 3, - # "torch.save should have been called exactly three times for train, test, and validation splits." - # ) - if __name__ == "__main__": unittest.main() diff --git a/tests/unit/dataset_classes/testXYBaseDataModule.py b/tests/unit/dataset_classes/testXYBaseDataModule.py index 8e3575ab..64dfbe40 100644 --- a/tests/unit/dataset_classes/testXYBaseDataModule.py +++ b/tests/unit/dataset_classes/testXYBaseDataModule.py @@ -11,7 +11,8 @@ class TestXYBaseDataModule(unittest.TestCase): @classmethod @patch.object(XYBaseDataModule, "_name", new_callable=PropertyMock) - def setUpClass(cls, mock_name_property: PropertyMock) -> None: + @patch("os.makedirs", return_value=None) + def setUpClass(cls, mock_makedirs, mock_name_property: PropertyMock) -> None: """ Set up a base instance of XYBaseDataModule for testing. """ From 44a1dfda8f92627b3bab97f62ab9101452a2754e Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 22 Sep 2024 12:39:42 +0200 Subject: [PATCH 039/112] add test case for invalid token/input to read_data --- tests/unit/readers/testChemDataReader.py | 10 ++++++++++ tests/unit/readers/testDeepChemDataReader.py | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py index fde8604f..0c1c4d6f 100644 --- a/tests/unit/readers/testChemDataReader.py +++ b/tests/unit/readers/testChemDataReader.py @@ -92,6 +92,16 @@ def test_read_data_with_new_token(self) -> None: "The new token '[H-]' was not added at the correct index in the cache.", ) + def test_read_data_with_invalid_input(self) -> None: + """ + Test the _read_data method with an invalid input. + The invalid token should raise an error or be handled appropriately. + """ + raw_data = "%INVALID%" + + with self.assertRaises(ValueError): + self.reader._read_data(raw_data) + if __name__ == "__main__": unittest.main() diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py index 31a63dd1..dc29c9a6 100644 --- a/tests/unit/readers/testDeepChemDataReader.py +++ b/tests/unit/readers/testDeepChemDataReader.py @@ -100,6 +100,16 @@ def test_read_data_with_new_token(self) -> None: "The new token '[H-]' was not added to the correct index in the cache.", ) + def test_read_data_with_invalid_input(self) -> None: + """ + Test the _read_data method with an invalid input string. + The invalid token should raise an error or be handled appropriately. + """ + raw_data = "CBr))(OCI" + + with self.assertRaises(Exception): + self.reader._read_data(raw_data) + if __name__ == "__main__": unittest.main() From 03bf4cd4a1c3f2de4e93cedfb2d1b7096ac4454c Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 24 Sep 2024 12:58:34 +0200 Subject: [PATCH 040/112] refactor _extract_class_hierarchy --- chebai/preprocessing/datasets/chebi.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 1c0cb2f9..1f305d4b 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -758,27 +758,18 @@ def _extract_class_hierarchy(self, chebi_path: str) -> nx.DiGraph: """ Extracts a subset of ChEBI based on subclasses of the top class ID. + This method calls the superclass method to extract the full class hierarchy, + then extracts the subgraph containing only the descendants of the top class ID, including itself. + Args: chebi_path (str): The file path to the ChEBI ontology file. Returns: - nx.DiGraph: The extracted class hierarchy as a directed graph. + nx.DiGraph: The extracted class hierarchy as a directed graph, limited to the + descendants of the top class ID. """ - with open(chebi_path, encoding="utf-8") as chebi: - chebi = "\n".join(l for l in chebi if not l.startswith("xref:")) - elements = [ - term_callback(clause) - for clause in fastobo.loads(chebi) - if clause and ":" in str(clause.id) - ] - g = nx.DiGraph() - for n in elements: - g.add_node(n["id"], **n) - g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]]) - - g = nx.transitive_closure_dag(g) - g = g.subgraph(list(nx.descendants(g, self.top_class_id)) + [self.top_class_id]) - print("Compute transitive closure") + g = super()._extract_class_hierarchy(chebi_path) + g = g.subgraph(list(g.successors(self.top_class_id)) + [self.top_class_id]) return g From 96d2097a9853e8e00a6067bd19ba3d35c49af317 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 24 Sep 2024 13:50:27 +0200 Subject: [PATCH 041/112] add top_class_id to kwargs - add top_class_id to kwargs, which is needed in _ChEBIDataExtractor class to create another chebi class object related to train version self._chebi_version_train_obj = self.__class__(single_class=self.single_class, **_init_kwargs,) --- chebai/preprocessing/datasets/chebi.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 1f305d4b..7316a39b 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -736,6 +736,9 @@ def __init__(self, top_class_id: int, **kwargs): top_class_id (int): The ID of the top class from which to extract subclasses. **kwargs: Additional keyword arguments passed to the superclass initializer. """ + if "top_class_id" not in kwargs: + kwargs["top_class_id"] = top_class_id + self.top_class_id: int = top_class_id super().__init__(**kwargs) From 6b9024b088e244c13bf74f3be797fcb2154077d9 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Tue, 24 Sep 2024 18:10:06 +0200 Subject: [PATCH 042/112] minor changes to texts --- tutorials/data_exploration.ipynb | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tutorials/data_exploration.ipynb b/tutorials/data_exploration.ipynb index 8cd834b1..fce3a9f7 100644 --- a/tutorials/data_exploration.ipynb +++ b/tutorials/data_exploration.ipynb @@ -19,14 +19,16 @@ "source": [ "# 1. Instantiation of a Data Class\n", "\n", - "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data.\n", "### Inheritance Hierarchy\n", "\n", "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n", "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "- **`XYBaseDataModule`**: This is the base class for all data modules in `chebai`, providing foundational properties and methods for handling and processing datasets, including loading a stored dataset and creating a `DataLoader`.\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for some datasets (e.g. the ChEBI and Gene Ontology datasets). The defining feature is the dynamically created data split into training, validation and test sets. It inherits from `XYBaseDataModule`.\n", + "\n", "\n", - "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", "\n", "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", "\n", @@ -34,7 +36,7 @@ "### Explanation\n", "A ChEBI data class can be configured with the following main parameters:\n", "\n", - "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI dataset to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", "\n", "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", "\n", @@ -50,7 +52,7 @@ "\n", "### Additional Input Parameters\n", "\n", - "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", + "The `XYBaseDa ChEBI data class, which `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", "\n", "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n", "\n", @@ -225,11 +227,11 @@ "### Data Files\n", "\n", "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", - " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " - **Description**: Contains the raw ChEBI ontology data in OBO format, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", "\n", "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a Pandas dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", "\n", "3. **`data.pt`**\n", @@ -261,7 +263,7 @@ "\n", "3. **Processed Data Stage 2**:\n", " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **Description**: This final stage includes the tokenized data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", "\n", From b62c93119e34098f02919f9c4bf840def518b4b8 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 24 Sep 2024 21:48:36 +0200 Subject: [PATCH 043/112] update term_callback to handle obsolete terms --- chebai/preprocessing/datasets/chebi.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 1c0cb2f9..616a3408 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -13,7 +13,7 @@ import pickle from abc import ABC from collections import OrderedDict -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import fastobo import networkx as nx @@ -244,11 +244,16 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: with open(data_path, encoding="utf-8") as chebi: chebi = "\n".join(l for l in chebi if not l.startswith("xref:")) - elements = [ - term_callback(clause) - for clause in fastobo.loads(chebi) - if clause and ":" in str(clause.id) - ] + elements = [] + for term_doc in fastobo.loads(chebi): + if ( + term_doc + and isinstance(term_doc.id, fastobo.id.PrefixedIdent) + and term_doc.id.prefix == "CHEBI" + ): + term_dict = term_callback(term_doc) + if term_dict: + elements.append(term_dict) g = nx.DiGraph() for n in elements: @@ -818,7 +823,7 @@ def chebi_to_int(s: str) -> int: return int(s[s.index(":") + 1 :]) -def term_callback(doc) -> dict: +def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]: """ Extracts information from a ChEBI term document. This function takes a ChEBI term document as input and extracts relevant information such as the term ID, parents, @@ -858,6 +863,12 @@ def term_callback(doc) -> dict: parents.append(chebi_to_int(str(clause.term))) elif isinstance(clause, fastobo.term.NameClause): name = str(clause.name) + + if isinstance(clause, fastobo.term.IsObsoleteClause): + if clause.obsolete: + # if the term document contains clause as obsolete as true, skips this document. + return False + return { "id": chebi_to_int(str(doc.id)), "parents": parents, From aab0fea1df5801b047e0f1ba9e3d2bce9f928f91 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 25 Sep 2024 13:57:54 +0200 Subject: [PATCH 044/112] test case for `Tox21MolNet.setup_processed` simple split --- tests/unit/dataset_classes/testTox21MolNet.py | 43 +++++++++++++++---- tests/unit/mock_data/tox_mock_data.py | 5 ++- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py index c995e701..042a6ae4 100644 --- a/tests/unit/dataset_classes/testTox21MolNet.py +++ b/tests/unit/dataset_classes/testTox21MolNet.py @@ -42,25 +42,52 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None: "The loaded data does not match the expected output.", ) - @patch.object( - Tox21MolNet, - "_load_data_from_file", - return_value=Tox21MolNetMockData.get_processed_data(), + @patch( + "builtins.open", + new_callable=mock_open, + read_data=Tox21MolNetMockData.get_raw_data(), ) @patch("torch.save") def test_setup_processed_simple_split( - self, mock_load_data: MagicMock, mock_torch_save: MagicMock + self, + mock_torch_save, + mock_open_file: mock_open, ) -> None: """ Test the `setup_processed` method for basic data splitting and saving. Args: - mock_load_data (MagicMock): Mocked `_load_data_from_file` method to provide controlled data. - mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes. + mock_torch_save : Mocked `torch.save` function to avoid actual file writes. + mock_open_file (mock_open): Mocked `open` builtin-method to provide custom data. """ - # Facing technical error here self.data_module.setup_processed() + # Verify if torch.save was called for each split + self.assertEqual(mock_torch_save.call_count, 3) + call_args_list = mock_torch_save.call_args_list + self.assertIn("test", call_args_list[0][0][1]) + self.assertIn("train", call_args_list[1][0][1]) + self.assertIn("validation", call_args_list[2][0][1]) + + # Check for non-overlap between train, test, and validation + test_split = [d["ident"] for d in call_args_list[0][0][0]] + train_split = [d["ident"] for d in call_args_list[1][0][0]] + validation_split = [d["ident"] for d in call_args_list[2][0][0]] + + # Assert no overlap between splits + self.assertTrue( + set(train_split).isdisjoint(test_split), + "There is an overlap between the train and test splits.", + ) + self.assertTrue( + set(train_split).isdisjoint(validation_split), + "There is an overlap between the train and validation splits.", + ) + self.assertTrue( + set(test_split).isdisjoint(validation_split), + "There is an overlap between the test and validation splits.", + ) + if __name__ == "__main__": unittest.main() diff --git a/tests/unit/mock_data/tox_mock_data.py b/tests/unit/mock_data/tox_mock_data.py index 32745c38..b5f85bda 100644 --- a/tests/unit/mock_data/tox_mock_data.py +++ b/tests/unit/mock_data/tox_mock_data.py @@ -35,7 +35,7 @@ def get_processed_data() -> List[Dict]: Returns a list of dictionaries simulating the processed data for the Tox21MolNet dataset. Each dictionary contains 'ident', 'features', and 'labels'. """ - return [ + data_list = [ { "ident": "TOX958", "features": "Nc1ccc([N+](=O)[O-])cc1N", @@ -182,6 +182,9 @@ def get_processed_data() -> List[Dict]: }, ] + data_with_group = [{**data, "group": None} for data in data_list] + return data_with_group + @staticmethod def get_processed_grouped_data() -> List[Dict]: """ From fc8182e0cc80187fcdf6ce8d9b0e783030378c5e Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Wed, 25 Sep 2024 19:11:35 +0200 Subject: [PATCH 045/112] test case for `Tox21MolNet.setup_processed` group split --- tests/unit/dataset_classes/testTox21MolNet.py | 117 ++++++++++++++---- 1 file changed, 93 insertions(+), 24 deletions(-) diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py index 042a6ae4..5d5f3497 100644 --- a/tests/unit/dataset_classes/testTox21MolNet.py +++ b/tests/unit/dataset_classes/testTox21MolNet.py @@ -1,21 +1,21 @@ -import os import unittest -from typing import Dict, List +from typing import List from unittest.mock import MagicMock, mock_open, patch -import torch -from sklearn.model_selection import GroupShuffleSplit - from chebai.preprocessing.datasets.tox21 import Tox21MolNet from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData class TestTox21MolNet(unittest.TestCase): - @classmethod @patch("os.makedirs", return_value=None) - def setUpClass(cls, mock_makedirs) -> None: - """Initialize a Tox21MolNet instance for testing.""" + def setUpClass(cls, mock_makedirs: MagicMock) -> None: + """ + Initialize a Tox21MolNet instance for testing. + + Args: + mock_makedirs (MagicMock): Mocked `os.makedirs` function. + """ ReaderMock = MagicMock() ReaderMock.name.return_value = "MockedReaderTox21MolNet" Tox21MolNet.READER = ReaderMock @@ -39,7 +39,7 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None: self.assertEqual( list(actual_data), expected_data, - "The loaded data does not match the expected output.", + "The loaded data does not match the expected output from the file.", ) @patch( @@ -50,42 +50,111 @@ def test_load_data_from_file(self, mock_open_file: mock_open) -> None: @patch("torch.save") def test_setup_processed_simple_split( self, - mock_torch_save, + mock_torch_save: MagicMock, mock_open_file: mock_open, ) -> None: """ Test the `setup_processed` method for basic data splitting and saving. Args: - mock_torch_save : Mocked `torch.save` function to avoid actual file writes. - mock_open_file (mock_open): Mocked `open` builtin-method to provide custom data. + mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes. + mock_open_file (mock_open): Mocked `open` function to simulate file reading. + """ + self.data_module.setup_processed() + + # Verify if torch.save was called for each split (train, test, validation) + self.assertEqual( + mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times." + ) + call_args_list = mock_torch_save.call_args_list + self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.") + self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.") + self.assertIn( + "validation", call_args_list[2][0][1], "Missing 'validation' split." + ) + + # Check for non-overlap between train, test, and validation splits + test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]] + train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]] + validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]] + + self.assertTrue( + set(train_split).isdisjoint(test_split), + "Overlap detected between the train and test splits.", + ) + self.assertTrue( + set(train_split).isdisjoint(validation_split), + "Overlap detected between the train and validation splits.", + ) + self.assertTrue( + set(test_split).isdisjoint(validation_split), + "Overlap detected between the test and validation splits.", + ) + + @patch.object( + Tox21MolNet, + "_load_data_from_file", + return_value=Tox21MolNetMockData.get_processed_grouped_data(), + ) + @patch("torch.save") + def test_setup_processed_with_group_split( + self, mock_torch_save: MagicMock, mock_load_file: MagicMock + ) -> None: + """ + Test the `setup_processed` method for group-based splitting and saving. + + Args: + mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes. + mock_load_file (MagicMock): Mocked `_load_data_from_file` to provide custom data. """ + self.data_module.train_split = 0.5 self.data_module.setup_processed() # Verify if torch.save was called for each split - self.assertEqual(mock_torch_save.call_count, 3) + self.assertEqual( + mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times." + ) call_args_list = mock_torch_save.call_args_list - self.assertIn("test", call_args_list[0][0][1]) - self.assertIn("train", call_args_list[1][0][1]) - self.assertIn("validation", call_args_list[2][0][1]) + self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.") + self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.") + self.assertIn( + "validation", call_args_list[2][0][1], "Missing 'validation' split." + ) - # Check for non-overlap between train, test, and validation - test_split = [d["ident"] for d in call_args_list[0][0][0]] - train_split = [d["ident"] for d in call_args_list[1][0][0]] - validation_split = [d["ident"] for d in call_args_list[2][0][0]] + # Check for non-overlap between train, test, and validation splits (based on 'ident') + test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]] + train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]] + validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]] - # Assert no overlap between splits self.assertTrue( set(train_split).isdisjoint(test_split), - "There is an overlap between the train and test splits.", + "Overlap detected between the train and test splits (based on 'ident').", ) self.assertTrue( set(train_split).isdisjoint(validation_split), - "There is an overlap between the train and validation splits.", + "Overlap detected between the train and validation splits (based on 'ident').", ) self.assertTrue( set(test_split).isdisjoint(validation_split), - "There is an overlap between the test and validation splits.", + "Overlap detected between the test and validation splits (based on 'ident').", + ) + + # Check for non-overlap between train, test, and validation splits (based on 'group') + test_split_grp: List[str] = [d["group"] for d in call_args_list[0][0][0]] + train_split_grp: List[str] = [d["group"] for d in call_args_list[1][0][0]] + validation_split_grp: List[str] = [d["group"] for d in call_args_list[2][0][0]] + + self.assertTrue( + set(train_split_grp).isdisjoint(test_split_grp), + "Overlap detected between the train and test splits (based on 'group').", + ) + self.assertTrue( + set(train_split_grp).isdisjoint(validation_split_grp), + "Overlap detected between the train and validation splits (based on 'group').", + ) + self.assertTrue( + set(test_split_grp).isdisjoint(validation_split_grp), + "Overlap detected between the test and validation splits (based on 'group').", ) From 1d3ecbe327b63324c52347ccc806a25c51471d40 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 26 Sep 2024 00:17:14 +0200 Subject: [PATCH 046/112] update chebi test as per modified term_callback --- .../dataset_classes/testChebiTermCallback.py | 10 +++++--- tests/unit/mock_data/ontology_mock_data.py | 25 ++++++------------- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/tests/unit/dataset_classes/testChebiTermCallback.py b/tests/unit/dataset_classes/testChebiTermCallback.py index 7b22d1a2..8680760e 100644 --- a/tests/unit/dataset_classes/testChebiTermCallback.py +++ b/tests/unit/dataset_classes/testChebiTermCallback.py @@ -51,11 +51,13 @@ def test_skip_obsolete_terms(self) -> None: """ Test that `term_callback` correctly skips obsolete ChEBI terms. """ + term_callback_output = [] + for ident in ChebiMockOntology.get_obsolete_nodes_ids(): + raw_term = self.callback_input_data.get(ident) + term_dict = term_callback(raw_term) + if term_dict: + term_callback_output.append(term_dict) - term_callback_output = [ - term_callback(self.callback_input_data.get(ident)) - for ident in ChebiMockOntology.get_obsolete_nodes_ids() - ] self.assertEqual( term_callback_output, [], diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index 478a2bbb..40d9674e 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -356,24 +356,15 @@ def get_data_in_dataframe() -> pd.DataFrame: "C1=CC=CC=C1Br", "C1=CC=CC=C1[Mg+]", ], - # Relationships { - # 12345: [11111, 54321, 22222, 67890], - # 67890: [22222], - # 99999: [67890, 11111, 54321, 22222, 12345], - # 54321: [11111], - # 88888: [22222, 67890] - # 11111: [] - # 22222: [] - # } **{ - # -row- [11111, 12345, 22222, 54321, 67890, 88888, 99999] - 11111: [False, False, False, False, False, False, False], - 12345: [True, True, True, True, True, False, False], - 22222: [False, False, False, False, False, False, False], - 54321: [True, False, False, True, False, False, False], - 67890: [False, False, True, False, True, False, False], - 88888: [False, False, True, False, True, True, False], - 99999: [True, True, True, True, True, False, True], + # -row- [12345, 54321, 67890, 11111, 22222, 99999, 88888] + 11111: [True, True, False, True, False, True, False], + 12345: [True, False, False, False, False, True, False], + 22222: [True, False, True, False, True, True, True], + 54321: [True, True, False, False, False, True, False], + 67890: [True, False, True, False, False, True, True], + 88888: [False, False, False, False, False, False, True], + 99999: [False, False, False, False, False, True, False], }, ) From 630add7a78a24277cfddf2b97fe10a450ad9f069 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 26 Sep 2024 00:19:29 +0200 Subject: [PATCH 047/112] update input to add_edges_from - Modified input to add_edges_from to only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes --- chebai/preprocessing/datasets/chebi.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 616a3408..c5aac3ae 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -258,7 +258,16 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: g = nx.DiGraph() for n in elements: g.add_node(n["id"], **n) - g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]]) + + # Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes + g.add_edges_from( + [ + (p, q["id"]) + for q in elements + for p in q["parents"] + if g.has_node(p) and g.has_node(q["id"]) + ] + ) print("Compute transitive closure") return nx.transitive_closure_dag(g) From 35a621cee6cfd3c732d6e851ba2bc320defa760d Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Thu, 26 Sep 2024 00:30:49 +0200 Subject: [PATCH 048/112] group key not needed for Tox21Chal._load_dict - group key needed in Tox21MolNet but not needed for Tox21Chal._load_dict --- tests/unit/dataset_classes/testTox21Challenge.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py index b94c8ca4..fedde8e5 100644 --- a/tests/unit/dataset_classes/testTox21Challenge.py +++ b/tests/unit/dataset_classes/testTox21Challenge.py @@ -67,6 +67,9 @@ def test_load_dict(self, mock_open_file: mock_open) -> None: mock_open_file (mock_open): Mocked open function to simulate file reading. """ expected_data = Tox21MolNetMockData.get_processed_data() + for item in expected_data: + item.pop("group", None) + actual_data = self.tox21._load_dict("fake/file/path.csv") self.assertEqual( From 19b194aead6923f6b1f866447f9b5cbfd5cad1ec Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 11:23:06 +0200 Subject: [PATCH 049/112] fix - if only one class surpass given selection threshold - https://github.com/ChEB-AI/python-chebai/pull/54#issuecomment-2371843170 --- chebai/preprocessing/datasets/base.py | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 02877ad3..a2997699 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -14,6 +14,7 @@ ) from lightning.pytorch.core.datamodule import LightningDataModule from lightning_utilities.core.rank_zero import rank_zero_info +from sklearn.model_selection import StratifiedShuffleSplit from torch.utils.data import DataLoader from chebai.preprocessing import reader as dr @@ -929,11 +930,17 @@ def get_test_split( labels_list = df["labels"].tolist() test_size = 1 - self.train_split - (1 - self.train_split) ** 2 - msss = MultilabelStratifiedShuffleSplit( - n_splits=1, test_size=test_size, random_state=seed - ) - train_indices, test_indices = next(msss.split(labels_list, labels_list)) + if len(labels_list[0]) > 1: + splitter = MultilabelStratifiedShuffleSplit( + n_splits=1, test_size=test_size, random_state=seed + ) + else: + splitter = StratifiedShuffleSplit( + n_splits=1, test_size=test_size, random_state=seed + ) + + train_indices, test_indices = next(splitter.split(labels_list, labels_list)) df_train = df.iloc[train_indices] df_test = df.iloc[test_indices] @@ -985,12 +992,18 @@ def get_train_val_splits_given_test( # scale val set size by 1/self.train_split to compensate for (hypothetical) test set size (1-self.train_split) test_size = ((1 - self.train_split) ** 2) / self.train_split - msss = MultilabelStratifiedShuffleSplit( - n_splits=1, test_size=test_size, random_state=seed - ) + + if len(labels_list_trainval[0]) > 1: + splitter = MultilabelStratifiedShuffleSplit( + n_splits=1, test_size=test_size, random_state=seed + ) + else: + splitter = StratifiedShuffleSplit( + n_splits=1, test_size=test_size, random_state=seed + ) train_indices, validation_indices = next( - msss.split(labels_list_trainval, labels_list_trainval) + splitter.split(labels_list_trainval, labels_list_trainval) ) df_validation = df_trainval.iloc[validation_indices] From 4fc31dab7716a54f05666f9bd0d5fe51d066e647 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 16:13:40 +0200 Subject: [PATCH 050/112] chebi notebook : suggested changes - https://github.com/ChEB-AI/python-chebai/pull/46#pullrequestreview-2325741708 --- tutorials/data_exploration_chebi.ipynb | 836 +++++++++++++++++++++++++ 1 file changed, 836 insertions(+) create mode 100644 tutorials/data_exploration_chebi.ipynb diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb new file mode 100644 index 00000000..17c3ae33 --- /dev/null +++ b/tutorials/data_exploration_chebi.ipynb @@ -0,0 +1,836 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "\n", + "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", + "metadata": {}, + "source": [ + "# 1. Instantiation of a Data Class\n", + "\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "### Inheritance Hierarchy\n", + "\n", + "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + "\n", + "\n", + "### Explanation\n", + "A ChEBI data class can be configured with the following main parameters:\n", + "\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", + "\n", + "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", + "\n", + "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" + ] + }, + { + "cell_type": "markdown", + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", + "metadata": {}, + "source": [ + "# Available ChEBI Data Classes\n", + "\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py):\n", + "\n", + "## `ChEBIOver100`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver50`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver100DeepSMILES`\n", + "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver100SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver50SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", + "\n", + "## `ChEBIOver50Partial`\n", + "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "metadata": {}, + "outputs": [], + "source": [ + "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", + "metadata": {}, + "outputs": [], + "source": [ + "chebi_class = ChEBIOver50(chebi_version=231)" + ] + }, + { + "cell_type": "markdown", + "id": "8456b545-88c5-401d-baa5-47e8ae710f04", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "1655d489-25fe-46de-9feb-eeca5d36936f", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "### Automatic Execution: \n", + "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", + "Cross-validation enabled: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", + "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", + "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" + ] + } + ], + "source": [ + "chebi_class.prepare_data()\n", + "chebi_class.setup()" + ] + }, + { + "cell_type": "markdown", + "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", + "metadata": {}, + "source": [ + "# 3. Different Data Files Created and their Structure\n", + "\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", + "\n", + "### Data Files\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", + " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", + "\n", + "### File Structure and Preprocessing Stages\n", + "\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **File**: `chebi.obo`\n", + " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", + "\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", + "\n", + "### Summary of File Paths\n", + "\n", + "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", + "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", + "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." + ] + }, + { + "cell_type": "markdown", + "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "74adb549-9e02-472d-a535-78a584853b52", + "metadata": {}, + "source": [ + "# 4. Information Stored in the Files\n" + ] + }, + { + "cell_type": "markdown", + "id": "43329709-5134-4ce5-88e7-edd2176bf84d", + "metadata": {}, + "source": [ + "## chebi.obo\n", + "\n", + "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", + "\n", + "### Structure of `chebi.obo`\n", + "\n", + "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "\n", + "#### Example of a Term Document\n", + "\n", + "```plaintext\n", + "[Term]\n", + "id: CHEBI:24867\n", + "name: monoatomic ion\n", + "subset: 3_STAR\n", + "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", + "is_a: CHEBI:24870\n", + "is_a: CHEBI:33238\n", + "is_a: CHEBI:3323Relevant 8\n", + "```\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each term document in the `chebi.obo` file consists of the following key attributes:\n", + "\n", + "- **`[Term]`**: \n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n", + "\n", + "- **`id: CHEBI:24867`**: \n", + " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n", + " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n", + "\n", + "- **`name: monoatomic ion`**: \n", + " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n", + " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n", + "\n", + "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n", + " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats." + ] + }, + { + "cell_type": "markdown", + "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", + "metadata": {}, + "source": [ + "## `data.pkl` File\n", + "\n", + "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", + "\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the ID of each ChEBI data instance.\n", + "- **Column 1**: Contains the name of each ChEBI data instance.\n", + "- **Column 2**: Contains the SMILES representation of the chemical.\n", + "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (129184, 1335)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameSMILES1722246825712580263430983992...143017143212143813146180147334156473166828166904167497167559
033429monoatomic monoanion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
130151aluminide(1-)[Al-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
216042halide anion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
317051fluoride[F-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
428741sodium fluoride[F-].[Na+]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1335 columns

\n", + "
" + ], + "text/plain": [ + " id name SMILES 1722 2468 2571 2580 2634 \\\n", + "0 33429 monoatomic monoanion [*-] False False False False False \n", + "1 30151 aluminide(1-) [Al-] False False False False False \n", + "2 16042 halide anion [*-] False False False False False \n", + "3 17051 fluoride [F-] False False False False False \n", + "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", + "\n", + " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", + "0 False False ... False False False False False False False \n", + "1 False False ... False False False False False False False \n", + "2 False False ... False False False False False False False \n", + "3 False False ... False False False False False False False \n", + "4 False False ... False False False False False False False \n", + "\n", + " 166904 167497 167559 \n", + "0 False False False \n", + "1 False False False \n", + "2 False False False \n", + "3 False False False \n", + "4 False False False \n", + "\n", + "[5 rows x 1335 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", + "metadata": {}, + "source": [ + "## `data.pt` File\n", + "\n", + "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", + "\n", + "### Structure of `data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", + "metadata": {}, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n" + ] + } + ], + "source": [ + "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n", + "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n", + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n", + "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n", + "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n" + ] + } + ], + "source": [ + "for i in range(5):\n", + " print(data_pt[i])" + ] + }, + { + "cell_type": "markdown", + "id": "861da1c3-0401-49f0-a22f-109814ed95d5", + "metadata": {}, + "source": [ + "## `classes.txt` File\n", + "\n", + "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", + "\n", + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1722\n", + "2468\n", + "2571\n", + "2580\n", + "2634\n" + ] + } + ], + "source": [ + "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", + " for i in range(5):\n", + " line = file.readline()\n", + " print(line.strip())" + ] + }, + { + "cell_type": "markdown", + "id": "b058714f-e434-4367-89b9-74c129ac727f", + "metadata": {}, + "source": [ + "## `splits.csv` File\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
033429train
130151train
217051train
332129train
430340train
\n", + "
" + ], + "text/plain": [ + " id split\n", + "0 33429 train\n", + "1 30151 train\n", + "2 17051 train\n", + "3 32129 train\n", + "4 30340 train" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", + "csv_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", + "metadata": {}, + "source": [ + "# 5. Example Molecule: Different Encodings\n", + "\n", + "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", + "\n", + "### Explanation:\n", + "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", + "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", + "\n", + "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n", + "\n", + "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n", + " - **Benzene SMILES**: `c1ccccc1`\n", + " - **Explanation**: \n", + " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n", + "\n", + "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n", + " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n", + " - **Explanation**: \n", + " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n", + " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n", + "\n", + "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics." + ] + }, + { + "cell_type": "markdown", + "id": "93e328cf-09f9-4694-b175-28320590937d", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (env_chebai)", + "language": "python", + "name": "env_chebai" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 587c0264b6a9c79a7d2b6be490c03486acc197f8 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 16:14:31 +0200 Subject: [PATCH 051/112] go_notebook: data exploration --- tutorials/data_exploration_go.ipynb | 551 ++++++++++++++++++++++++++++ 1 file changed, 551 insertions(+) create mode 100644 tutorials/data_exploration_go.ipynb diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb new file mode 100644 index 00000000..391192a1 --- /dev/null +++ b/tutorials/data_exploration_go.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on Gene Ontology (GO) and Swiss UniProt Protein data. This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "\n", + "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "\n", + "---" + ], + "id": "da687d32ba48b188" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Information for Protein Dataset\n", + "\n", + "# 1. Instantiation of a Data Class\n", + "\n", + "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n", + "### Inheritance Hierarchy\n", + "\n", + "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, GO_UniProt data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + "\n", + "\n", + "### Configuration Parameters\n", + "\n", + "Data classes related to proteins can be configured using the following main parameters:\n", + "\n", + "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", + "\n", + "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n", + "\n", + "### Available GOUniProt Data Classes\n", + "\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n", + "\n", + "#### `GOUniProtOver250`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "\n", + "#### `GOUniProtOver50`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n" + ], + "id": "64585012b0d7f66f" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Instantiation Example", + "id": "605bbca601037df2" + }, + { + "metadata": {}, + "cell_type": "code", + "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250", + "id": "440f203ceaf7e4b7", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:08:21.236447Z", + "start_time": "2024-09-30T14:08:21.130242Z" + } + }, + "cell_type": "code", + "source": "go_class = GOUniProtOver250()", + "id": "a648346d81d0dc5e", + "outputs": [], + "execution_count": 2 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## GOUniProt Data File Structure\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", + " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", + " - **File Paths**:\n", + " - `data/GO_UniProt/raw/${filename}.obo`\n", + " - `data/GO_UniProt/raw/${filename}.dat`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", + "\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" + ], + "id": "ee174b61b36c71aa" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "### Automatic Execution: \n", + "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ], + "id": "2328e824c4dafb2d" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "go_class.prepare_data()\n", + "go_class.setup()" + ], + "id": "9f77351090560bc4" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## data.pkl", + "id": "735844f0b2474ad6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:08:33.990378Z", + "start_time": "2024-09-30T14:08:33.959459Z" + } + }, + "cell_type": "code", + "source": "import pandas as pd", + "id": "b4da7e73e251e1d1", + "outputs": [], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:10:12.796911Z", + "start_time": "2024-09-30T14:10:06.052276Z" + } + }, + "cell_type": "code", + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ], + "id": "b66fbb9b720d053c", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (27459, 1050)\n" + ] + }, + { + "data": { + "text/plain": [ + " swiss_id accession \\\n", + "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", + "9 14331_CAEEL P41932,Q21537 \n", + "10 14331_MAIZE P49106 \n", + "13 14332_MAIZE Q01526 \n", + "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", + "\n", + " go_ids \\\n", + "8 [19222] \n", + "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", + "10 [3677, 5634, 10468, 44877] \n", + "13 [3677, 5634, 10468, 44877] \n", + "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", + "\n", + " sequence 41 75 122 \\\n", + "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", + "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", + "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", + "\n", + " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", + "8 False False False ... False False False False False \n", + "9 False False False ... False False False False False \n", + "10 False False False ... False False False False False \n", + "13 False False False ... False False False False False \n", + "14 False False False ... False False False False False \n", + "\n", + " 2000377 2001020 2001141 2001233 2001234 \n", + "8 False False False False False \n", + "9 False False False False False \n", + "10 False False False False False \n", + "13 False False False False False \n", + "14 False False False False False \n", + "\n", + "[5 rows x 1050 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
swiss_idaccessiongo_idssequence4175122165209226...2000145200014620001472000241200024320003772001020200114120012332001234
814331_ARATHP42643,Q945M2,Q9M0S7[19222]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1014331_MAIZEP49106[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1314332_MAIZEQ01526[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1414333_ARATHP42644,F4KBI7,Q945L2[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1050 columns

\n", + "
" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## data.pt", + "id": "2c9f23883c66b48d" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:10:35.034002Z", + "start_time": "2024-09-30T14:10:35.018342Z" + } + }, + "cell_type": "code", + "source": "import torch", + "id": "85b097601fb242d6", + "outputs": [], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:11:36.443693Z", + "start_time": "2024-09-30T14:11:34.199285Z" + } + }, + "cell_type": "code", + "source": [ + "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))\n", + "for i in range(1):\n", + " print(data_pt[i])" + ], + "id": "289a54a71dec20fb", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n", + "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n" + ] + } + ], + "execution_count": 11 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Protein Representation Using Amino Acid Sequence Notation\n", + "\n", + "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", + "\n", + "### Example Protein Sequence\n", + "\n", + "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", + "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", + "\n", + "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", + "- **Sequence Length**: 147\n", + "\n", + "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", + "\n", + "### The 20 Amino Acids and Their One-Letter Notations\n", + "\n", + "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", + "\n", + "| One-Letter Notation | Amino Acid Name | Description |\n", + "|---------------------|----------------------|---------------------------------------------------------|\n", + "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", + "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", + "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **F** | Phenylalanine | Aromatic, non-polar. |\n", + "| **G** | Glycine | Smallest amino acid, non-polar. |\n", + "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", + "| **I** | Isoleucine | Non-polar, aliphatic. |\n", + "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", + "| **L** | Leucine | Non-polar, aliphatic. |\n", + "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", + "| **N** | Asparagine | Polar, uncharged. |\n", + "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", + "| **Q** | Glutamine | Polar, uncharged. |\n", + "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", + "| **S** | Serine | Polar, can be phosphorylated. |\n", + "| **T** | Threonine | Polar, can be phosphorylated. |\n", + "| **V** | Valine | Non-polar, aliphatic. |\n", + "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", + "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", + "\n", + "### Understanding Protein Sequences\n", + "\n", + "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", + "\n", + "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", + "\n", + "\n", + "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" + ], + "id": "481b8c0271ec9636" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 71e9888d54276413f4d145c031ea56cd60d0f228 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 20:29:32 +0200 Subject: [PATCH 052/112] Delete data_exploration.ipynb --- tutorials/data_exploration.ipynb | 1294 ------------------------------ 1 file changed, 1294 deletions(-) delete mode 100644 tutorials/data_exploration.ipynb diff --git a/tutorials/data_exploration.ipynb b/tutorials/data_exploration.ipynb deleted file mode 100644 index fce3a9f7..00000000 --- a/tutorials/data_exploration.ipynb +++ /dev/null @@ -1,1294 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", - "metadata": {}, - "source": [ - "# 1. Instantiation of a Data Class\n", - "\n", - "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data.\n", - "### Inheritance Hierarchy\n", - "\n", - "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n", - "\n", - "- **`XYBaseDataModule`**: This is the base class for all data modules in `chebai`, providing foundational properties and methods for handling and processing datasets, including loading a stored dataset and creating a `DataLoader`.\n", - "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for some datasets (e.g. the ChEBI and Gene Ontology datasets). The defining feature is the dynamically created data split into training, validation and test sets. It inherits from `XYBaseDataModule`.\n", - "\n", - "\n", - "\n", - "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - "\n", - "\n", - "### Explanation\n", - "A ChEBI data class can be configured with the following main parameters:\n", - "\n", - "- **chebi_version (int)**: Specifies the version of the ChEBI dataset to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", - "\n", - "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", - "\n", - "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\n", - "\n", - "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\n", - "\n", - "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", - "\n", - "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\n", - "\n", - "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\n", - "\n", - "### Additional Input Parameters\n", - "\n", - "The `XYBaseDa ChEBI data class, which `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", - "\n", - "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n", - "\n", - "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\n", - "\n", - "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\n", - "\n", - "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\n", - "\n", - "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\n", - "\n", - "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\n", - "\n", - "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\n", - "\n", - "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\n", - "\n", - "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\n", - "\n", - "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\n", - "\n", - "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\n", - "\n", - "- **kwargs**: Additional keyword arguments.\n", - "\n", - "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", - "metadata": {}, - "source": [ - "# Available ChEBI Data Classes\n", - "\n", - "## `ChEBIOver100`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", - "\n", - "## `ChEBIOver50`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", - "\n", - "## `ChEBIOver100DeepSMILES`\n", - "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver100SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver50SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", - "\n", - "## `ChEBIOver50Partial`\n", - "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.datasets.chebi import ChEBIOver50" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", - "metadata": {}, - "outputs": [], - "source": [ - "chebi_class = ChEBIOver50(chebi_version=231)" - ] - }, - { - "cell_type": "markdown", - "id": "8456b545-88c5-401d-baa5-47e8ae710f04", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "1655d489-25fe-46de-9feb-eeca5d36936f", - "metadata": {}, - "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", - "\n", - "### Why is Preparation Needed?\n", - "\n", - "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", - "\n", - "### Main Methods for Data Preprocessing\n", - "\n", - "The data preprocessing in a data class involves two main methods:\n", - "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", - "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", - "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", - "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", - "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" - ] - } - ], - "source": [ - "chebi_class.prepare_data()\n", - "chebi_class.setup()" - ] - }, - { - "cell_type": "markdown", - "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", - "metadata": {}, - "source": [ - "# 3. Different Data Files Created and their Structure\n", - "\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", - "\n", - "### Data Files\n", - "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", - " - **Description**: Contains the raw ChEBI ontology data in OBO format, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", - "\n", - "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a Pandas dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", - "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "4. **`classes.txt`**\n", - " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", - "\n", - "5. **`splits.csv`**\n", - " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", - "\n", - "### File Structure and Preprocessing Stages\n", - "\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", - "\n", - "1. **Raw Data Stage**:\n", - " - **File**: `chebi.obo`\n", - " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", - "\n", - "2. **Processed Data Stage 1**:\n", - " - **File**: `data.pkl`\n", - " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", - " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", - "\n", - "3. **Processed Data Stage 2**:\n", - " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the tokenized data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", - "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", - "\n", - "### Summary of File Paths\n", - "\n", - "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", - "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", - "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", - "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." - ] - }, - { - "cell_type": "markdown", - "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "74adb549-9e02-472d-a535-78a584853b52", - "metadata": {}, - "source": [ - "# 4. Information Stored in the Files\n" - ] - }, - { - "cell_type": "markdown", - "id": "43329709-5134-4ce5-88e7-edd2176bf84d", - "metadata": {}, - "source": [ - "## chebi.obo\n", - "\n", - "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", - "\n", - "### Structure of `chebi.obo`\n", - "\n", - "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", - "\n", - "#### Example of a Term Document\n", - "\n", - "```plaintext\n", - "[Term]\n", - "id: CHEBI:24867\n", - "name: monoatomic ion\n", - "subset: 3_STAR\n", - "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", - "is_a: CHEBI:24870\n", - "is_a: CHEBI:33238\n", - "is_a: CHEBI:3323Relevant 8\n", - "```\n", - "\n", - "### Breakdown of Attributes\n", - "\n", - "Each term document in the `chebi.obo` file consists of the following key attributes:\n", - "\n", - "- **`[Term]`**: \n", - " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n", - "\n", - "- **`id: CHEBI:24867`**: \n", - " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n", - " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n", - "\n", - "- **`name: monoatomic ion`**: \n", - " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n", - " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n", - "\n", - "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n", - " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n", - " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats." - ] - }, - { - "cell_type": "markdown", - "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", - "metadata": {}, - "source": [ - "## `data.pkl` File\n", - "\n", - "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", - "\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the ID of each ChEBI data instance.\n", - "- **Column 1**: Contains the name of each ChEBI data instance.\n", - "- **Column 2**: Contains the SMILES representation of the chemical.\n", - "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "fd490270-59b8-4c1c-8b09-204defddf592", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (129184, 1335)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnameSMILES1722246825712580263430983992...143017143212143813146180147334156473166828166904167497167559
033429monoatomic monoanion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
130151aluminide(1-)[Al-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
216042halide anion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
317051fluoride[F-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
428741sodium fluoride[F-].[Na+]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", - "

5 rows × 1335 columns

\n", - "
" - ], - "text/plain": [ - " id name SMILES 1722 2468 2571 2580 2634 \\\n", - "0 33429 monoatomic monoanion [*-] False False False False False \n", - "1 30151 aluminide(1-) [Al-] False False False False False \n", - "2 16042 halide anion [*-] False False False False False \n", - "3 17051 fluoride [F-] False False False False False \n", - "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", - "\n", - " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", - "0 False False ... False False False False False False False \n", - "1 False False ... False False False False False False False \n", - "2 False False ... False False False False False False False \n", - "3 False False ... False False False False False False False \n", - "4 False False ... False False False False False False False \n", - "\n", - " 166904 167497 167559 \n", - "0 False False False \n", - "1 False False False \n", - "2 False False False \n", - "3 False False False \n", - "4 False False False \n", - "\n", - "[5 rows x 1335 columns]" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", - "metadata": {}, - "source": [ - "## `data.pt` File\n", - "\n", - "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", - "\n", - "### Structure of `data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", - "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", - "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", - "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", - "metadata": {}, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n" - ] - } - ], - "source": [ - "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", - "print(\"Type of loaded data:\", type(data_pt))" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n", - "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n", - "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n", - "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n", - "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n" - ] - } - ], - "source": [ - "for i in range(5):\n", - " print(data_pt[i])" - ] - }, - { - "cell_type": "markdown", - "id": "861da1c3-0401-49f0-a22f-109814ed95d5", - "metadata": {}, - "source": [ - "## `classes.txt` File\n", - "\n", - "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", - "\n", - "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1722\n", - "2468\n", - "2571\n", - "2580\n", - "2634\n" - ] - } - ], - "source": [ - "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", - " for i in range(5):\n", - " line = file.readline()\n", - " print(line.strip())" - ] - }, - { - "cell_type": "markdown", - "id": "b058714f-e434-4367-89b9-74c129ac727f", - "metadata": {}, - "source": [ - "## `splits.csv` File\n", - "\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsplit
033429train
130151train
217051train
332129train
430340train
\n", - "
" - ], - "text/plain": [ - " id split\n", - "0 33429 train\n", - "1 30151 train\n", - "2 17051 train\n", - "3 32129 train\n", - "4 30340 train" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", - "csv_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", - "metadata": {}, - "source": [ - "# 5. Example Molecule: Different Encodings\n", - "\n", - "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", - "\n", - "### Explanation:\n", - "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", - "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", - "\n", - "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n", - "\n", - "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n", - " - **Benzene SMILES**: `c1ccccc1`\n", - " - **Explanation**: \n", - " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n", - "\n", - "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n", - " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n", - " - **Explanation**: \n", - " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n", - " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n", - "\n", - "### 3. **InChI (IUPAC International Chemical Identifier)**\n", - " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\n", - " - **Explanation**: \n", - " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\n", - "\n", - "### 4. **InChIKey**\n", - " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\n", - " - **Explanation**: \n", - " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\n", - "\n", - "### 5. **Canonical SMILES**\n", - " - **Benzene Canonical SMILES**: `c1ccccc1`\n", - " - **Explanation**:\n", - " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\n", - "\n", - "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\n", - " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\n", - " - **Explanation**: \n", - " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\n", - "\n", - "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics." - ] - }, - { - "cell_type": "markdown", - "id": "93e328cf-09f9-4694-b175-28320590937d", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde", - "metadata": {}, - "source": [ - "# Information for Protein Dataset\n", - "\n", - "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\n", - "\n", - "### Configuration Parameters\n", - "\n", - "Data classes related to proteins can be configured using the following main parameters:\n", - "\n", - "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", - "\n", - "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\n", - "\n", - "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", - "\n", - "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\n", - "\n", - "### Available GOUniProt Data Classes\n", - "\n", - "#### `GOUniProtOver250`\n", - "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", - "\n", - "#### `GOUniProtOver50`\n", - "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", - "\n", - "### Instantiation Example\n", - "\n", - "```python\n", - "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\n", - "go_class = GOUniProtOver250()\n" - ] - }, - { - "cell_type": "markdown", - "id": "2ffca830-bc0b-421c-8054-0860c95c10f2", - "metadata": {}, - "source": [ - "## GOUniProt Data File Structure\n", - "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", - " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", - " - **File Paths**:\n", - " - `data/GO_UniProt/raw/${filename}.obo`\n", - " - `data/GO_UniProt/raw/${filename}.dat`\n", - "\n", - "2. **`data.pkl`**\n", - " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", - "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "4. **`classes.txt`**\n", - " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", - "\n", - "5. **`splits.csv`**\n", - " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", - "\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" - ] - }, - { - "cell_type": "markdown", - "id": "61bc261e-2328-4968-aca6-14c48bb24348", - "metadata": {}, - "source": [ - "## data.pkl" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "id": "31df4ee7-4c03-4ea2-9798-5e5082a74c2b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (27459, 1050)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
swiss_idaccessiongo_idssequence4175122165209226...2000145200014620001472000241200024320003772001020200114120012332001234
814331_ARATHP42643,Q945M2,Q9M0S7[19222]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1014331_MAIZEP49106[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1314332_MAIZEQ01526[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1414333_ARATHP42644,F4KBI7,Q945L2[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", - "

5 rows × 1050 columns

\n", - "
" - ], - "text/plain": [ - " swiss_id accession \\\n", - "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", - "9 14331_CAEEL P41932,Q21537 \n", - "10 14331_MAIZE P49106 \n", - "13 14332_MAIZE Q01526 \n", - "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", - "\n", - " go_ids \\\n", - "8 [19222] \n", - "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", - "10 [3677, 5634, 10468, 44877] \n", - "13 [3677, 5634, 10468, 44877] \n", - "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", - "\n", - " sequence 41 75 122 \\\n", - "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", - "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", - "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", - "\n", - " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", - "8 False False False ... False False False False False \n", - "9 False False False ... False False False False False \n", - "10 False False False ... False False False False False \n", - "13 False False False ... False False False False False \n", - "14 False False False ... False False False False False \n", - "\n", - " 2000377 2001020 2001141 2001233 2001234 \n", - "8 False False False False False \n", - "9 False False False False False \n", - "10 False False False False False \n", - "13 False False False False False \n", - "14 False False False False False \n", - "\n", - "[5 rows x 1050 columns]" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "be0078fd-bcf1-4d4c-b8c6-c84e3aeac99c", - "metadata": {}, - "source": [ - "## data.pt" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "id": "a70f9c35-daca-4728-a9ea-b1212866f421", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n", - "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n" - ] - } - ], - "source": [ - "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n", - "print(\"Type of loaded data:\", type(data_pt))\n", - "for i in range(1):\n", - " print(data_pt[i])" - ] - }, - { - "cell_type": "markdown", - "id": "380049c1-2963-4223-b698-a7b59b9fe595", - "metadata": {}, - "source": [ - "## Protein Representation Using Amino Acid Sequence Notation\n", - "\n", - "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", - "\n", - "### Example Protein Sequence\n", - "\n", - "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", - "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", - "\n", - "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", - "- **Sequence Length**: 147\n", - "\n", - "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", - "\n", - "### The 20 Amino Acids and Their One-Letter Notations\n", - "\n", - "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", - "\n", - "| One-Letter Notation | Amino Acid Name | Description |\n", - "|---------------------|----------------------|---------------------------------------------------------|\n", - "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", - "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", - "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **F** | Phenylalanine | Aromatic, non-polar. |\n", - "| **G** | Glycine | Smallest amino acid, non-polar. |\n", - "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", - "| **I** | Isoleucine | Non-polar, aliphatic. |\n", - "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", - "| **L** | Leucine | Non-polar, aliphatic. |\n", - "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", - "| **N** | Asparagine | Polar, uncharged. |\n", - "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", - "| **Q** | Glutamine | Polar, uncharged. |\n", - "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", - "| **S** | Serine | Polar, can be phosphorylated. |\n", - "| **T** | Threonine | Polar, can be phosphorylated. |\n", - "| **V** | Valine | Non-polar, aliphatic. |\n", - "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", - "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", - "\n", - "### Understanding Protein Sequences\n", - "\n", - "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", - "\n", - "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", - "\n", - "\n", - "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" - ] - }, - { - "cell_type": "markdown", - "id": "702359d6-5338-4391-b196-2328ba5676a1", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (env_chebai)", - "language": "python", - "name": "env_chebai" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From c6b8d5071b16e99c9b379304ddb22829af9840cf Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 23:35:07 +0200 Subject: [PATCH 053/112] add info on evidence codes + uniprot.data file + changes --- tutorials/data_exploration_go.ipynb | 436 +++++++++++++++++++++++++--- 1 file changed, 402 insertions(+), 34 deletions(-) diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb index 391192a1..2c789ae6 100644 --- a/tutorials/data_exploration_go.ipynb +++ b/tutorials/data_exploration_go.ipynb @@ -18,8 +18,6 @@ "metadata": {}, "cell_type": "markdown", "source": [ - "# Information for Protein Dataset\n", - "\n", "# 1. Instantiation of a Data Class\n", "\n", "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n", @@ -71,31 +69,80 @@ "id": "605bbca601037df2" }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:03.920610Z", + "start_time": "2024-09-30T21:25:03.622407Z" + } + }, "cell_type": "code", "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250", "id": "440f203ceaf7e4b7", "outputs": [], - "execution_count": null + "execution_count": 12 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-09-30T14:08:21.236447Z", - "start_time": "2024-09-30T14:08:21.130242Z" + "end_time": "2024-09-30T21:25:08.863132Z", + "start_time": "2024-09-30T21:25:08.387739Z" } }, "cell_type": "code", "source": "go_class = GOUniProtOver250()", "id": "a648346d81d0dc5e", "outputs": [], - "execution_count": 2 + "execution_count": 13 }, { "metadata": {}, "cell_type": "markdown", "source": [ - "## GOUniProt Data File Structure\n", + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "### Automatic Execution: \n", + "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required GOUniProt data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ], + "id": "2328e824c4dafb2d" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "go_class.prepare_data()\n", + "go_class.setup()" + ], + "id": "9f77351090560bc4", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# 3. GOUniProt Data File Structure\n", "\n", "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", @@ -123,54 +170,225 @@ ], "id": "ee174b61b36c71aa" }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "# 4. Information Stored in the Files", + "id": "3f92b58e460c08fd" + }, { "metadata": {}, "cell_type": "markdown", "source": [ - "# 2. Preparation / Setup Methods\n", + "## go-basic.obo\n", "\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", - "### Automatic Execution: \n", - "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", "\n", + "### Structure of `go-basic.obo`\n", "\n", - "### Why is Preparation Needed?\n", + "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", "\n", - "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "#### Example of a Term Document\n", "\n", - "### Main Methods for Data Preprocessing\n", + "```plaintext\n", + "[Term]\n", + "id: GO:0000032\n", + "name: cell wall mannoprotein biosynthetic process\n", + "namespace: biological_process\n", + "def: \"The chemical reactions and pathways resulting in the formation of cell wall mannoproteins, any cell wall protein that contains covalently bound mannose residues.\" [GOC:ai]\n", + "synonym: \"cell wall mannoprotein anabolism\" EXACT []\n", + "is_a: GO:0006057 ! mannoprotein biosynthetic process\n", + "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n", + "```\n", "\n", - "The data preprocessing in a data class involves two main methods:\n", + "### Breakdown of Attributes\n", "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "Each term document in the `go-basic.obo` file consists of the following key attributes:\n", "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "- **`[Term]`**: \n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct biological process, molecular function, or cellular component.\n", "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + "- **`id: GO:0000032`**: \n", + " - **Description**: A unique identifier for the biological term within the GO ontology.\n", + " - **Example**: `GO:0000032` refers to the term \"cell wall mannoprotein biosynthetic process.\"\n", + "\n", + "- **`name: cell wall mannoprotein biosynthetic process`**: \n", + " - **Description**: The name of the biological process, molecular function, or cellular component being described.\n", + " - **Example**: The name \"cell wall mannoprotein biosynthetic process\" is a descriptive label for the GO term with the identifier `GO:0000032`.\n", + "\n", + "- **`namespace: biological_process`**: \n", + " - **Description**: Specifies which ontology the term belongs to. The main namespaces are `biological_process`, `molecular_function`, and `cellular_component`.\n", + "\n", + "- **`is_a: GO:0006057`**: \n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n", + " - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n" ], - "id": "2328e824c4dafb2d" + "id": "cca75d881cb8bade" }, { "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, + "cell_type": "markdown", "source": [ - "go_class.prepare_data()\n", - "go_class.setup()" + "## uniprot_sprot.dat\n", + "\n", + "The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", + "\n", + "\n", + "## Structure of `uniprot_sprot.dat`\n", + "\n", + "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", + "\n", + "### Example of a Protein Entry\n", + "\n", + "```plaintext\n", + "ID 002L_FRG3G Reviewed; 320 AA.\n", + "AC Q6GZX3;\n", + "DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot.\n", + "DT 19-JUL-2004, sequence version 1.\n", + "DT 08-NOV-2023, entry version 46.\n", + "DE RecName: Full=Uncharacterized protein 002L;\n", + "GN ORFNames=FV3-002L;\n", + "OS Frog virus 3 (isolate Goorha) (FV-3).\n", + "OC Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes;\n", + "OX NCBI_TaxID=654924;\n", + "OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens).\n", + "RN [1]\n", + "RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n", + "RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;\n", + "RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.;\n", + "RT \"Comparative genomic analyses of frog virus 3, type species of the genus\n", + "RT Ranavirus (family Iridoviridae).\";\n", + "RL Virology 323:70-84(2004).\n", + "CC -!- SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass membrane\n", + "CC protein {ECO:0000305}.\n", + "DR EMBL; AY548484; AAT09661.1; -; Genomic_DNA.\n", + "DR RefSeq; YP_031580.1; NC_005946.1.\n", + "DR GeneID; 2947774; -.\n", + "DR KEGG; vg:2947774; -.\n", + "DR Proteomes; UP000008770; Segment.\n", + "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", + "DR GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.\n", + "PE 4: Predicted;\n", + "KW Host membrane; Membrane; Reference proteome; Transmembrane;\n", + "KW Transmembrane helix.\n", + "FT CHAIN 1..320\n", + "FT /note=\"Uncharacterized protein 002L\"\n", + "FT /id=\"PRO_0000410509\"\n", + "SQ SEQUENCE 320 AA; 34642 MW; 9E110808B6E328E0 CRC64;\n", + " MSIIGATRLQ NDKSDTYSAG PCYAGGCSAF TPRGTCGKDW DLGEQTCASG FCTSQPLCAR\n", + " IKKTQVCGLR YSSKGKDPLV SAEWDSRGAP YVRCTYDADL IDTQAQVDQF VSMFGESPSL\n", + " AERYCMRGVK NTAGELVSRV SSDADPAGGW CRKWYSAHRG PDQDAALGSF CIKNPGAADC\n", + " KCINRASDPV YQKVKTLHAY PDQCWYVPCA ADVGELKMGT QRDTPTNCPT QVCQIVFNML\n", + " DDGSVTMDDV KNTINCDFSK YVPPPPPPKP TPPTPPTPPT PPTPPTPPTP PTPRPVHNRK\n", + " VMFFVAGAVL VAILISTVRW\n", + "//\n", + "```\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n", + "\n", + "- **`ID`**: \n", + " - **Description**: Contains the unique identifier for the protein and its status (e.g., `Reviewed` indicates the sequence has been manually curated).\n", + " - **Example**: `002L_FRG3G` is the identifier for the protein from Frog virus 3.\n", + "\n", + "- **`AC`**: \n", + " - **Description**: Accession number, a unique identifier for the protein sequence.\n", + " - **Example**: `Q6GZX3` is the accession number for this entry.\n", + "\n", + "- **`DR`**: \n", + " - **Description**: Cross-references to other databases like EMBL, RefSeq, KEGG, and GeneID.\n", + " - **Example**: This entry is cross-referenced with the EMBL database, RefSeq, GO, etc.\n", + "\n", + "- **`GO`**: \n", + " - **Description**: Gene Ontology annotations that describe the cellular component, biological process, or molecular function associated with the protein.\n", + " - **Example**: The protein is associated with the GO terms `GO:0033644` (host cell membrane) and `GO:0016020` (membrane).\n", + "\n", + "- **`SQ`**: \n", + " - **Description**: The amino acid sequence of the protein.\n", + " - **Example**: The sequence consists of 320 amino acids.\n", + "\n", + "The `uniprot_sprot.dat` file is an extensively curated resource, containing comprehensive protein data used for various bioinformatics applications.\n", + "\n", + "__Note__: For more detailed information refer [here](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt\n", + "). \n", + "\n", + "Consider the below line from above example: \n", + "```plaintext\n", + "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", + "```\n", + "\n", + "The line contains a **Gene Ontology (GO) annotation** describing the protein's subcellular location. Here's a detailed breakdown:\n", + "\n", + "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", + "\n", + "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", + "\n", + "### More on GO Evidence Codes\n", + "\n", + "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n", + "\n", + "| **Evidence Code** | **Description** |\n", + "|-------------------|-----------------|\n", + "| **EXP** | Inferred from Experiment |\n", + "| **IDA** | Inferred from Direct Assay |\n", + "| **IPI** | Inferred from Physical Interaction |\n", + "| **IMP** | Inferred from Mutant Phenotype |\n", + "| **IGI** | Inferred from Genetic Interaction |\n", + "| **IEP** | Inferred from Expression Pattern |\n", + "| **TAS** | Traceable Author Statement |\n", + "| **IC** | Inferred by Curator |\n", + "| **IEA** | Inferred from Electronic Annotation (Computational) |\n", + "| **ISS** | Inferred from Sequence or Structural Similarity |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **ISM** | Inferred from Sequence Model |\n", + "| **ISO** | Inferred from Sequence Orthology |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **RCA** | Inferred from Reviewed Computational Analysis |\n", + "| **NAS** | Non-traceable Author Statement |\n", + "| **ND** | No Biological Data Available (placeholder) |\n", + "| **NR** | Not Recorded |\n", + "\n", + "\n", + "### Grouping of Codes:\n", + "\n", + "- **Experimental Evidence Codes**: \n", + " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", + " \n", + "- **Author/Curator Inferred Codes**:\n", + " - **TAS**, **IC**, **NAS**\n", + "\n", + "- **Computational Evidence Codes**:\n", + " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", + "\n", + "- **Others**:\n", + " - **ND** (No Data), **NR** (Not Recorded)\n", + "\n", + "\n", + "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation." ], - "id": "9f77351090560bc4" + "id": "87c841de7d80beef" }, { "metadata": {}, "cell_type": "markdown", - "source": "## data.pkl", + "source": [ + "## data.pkl\n", + "\n", + "The `data.pkl` file, generated during the preprocessing stage, contains the processed GO data in a dataframe format. Below is an example of how this data is structured:\n", + "\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n", + "- **Column 1**: Contains the accession of each Protein data instance.\n", + "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n", + "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n", + "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ], "id": "735844f0b2474ad6" }, { @@ -427,7 +645,20 @@ { "metadata": {}, "cell_type": "markdown", - "source": "## data.pt", + "source": [ + "## data.pt\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ], "id": "2c9f23883c66b48d" }, { @@ -470,6 +701,143 @@ ], "execution_count": 11 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## `classes.txt` File\n", + "\n", + "The `classes.txt` file lists selected Swiss Proteins classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset.\n", + "\n", + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ], + "id": "f69012b3540fd1b6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:30:34.344202Z", + "start_time": "2024-09-30T21:30:34.328318Z" + } + }, + "cell_type": "code", + "source": [ + "with open(r\"data/GO_UniProt/GO250_BP/processed/classes.txt\", \"r\") as file:\n", + " for i in range(5):\n", + " line = file.readline()\n", + " print(line.strip())" + ], + "id": "19200f7ff9a6ebba", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "41\n", + "75\n", + "122\n", + "165\n", + "209\n" + ] + } + ], + "execution_count": 15 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## `splits.csv` File\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run." + ], + "id": "6661dc11247e9753" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:30:41.586616Z", + "start_time": "2024-09-30T21:30:39.318598Z" + } + }, + "cell_type": "code", + "source": [ + "csv_df = pd.read_csv(r\"data/GO_UniProt/GO250_BP/processed/splits.csv\")\n", + "csv_df.head()" + ], + "id": "88c3ea8f01ba9fac", + "outputs": [ + { + "data": { + "text/plain": [ + " id split\n", + "0 14331_ARATH train\n", + "1 14331_CAEEL train\n", + "2 14331_MAIZE train\n", + "3 14332_MAIZE train\n", + "4 14333_ARATH train" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
014331_ARATHtrain
114331_CAEELtrain
214331_MAIZEtrain
314332_MAIZEtrain
414333_ARATHtrain
\n", + "
" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 16 + }, { "metadata": {}, "cell_type": "markdown", From 4c55b04890861c063370345d3f7f0cc169ec88c5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 23:51:36 +0200 Subject: [PATCH 054/112] minor formatting changes --- tutorials/data_exploration_chebi.ipynb | 1 - tutorials/data_exploration_go.ipynb | 129 ++++++++++++++++--------- 2 files changed, 86 insertions(+), 44 deletions(-) diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb index 17c3ae33..6ddd3238 100644 --- a/tutorials/data_exploration_chebi.ipynb +++ b/tutorials/data_exploration_chebi.ipynb @@ -291,7 +291,6 @@ "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", "is_a: CHEBI:24870\n", "is_a: CHEBI:33238\n", - "is_a: CHEBI:3323Relevant 8\n", "```\n", "\n", "### Breakdown of Attributes\n", diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb index 2c789ae6..8dc4cb44 100644 --- a/tutorials/data_exploration_go.ipynb +++ b/tutorials/data_exploration_go.ipynb @@ -94,6 +94,12 @@ "outputs": [], "execution_count": 13 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "651ab5c39833bd2c" + }, { "metadata": {}, "cell_type": "markdown", @@ -138,6 +144,12 @@ "outputs": [], "execution_count": null }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "db5b58f2d96823fc" + }, { "metadata": {}, "cell_type": "markdown", @@ -170,6 +182,12 @@ ], "id": "ee174b61b36c71aa" }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "a927ad484c930960" + }, { "metadata": {}, "cell_type": "markdown", @@ -323,49 +341,7 @@ "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", "\n", "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", - "\n", - "### More on GO Evidence Codes\n", - "\n", - "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n", - "\n", - "| **Evidence Code** | **Description** |\n", - "|-------------------|-----------------|\n", - "| **EXP** | Inferred from Experiment |\n", - "| **IDA** | Inferred from Direct Assay |\n", - "| **IPI** | Inferred from Physical Interaction |\n", - "| **IMP** | Inferred from Mutant Phenotype |\n", - "| **IGI** | Inferred from Genetic Interaction |\n", - "| **IEP** | Inferred from Expression Pattern |\n", - "| **TAS** | Traceable Author Statement |\n", - "| **IC** | Inferred by Curator |\n", - "| **IEA** | Inferred from Electronic Annotation (Computational) |\n", - "| **ISS** | Inferred from Sequence or Structural Similarity |\n", - "| **ISA** | Inferred from Sequence Alignment |\n", - "| **ISM** | Inferred from Sequence Model |\n", - "| **ISO** | Inferred from Sequence Orthology |\n", - "| **ISA** | Inferred from Sequence Alignment |\n", - "| **RCA** | Inferred from Reviewed Computational Analysis |\n", - "| **NAS** | Non-traceable Author Statement |\n", - "| **ND** | No Biological Data Available (placeholder) |\n", - "| **NR** | Not Recorded |\n", - "\n", - "\n", - "### Grouping of Codes:\n", - "\n", - "- **Experimental Evidence Codes**: \n", - " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", - " \n", - "- **Author/Curator Inferred Codes**:\n", - " - **TAS**, **IC**, **NAS**\n", - "\n", - "- **Computational Evidence Codes**:\n", - " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", - "\n", - "- **Others**:\n", - " - **ND** (No Data), **NR** (Not Recorded)\n", - "\n", - "\n", - "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation." + "\n" ], "id": "87c841de7d80beef" }, @@ -838,6 +814,12 @@ ], "execution_count": 16 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "e6b1f184a5091b83" + }, { "metadata": {}, "cell_type": "markdown", @@ -893,6 +875,67 @@ "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" ], "id": "481b8c0271ec9636" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "db6d7f2cc446e6f9" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## More on GO Evidence Codes\n", + "\n", + "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n", + "\n", + "| **Evidence Code** | **Description** |\n", + "|-------------------|-----------------|\n", + "| **EXP** | Inferred from Experiment |\n", + "| **IDA** | Inferred from Direct Assay |\n", + "| **IPI** | Inferred from Physical Interaction |\n", + "| **IMP** | Inferred from Mutant Phenotype |\n", + "| **IGI** | Inferred from Genetic Interaction |\n", + "| **IEP** | Inferred from Expression Pattern |\n", + "| **TAS** | Traceable Author Statement |\n", + "| **IC** | Inferred by Curator |\n", + "| **IEA** | Inferred from Electronic Annotation (Computational) |\n", + "| **ISS** | Inferred from Sequence or Structural Similarity |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **ISM** | Inferred from Sequence Model |\n", + "| **ISO** | Inferred from Sequence Orthology |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **RCA** | Inferred from Reviewed Computational Analysis |\n", + "| **NAS** | Non-traceable Author Statement |\n", + "| **ND** | No Biological Data Available (placeholder) |\n", + "| **NR** | Not Recorded |\n", + "\n", + "\n", + "### Grouping of Codes:\n", + "\n", + "- **Experimental Evidence Codes**: \n", + " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", + " \n", + "- **Author/Curator Inferred Codes**:\n", + " - **TAS**, **IC**, **NAS**\n", + "\n", + "- **Computational Evidence Codes**:\n", + " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", + "\n", + "- **Others**:\n", + " - **ND** (No Data), **NR** (Not Recorded)\n", + "\n", + "\n", + "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation." + ], + "id": "7f42b928364e5cd1" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "1c11d6f520b02434" } ], "metadata": { From 1a32757addfd29185d504dd6d56d4ae869b3e1dc Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 1 Oct 2024 11:03:26 +0200 Subject: [PATCH 055/112] Separate tokens.txt files for each n-gram --- chebai/preprocessing/datasets/go_uniprot.py | 10 ---------- chebai/preprocessing/reader.py | 6 ++++-- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py index 574ecdbd..c59b3d4a 100644 --- a/chebai/preprocessing/datasets/go_uniprot.py +++ b/chebai/preprocessing/datasets/go_uniprot.py @@ -563,16 +563,6 @@ def base_dir(self) -> str: """ return os.path.join("data", f"GO_UniProt") - @property - def identifier(self) -> tuple: - """Identifier for the dataset.""" - # overriding identifier instead of reader.name to keep same tokens.txt file, but different processed_dir folder - if not isinstance(self.reader, dr.ProteinDataReader): - raise ValueError("Need Protein DataReader for identifier") - if self.reader.n_gram is not None: - return (f"{self.reader.name()}_{self.reader.n_gram}_gram",) - return (self.reader.name(),) - @property def raw_file_names_dict(self) -> dict: """ diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py index 46cd558a..e220e1e4 100644 --- a/chebai/preprocessing/reader.py +++ b/chebai/preprocessing/reader.py @@ -372,14 +372,16 @@ class ProteinDataReader(DataReader): "V", ] - @classmethod - def name(cls) -> str: + def name(self) -> str: """ Returns the name of the data reader. This method identifies the specific type of data reader. Returns: str: The name of the data reader, which is "protein_token". """ + if self.n_gram is not None: + return f"protein_token_{self.n_gram}_gram" + return "protein_token" def __init__(self, *args, n_gram: Optional[int] = None, **kwargs): From 33a5e64a1a904b00eec1df3f1bce93f499e4fa2c Mon Sep 17 00:00:00 2001 From: sfluegel Date: Tue, 1 Oct 2024 14:43:21 +0200 Subject: [PATCH 056/112] move commands to the top, restructure section 2 --- tutorials/data_exploration_chebi.ipynb | 162 +++++++++++-------------- 1 file changed, 69 insertions(+), 93 deletions(-) diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb index 6ddd3238..6a7e25ed 100644 --- a/tutorials/data_exploration_chebi.ipynb +++ b/tutorials/data_exploration_chebi.ipynb @@ -1,30 +1,58 @@ { "cells": [ { - "cell_type": "markdown", - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", "metadata": {}, + "cell_type": "markdown", "source": [ "# Introduction\n", "\n", - "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the ChEBI dataset. It covers\n", + "- how to instantiate a data class and generate data\n", + "- how the data is processed and stored\n", + "- and how to work with different molecule encodings.\n", "\n", - "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", "\n", "---\n" - ] + ], + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b" }, { - "cell_type": "markdown", - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", "metadata": {}, + "cell_type": "markdown", "source": [ "# 1. Instantiation of a Data Class\n", "\n", - "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data." + ], + "id": "4550d01fc7af5ae4" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": 18, + "source": "from chebai.preprocessing.datasets.chebi import ChEBIOver50", + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22" + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", + "metadata": {}, + "outputs": [], + "source": [ + "chebi_class = ChEBIOver50(chebi_version=231)" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "\n", "### Inheritance Hierarchy\n", "\n", - "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", + "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L598), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L23). Specifically:\n", "\n", "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", "\n", @@ -33,8 +61,8 @@ "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", "\n", "\n", - "### Explanation\n", - "A ChEBI data class can be configured with the following main parameters:\n", + "### Input parameters\n", + "A ChEBI data class can be configured with a range of parameters, including:\n", "\n", "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", "\n", @@ -45,87 +73,64 @@ "### Additional Input Parameters\n", "\n", "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" - ] + ], + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d" }, { - "cell_type": "markdown", - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", "metadata": {}, + "cell_type": "markdown", "source": [ "# Available ChEBI Data Classes\n", "\n", "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py):\n", "\n", - "## `ChEBIOver100`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", - "\n", - "## `ChEBIOver50`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", + "There is a range of available dataset classes for ChEBI. Usually, you want to use `ChEBIOver100` or `ChEBIOver50`. The number indicates the threshold for selecting label classes: ChEBI classes which have at least 100 / 50 SMILES-annotated subclasses will be used as labels.\n", "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "Both inherit from `ChEBIOverX`. If you need a different threshold, you can create your own subclass. By default, `ChEBIOverX` uses the SMILES encoding (see Section 5). The other implemented encodings are SELFIES and DeepSMILES, used by the classes `ChEBIOverXSELFIES` and `ChEBIOverXDeepSMILES`, respectively. \n", + "They also have subclasses for different thresholds (`ChEBIOver50SELFIES`, `ChEBIOver100SELFIES`, `ChEBIOver100DeepSMILES`).\n", "\n", - "## `ChEBIOver100DeepSMILES`\n", - "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver100SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver50SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", - "\n", - "## `ChEBIOver50Partial`\n", - "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" - ] + "Finally, `ChEBIOver50Partial` selects extracts a part of ChEBI based on a given top class, with a threshold of 50 for selecting labels.\n", + "This class inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" + ], + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a" }, { - "cell_type": "code", - "execution_count": 18, - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "cell_type": "markdown", + "id": "8456b545-88c5-401d-baa5-47e8ae710f04", "metadata": {}, - "outputs": [], "source": [ - "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + "---" ] }, { - "cell_type": "code", - "execution_count": 20, - "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", "metadata": {}, - "outputs": [], + "cell_type": "markdown", "source": [ - "chebi_class = ChEBIOver50(chebi_version=231)" - ] + "# 2. Preparation / Setup Methods\n", + "\n", + "Now we have a ChEBI data class with all the relevant parameters. Next, we need to generate the actual dataset." + ], + "id": "ed973fb59df11849" }, { - "cell_type": "markdown", - "id": "8456b545-88c5-401d-baa5-47e8ae710f04", "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "---" - ] + "chebi_class.prepare_data()\n", + "chebi_class.setup()" + ], + "id": "d0a58e2bd9c0e6d9" }, { "cell_type": "markdown", "id": "1655d489-25fe-46de-9feb-eeca5d36936f", "metadata": {}, "source": [ - "# 2. Preparation / Setup Methods\n", "\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", "### Automatic Execution: \n", - "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", - "\n", + "These methods are executed automatically when using the training command `chebai fit`. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", "\n", "### Why is Preparation Needed?\n", "\n", @@ -137,46 +142,17 @@ "The data preprocessing in a data class involves two main methods:\n", "\n", "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels. This step is independent of input encodings and all chemicals are stored as SMILES strings.\n", " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", "\n", "2. **`setup` Method**:\n", " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), tokenizing the input according to the specified encoding. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the tokenization.\n", " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", "\n", "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." ] }, - { - "cell_type": "code", - "execution_count": 36, - "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", - "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", - "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" - ] - } - ], - "source": [ - "chebi_class.prepare_data()\n", - "chebi_class.setup()" - ] - }, { "cell_type": "markdown", "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", @@ -202,7 +178,7 @@ " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", "\n", "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", "\n", "3. **`data.pt`**\n", From 016134f815c810f989566b94759514588cd09e02 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 1 Oct 2024 20:33:02 +0200 Subject: [PATCH 057/112] Obsolete terms being the parent of valid terms --- tests/unit/mock_data/ontology_mock_data.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index 40d9674e..0c713334 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -532,12 +532,21 @@ def get_obsolete_nodes_ids() -> Set[int]: @staticmethod def get_GO_raw_data() -> str: """ - Get raw data in string format for GO ontology. + Get raw data in string format for a basic Gene Ontology (GO) structure. - This data simulates a basic GO ontology in a format typically used for testing. + This data simulates a basic GO ontology format typically used for testing purposes. + The data will include valid and obsolete GO terms with various relationships between them. + + Scenarios covered: + - Obsolete terms being the parent of valid terms. + - Valid terms being the parent of obsolete terms. + - Both direct and indirect hierarchical relationships between terms. + + The data is designed to help test the proper handling of obsolete and valid GO terms, + ensuring that the ontology parser can correctly manage both cases. Returns: - str: The raw GO data in string format. + str: The raw GO data in string format, structured as test input. """ return """ [Term] @@ -557,6 +566,7 @@ def get_GO_raw_data() -> str: name: GO_2 namespace: biological_process is_a: GO:0000001 ! hydrolase activity, hydrolyzing O-glycosyl compounds + is_a: GO:0000008 ! hydrolase activity, hydrolyzing O-glycosyl compounds [Term] id: GO:0000003 @@ -594,7 +604,6 @@ def get_GO_raw_data() -> str: id: GO:0000008 name: GO_8 namespace: molecular_function - is_a: GO:0000001 ! glucoside transport is_obsolete: true [Typedef] From 582b528cb950344892d4959acc66b126c950ab6c Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 1 Oct 2024 20:40:17 +0200 Subject: [PATCH 058/112] remove `g.has_node(q["id"])` - https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142 --- chebai/preprocessing/datasets/chebi.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 7d53e831..727f9f64 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -260,13 +260,9 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: g.add_node(n["id"], **n) # Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes + # https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142 g.add_edges_from( - [ - (p, q["id"]) - for q in elements - for p in q["parents"] - if g.has_node(p) and g.has_node(q["id"]) - ] + [(p, q["id"]) for q in elements for p in q["parents"] if g.has_node(p)] ) print("Compute transitive closure") From 4b39bbbcee268099ea393ee692c1a8d10b70a630 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 1 Oct 2024 20:54:16 +0200 Subject: [PATCH 059/112] for ngram, truncate sequence to adhere to max no of AA --- chebai/preprocessing/datasets/go_uniprot.py | 22 ++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py index c59b3d4a..fd55d45d 100644 --- a/chebai/preprocessing/datasets/go_uniprot.py +++ b/chebai/preprocessing/datasets/go_uniprot.py @@ -80,6 +80,12 @@ def __init__(self, **kwargs): self.max_sequence_length >= 1 ), "Max sequence length should be greater than or equal to 1." + if self.reader.n_gram is not None: + assert self.max_sequence_length >= self.reader.n_gram, ( + f"max_sequence_length ({self.max_sequence_length}) must be greater than " + f"or equal to n_gram ({self.reader.n_gram})." + ) + @classmethod def _get_go_branch(cls, **kwargs) -> str: """ @@ -536,7 +542,8 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader: This method overrides the dataloader method from the superclass. After fetching the dataset from the superclass, it truncates the 'features' of each data instance to a maximum length specified by - `self.max_sequence_length`. + `self.max_sequence_length`. The truncation is adjusted based on the value of `n_gram` to ensure that + the correct number of amino acids is preserved in the truncated sequences. Args: kind (str): The kind of data to load (e.g., 'train', 'val', 'test'). @@ -547,9 +554,18 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader: """ dataloader = super().dataloader(kind, **kwargs) - # Truncate the 'features' to max_sequence_length for each instance + if self.reader.n_gram is None: + # Truncate the 'features' to max_sequence_length for each instance + truncate_index = self.max_sequence_length + else: + # If n_gram is given, adjust truncation to ensure maximum sequence length refers to the maximum number of + # amino acids in sequence rather than number of n-grams. Eg, Sequence "ABCDEFGHIJ" can form 8 trigrams, + # if max length is 5, then only first 3 trigrams should be considered as they are formed by first 5 letters. + truncate_index = self.max_sequence_length - (self.reader.n_gram - 1) + for instance in dataloader.dataset: - instance["features"] = instance["features"][: self.max_sequence_length] + instance["features"] = instance["features"][:truncate_index] + return dataloader # ------------------------------ Phase: Raw Properties ----------------------------------- From d7e80970bd0db90017141101e8e62a0f6876388a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 1 Oct 2024 20:54:49 +0200 Subject: [PATCH 060/112] 3-gram token.txt --- .../bin/protein_token_3_gram/tokens.txt | 8000 +++++++++++++++++ 1 file changed, 8000 insertions(+) create mode 100644 chebai/preprocessing/bin/protein_token_3_gram/tokens.txt diff --git a/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt b/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt new file mode 100644 index 00000000..69dca126 --- /dev/null +++ b/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt @@ -0,0 +1,8000 @@ +MAT +ATP +TPG +PGA +GAS +ASS +SSA +SAR +ARD +RDE +DEF +EFV +FVY +VYM +YMA +MAK +AKL +KLA +LAE +AEQ +EQA +QAE +AER +ERY +RYE +YEE +EEM +EMV +MVE +VEF +EFM +FME +MEK +EKV +KVA +VAK +AKA +KAV +AVD +VDK +DKD +KDE +DEL +ELT +LTV +TVE +VEE +EER +ERN +RNL +NLL +LLS +LSV +SVA +VAY +AYK +YKN +KNV +NVI +VIG +IGA +GAR +ARR +RRA +RAS +ASW +SWR +WRI +RII +IIS +ISS +SSI +SIE +IEQ +EQK +QKE +KEE +EES +ESR +SRG +RGN +GND +NDD +DDH +DHV +HVS +VSL +SLI +LIR +IRD +RDY +DYR +YRS +RSK +SKI +KIE +IET +ETE +TEL +ELS +LSD +SDI +DIC +ICD +CDG +DGI +GIL +ILK +LKL +KLL +LLD +LDT +DTI +TIL +ILV +LVP +VPA +PAA +AAA +AAS +ASG +SGD +GDS +DSK +SKV +KVF +VFY +FYL +YLK +LKM +KMK +MKG +KGD +GDY +DYH +YHR +HRY +RYL +YLA +AEF +EFK +FKS +KSG +SGQ +GQE +QER +ERK +RKD +KDA +DAA +AAE +AEH +EHT +HTL +TLT +LTA +TAY +YKA +KAA +AAQ +AQD +QDI +DIA +IAN +ANS +NSE +SEL +ELA +LAP +APT +PTH +THP +HPI +PIR +IRL +RLG +LGL +GLA +LAL +ALN +LNF +NFS +FSV +SVF +FYY +YYE +YEI +EIL +ILN +LNS +NSP +SPD +PDR +DRA +RAC +ACN +CNL +NLA +LAK +AKQ +KQA +QAF +AFD +FDE +DEA +EAI +AIA +IAE +AEL +ELD +DTL +TLG +LGE +GEE +ESY +SYK +YKD +KDS +DST +STL +TLI +LIM +IMQ +MQL +QLL +LLR +LRD +RDN +DNL +NLT +LTL +TLW +LWT +WTS +TSD +SDM +DMQ +MQD +QDD +DDV +DVA +VAD +ADD +DDI +DIK +IKE +KEA +EAA +AAP +APA +AAK +AKP +KPA +PAD +ADE +DEQ +EQQ +QQS +MSD +SDT +DTV +EEL +ELV +LVQ +VQR +QRA +RAK +RYD +YDD +DDM +DMA +MAA +AAM +AMK +MKK +KKV +KVT +VTE +TEQ +EQG +QGQ +QEL +LSN +SNE +NEE +NVV +VVG +VGA +RRS +RSS +SSW +WRV +RVI +VIS +QKT +KTE +TEG +EGS +GSE +SEK +EKK +KKQ +KQQ +QQL +QLA +AKE +KEY +EYR +YRV +RVK +VKV +KVE +VEQ +EQE +ELN +LND +NDI +ICQ +CQD +QDV +DVL +VLK +LDE +EFL +FLI +LIV +IVK +VKA +KAG +AGA +GAA +AES +ESK +DYY +YYR +YRY +AEV +EVA +VAS +ASE +SED +EDR +RAA +AAV +AVV +VVE +VEK +EKS +KSQ +SQK +QKA +KAY +AYQ +YQE +QEA +EAL +ALD +LDI +IAK +AKD +KDK +DKM +KMQ +MQP +QPT +LNT +NTP +TPE +PEH +EHA +HAC +ACQ +CQL +FDD +DDA +DAI +TLN +LNE +NED +EDS +DSY +SDV +DVG +GAE +AED +EDQ +DQE +QEQ +QEG +EGN +GNQ +NQE +EAG +AGN +MAS +ASA +SAE +LSR +SRE +REE +EEN +ENV +NVY +AKT +KTV +TVD +VDS +DSE +SEE +EEG +EGR +GRG +GNE +DRV +RVT +VTL +LIK +IKD +KDY +YRG +RGK +GKI +LTK +TKI +KIC +LLE +LET +ETH +THL +HLV +VPS +PSS +SST +STA +TAP +APE +PES +FKT +KTG +TGA +AEN +ENT +NTM +TMV +MVA +IAL +ALA +ACS +CSL +SLA +AIS +ISE +TLS +LSE +DIS +EDP +DPA +PAE +AEE +EEI +EIR +IRE +REA +EAP +APK +PKR +KRD +RDS +DSS +SSE +SEG +EGQ +LES +ESH +SHL +LLH +LHD +HDN +PKH +KHD +HDL +DLS +MST +STR +TRE +VDV +DVE +SVE +SKG +KGN +EDH +HVA +VAI +AII +IIK +IES +ESE +LSK +LNV +NVL +VLE +LEA +EAH +AHL +HLI +LIP +IPS +PSA +SAS +ASP +SPA +FKA +RKE +EST +TLV +LVA +YKS +KSA +ASD +IAT +ATA +TAE +DMT +MTD +TDE +AGD +GDE +DEI +EIK +EAS +ASK +SKP +KPD +PDG +DGA +MAE +RED +EDC +DCV +CVF +VFL +FLS +SKL +EQS +QSE +SER +YDE +DEM +MVQ +VQY +QYM +YMK +MKQ +KQV +QVA +VAA +AAL +NTE +IGS +GSR +SRR +IIT +ITS +TSL +SLE +LEQ +KEQ +QAK +AKG +NDK +DKH +KHV +HVE +VEI +EII +IKG +KGY +GYR +YRA +AKI +IED +EDE +AKY +KYC +YCD +CDD +LKV +KVI +VIK +KEN +ENL +LLP +LPN +PNA +NAS +AST +STS +TSE +SES +FYK +YKK +KKM +KME +MEG +EGD +RYY +YYA +YAE +EFT +FTV +VDE +DEK +EKR +KRQ +RQE +QEV +ADK +DKS +KSL +LAA +AAY +AYT +YTE +TEA +EAT +ATE +TEI +EIS +ISN +SNA +NAD +ADL +DLA +EIM +IMN +MND +NDA +DAD +DKA +KAC +DDS +DSI +SIA +KLD +DEV +EVP +VPE +ESS +SSY +DTA +TAD +DEE +AAT +ATL +LGR +GRD +RDQ +DQY +QYV +YVY +VQF +QFM +MEQ +EQL +QLV +LVT +VTG +GAT +TPA +GSL +SLR +LRA +AAW +AWR +RIV +IVS +VSS +SRK +RKN +KND +NDE +DEH +EHV +SLV +LVK +VKD +VES +LSS +SSV +SVC +VCS +CSG +SGI +LDS +DSH +SAG +RYM +DER +RKT +KTA +TAA +EDT +DTM +TML +MLA +LAY +IAA +AAD +ADM +MAP +NSS +SSD +SDK +CNM +NMA +AFE +FEE +EEA +MQE +EQM +QMD +MDE +ATT +TTL +SRD +LVS +VSG +SGA +PAG +AGE +GEL +KNE +EEH +VET +SIC +ICS +ILR +LRL +RLL +SAT +TAS +TMI +MIA +IAY +VAV +AVA +EKA +CSM +SMA +MTM +TMD +MDK +KSE +VQK +KAK +MKA +AVT +QGH +GHE +HEL +TER +RNE +NEK +QQM +QMG +MGK +GKE +YRE +REK +EKI +IEA +EAE +ELQ +LQD +ICN +CND +NDV +LEL +ELL +LDK +DKY +KYL +YLI +IPN +NAT +ATQ +TQP +QPE +DYF +YFR +FRY +YLS +SEV +GDN +DNK +NKQ +KQT +QTT +TTV +TVS +VSN +SNS +NSQ +SQQ +QQA +QAY +EAF +FEI +ISK +SKK +KKE +KEM +EMQ +SPE +PEK +TAF +SEN +ENQ +NQG +QGD +DEG +GDA +DAG +GEG +EGE +GEN +LIL +LNA +TQA +SGE +ENK +CSD +ATH +THA +HAE +MTE +ERE +REN +ENN +NNV +VYK +VEA +EAM +ASM +SMD +MDV +VEL +TSI +NKG +KGA +EEK +EKL +KLE +LEM +EMI +MIK +IKT +KTY +TYR +RGQ +GQV +QVE +EKE +KEL +ELR +RDI +DIL +LEK +EKH +KHL +IPC +PCA +CAT +ATS +TSG +GES +YYK +YKM +EFA +FAT +ATG +TGS +GSD +SDR +DRK +ENS +NSL +LIA +IAM +AMN +NDL +DLP +LPP +PPT +ACR +CRL +RLA +AAF +MQA +EEV +EVD +VDP +DPN +NAG +GDG +DGE +GEP +EPK +PKE +EQI +QIQ +IQD +VED +DQD +DVS +MDD +DDR +DRE +EDL +DLV +LVY +VYQ +YQA +ESM +SMK +VAG +AGM +GMD +KGG +GGE +GED +EDK +DKL +KLK +KMI +MIR +REY +YRQ +RQM +QMV +ELK +KLI +LIC +ICC +CCD +CDI +ILD +LDV +VLD +IPA +AAN +ANT +NTG +TGE +TGN +NDR +AMT +ELP +MQG +EEQ +EQN +QNK +NKE +ALQ +DEN +MGD +GDR +REQ +LLQ +LQR +RAR +ARL +SAM +NEP +EPL +PLS +DRN +KTM +TMA +MAD +ADG +DGN +KKL +KVK +AYR +IEK +ELE +ETV +TVC +VCN +VLS +LSL +SLL +DKF +KFL +IKN +KNC +NCN +NDF +DFQ +FQY +QYE +YES +GEK +KKN +KNS +NSV +SVV +SEA +YKE +SKE +QMQ +EIQ +IQN +QNA +NAP +PEQ +QAC +ACL +CLL +LLA +SDQ +DQQ +QQD +QDE +VLA +ALL +KEH +EHM +HMQ +MVD +VDR +KAR +MKN +NVT +KTS +TSA +SAD +KKI +IEM +MVR +VRA +RAY +EAV +AVC +VCQ +LDN +DNY +NYL +NCS +CSE +SET +ETQ +TQY +VAT +KRA +RAT +ATV +TVV +AYS +YSE +AHE +HEI +LNY +NYS +YSV +ACH +CHL +HLA +DDD +DDG +DGG +GNN +MER +ERA +ASL +LIQ +IQK +YED +EDM +AFM +FMK +MKS +SAV +AVE +EKG +KGE +LSC +SCE +CEE +VGG +GGQ +GQR +RVL +QKS +KSN +KGP +GPE +PEV +EVK +VKE +LRG +RGV +GVC +VCD +CDT +TVL +VLG +GLL +GAG +DAE +SRV +RVF +TGD +GDD +DDK +DKK +KKR +KRI +IID +IDS +DSA +ARS +RSA +SAY +AMD +MDI +EMP +MPP +PTN +TNP +NPI +VFH +FHY +HYE +EIA +PEE +ISL +KTT +TTF +TFD +AMA +DLH +LHT +WTA +ADS +EGG +GEA +EEP +EPQ +PQS +EKT +ELI +ATC +TCM +CMK +QGA +GGR +GRR +SAW +KTD +TDT +DTS +KLQ +LQL +QLI +LRS +RSI +ICT +CTT +ANA +ATN +NPE +VAC +ACG +CGD +RKQ +QTI +TID +IDN +DNS +SQG +GAY +FDI +LNN +NNP +PEL +LAC +ACT +CTL +TLA +SDS +EEC +ECD +CDA +AEG +EGA +TIE +IEN +STV +DKE +MAQ +AQA +QAM +KSV +SVT +TET +ETG +TGV +GVE +ARK +LAR +ARE +RER +ERV +RVE +LRE +REI +EIC +ICY +CYE +YEV +EVL +IPK +PKA +KAS +ASN +SNP +DAR +ARN +RNT +NTV +VVD +VDD +DSQ +SQT +QTA +YQD +QDA +DAF +KGK +GKM +PDK +DTQ +TQG +AEP +PQE +GGD +DKN +NEL +AAC +ACM +RVV +VVS +AEK +QMA +MAR +EKF +ASQ +SQA +AAG +KKG +KGI +GIV +IVD +VDQ +DQS +QSQ +AEA +SQP +MPA +PAS +ASR +DSV +SVY +VYL +VEN +ENM +NMK +SSG +EAK +NES +ESQ +SQV +VAL +ALI +ICE +CED +EDI +ILS +SVL +SDH +DHL +LIT +SAQ +AQT +QTG +FAI +KRK +EAY +DAV +DLE +ETL +WTD +TDL +TEE +QQQ +QSS +SSQ +QAP +AQP +PTE +EGK +GKA +KAD +ADQ +MTR +VAE +NEN +ENH +NHV +HVK +VKK +KIK +EYK +YKC +KCK +CKV +LTD +TDI +ILE +LEV +GNP +NPR +PRK +SSL +IAV +DVH +VHN +HNM +NME +EKN +KNQ +NQD +QDG +DGD +DDQ +DQN +QNE +EPG +PGM +AFT +FTR +EDY +DYV +YVF +VFM +FMA +AQL +QLN +ENA +NAE +ETM +TMR +MRK +RKI +KIS +ISG +SGM +GME +KER +IGP +GPR +PRR +KEK +KGR +GRQ +RQK +QKP +KPN +NAK +AKR +RIE +QIR +IRV +RVY +VYR +QKI +LQE +EQF +QFV +FVP +VPR +PRS +RST +STN +TNA +ADA +DAK +AKV +AEY +EYS +YSS +KIA +IAG +AGS +GSA +SAL +NAY +AYN +YNS +NSA +SAF +ISQ +QLP +ILA +LAS +ACE +CEL +RKA +KAF +FDA +AAI +AIT +ITD +DLD +KLT +LTE +NLN +LNL +NLW +LWV +WVT +VTD +TDS +DDN +DNA +NEA +ALS +VLN +DNF +NFL +NCG +CGE +GET +TQH +QHE +HES +KSY +SYS +DDE +MVS +VSQ +QVV +VVA +EKP +KPQ +PQL +KKA +AGC +GCN +CNS +NSH +SHG +HGQ +GQD +QDS +SYF +YFL +FLG +LGW +GWQ +WQE +QEY +EYE +YEK +KNP +NPF +PFD +FDP +DPV +PVS +NPS +PSG +GII +IIQ +IQM +MGL +NQL +QLS +LSF +SFD +FDL +DLL +LEE +EEW +EWL +WLE +NPH +PHA +HAL +ALG +GLR +LRR +RRE +REG +GGG +GGA +ASV +VFR +FRE +REL +ALF +LFQ +FQD +QDY +YHG +HGL +GLP +LPA +PAF +AFK +FKN +KNA +NAL +ARF +RFM +FMS +MSE +SEQ +EQR +QRG +RGY +GYK +YKV +KVV +VVF +VFD +DPS +PSN +SNI +NIV +IVL +VLT +TAG +SAN +ANE +ALM +LMF +MFC +FCL +CLA +LAD +ADH +DHG +HGD +AFL +IPT +PTP +TPY +PYY +YYP +YPG +PGF +GFD +FDR +DRD +RDL +DLK +LKW +KWR +WRT +RTG +AEI +EIV +IVP +VPV +PVH +VHC +HCA +CAS +ANG +NGF +GFR +FRV +VTR +TRP +RPA +PAL +LDD +DAY +YRR +RAQ +AQK +QKR +KRR +RRL +RLR +LRV +VKG +KGV +GVL +VLI +ITN +NPL +PLG +LGT +GTA +SPR +PRA +RAD +ETI +TIV +VDF +DFV +FVA +GIH +IHL +LIS +ISD +SDE +EIY +IYA +YAG +AGT +AFA +FAE +EPP +PPA +AGF +GFV +FVS +VSA +ALE +EVV +AGR +RDG +GAD +ADV +VSD +RVH +VHV +HVV +VVY +VYS +YSL +SLS +SKD +KDL +DLG +LPG +RVG +GAI +AIY +IYS +YSA +NAA +SAA +ATK +TKM +KMS +MSS +SSF +SFG +FGL +GLV +QTQ +QYL +YLL +LLG +LGD +RDF +DFT +TRS +RSY +SYV +YVA +NKR +RRI +RIK +ERH +RHD +HDQ +DQL +LVD +VDG +DGL +EIG +IGI +GIG +IGC +GCL +CLP +LPS +AGL +GLF +LFC +FCW +CWV +WVD +VDM +DMS +MSH +HLM +LMR +MRS +RSR +SRS +RSF +SFA +FAG +GEM +EME +MEL +ELW +LWK +WKK +VFE +FEV +EVG +VGL +GLN +LNI +NIS +ISP +SPG +PGS +GSS +SSC +SCH +CHC +HCR +CRE +REP +PGW +GWF +WFR +RVC +VCF +CFA +FAN +ANM +NMS +MSA +SAK +KTL +TLD +VAM +AMQ +MQR +QRL +SFV +FVD +TGG +ALR +AVP +PVR +VRS +RSV +SVS +VSC +SCP +CPL +PLA +LAI +AIK +IKW +KWA +WAL +RLT +LTP +TPS +PSI +IAD +ADR +KAE +MAY +YQG +QGI +GID +IDL +LST +STK +TKA +HGE +YFD +FDG +DGW +GWK +WKA +AYD +YDT +DTN +DLR +LRH +RHN +HNR +NRG +RGG +GGV +GVI +VIQ +SLD +LDL +DLI +LIE +IEE +EWS +WSK +SKN +KNH +NHP +HPE +PEA +ASI +CTP +PEG +EGV +GVS +SQF +QFK +FKR +RIA +ANF +NFQ +LPE +PEF +EFR +FRK +KAM +AQF +FMG +MGQ +QVR +VRG +GGK +KAT +ATF +DPD +VVM +VMS +MSG +SGG +GAQ +AQE +QET +LAF +AFC +LAN +ANP +NPG +PGE +FLV +VPT +YPA +RDC +DCC +CCW +CWR +WRS +RSG +GIK +IKL +LPI +PIE +IEC +ECH +CHS +HSF +SFN +FND +DFR +FRL +TKE +ALV +YDG +RRQ +RQG +GIS +ISV +SVK +ILI +GTI +TIT +TDR +RDT +LAM +AML +LAT +TFA +TEH +EHR +HRV +VHL +LVC +CDE +GSV +VFA +PEY +EYV +YVS +VSI +EVI +VIE +IER +ERD +RDV +DVP +VPW +PWC +WCN +CNR +NRD +LIH +IHV +KDF +DFG +VGI +IIY +YSY +SYN +YND +AAR +RRM +RMS +QYF +FLA +ARM +RML +MLS +EEF +EFI +FIG +IGR +GRF +RFL +FLQ +QES +SKC +KCR +RLV +VAR +ARH +RHE +HER +ERF +RFT +FTS +SGL +REV +CLR +GNA +LFS +FSW +SWM +WMD +MDL +MLR +LWR +VIV +IVH +VHQ +HQV +QVK +VKL +KLN +NVS +VSP +PGT +GTS +TSF +SFH +FHC +VCH +CHA +HAN +NMD +DET +TME +MEV +GRI +RIH +IHD +HDF +FVR +VRQ +RQH +QHQ +HQQ +QQR +QRR +RRV +ERW +RWA +WAA +ANR +NRQ +RQL +QLR +RLS +SLP +LPH +PHH +HHH +HHL +HLS +LSP +PAH +SSP +SPL +SPQ +QSP +SPM +PMV +KQL +TKV +VTS +TSN +SNG +NGH +GHG +GWE +WEE +EEY +NPY +PYD +NPN +PNG +NGM +GMI +MIQ +QLC +LCF +CFD +ESW +SWL +WLT +TKN +NPD +PDA +SLK +LKR +KRN +RNG +NGQ +GQS +QSI +SIF +IFR +HGM +GMP +MPE +FKK +MEE +IRG +GNR +NRV +VTF +DPK +PKK +KIV +GST +NET +TLM +PGD +FLL +LPT +VPI +PIH +IHC +HCS +CSS +SSS +SSN +GFQ +FQI +QIT +ITE +TES +ESA +LQQ +YQQ +QAQ +QKL +VLV +VTN +TAL +ALT +LTR +TRR +LLV +DFI +FIT +TSK +KNI +NIH +YSG +SGT +GTM +TMF +MFG +FGF +GFE +FEQ +QFI +FIS +SVM +VMD +LKD +LED +DTE +TEV +EVS +VSK +SKR +KRV +YSN +SND +MIV +LSA +KKF +KFT +TSQ +SQY +YLE +NQK +KRL +RLK +LKS +KSR +SRQ +RQR +GLE +AGI +GIT +ITC +TCL +RSN +DMR +MRH +RHL +HLL +TNT +NTF +TFE +FEA +DLW +IVY +VYN +YNV +NVK +HCT +CTE +TEP +ALK +LKT +KTF +TFV +FVE +STD +TDC +DCG +CGR +GRM +RMI +MIS +ISR +SSH +SHE +ERL +LRK +RKK +KKT +SNW +NWV +WVF +RVS +VSW +SWT +RVP +VPD +PDE +VAF +TEK +KQD +QDL +DLN +IAS +DGH +AYE +ENP +PFH +FHP +PID +IDR +DRP +RPD +DGV +LCG +GDL +DLM +RKW +KWV +WVL +LKH +KHP +CTS +GVN +VNQ +NQF +QFS +FSD +IAI +AIF +IFQ +FRQ +RQA +QAV +AKF +KFM +KTR +TRN +RNN +NNK +NKV +VKF +KFD +DRI +IVM +GAH +HET +TVA +DGF +GFL +LRW +RWR +VNL +NLV +PVT +VTC +TCH +HSS +GFK +FKI +KIT +ITV +YEN +NAR +RKS +NIP +IPV +PVK +KGL +GTT +LDR +REC +ECL +CLK +LVN +VNF +NFT +FTN +TND +DKG +YAA +TFG +FGQ +SEF +EIE +DCN +IHI +HIV +KDM +DMG +PGL +VVQ +VQI +QIA +IAR +RKM +QHL +AKM +KML +FIR +RES +KLR +RHA +EIT +ITT +TTG +TGL +GLD +LDG +GLG +LGI +IGW +GWL +WLK +LKA +LFL +FLW +LWM +LRN +LLK +TAT +FDS +PGG +GGS +GSF +HCH +CHE +HEP +MDH +DHK +HKT +MET +ETA +LER +ERI +RIR +VFT +SQL +QLE +EEE +EET +ETK +TKP +KPM +PMA +TTM +TMM +MMA +AKK +KKK +KKC +KCW +CWQ +WQS +QSN +SNL +NLR +SFS +DTR +RRF +RFD +GFF +FFS +FSP +SPH +PHS +HSP +SPV +PVP +VPP +PPS +PSP +PLV +LVR +RKV +NAH +AHG +NGI +ETW +TWL +WLA +AKN +GLK +LKK +KKD +KDG +DGQ +IFK +FKE +KAL +PSK +MLT +GTV +TVF +VFG +VSV +KNL +NLE +LEN +VHI +MVV +TST +STY +TYL +YLD +LKI +KIR +IRQ +QKK +KLV +VYD +YDV +DVK +MKR +LKE +YVE +DSR +SKS +KSS +SHD +HDR +IKS +RKR +KRT +RTV +MHG +HGS +GSG +SGH +GHS +HSL +SLT +LTG +GAP +APH +PHQ +HQI +QIP +IPP +PPP +PPR +PRT +RTQ +GQQ +TAN +ANQ +DKI +KID +IDP +DPF +FHN +HNK +KRG +RGT +TSR +LRI +RIN +INN +NNS +SSR +SRY +RYN +NVD +VQL +KDT +NEQ +EQP +QPA +LVI +VQC +QCQ +CQH +QHV +HVF +FDF +DFY +FYD +YDP +PVA +VAQ +QLK +LKC +CKE +KEI +IKR +LID +IDH +DHI +HIT +TKG +AIV +IVE +TIY +IYP +PAV +AVI +IKM +KMV +NIF +VLP +PSE +ENC +NCE +CEF +EFD +DPE +EED +DEP +EPT +PTL +TLE +SWP +WPH +PHL +HLQ +VYE +YEL +ELF +FLR +LRF +FLE +ESP +PDF +FQA +QAS +SIG +IGK +GKK +KKY +KYI +YID +IDQ +DQR +QRF +RFV +FVL +DLF +LFD +DPR +PRE +DFL +FLK +VLH +LHR +HRI +RIY +IYG +YGK +GKF +RAF +AFI +IRK +RKH +KHI +HIN +NNM +NMF +MFL +YET +ETD +DSF +FNG +NGV +GVG +VGE +LEI +ILG +LGS +GSI +SII +IIN +ING +GFA +FAL +ALP +LPL +PLK +LKQ +KQE +QEH +EHK +HKV +KVL +VLL +PLH +LHK +HKP +KPK +PKC +KCL +CLS +SLY +LYH +YHA +HAQ +AYC +YCV +CVV +FIE +EKD +TPQ +PQV +QVF +LKF +KFW +FWP +WPR +RTC +TCS +SSK +KEV +EVM +VMF +GEV +EVE +DII +IIE +IEP +EPE +KII +DPL +PLF +LFR +AKC +KCV +CVS +PHF +HFQ +FQV +RAL +ALY +LYF +YFW +FWN +WNN +NNE +NEY +EYI +YIL +TSS +LVM +VMP +MPI +PIM +IMF +MFP +FPA +LYR +YRI +RIS +EHW +HWN +WNQ +NQT +IVA +TFM +MEM +EMN +MNG +NGK +GKL +KLF +LTS +TYK +YKG +GER +EKQ +KQR +QRE +KDR +RDA +AFW +FWK +MEA +LNP +NPP +EVT +VTP +PSL +SLF +LFP +FPE +TDY +DYL +DGP +GPN +PNM +NMT +MTP +TPL +PLP +LPV +AGG +GDK +KSP +SPS +PSV +VVK +KKS +STG +ETT +TTT +TTP +PAK +TKL +KLP +STP +TPT +PTS +TSP +GLS +PPD +DKV +KVD +GFS +FSR +RSL +ARP +RPR +RSH +SHS +QFR +RYQ +YQS +SNQ +NQQ +QQE +PLL +KDV +ELH +LHE +RKL +LAQ +AQC +QCG +CGV +GVM +MFD +FLD +LDC +CVA +LKG +VKR +LVE +VEC +ECV +CVG +VGS +TRG +EPV +PVY +VYP +YPD +PDI +IIR +IRM +SVN +VNI +FRT +RTL +TLP +EPN +PNL +LEP +EPS +PSW +YEF +EFF +FFL +FQP +QPS +KRY +RYV +YVD +DQK +QKF +KFV +VLM +LML +MLL +EYL +KTI +ILH +VYG +AYI +YIR +KQC +QCN +CNH +NHI +HIF +IFL +RFI +FIY +IYE +LEH +EHF +HFN +GVA +HKQ +KQF +QFL +VRV +IPL +LHS +HSV +VKS +FHA +DAT +HVI +VIR +RGL +LKY +KYW +YWP +WPK +PKT +KTC +TCT +CTQ +TQK +DVI +PSQ +FVK +VKI +KIQ +IQE +QEP +LFK +FKQ +ARC +RCV +EDN +DNC +NCH +CHT +HTV +AVF +FGT +GTL +TLY +LYQ +YQV +QVS +LIY +IYN +ASY +YKL +QQK +KAQ +ERQ +WRG +RLQ +LQG +QGT +GTQ +GAK +APV +PRP +RPT +MPY +PYK +KEP +PPK +PKV +KCT +CTA +TAK +KPS +SGK +GKD +EAQ +QPQ +PQP +PQA +AQS +QPP +SNK +KRP +RPS +NST +TPP +PTQ +TQL +IKY +KYS +GGP +GPQ +PQI +QIV +ERR +RQS +SRF +RFN +FNL +NLS +KNR +NRE +LQK +DSP +SPT +TQE +LFI +FIQ +LRQ +RQC +QCC +CCV +CVL +VLF +SDP +SDL +KFK +RAG +NEM +VEY +YIT +ITH +THS +HSR +DVV +VVT +YPE +VTM +MFS +NLF +NPT +PTG +AWP +QPN +PNI +NIA +IRR +RQI +QIN +INH +IFY +FYR +YRF +EHH +HHN +HNG +GIA +HKM +KMF +VYH +YHP +HPQ +KES +PVI +IVG +KTH +SPK +FLN +EFS +FSK +KVM +VME +MEP +LYY +YYW +YWN +YIM +IMS +MSL +SDN +ARV +YRN +RNS +NSK +KSH +SHW +WNK +NKT +TIH +IHG +GLI +YNA +LFM +MNQ +DDC +DCT +TQQ +QQY +QYK +KQK +QKG +RFR +FRM +RMK +MKE +EMW +MWQ +WQK +RLN +NPQ +PQY +QYP +YPM +PMF +MFR +FRA +RAP +APP +PPL +PPV +YSM +SME +ETP +PTA +DIQ +IQL +AVQ +VQM +QML +MLK +KDI +IKK +RRK +LPQ +PQD +DVY +VYT +YTI +TIK +IKA +AHK +HKR +RAE +FLT +SQE +MMR +MRG +RGF +RLI +STT +TTS +KKP +HGT +TTH +GSK +KST +TTE +GKQ +KQS +QSG +SGS +SVP +QGK +GKH +KHH +HHS +SKT +KTK +TKT +VSR +TKK +RKG +KGQ +QSK +SKQ +QQP +SQS +QKQ +KQG +QGS +AIM +MNP +TPV +PVL +TVT +VTK +TKD +KDD +DHA +HAH +AHP +HPT +TLL +LGA +GAV +AVS +SPI +PIS +TAV +ENG +NGN +GNS +NSN +SNN +NNN +NMN +MNI +NIN +INT +NTS +SNT +NTQ +TQD +DAN +ANH +NHA +HAS +SID +IDI +DIP +IPR +SFE +FER +RLP +PTK +PDT +DTD +KTP +PQR +QRH +RHS +RFE +FEP +PSR +RYT +YTP +PLT +PNF +NFN +FNE +NEV +RIP +FIA +DQC +CNT +DFN +NDP +PSF +IQG +KRS +IEF +TNR +NRF +FTY +TYT +YTN +TNE +EMY +MYA +YAH +AHV +VVN +VNM +MFK +KIN +INL +FRP +RPI +PIP +PVN +VNP +NPV +PVG +VGD +GDI +DIY +IYD +DED +VNE +LAW +PHM +AVY +FNH +NHQ +KQY +QYI +QDF +FIL +DIR +DCL +TLH +SFI +RSM +SMN +MNN +NNI +LQF +KFN +VRI +RIL +KVR +VRC +RCL +YCI +CIV +IVQ +KDP +LLT +VMG +LRY +RYW +PKI +INS +NEI +DIF +IFE +PLE +LEF +FIK +IKV +VEV +VPL +LFV +FVQ +KCI +CIS +LSY +SYW +EYF +NLC +LCI +CIE +VIL +ILP +PII +IIF +IFP +LYE +NGE +SIS +DPY +PYM +YML +MLV +QAI +AIN +NSG +GSW +SWN +WNR +NRA +RAI +AIH +IHA +HAM +MAF +KIF +ETN +VLY +CNA +LYL +KET +QRK +KVQ +ENW +NWS +YVK +VKN +NND +KDQ +QYT +NSF +FNT +NTA +NNT +NTL +ENE +END +NDC +DCD +CDS +SEI +IKQ +KQI +QIF +IFG +FGK +LPR +RKP +SHN +HND +NDS +DSN +VNS +NSY +SYY +YYI +YIP +PNS +NGA +GAN +NGT +TVI +VIA +IAP +APS +SNR +NRT +RTN +TNQ +NQV +QVN +VNG +GVY +YEA +SFR +FRD +KLS +LSM +SMC +MCC +RQT +QTL +VDY +DYI +YIA +VST +SDA +QEI +RTF +TFP +FPS +NHE +KIL +DVD +EPA +PAW +LQV +LLL +PMT +TDA +RYI +DHS +FMV +MVH +VHR +HRP +RPF +PFI +KAI +FIF +FET +KHN +HKL +IRA +RPK +KCA +AYH +YHQ +SYC +DFK +FKL +ADT +WPV +TNS +QAA +EFQ +FQR +QRC +RCM +CMV +MVP +CLN +SHF +LWN +NDH +HIR +IRN +NLI +ITQ +TQN +QNH +NHK +VIM +IMP +PIV +IVF +VFP +PAM +AME +NTR +RGH +GHW +NQA +VQS +QSL +NVR +VRK +VMA +AET +TDQ +DQI +QIL +ILF +DEC +KFQ +FQE +QED +EAN +KRE +ATW +TWK +WKL +AVL +PRF +RFS +FSS +TGK +GKT +LTC +TCN +CNK +NKA +SRM +RMV +VDA +NGP +GPF +PFQ +QPV +PVV +VVL +LHI +QEK +KWK +WKE +SEM +THN +NRN +RNV +VIT +EPI +PIY +VVH +VHM +HMF +MFA +FAV +AVN +VLQ +HKI +MAL +KIM +IME +THW +QQF +EAW +AWV +WVK +KAN +YTV +TVY +YSQ +STM +TMS +MSI +SIP +TDG +GPL +LFE +FED +EDV +DVQ +TVK +AHQ +HQA +QKD +RPL +QDP +DPH +PHT +HTK +AHC +CRA +SQD +DGR +MSV +ATD +TDD +DAL +LYP +YPI +PIA +IDE +DVT +TLR +NSI +SIR +STI +TIA +LGV +VER +ERT +RTR +IQF +LVL +QLG +LGN +GNF +FTP +LVG +GPD +PDH +HVH +HCL +VVR +VRD +RDK +ESL +KHS +HFV +VPM +PML +GDW +DWF +WFT +SRT +RTS +SAC +CGL +YPR +PRV +PAI +KSM +SMF +TLC +LCR +CRD +RDD +DDT +DTP +TPM +VRR +KLG +GEF +FAK +FEK +IEG +EGL +GLH +LHV +HVD +EQD +SVR +VRL +SAI +IAF +AFG +ANK +NKK +PIL +IEL +KSW +RVR +VRY +YMV +IEI +QNV +DMD +MDT +DTT +NMY +MYT +TNL +EVR +RCA +CAA +TQR +QEF +NLP +PED +DKR +RQN +QNI +NII +IIC +LLN +NVA +LAG +AGV +IMG +APL +PLI +LIG +EQT +QTV +VSE +IYM +YMQ +NDQ +DQT +QTP +KVN +EDG +DGK +GKW +FMP +MPL +LGQ +FFD +PLC +LCL +LNW +NWL +TDH +VFS +FSI +IMK +LTQ +KFG +FGG +GQW +QWA +WAS +TNI +VPK +PKM +MQK +TNY +YLQ +QRM +RMT +MTC +CLF +MTQ +EDD +VPN +PNV +VRF +FNA +AKS +RIG +GKN +PST +VKP +KPL +LGK +DSD +SDF +DFD +FDV +DVR +RYF +YFS +FSE +SLG +SVD +DSL +LKN +SIK +RSE +IPF +PFL +FAM +AMY +MYL +LRT +EHS +HSA +EIH +VVP +TLQ +VCY +CYP +VTQ +RAN +NFR +KLC +LCQ +NKL +TEY +KSD +NFV +LAV +EAC +ACV +IAQ +VEH +EHL +QCA +VDL +DLQ +AVG +VGP +PEI +ITR +TRV +RVD +AFQ +DFC +FCA +CAN +ANL +NLD +QVQ +QII +IIL +SIL +LPY +PYV +YVR +PNP +PHV +SVI +MLG +YQT +ECP +CPE +CVN +VND +GIQ +IQQ +LSQ +SKW +IEY +EYM +YMP +AGQ +GQL +FDQ +GLC +LCM +CMG +MGW +WLN +HVY +VYA +YAI +AIR +LNM +QFG +FGA +APW +PWA +WAE +IIP +IPM +PMI +MIL +MSR +SRN +RNK +NKN +KNY +YLH +HRM +EVC +VCG +CGT +GTD +DIT +TTK +PTV +ADP +VAN +ANV +FNV +SPF +VID +IDA +DAQ +AQV +KPT +NTD +TDV +VKH +KHF +HFA +FAA +LPF +GTF +TFT +FTT +YVH +ISH +HEH +PSD +AHF +AVK +RQY +FRN +LCS +SDD +DNV +FSN +MPT +FTE +ITK +FQN +QNL +NLM +LMK +MKD +KDC +DCE +CEA +ASH +SHK +KEF +EFC +FCE +CEN +ADC +DCR +MSQ +SQI +LPC +PCI +CIK +NQH +KDN +DNT +NTI +IEH +GIR +EDA +AKW +SLC +CMA +MAW +AWL +WLV +VDH +NLK +KEW +EWA +WAH +AHA +HAT +ATI +TII +AMS +GDP +PNY +MTT +TLF +FCI +CIN +INV +CGQ +TKH +KHM +HML +MLP +VLR +LRM +RMA +MAG +SLQ +KIG +GPI +LQS +KPI +QDQ +VKY +KYF +YFA +FAQ +TTA +YPL +LLM +LMD +HDD +LGP +PER +EVF +VPY +PYI +YIG +IGG +QYA +YAT +ILL +VRE +SLN +QLF +ADW +WFS +KVS +IVR +NIL +MVK +RAV +VGK +NLG +EDW +DWD +WDY +YIS +FQK +IND +NDN +DNQ +VDC +CLI +ISI +KFF +FFN +DES +SHT +HTQ +IGD +DRF +VQP +QPF +LCE +DNE +NEG +GDV +SGF +LNK +NKI +VQN +TVR +NKD +DQV +QVI +VIN +NNF +FLP +NML +EFP +FPD +PDV +IIA +GIE +DVN +VNW +NWR +VRM +MAI +IPI +LGM +GMQ +MQF +QFF +DLC +LSW +WLW +LWD +WDT +YSI +VNN +NNL +EIF +FGS +SDW +DWC +WCR +SRL +ENF +FTI +LTT +GVP +NIR +IRF +SYA +YAV +KYD +YDA +KNT +LQT +AEC +ECQ +CQE +MVM +SQN +QNQ +NQP +AND +FDM +EGP +ETF +PVD +INW +NWK +WKF +FNQ +GNI +NID +VHT +HTE +EAD +ISC +SCV +CVE +FSH +HDG +GEY +GRV +VVI +VIF +QRD +GKY +KYV +GVR +EYN +YST +STF +TFQ +FQS +QSH +FDY +EID +INQ +NQI +IRW +RWL +NFI +DKT +KLW +WKI +DAW +AWN +WNL +NRI +FRG +RGR +GRL +LQI +SIV +PME +YGN +AHT +HTY +TYH +YHV +HVN +NSD +TFL +DDL +RVN +ESF +FNI +VDI +IKP +PAN +ITA +EFH +TQC +CNW +NWF +WFV +KGS +RLC +LCD +CDM +MRD +RDR +ALC +AYA +YAK +DPQ +QSR +SFF +KFS +NGR +GRY +TRD +YLT +KVW +VWD +WDL +MES +PVE +ETY +TYP +YPV +HNY +YLR +RTK +LCA +CAL +IFD +FDK +KFE +FEC +CDW +DWS +WSG +HIL +ILT +GSY +SYH +YHN +HNL +FRS +YAR +ARG +NNQ +KTW +TWE +WEA +EAR +RPQ +EPH +HSQ +FVV +QLQ +QFD +HTA +TAW +AWH +WHP +HPK +PKD +DNI +TNN +NLY +LYI +YIF +IFS +MGR +GRW +RWG +WGR +PDP +PQM +MQT +FMR +MRQ +SIT +IGN +GNM +MLN +TAI +INI +SWC +WCF +CFS +FSQ +QIK +GAL +ADI +EFN +NHD +RDP +SKA +RRG +RGE +INK +WLQ +QKN +VHF +HFL +WKV +KSF +GGY +GYN +YNT +NTK +NGL +PQN +VTA +VKQ +RRT +YHI +LWH +WHL +HLE +NQS +QSY +YNI +TNM +TEC +ECN +CNV +NVF +VFV +KGT +TIR +CDR +DRH +HSK +QFE +PEN +NRS +SGR +YMI +LSI +LHM +HME +VHE +HEY +DCI +CIF +ECC +CWN +WNG +SIM +IMT +MTG +YNN +NFF +FFR +LKP +KPR +KVC +VCT +CTG +GKR +CLD +LDF +FNK +ENI +QDK +DID +IDT +TRK +SFL +RDH +HSY +IST +NHT +HTG +QVH +HRR +WLP +PQQ +QQN +AYF +RPE +EGY +YNL +PAT +LRP +RPM +PMD +LMV +TPR +SDY +DYE +TYM +YMS +WNF +NFE +QSF +HPH +HHC +HCN +MRA +RHT +TKF +FFE +HSG +MEN +ENR +NRP +RPV +TYQ +VHD +HDY +CVW +VWN +NGS +RMF +TKR +AIL +VCV +DFS +HPS +MRF +RFC +FCV +AWF +WFF +FFP +FPN +NTT +TTR +VFW +FWD +WDA +AFS +SNF +FTG +TGC +GCH +CHH +HHG +GQN +GLY +YFQ +RFG +FGY +GYI +IPE +PET +TFS +FSG +SGN +FTD +DDF +ELY +QTN +TNF +LDA +LTI +TIQ +IQH +QHI +IVI +VIP +PRC +RCG +CGN +SLM +LMH +HGG +EVN +RTH +HLH +LHA +HAV +YTL +FPG +EPR +PRW +RWP +PRN +RNR +NRR +RRD +DLT +LTY +TYA +YAF +PKN +SRA +FGR +RWS +WSD +FTL +FST +ITI +TIG +IGF +GFY +FYT +YTG +GDH +EPF +LAH +HAF +SPP +KFH +FHL +HLD +WVV +ESV +AVH +IGH +GHL +LGH +ESI +IMY +MYP +YPT +PTI +LTN +VEG +EGI +IQY +YLY +LYG +YGA +KHQ +HQR +DTG +GGF +FSA +RID +IDG +DGS +TVG +VLW +LWF +WFL +MGS +PLR +KPG +TSW +WNS +VRT +TQV +EYG +YGC +GCF +CFE +KGH +LNG +GNK +NKP +KPE +EYD +GFT +EGM +GMG +MGV +VGR +RIT +LMW +MWP +WPE +CET +SYG +KRM +KMM +MMV +MVF +FES +FGM +HFD +SFC +CES +LHF +HFM +MRY +QPG +PGK +GRS +RSP +SLH +HKD +KSI +IVN +NQN +QND +EFE +GEW +EWI +WIL +ADN +DNH +GDC +DCF +CFM +AWS +WSN +RLH +QAR +FSF +SFP +FPK +EHP +HPL +LLF +LFN +FNP +PFE +YCF +CFT +FTK +KEG +CDL +PAQ +PFR +FRI +QGP +ERP +RQQ +QQC +QCS +CSQ +SQR +QRI +RIQ +QGE +NQC +QCR +CRS +RSQ +SQM +QSC +SCC +CCQ +LQN +NVE +EQC +CQC +MPG +GWS +WSC +SCL +CLV +FVG +VGQ +VQE +QTK +MLE +LEG +AQY +CQG +VIH +IHT +IDV +VSH +SHV +HVL +PRQ +IYC +YCS +CST +AGP +HEE +HHE +STW +TWS +AYP +YPY +PYS +YSK +KNG +NGG +GGT +HTC +TCA +PMY +MYI +YIY +YGE +ERS +VMI +KNK +VYV +YVG +VGN +GNV +VAW +AWA +AHI +NVQ +VQG +GQF +QFY +TPH +HQS +SYD +LNC +NCT +EWG +WGL +RLD +SWS +WSL +LLY +LYW +YWL +VSF +PFY +FYN +YNY +NYR +YRP +RPP +PPF +PFN +FNC +SKF +FTF +FSY +AQR +LGY +GYV +YVP +SWE +SEW +WIG +IGT +EQH +QHR +HRE +RET +DTK +TKS +GGL +AFR +QNR +TAC +ACI +CII +DVF +FGV +GVT +VTH +THR +MNV +NVN +VNV +CVQ +VQA +PVF +VFI +IYT +YTS +IEV +QNG +NTW +TWP +WPT +PYP +NGW +GWN +NGD +GDT +LYT +YTC +PTY +TYI +SIN +INE +NNG +SVG +TVN +KAP +YDN +NYI +EFG +SRW +LMY +MYW +YWI +SYQ +YQP +FNR +NRH +YKP +PLY +LYS +YSW +VEW +EWV +WVG +RHK +HKE +TLK +KSK +KTQ +YRT +KHK +VTV +RGD +DIV +QGM +GMS +VII +IIH +DAC +TFH +FHT +MVN +VNR +KNN +KRH +SIQ +NYT +WGF +GFC +MVT +VTI +TIS +ISY +GYE +YEP +QVP +YLV +GGC +GCG +CGF +GEH +EHI +LEW +EWE +WEP +PRL +LHL +TGP +GPV +PVQ +VQV +QVT +AIQ +QAH +HEV +GSH +IHK +VQT +TGT +GTR +TRL +SSM +GHP +HPF +PYE +IHR +HRH +RHP +HPY +YPC +PCS +CSK +GRK +RLF +AIP +EHG +HGR +AWM +WMH +MHI +LMG +MGG +QVY +VYF +YFC +FCY +CYD +YDK +SPY +SYE +EDF +FNM +MEF +SPC +PCG +GTH +PYW +WLL +LQW +QWL +PYT +TNK +RHF +HFG +ART +RTI +IHW +HWV +WVQ +RMG +DAS +ELG +VTT +DRG +WVR +DVC +VCA +TIF +IFH +ELM +DEY +QRS +NVG +GTE +TEN +HAG +GVQ +YTD +DLY +AQN +GVD +DGM +GML +CAI +IRP +GIW +IWG +WGN +GNG +GDQ +QTM +GHV +HGF +GFI +AAH +DGT +APG +PGQ +GQA +YFI +FIN +PIN +INM +MFE +FEF +FAR +QRW +KMR +MRI +SGP +GPA +AVR +VRW +RWV +WVM +VMT +TGW +WQR +HFR +FRF +GFP +PAP +RLY +NYF +LFT +TTQ +QAL +YYV +QMK +ARA +MMK +QLH +RMR +GRT +RTP +RLE +AHN +HNI +LQA +CLQ +PLM +LMA +SFK +LDP +PDS +SMG +EMS +MSC +SCA +ARI +FEM +EMT +MTL +LQP +QPL +HKK +DWN +WNT +QAT +QGL +LGG +GSP +HSH +HTT +MAN +YHF +FVT +KED +YAN +ANY +IQA +QAD +ADY +NHG +PSM +SMT +MTA +THF +HFP +FPR +YGV +GRE +CVM +VMM +MML +GMK +FCS +SYL +PEP +LMT +MTF +LYD +DDW +DWM +WMR +CSR +PPE +YLM +MKF +VNK +NKM +KMT +LLW +LWP +WPP +DQA +QLD +IQV +VGV +GVV +IQS +QSA +DIN +INF +QDT +DRL +RTE +PAR +PTM +TMP +PPQ +PPG +GTP +TVP +PGP +NPA +QVD +SGV +QPR +HNV +NVH +VHK +TAM +PLN +LNR +NRL +HTH +THM +HMA +QCK +CKD +HFS +YFT +FTH +HRK +NHS +APF +PFS +QEE +MTS +ALH +HDV +QEN +FNN +GIF +APQ +QQV +MTV +LPK +PKP +PTD +VGT +PCP +CPA +SNM +NMP +DQG +TED +GGH +HPP +PRG +EMH +MHW +HWP +PMK +AIG +LTM +AGY +GYL +KWP +WPL +FVI +KRC +CVY +VYY +YYF +YFK +PQG +GAF +FSL +LSG +SGY +YNR +RVM +VMR +FPF +PFK +HIS +KKH +KHR +HRT +RTW +TWF +WMA +GHF +HFH +FHE +HEK +PLD +SFY +FYG +TDN +YEH +EHD +EPD +PGR +MHP +PAY +YPP +DMP +MPR +RAH +AHS +SFT +GPG +KHG +LPD +LCP +CPR +EPC +DPP +KPP +PPC +PCF +CFR +EPW +PWT +WTP +PGH +HGA +GAC +IMA +RNC +NCD +CDK +RGP +GPP +SEP +PKF +AMP +VAP +APR +RQP +KVP +FVN +VNT +ESC +CEV +LYC +CIR +GKV +LVV +VVW +WDE +ETS +VRN +RNY +RIF +KFY +GSM +SMV +EHY +HYH +YHT +THV +PSH +SHQ +PYG +YGY +GYT +IQI +QIE +EIN +TFR +GNC +NCI +RPY +AQI +CQK +HAA +MSN +HEW +EWQ +WQF +FDN +NAW +AWQ +QEM +EML +LNH +QKV +MDA +DCH +EHQ +FRR +NKS +SRP +PYF +YFE +QVC +TYS +DIH +HRQ +GDF +DFP +FPT +PGV +FQL +EKC +KCD +CDY +DYP +YPS +GSQ +QMS +ACD +DYD +VRP +DVW +VWE +WEH +EHE +LDH +LMM +QQT +STE +QRP +RHC +HCD +CDV +TSC +HHQ +HQL +NHL +TPI +PIK +VSM +SMR +MRE +DRS +RRR +PRI +LNQ +QST +INR +ARQ +KFR +KPY +YWE +RVA +RQF +QRV +LVH +ARY +AMG +FEL +KYY +YVQ +KMA +IHE +MGP +RGC +TSV +DSC +SCS +CSN +TQS +QSV +GPT +MPD +PDQ +DQF +QFP +RPG +GMM +MMF +FPV +SEC +ECS +PEC +ECE +ERG +ANN +NNR +NRM +LQC +QIG +ISA +REH +HKA +LQM +GKS +TRM +GCD +GVK +YHS +HSN +WDD +YGD +HAD +IGE +IFN +FNS +QLW +WMV +VDN +FQT +QTE +YWS +WSE +LGF +LHG +HGY +FEH +HFK +FKD +DQM +QFT +FTA +NDT +QTR +VFN +AFP +KFA +AYL +YRW +RWH +WHS +SYI +TPD +FHS +QCL +CLW +WRW +RWW +WWK +WGC +GCP +LTF +TFI +IRH +RHR +EFY +IDM +DMV +VKT +DMY +MYD +DTF +KRW +RWD +WDP +MVL +EMA +QGR +AEW +WIA +TGY +PTF +FEN +GHR +QPI +PFP +FPH +HHI +ILQ +IDF +NDY +DYA +YAC +CSI +TRC +RCY +CYK +ASC +SCT +SCY +CYM +STQ +MIE +NWE +WEF +PDN +DNN +NNA +API +KHA +AFN +LHH +HHF +HFY +YRD +DGY +GYS +LDY +QFA +SVQ +VQQ +CVK +AQW +QWI +SCI +DNP +DMI +YMR +LIN +CLG +GSC +SCN +DFA +CGY +GYA +IVC +CFW +HSD +GQK +III +GGI +RGA +YER +GLQ +GPH +PHG +HGW +GWR +WRM +SWG +LDQ +IVV +YLP +FQQ +QQH +QHY +HYG +YGG +HRS +RSD +KLH +LHN +DIE +IHS +DAP +AEM +EMK +IGY +HFI +QRY +RTA +DWG +YNH +NHC +CDP +QDR +WRN +NNW +NWW +WWQ +WQM +HAP +PLQ +LQY +AVM +MAM +MED +LFA +GNL +LDW +DWE +RRP +RCS +SRI +IQT +RFW +FWG +WGE +WHV +EGT +TAR +WFI +YAD +DWL +LWG +WGY +GYD +HIA +MPQ +EWR +WRY +RYA +YAL +NWQ +WQP +PPY +YDW +WSW +WML +IPD +CNP +PGC +GCV +CVD +QGV +QLY +YIC +ICF +CFP +LPM +MTI +TIP +IPG +MKT +QTF +PGI +RWT +RGW +WQA +PDD +DDY +RFP +GMT +RRY +RWK +WKP +KPW +PWR +HIW +IWY +WYT +EGW +QPD +RIC +ICV +LFF +FFA +FAP +RNA +NPW +PWN +AGK +LYM +FQH +QHF +NAV +VEM +MYQ +YQR +QRN +RNF +TMH +MHS +RFH +KHY +HYS +YSF +TRW +RWE +FYS +GPM +PMR +MRT +TGH +NWI +WIV +IRT +TGR +TTD +DSG +SDG +QYY +FWI +WII +FLY +YDL +ACW +CWA +WAP +LFG +IWI +WIP +NYD +YDQ +GYM +CVR +RGM +GMA +AYV +SKM +GIP +IPY +PYR +RAM +KYA +YPH +PHI +HIE +RTM +MDP +MRP +PGN +HSM +SML +GIM +IML +YPW +DRR +MWC +VQD +QRQ +QQI +INA +RNQ +EMR +YLN +PTR +NPC +QYG +DAH +AHR +HRA +QAW +GRA +AHH +HGC +GCS +SRH +GVH +VHG +AWI +ASF +QNP +NPM +PMG +LMP +VYW +YWK +WKG +RRW +KIW +IWR +WRA +EYA +GGN +DRY +YYG +FYA +YAM +AMR +MRL +RLW +WPG +GEI +GTK +FAF +MVG +GKP +MFY +FYM +YMT +TGQ +VVV +GMV +HQG +PHY +GVW +VWI +PNN +RKY +HAI +IIG +DTY +PEM +LCW +WVP +VPG +PGY +YSD +VEP +KPF +PDL +PMN +MNM +NMV +VMQ +MQQ +HPR +KVG +TWG +WGK +VGM +IGL +LYV +GIY +IYV +RHG +HGV +EHN +HNE +QMR +MRV +KYQ +PIT +TEW +EWT +WTV +LME +AWW +WWG +WGP +PWF +WFA +IIV +KRF +FMN +MNE +SMP +HHM +HMY +MYG +GQY +YGQ +GQG +WLI +LIF +QYR +IFA +KWL +ESG +DFH +FHR +HRG +YDR +DPT +IKH +HGP +RTD +LYA +PVM +MGH +GHT +TVQ +RTY +HGI +KHT +HTP +KMC +MCW +GRP +AYG +MKV +TMW +MWA +WAK +HEA +CGG +LVF +RYR +WLD +NAF +VGH +SAP +QAG +QDW +DWT +YTA +AQG +GLT +TTI +SIW +IWL +RQD +NIE +PDY +RMD +INP +DIG +GRC +CTK +DRM +MIG +QNF +NFA +PRY +MHA +FEG +AIW +IWS +WSM +GPS +ATR +RRN +VPQ +TSH +CSP +DNG +SFM +FMI +MIF +DCP +CPP +AQH +QHC +CRK +RCR +AFF +FFC +FCP +PPN +AIE +AID +GNT +FYP +AMV +SYR +QDM +MIC +CYN +YNQ +PTT +GQC +QCY +DHR +GCA +CAC +ACP +CPN +CCS +KCN +YKT +TCP +LCY +MFM +GCI +CID +CPK +YVC +VCC +CCN +DRC +RCN +VCL +KCY +CYV +TQT +QTC +CEK +EKY +VSY +YFH +FHD +YEC +ECT +CHR +GPY +PYN +NVC +LCN +MGE +THT +HTI +HTS +HLN +KFI +ITY +EIP +NAN +LII +DFF +FCN +TSM +TYF +LLC +LCT +CTF +FLH +HHP +LHQ +HQT +FPL +PMS +LFY +YRK +KTN +TNV +YKH +NMR +YGP +LSH +PHD +HDT +HEC +FLC +CFG +AQQ +SGC +GCR +CRF +LWL +EMD +EGF +VGF +TWV +PQK +HDA +THC +HCG +CGW +WSS +GWP +MPM +IYI +HLP +RPC +PCL +NNH +HIY +YTY +TIM +IMI +FVF +MGA +YLG +ACF +CFV +VIC +ICI +EGC +CIH +IHF +HDI +QSD +PKG +VML +LTH +THK +HKG +YMH +HSE +LMC +MCV +LFH +FHI +QFC +KYK +PFV +PPI +TVM +IKF +KFP +QGY +GYG +YGM +AMC +MCL +MKI +QIM +TRT +IDK +WLH +DND +FIV +KGF +HPN +AFV +YKR +VFF +FFV +PKS +TKQ +PNH +QIY +NSR +IQP +QPK +IVT +CHV +CLH +QAN +NEH +YIH +DVM +MLC +LCV +IQR +RYK +WLY +IDD +TFY +FID +SPN +VVC +TTN +EMM +TGM +MSK +SHR +HRN +GEQ +FIC +CTV +MFH +YGL +YGS +MHE +HEM +MMS +SMH +MHT +VLC +KYP +TGI +RYG +YGT +GQI +GPK +PKQ +GYF +WLR +CYI +IFV +GYQ +FPM +YVV +APY +IKI +MDS +HAR +PNT +HVT +VNH +HPD +NIK +ESD +FHV +TFW +WPD +PDM +DMK +KYN +TWY +IHQ +SHP +EYP +YPK +PKL +IRS +RSC +CSA +LMS +PHK +KPV +VCI +FGW +WFH +FDT +KYG +INC +NCA +CAV +FCK +CKK +FKV +DYS +TRI +RKF +FLM +MEC +ECR +CRN +PRD +PPM +HLR +GHQ +HQP +DYC +YCT +PCH +MIT +DPI +PIQ +QMP +EVY +RGS +SNV +NVP +PSC +TPF +RKC +CVP +QFQ +MDR +KCP +CPH +PHR +YTK +YDS +EKW +KWH +WHA +KDH +HRL +REF +FGD +FGE +RND +SYT +PEW +EWF +TGF +CNG +NEF +VPC +SMI +RWF +PHE +QNS +GNY +MLQ +PFM +GDM +TMK +MSP +GQP +GLM +VFQ +TRA +PIG +FQG +GMR +TAQ +AQM +NFY +FYQ +GFG +DRT +KMY +MYE +NRY +VPH +HVP +LHP +HPG +VHP +PQH +SHA +HMH +KWF +WFG +LEY +DYK +APM +PMH +NPK +QTS +THQ +MPH +SHC +HCV +SDC +CVT +KQP +QPM +MNA +GWV +LFW +FWL +WLG +QKW +KWW +WWH +WHT +HKN +QTD +QID +NFG +TPN +NSW +SWF +VDT +EFW +WQN +NIT +LLI +GTN +ESN +NRW +RWC +WCS +CSW +YQL +QLM +MLF +MLW +DPG +RHW +HWD +WDQ +NER +HEG +FPY +PYA +QMN +MNL +KLY +FAD +TKC +KCH +QKH +YKI +NDM +MVI +SHI +HIQ +ECK +CKY +KYE +RQV +KLM +MKL +YVT +VKM +DHY +HYA +DME +VFC +CIT +PIF +IFF +FFF +KIP +WFK +KSC +SCK +CKG +CAY +CKS +LQH +QHP +PWV +WVE +MRM +MLH +SHM +NSM +QGN +GYY +YYD +KGW +RYP +YSP +PND +ITP +IFC +NAC +QVL +NKW +KWT +WTL +TCD +LCC +CCT +HLC +YWA +WAI +TDP +IDY +YVN +LTW +CTI +AFY +FYI +YGR +TRH +RNW +WRL +EVH +TPC +CAP +IIM +MGT +ILC +CWL +PFF +FFI +PFC +CHM +HMP +VIY +YAY +YFN +IKC +CKF +KFC +FCR +CRQ +WNI +WRR +RRC +RCP +CPV +YQI +FGN +CVI +IFI +ITW +TWI +CRI +ILM +DTC +VHH +HHY +HYV +LHC +HCK +CKP +ETC +IQC +HNC +IYQ +LPW +PWK +ITL +TMY +CDF +DFW +WLS +TCC +IMH +MHL +TPK +LVW +VWV +FFW +FWR +WRQ +PNK +VCW +FII +PIC +ICK +CWF +FHM +FNW +YTM +AFH +FHK +RFK +FKC +PNQ +GAW +AWD +YTT +TWN +DIW +IWV +WVS +AGH +GHA +AMI +AVW +TAH +QIS +STC +TCG +CGA +ILY +ITG +ICW +ICR +SCW +CWI +WIH +IHP +HPA +FFT +FTW +TNC +EKM +MLI +ICM +CMT +YIV +DRW +EVW +VWL +CTC +NAI +LMI +TVW +VWT +WTI +ISM +SQC +QCT +QHD +HDH +IYH +YQK +FAS +CKL +TFC +TEF +IRI +DHP +SIY +ADF +IRC +MPS +NWT +CEG +KNW +WSA +MAV +DML +MPV +WIY +IYL +IHH +VFK +FIP +IMV +SIH +TMQ +MQS +ACK +FLF +VMW +WCP +CPF +NIM +CNE +FVW +YIQ +CQY +KQH +QHS +LMQ +DCS +MEI +PTC +NRK +QVG +CTD +FVH +LHW +HWA +FVM +AMW +WLF +NQY +QYN +TCV +DFM +FML +VTY +MLD +MRR +CNQ +CNY +KIY +IYF +RNP +FFK +GIN +EQV +SEH +CDH +TVH +VHW +IWP +PHN +TCE +HRD +NDG +RTT +VNA +VGY +NYQ +KCS +YFG +MFQ +NWA +VMV +DWP +CPI +PIW +CIA +NYP +PHP +ITF +SNH +IMM +MMI +MII +KVY +SYP +TMG +SMQ +CGP +GPC +PCD +ANI +RLM +SWV +TRY +THI +LGC +NCK +CKQ +VHS +VWQ +FKF +QNW +NWP +WPA +HNA +YDY +YVW +VWP +YLC +PVW +WIS +IVW +VWA +IGV +TTC +TYC +YCL +CLT +YVL +HGH +KCC +CCK +CKR +IMW +IYR +SNY +LRC +NYK +NIY +YRH +HTN +MQV +TFN +FQF +NCC +CCC +APN +CGK +SKY +RCD +GKG +HNS +RDW +DWR +WRK +TTY +YIW +WYR +QFW +FWT +QWN +WNP +VRH +RHQ +EVQ +QNY +YNF +NFP +QNC +SLW +WEL +FYV +YFV +VCM +PLW +QLT +MGN +GNH +MCG +ETR +LWS +WSV +SVW +VWH +WHY +QYW +YWT +VYI +FSM +NYY +IAW +CSH +HMG +DFE +WSI +IWQ +WQY +CIP +IPQ +HST +QWT +GVF +PDW +MAH +HVG +IWH +LWA +WAC +CIL +DTH +THH +YHL +QKY +KYH +YHK +YNW +WTK +VHA +VWY +WYQ +WND +IWA +APD +ENY +TFK +HDK +QFN +RRH +PNC +NCR +HFF +TIC +HDE +VWS +WSQ +YLF +ALW +WGG +TRQ +QYH +CVH +SMW +MWY +RNH +QTY +WQL +GCT +APC +CAE +FWF +WFQ +LCH +CHF +WSR +GGW +TIN +DQH +QHG +PFT +ISF +WDN +NWN +KEC +DKP +FYF +YFP +DSM +WEI +MSM +YII +MMN +PMP +RCC +CCP +CPT +SGW +GWT +NCP +CPG +GQH +IWN +WNC +NCY +CYS +YSR +HTF +HHA +RPW +PWH +WHN +HNQ +VQW +ECG +SMS +ANW +RAW +SFQ +QIH +HHR +REW +YEQ +CRP +FKM +GQM +WTR +ICL +MVW +ISW +IRY +LDM +RNI +QTH +THG +YVI +VPF +DAM +CWD +WDR +DRQ +MPF +PFG +CCI +AIC +CQP +GCW +WVI +QGW +NIG +CSV +AYY +STH +MGC +CFC +CLC +DYT +QVW +YIN +GQT +QWE +WES +QCH +CHP +VCR +WAY +EMF +MFI +TWC +WCV +FCF +SRC +RCH +HLT +TFF +IWF +WFY +NHN +PQT +QDN +TNH +FAW +RWQ +LWI +IAC +WNV +RHM +MEY +EYT +NVM +TWA +GWG +CQV +PSY +IYK +DTW +TWR +WRE +CSC +SCD +IWK +WKS +NYN +KNF +PNR +SQH +WFP +SWD +WDI +AWG +WVA +VMC +GWH +WHE +YCR +GMF +MFF +VTW +CDC +GYC +YCN +TMN +PCV +HCP +CCL +TQI +FCC +CLE +PRH +WST +MMD +YGH +SWA +PTW +TWD +INY +NYG +NCL +KWI +WIF +FGH +GKC +KCM +GWA +WAQ +FMY +MYY +YYQ +AKH +HKF +ECA +KHE +ICG +GMH +CTR +DHH +HNW +MIH +HPV +WMP +RVQ +DHC +GHD +AWE +WET +CHI +HIG +NTY +QNN +GAM +HLY +GWM +WMI +TCI +RYS +HNT +NQW +QWS +DPW +YSC +MPK +EWK +YAQ +AYW +MLY +CAR +KYM +YME +EYQ +MTN +NHY +ATM +FFQ +RPH +QMF +YGI +IFT +MMP +TDF +CHN +EWY +HSI +NFM +FMD +MDF +FKG +NKH +YMD +YDI +MCP +QYQ +TIW +VAH +HEF +FPI +YPF +HGN +SCR +NMG +MIN +HTD +MFW +WNH +MCI +WEN +SVH +CRV +KMD +FQM +QMI +HDS +CHG +WAV +PKW +PQW +QWP +TSY +QWR +TCR +SEY +SHY +HYQ +SFW +FWA +WAM +GFN +VMK +DYW +YWY +WYS +HWQ +WQT +VMH +MHF +PKY +VKW +WWS +HMM +MMT +MTY +KWD +TKW +KWS +HPM +MKP +KPH +HNH +QHA +MVY +YYT +EYY +WDF +GYH +WVY +YGF +GFH +HDP +NEW +WYE +YKY +YIE +EWN +GTG +YAS +CSY +GMN +RIW +IMD +TYE +DHF +HIK +PCC +CCE +CEW +GHY +DIM +TDK +KWN +QGF +GFM +ATY +TYG +YGW +MWR +NKF +WTG +PYL +YQN +QNT +RGI +GHI +HID +REM +YRC +QYS +DYG +CRY +GWD +GMC +MCA +CAF +EMC +MCK +DMM +MID +MFV +VGC +FGP +KAW +HLK +DWV +KYR +WTC +TCF +DSW +GTW +WVH +TNW +ICA +CAK +HIM +YCA +MLM +FRW +WGI +QFH +FHH +HHT +SCF +CFL +NHH +IDW +DWA +WAR +ARW +WHF +FWV +NTN +KKW +SAH +CGS +VWF +LWW +WWR +WRP +HRF +SHH +HGK +NYE +YEG +FSC +NAM +MKM +KMG +IGM +TWT +WTM +SWQ +WQD +PMM +FPP +FIH +HHD +RFA +NQM +CIG +IGQ +YSH +IIW +IPW +PWY +WYL +HKY +LWE +WEG +YDF +FGI +KGM +HSC +GTC +WPF +WWL +HQD +WGD +FKP +NYV +YMM +WGH +HQF +GGM +YDM +QGG +QDC +CDN +FDC +EQY +MQM +GYP +WLM +FEW +FAY +HLG +DGC +HLF +AHD +GFW +FWW +YTH +THY +HYK +TQF +FMH +DMF +YFF +VWG +WGV +TEM +RMP +FNF +RHY +HYC +YCE +MQI +FFG +YCK +CKH +YYN +PPH +CPS +GHH +KQN +FTC +GCK +CRG +QHH +QYC +MGM +DPM +MGI +EPY +QCM +CMQ +NIC +EYC +CRT +YPN +TRF +RDM +WPY +LYK +MNC +NCV +SCQ +CQA +GHN +HNN +AMF +LYN +NIW +ECF +FAC +ACA +RHV +EWM +WME +FEY +DDP +TMT +MTW +IPH +CNI +NIQ +NVW +VWK +DHE +RWN +FCT +PCW +DMW +YWF +FHG +YMG +DHW +HWK +VCK +SWK +RFF +CAG +MFT +VMY +MYN +PRM +RMC +CPM +THD +CKI +FWQ +WQV +MDY +RME +WGA +AYM +YYL +FFM +KNM +YCP +CPD +WDG +HNF +MGY +LEC +CLY +SWI +NLQ +GSN +PQC +QCV +PNW +IYY +NMI +EGH +HIP +WQG +MWS +HII +YNG +RMH +WKR +RHI +YNK +HQE +RWM +DHT +WEV +KCE +IWT +YRL +KCG +WSP +KGC +CKA +HDM +MPN +DKC +DNR +LCK +YRM +RFQ +FMM +WMS +QIW +HVC +NWD +MDG +FIW +DPC +HED +KWQ +WQQ +PVC +HLW +TYN +GHK +EWD +WDS +PHC +HCI +CIQ +IWD +WDV +AWC +IDC +DCA +CRR +WPM +QYD +HYR +HCW +CWS +WHI +WRD +KMP +VIW +AWK +WKH +HYP +FFY +QCI +CIY +FAH +NWG +FYE +TDW +DWK +CEP +HMV +DKW +YFY +VDW +HKS +MQH +FDH +CCM +MMM +DYN +WKQ +WCL +IHN +NNY +NYH +VNY +HTR +MNR +DMH +MHY +GTY +IFM +QRT +GCE +CEI +YEM +RMM +YTR +YAP +YMN +WGQ +WNM +GHM +WQI +NFD +WEY +FKY +HYD +HVW +AMM +RMY +QWV +MMQ +MSF +HFE +WER +HQM +VQH +YPQ +PQF +GHC +HMN +MNT +FFH +MMH +FIM +MIY +IYW +YWH +FMC +MCS +DWY +WYA +MWK +CMR +HYN +GWI +WIW +KWC +PWQ +RYC +THE +YQF +ERM +EWP +SWY +WYN +WKY +WEC +ECM +CME +RVW +VWC +WCK +RFY +NHM +KHC +GWC +HRW +RWI +WIK +FRC +HIH +RCW +CWP +CGI +FVC +VMN +KDW +AMH +MHQ +NQR +TCY +CYT +YTQ +HHV +AHY +QSM +LMN +FMT +MTH +GRN +NMQ +NGY +AWT +FCG +NMH +MHM +YCQ +NWC +CKT +VCE +HWH +NLH +VWW +PCR +RWY +WYF +YCM +QVM +QHT +HVR +RMN +QPW +FRH +HQK +YKF +MQN +KWE +TYV +HMR +ICH +KYT +TDM +CEY +CVC +PAC +NFK +KCF +YNC +QSW +WEW +WPW +YQH +NFH +MSY +YNP +DQP +HKH +MTK +KAH +VKC +YKW +GWW +WWP +MWG +VYC +YCG +HSW +WNE +CFI +CLM +CHK +RCQ +TCQ +PFA +NNC +QGC +MNY +NYM +KQM +QME +NCF +PDC +WAN +RPN +VCP +WIN +PPW +PWL +CRH +PWD +SYM +FGC +YIK +VNC +YTF +SNC +QHM +MEH +CQT +ITM +EYH +CQF +DYM +SMM +QMH +CYA +MAC +WVN +WAT +FWM +WMT +CCG +CYG +WAF +EPM +MVC +HWG +ELC +RCI +WQH +FWH +QWQ +AGW +NWY +WYC +CRW +CQS +LIW +CAQ +QMW +MWT +CER +ERC +VGW +IAH +NAQ +WIM +MKC +FQC +MWE +TQM +YHW +HWS +NYA +WMM +MMW +MWN +WNW +NWM +YEY +PCQ +HFW +FNY +NHR +NSC +TNG +HVM +HQW +EYW +IWE +HCE +PYH +YHD +YKQ +SWH +HAY +QMY +KIH +WFN +CSF +RCE +YCH +GRH +YNE +HQN +QPH +HYL +MHV +WIT +SCG +SPW +FHF +CIW +WAG +CTW +YAW +RHH +NFW +MNK +GEC +AHM +CYY +HEQ +MWV +IMR +FCD +HQC +CYF +MHC +PMC +HQY +WTH +QKC +HRC +HYF +CYL +HKC +WPS +WDC +FMQ +QHK +CFK +NEC +DNM +CQM +QMT +MDN +DCK +WDW +LHY +TKY +FPC +MDM +QWF +MDW +DWW +WWE +GLW +TWM +MSW +WEQ +WKN +PMQ +WAW +WMQ +DCY +CYR +CFH +HMS +IWW +WWI +PFW +WVC +ACY +MNS +CGC +GCM +TYY +YYS +MIM +MKW +HMI +FWE +MKH +MEW +SMY +MYH +HYI +CKN +NMM +RIM +SKH +YEW +CQR +RYH +HTM +WKT +KMN +FKH +TCK +WYI +HNP +NGC +MRN +FHW +EIW +KVH +WFE +YCY +AHW +TYW +YWR +WNA +EMG +CFF +HYT +FHQ +NKY +HHK +PCE +FCM +CMY +DHM +QQW +QWY +WYM +MRW +FPQ +MME +MYR +LWQ +GWY +WYD +HPW +YWD +CAH +EQW +QWK +WSH +NMC +PNE +FYH +QKM +HWE +WHD +RQW +SWW +WWA +MYS +KQW +WWT +CPQ +WIE +ACC +CCH +WEK +GMY +HFT +WTY +MMG +WTN +YYM +NTH +YCC +CCF +DYQ +WEM +WGT +NHF +CMS +WGS +MIW +YQM +IHM +QDH +TWQ +CAD +GNW +NWH +YYH +YYY +YFM +TPW +WED +MCR +YNM +WWD +MYV +YWM +SCM +CMM +NRC +RCT +CTN +YHM +QWC +WCT +TTW +TWW +WWY +WMG +YYC +WID +YVM +WIR +FYC +FWS +FYW +WTW +RCF +QQG +HMD +HEN +CKM +MKY +HCF +SQW +TYD +GIC +FQW +IFW +YQY +CCY +WAD +WSF +MYK +NDW +MIP +QWG +TCW +CWW +YLW +TQW +IHY +MQC +QCD +WTQ +MWW +VWM +WMK +GMW +MQW +NCQ +CQI +MRC +PWP +WTF +HVQ +HMC +DWQ +ILW +PWS +YHH +CPC +YHE +HAK +RNM +CEH +CMF +QHN +QCE +MDQ +DHQ +YTW +WLC +MCF +WFC +CFQ +YCW +CWE +MPW +WYK +MGF +FTM +CWK +HWF +PCT +MHN +HKW +WYV +DCW +CYQ +CAW +HWC +HWR +RSW +PYC +FKW +WFW +FMF +YMY +DCM +YDH +LWY +WKD +WRF +DKQ +QEC +WTE +CEM +GCY +MNH +CEQ +HYY +PYQ +QIC +GPW +PWW +MCD +WHR +NYW +QWM +CQQ +YHC +FCH +CHQ +QCF +NFC +PCN +PWG +CMI +CTM +QCP +WWN +TMC +CYW +EHC +CCR +FTQ +CNF +FDW +DWI +PWM +YWG +KMH +PWE +KWG +WGM +WHM +WPQ +CHY +VWR +WRH +CYC +AWY +DHN +CIC +CPW +ICP +QWD +CQW +CTY +WRC +WYW +MWL +CGH +HPC +PCY +EWH +QNM +PCM +QMM +WMY +WPN +WCE +HQH +CNN +CMW +PCK +QWH +NTC +HIC +CMC +MCQ +KHW +KCQ +MHK +CWG +HMT +WFM +IWC +CML +HWT +MHR +DQW +IQW +WVW +WPC +WHG +WYH +IEW +VHY +YQW +WDH +CHD +QPY +WKC +YDC +NHW +WDM +QPC +CKW +KWY +NCM +CQN +MYF +YMW +MMC +KMW +MWI +MHD +ECI +CMD +WCI +CGM +GCQ +MCE +WWF +WTT +HDC +FCQ +DMN +PWI +RMQ +WGW +WYP +MYM +HCC +CDQ +MNW +CMP +RCK +MWD +FPW +QTW +WNY +MCT +MHH +IWM +CFY +HYW +PHW +HWW +CFN +MWF +HCM +MWH +GYW +HAW +DWH +YWV +NMW +QEW +CNC +WDK +NKC +GCC +MPC +MCN +CCA +KWM +MCM +HWL +WSY +CKC +WMF +CWY +HCQ +WCA +HMK +DHD +YHY +DNW +WCD +WPI +WFD +WHW +WHC +HCY +WHQ +IMC +KPC +YMC +CRC +MCY +ECY +MCH +HWI +DCQ +PMW +LWC +CRM +DMC +MNF +HWY +YWW +YWC +WYY +EWC +FWC +FWY +WMN +WWV +EWW +WCM +CAM +WKM +WHH +YMF +WCQ +WIQ +MFN +ANC +ECW +WCG +CIM +WQC +CMH +MYC +CTH +HHW +QWW +WIC +CPY +MDC +NYC +CMN +WHK +MMY +DEW +QHW +WQW +CEC +TWH +HFC +WKW +HWM +MQY +HDW +WYG +CWM +CYH +HYM +QMC +QCW +NCW +YQC +FMW +WMC +WWW +HMW +RMW +CHW +WCW +HTW +CWC +WCY +YWQ +WMW +CWT +CWH +MWM +WWC +WCC +WCH +WWM From b479d5aff52adb580346ea70f3736e4ef876ac1a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 5 Oct 2024 12:54:10 +0200 Subject: [PATCH 061/112] remove absolete path for mocked open func --- tests/unit/dataset_classes/testTox21Challenge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/dataset_classes/testTox21Challenge.py b/tests/unit/dataset_classes/testTox21Challenge.py index fedde8e5..9ad2af21 100644 --- a/tests/unit/dataset_classes/testTox21Challenge.py +++ b/tests/unit/dataset_classes/testTox21Challenge.py @@ -38,7 +38,7 @@ def test_load_data_from_file(self, mock_sdmol_supplier: patch) -> None: mock_file = mock_open(read_data=Tox21ChallengeMockData.get_raw_train_data()) with patch("builtins.open", mock_file): with open( - r"G:\github-aditya0by0\chebai_data\tox21_challenge\tox21_10k_data_all.sdf\tox21_10k_data_all.sdf", + r"fake/path", "rb", ) as f: suppl = Chem.ForwardSDMolSupplier(f) From adedc093435a8fb53d5bdb8c0210f65204c4d45d Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 5 Oct 2024 16:17:58 +0200 Subject: [PATCH 062/112] test single label split scenario implemented in #54 --- .../dataset_classes/testChebiOverXPartial.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py index 7720d301..76584ebf 100644 --- a/tests/unit/dataset_classes/testChebiOverXPartial.py +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -104,6 +104,72 @@ def test_extract_class_hierarchy_with_bottom_cls( f"The graph nodes do not match the expected nodes for top class {self.chebi_extractor.top_class_id} hierarchy.", ) + @patch("pandas.DataFrame.to_csv") + @patch("pandas.read_pickle") + @patch.object(ChEBIOverXPartial, "_get_data_size", return_value=4.0) + @patch("torch.load") + @patch( + "builtins.open", + new_callable=mock_open, + read_data=ChebiMockOntology.get_raw_data(), + ) + def test_single_label_data_split( + self, mock_open, mock_load, mock_get_data_size, mock_read_pickle, mock_to_csv + ) -> None: + """ + Test the single-label data splitting functionality of the ChebiExtractor class. + + This test mocks several key methods (file operations, torch loading, and pandas functions) + to ensure that the class hierarchy is properly extracted, data is processed into a raw dataset, + and the data splitting logic works as intended without actual file I/O. + + It also verifies that there is no overlap between training, validation, and test sets. + """ + self.chebi_extractor.top_class_id = 11111 + self.chebi_extractor.THRESHOLD = 3 + self.chebi_extractor.chebi_version_train = None + + graph: nx.DiGraph = self.chebi_extractor._extract_class_hierarchy("fake_path") + data_df = self.chebi_extractor._graph_to_raw_dataset(graph) + + mock_read_pickle.return_value = data_df + data_pt = self.chebi_extractor._load_data_from_file("fake/path") + + # Verify that the data contains only 1 label + self.assertEqual(len(data_pt[0]["labels"]), 1) + + mock_load.return_value = data_pt + + # Retrieve the data splits (train, validation, and test) + train_split = self.chebi_extractor.dynamic_split_dfs["train"] + validation_split = self.chebi_extractor.dynamic_split_dfs["validation"] + test_split = self.chebi_extractor.dynamic_split_dfs["test"] + + train_idents = set(train_split["ident"]) + val_idents = set(validation_split["ident"]) + test_idents = set(test_split["ident"]) + + # Ensure there is no overlap between train and test sets + self.assertEqual( + len(train_idents.intersection(test_idents)), + 0, + "Train and test sets should not overlap.", + ) + + # Ensure there is no overlap between validation and test sets + self.assertEqual( + len(val_idents.intersection(test_idents)), + 0, + "Validation and test sets should not overlap.", + ) + + # Ensure there is no overlap between train and validation sets + self.assertEqual( + len(train_idents.intersection(val_idents)), + 0, + "Train and validation sets should not overlap.", + ) + if __name__ == "__main__": unittest.main() From 65c2d9bd6cdd1241b2b9d2cb0c69bc892f760274 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 5 Oct 2024 17:04:10 +0200 Subject: [PATCH 063/112] test output format for Tox21MolNet._load_data_from_file --- tests/unit/dataset_classes/testTox21MolNet.py | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/tests/unit/dataset_classes/testTox21MolNet.py b/tests/unit/dataset_classes/testTox21MolNet.py index 5d5f3497..86cbb752 100644 --- a/tests/unit/dataset_classes/testTox21MolNet.py +++ b/tests/unit/dataset_classes/testTox21MolNet.py @@ -2,7 +2,10 @@ from typing import List from unittest.mock import MagicMock, mock_open, patch +import torch + from chebai.preprocessing.datasets.tox21 import Tox21MolNet +from chebai.preprocessing.reader import ChemDataReader from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData @@ -16,9 +19,7 @@ def setUpClass(cls, mock_makedirs: MagicMock) -> None: Args: mock_makedirs (MagicMock): Mocked `os.makedirs` function. """ - ReaderMock = MagicMock() - ReaderMock.name.return_value = "MockedReaderTox21MolNet" - Tox21MolNet.READER = ReaderMock + Tox21MolNet.READER = ChemDataReader cls.data_module = Tox21MolNet() @patch( @@ -28,20 +29,38 @@ def setUpClass(cls, mock_makedirs: MagicMock) -> None: ) def test_load_data_from_file(self, mock_open_file: mock_open) -> None: """ - Test the `_load_data_from_file` method for correct CSV parsing. + Test the `_load_data_from_file` method for correct output. Args: mock_open_file (mock_open): Mocked open function to simulate file reading. """ - expected_data = Tox21MolNetMockData.get_processed_data() actual_data = self.data_module._load_data_from_file("fake/file/path.csv") - self.assertEqual( - list(actual_data), - expected_data, - "The loaded data does not match the expected output from the file.", + first_instance = next(actual_data) + + # Check for required keys + required_keys = ["features", "labels", "ident"] + for key in required_keys: + self.assertIn( + key, first_instance, f"'{key}' key is missing in the output data." + ) + + self.assertTrue( + all(isinstance(feature, int) for feature in first_instance["features"]), + "Not all elements in 'features' are integers.", ) + # Check that 'features' can be converted to a tensor + features = first_instance["features"] + try: + tensor_features = torch.tensor(features) + self.assertTrue( + tensor_features.ndim > 0, + "'features' should be convertible to a non-empty tensor.", + ) + except Exception as e: + self.fail(f"'features' cannot be converted to a tensor: {str(e)}") + @patch( "builtins.open", new_callable=mock_open, From a63c010f46cce5780d4f4068a01268ecec292e64 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 5 Oct 2024 17:40:10 +0200 Subject: [PATCH 064/112] DynamicDataset: check split stratification --- .../dataset_classes/testDynamicDataset.py | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/tests/unit/dataset_classes/testDynamicDataset.py b/tests/unit/dataset_classes/testDynamicDataset.py index e42c3e7e..c8846273 100644 --- a/tests/unit/dataset_classes/testDynamicDataset.py +++ b/tests/unit/dataset_classes/testDynamicDataset.py @@ -216,6 +216,142 @@ def test_get_train_val_splits_given_test_consistency(self) -> None: obj="Validation sets should be identical for the same seed.", ) + def test_get_test_split_stratification(self) -> None: + """ + Test that the split into train and test sets maintains the stratification of labels. + """ + self.dataset.train_split = 0.5 + train_df, test_df = self.dataset.get_test_split(self.data_df, seed=0) + + number_of_labels = len(self.data_df["labels"][0]) + + # Check the label distribution in the original dataset + original_pos_count, original_neg_count = ( + self.get_positive_negative_labels_counts(self.data_df) + ) + total_count = len(self.data_df) * number_of_labels + + # Calculate the expected proportions + original_pos_proportion = original_pos_count / total_count + original_neg_proportion = original_neg_count / total_count + + # Check the label distribution in the train set + train_pos_count, train_neg_count = self.get_positive_negative_labels_counts( + train_df + ) + train_total_count = len(train_df) * number_of_labels + + # Calculate the train set proportions + train_pos_proportion = train_pos_count / train_total_count + train_neg_proportion = train_neg_count / train_total_count + + # Assert that the proportions are similar to the original dataset + self.assertAlmostEqual( + train_pos_proportion, + original_pos_proportion, + places=1, + msg="Train set labels should maintain original positive label proportion.", + ) + self.assertAlmostEqual( + train_neg_proportion, + original_neg_proportion, + places=1, + msg="Train set labels should maintain original negative label proportion.", + ) + + # Check the label distribution in the test set + test_pos_count, test_neg_count = self.get_positive_negative_labels_counts( + test_df + ) + test_total_count = len(test_df) * number_of_labels + + # Calculate the test set proportions + test_pos_proportion = test_pos_count / test_total_count + test_neg_proportion = test_neg_count / test_total_count + + # Assert that the proportions are similar to the original dataset + self.assertAlmostEqual( + test_pos_proportion, + original_pos_proportion, + places=1, + msg="Test set labels should maintain original positive label proportion.", + ) + self.assertAlmostEqual( + test_neg_proportion, + original_neg_proportion, + places=1, + msg="Test set labels should maintain original negative label proportion.", + ) + + def test_get_train_val_splits_given_test_stratification(self) -> None: + """ + Test that the split into train and validation sets maintains the stratification of labels. + """ + self.dataset.use_inner_cross_validation = False + self.dataset.train_split = 0.5 + df_train_main, test_df = self.dataset.get_test_split(self.data_df, seed=0) + train_df, val_df = self.dataset.get_train_val_splits_given_test( + df_train_main, test_df, seed=42 + ) + + number_of_labels = len(self.data_df["labels"][0]) + + # Check the label distribution in the original dataset + original_pos_count, original_neg_count = ( + self.get_positive_negative_labels_counts(self.data_df) + ) + total_count = len(self.data_df) * number_of_labels + + # Calculate the expected proportions + original_pos_proportion = original_pos_count / total_count + original_neg_proportion = original_neg_count / total_count + + # Check the label distribution in the train set + train_pos_count, train_neg_count = self.get_positive_negative_labels_counts( + train_df + ) + train_total_count = len(train_df) * number_of_labels + + # Calculate the train set proportions + train_pos_proportion = train_pos_count / train_total_count + train_neg_proportion = train_neg_count / train_total_count + + # Assert that the proportions are similar to the original dataset + self.assertAlmostEqual( + train_pos_proportion, + original_pos_proportion, + places=1, + msg="Train set labels should maintain original positive label proportion.", + ) + self.assertAlmostEqual( + train_neg_proportion, + original_neg_proportion, + places=1, + msg="Train set labels should maintain original negative label proportion.", + ) + + # Check the label distribution in the validation set + val_pos_count, val_neg_count = self.get_positive_negative_labels_counts(val_df) + val_total_count = len(val_df) * number_of_labels + + # Calculate the validation set proportions + val_pos_proportion = val_pos_count / val_total_count + val_neg_proportion = val_neg_count / val_total_count + + # Assert that the proportions are similar to the original dataset + self.assertAlmostEqual( + val_pos_proportion, + original_pos_proportion, + places=1, + msg="Validation set labels should maintain original positive label proportion.", + ) + self.assertAlmostEqual( + val_neg_proportion, + original_neg_proportion, + places=1, + msg="Validation set labels should maintain original negative label proportion.", + ) + @staticmethod def get_positive_negative_labels_counts(df: pd.DataFrame) -> Tuple[int, int]: """ From 7fc96a939c839a2f080b9ef42f21847a5ea51a1f Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 5 Oct 2024 19:12:42 +0200 Subject: [PATCH 065/112] set weights_only parameter of torch.load to False - #48 --- chebai/models/electra.py | 8 ++++++-- chebai/preprocessing/datasets/base.py | 14 ++++++++++---- chebai/preprocessing/datasets/chebi.py | 7 +++++-- chebai/preprocessing/datasets/go_uniprot.py | 4 +++- chebai/preprocessing/datasets/pubchem.py | 4 ++-- .../migration/chebi_data_migration.py | 2 +- chebai/result/analyse_sem.py | 4 +++- chebai/result/base.py | 2 +- chebai/result/pretraining.py | 2 +- chebai/result/utils.py | 2 ++ tests/testCustomBalancedAccuracyMetric.py | 6 +++++- tests/testCustomMacroF1Metric.py | 6 +++++- tests/testPubChemData.py | 12 +++++++++--- tests/testTox21MolNetData.py | 12 +++++++++--- tutorials/demo_process_results.ipynb | 10 +++++----- tutorials/process_results_old_chebi.ipynb | 2 +- 16 files changed, 68 insertions(+), 29 deletions(-) diff --git a/chebai/models/electra.py b/chebai/models/electra.py index 3b2807c8..7009406d 100644 --- a/chebai/models/electra.py +++ b/chebai/models/electra.py @@ -256,7 +256,9 @@ def __init__( # Load pretrained checkpoint if provided if pretrained_checkpoint: with open(pretrained_checkpoint, "rb") as fin: - model_dict = torch.load(fin, map_location=self.device) + model_dict = torch.load( + fin, map_location=self.device, weights_only=False + ) if load_prefix: state_dict = filter_dict(model_dict["state_dict"], load_prefix) else: @@ -414,7 +416,9 @@ def __init__(self, cone_dimensions=20, **kwargs): model_prefix = kwargs.get("load_prefix", None) if pretrained_checkpoint: with open(pretrained_checkpoint, "rb") as fin: - model_dict = torch.load(fin, map_location=self.device) + model_dict = torch.load( + fin, map_location=self.device, weights_only=False + ) if model_prefix: state_dict = { str(k)[len(model_prefix) :]: v diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index a2997699..f163a9e6 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -200,7 +200,9 @@ def load_processed_data( filename = self.processed_file_names_dict[kind] except NotImplementedError: filename = f"{kind}.pt" - return torch.load(os.path.join(self.processed_dir, filename)) + return torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) def dataloader(self, kind: str, **kwargs) -> DataLoader: """ @@ -519,7 +521,7 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader: DataLoader: DataLoader object for the specified subset. """ subdatasets = [ - torch.load(os.path.join(s.processed_dir, f"{kind}.pt")) + torch.load(os.path.join(s.processed_dir, f"{kind}.pt"), weights_only=False) for s in self.subsets ] dataset = [ @@ -1022,7 +1024,9 @@ def _retrieve_splits_from_csv(self) -> None: splits_df = pd.read_csv(self.splits_file_path) filename = self.processed_file_names_dict["data"] - data = torch.load(os.path.join(self.processed_dir, filename)) + data = torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) df_data = pd.DataFrame(data) train_ids = splits_df[splits_df["split"] == "train"]["id"] @@ -1081,7 +1085,9 @@ def load_processed_data( # If filename is provided try: - return torch.load(os.path.join(self.processed_dir, filename)) + return torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) except FileNotFoundError: raise FileNotFoundError(f"File {filename} doesn't exist") diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 727f9f64..9d80929a 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -407,7 +407,9 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ try: filename = self.processed_file_names_dict["data"] - data_chebi_version = torch.load(os.path.join(self.processed_dir, filename)) + data_chebi_version = torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) except FileNotFoundError: raise FileNotFoundError( f"File data.pt doesn't exists. " @@ -428,7 +430,8 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: data_chebi_train_version = torch.load( os.path.join( self._chebi_version_train_obj.processed_dir, filename_train - ) + ), + weights_only=False, ) except FileNotFoundError: raise FileNotFoundError( diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py index 574ecdbd..dba9940e 100644 --- a/chebai/preprocessing/datasets/go_uniprot.py +++ b/chebai/preprocessing/datasets/go_uniprot.py @@ -508,7 +508,9 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ try: filename = self.processed_file_names_dict["data"] - data_go = torch.load(os.path.join(self.processed_dir, filename)) + data_go = torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) except FileNotFoundError: raise FileNotFoundError( f"File data.pt doesn't exists. " diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index 5ba76cc4..c82ea42f 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -891,10 +891,10 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader: DataLoader: DataLoader instance. """ labeled_data = torch.load( - os.path.join(self.labeled.processed_dir, f"{kind}.pt") + os.path.join(self.labeled.processed_dir, f"{kind}.pt"), weights_only=False ) unlabeled_data = torch.load( - os.path.join(self.unlabeled.processed_dir, f"{kind}.pt") + os.path.join(self.unlabeled.processed_dir, f"{kind}.pt"), weights_only=False ) if self.data_limit is not None: labeled_data = labeled_data[: self.data_limit] diff --git a/chebai/preprocessing/migration/chebi_data_migration.py b/chebai/preprocessing/migration/chebi_data_migration.py index 5a438b44..a057326a 100644 --- a/chebai/preprocessing/migration/chebi_data_migration.py +++ b/chebai/preprocessing/migration/chebi_data_migration.py @@ -168,7 +168,7 @@ def _combine_pt_splits( df_list: List[pd.DataFrame] = [] for split, file_name in old_splits_file_names.items(): file_path = os.path.join(old_dir, file_name) - file_df = pd.DataFrame(torch.load(file_path)) + file_df = pd.DataFrame(torch.load(file_path, weights_only=False)) df_list.append(file_df) return pd.concat(df_list, ignore_index=True) diff --git a/chebai/result/analyse_sem.py b/chebai/result/analyse_sem.py index 64ac87a1..6adb1066 100644 --- a/chebai/result/analyse_sem.py +++ b/chebai/result/analyse_sem.py @@ -427,7 +427,9 @@ def run_all( os.path.join(buffer_dir_smoothed, "preds000.pt") ): preds = torch.load( - os.path.join(buffer_dir_smoothed, "preds000.pt"), DEVICE + os.path.join(buffer_dir_smoothed, "preds000.pt"), + DEVICE, + weights_only=False, ) labels = None else: diff --git a/chebai/result/base.py b/chebai/result/base.py index 487be6ac..9d583a00 100644 --- a/chebai/result/base.py +++ b/chebai/result/base.py @@ -54,7 +54,7 @@ def _generate_predictions(self, data_path, raw=False, **kwargs): else: data_tuples = [ (x.get("raw_features", x["ident"]), x["ident"], x) - for x in torch.load(data_path) + for x in torch.load(data_path, weights_only=False) ] for raw_features, ident, row in tqdm.tqdm(data_tuples): diff --git a/chebai/result/pretraining.py b/chebai/result/pretraining.py index 33c212c8..8d712f21 100644 --- a/chebai/result/pretraining.py +++ b/chebai/result/pretraining.py @@ -34,7 +34,7 @@ def evaluate_model(logs_base_path, model_filename, data_module): collate = data_module.reader.COLLATOR() test_file = "test.pt" data_path = os.path.join(data_module.processed_dir, test_file) - data_list = torch.load(data_path) + data_list = torch.load(data_path, weights_only=False) preds_list = [] labels_list = [] diff --git a/chebai/result/utils.py b/chebai/result/utils.py index 31063747..d015bd80 100644 --- a/chebai/result/utils.py +++ b/chebai/result/utils.py @@ -182,6 +182,7 @@ def load_results_from_buffer( torch.load( os.path.join(buffer_dir, filename), map_location=torch.device(device), + weights_only=False, ) ) i += 1 @@ -194,6 +195,7 @@ def load_results_from_buffer( torch.load( os.path.join(buffer_dir, filename), map_location=torch.device(device), + weights_only=False, ) ) i += 1 diff --git a/tests/testCustomBalancedAccuracyMetric.py b/tests/testCustomBalancedAccuracyMetric.py index 30cbe1d5..033227df 100644 --- a/tests/testCustomBalancedAccuracyMetric.py +++ b/tests/testCustomBalancedAccuracyMetric.py @@ -49,7 +49,9 @@ def test_metric_against_realistic_data(self) -> None: # load single file to get the num of labels for metric class instantiation labels = torch.load( - f"{directory_path}/labels{0:03d}.pt", map_location=torch.device(self.device) + f"{directory_path}/labels{0:03d}.pt", + map_location=torch.device(self.device), + weights_only=False, ) num_labels = labels.shape[1] balanced_acc_custom = BalancedAccuracy(num_labels=num_labels) @@ -58,10 +60,12 @@ def test_metric_against_realistic_data(self) -> None: labels = torch.load( f"{directory_path}/labels{i:03d}.pt", map_location=torch.device(self.device), + weights_only=False, ) preds = torch.load( f"{directory_path}/preds{i:03d}.pt", map_location=torch.device(self.device), + weights_only=False, ) balanced_acc_custom.update(preds, labels) diff --git a/tests/testCustomMacroF1Metric.py b/tests/testCustomMacroF1Metric.py index a7bbbaa2..1c67d54b 100644 --- a/tests/testCustomMacroF1Metric.py +++ b/tests/testCustomMacroF1Metric.py @@ -119,7 +119,9 @@ def test_metric_against_realistic_data(self) -> None: # Load single file to get the number of labels for metric class instantiation labels = torch.load( - f"{directory_path}/labels{0:03d}.pt", map_location=torch.device(self.device) + f"{directory_path}/labels{0:03d}.pt", + map_location=torch.device(self.device), + weights_only=False, ) num_labels = labels.shape[1] macro_f1_custom = MacroF1(num_labels=num_labels) @@ -130,10 +132,12 @@ def test_metric_against_realistic_data(self) -> None: labels = torch.load( f"{directory_path}/labels{i:03d}.pt", map_location=torch.device(self.device), + weights_only=False, ) preds = torch.load( f"{directory_path}/preds{i:03d}.pt", map_location=torch.device(self.device), + weights_only=False, ) macro_f1_standard.update(preds, labels) macro_f1_custom.update(preds, labels) diff --git a/tests/testPubChemData.py b/tests/testPubChemData.py index dfc43028..71591f6e 100644 --- a/tests/testPubChemData.py +++ b/tests/testPubChemData.py @@ -37,9 +37,15 @@ def getDataSplitsOverlaps(cls) -> None: processed_path = os.path.join(os.getcwd(), cls.pubChem.processed_dir) print(f"Checking Data from - {processed_path}") - train_set = torch.load(os.path.join(processed_path, "train.pt")) - val_set = torch.load(os.path.join(processed_path, "validation.pt")) - test_set = torch.load(os.path.join(processed_path, "test.pt")) + train_set = torch.load( + os.path.join(processed_path, "train.pt"), weights_only=False + ) + val_set = torch.load( + os.path.join(processed_path, "validation.pt"), weights_only=False + ) + test_set = torch.load( + os.path.join(processed_path, "test.pt"), weights_only=False + ) train_smiles, train_smiles_ids = cls.get_features_ids(train_set) val_smiles, val_smiles_ids = cls.get_features_ids(val_set) diff --git a/tests/testTox21MolNetData.py b/tests/testTox21MolNetData.py index 99424e83..36fcb431 100644 --- a/tests/testTox21MolNetData.py +++ b/tests/testTox21MolNetData.py @@ -37,9 +37,15 @@ def getDataSplitsOverlaps(cls) -> None: processed_path = os.path.join(os.getcwd(), cls.tox21.processed_dir) print(f"Checking Data from - {processed_path}") - train_set = torch.load(os.path.join(processed_path, "train.pt")) - val_set = torch.load(os.path.join(processed_path, "validation.pt")) - test_set = torch.load(os.path.join(processed_path, "test.pt")) + train_set = torch.load( + os.path.join(processed_path, "train.pt"), weights_only=False + ) + val_set = torch.load( + os.path.join(processed_path, "validation.pt"), weights_only=False + ) + test_set = torch.load( + os.path.join(processed_path, "test.pt"), weights_only=False + ) train_smiles, train_smiles_ids = cls.get_features_ids(train_set) val_smiles, val_smiles_ids = cls.get_features_ids(val_set) diff --git a/tutorials/demo_process_results.ipynb b/tutorials/demo_process_results.ipynb index ee0c1ec9..bf7810cc 100644 --- a/tutorials/demo_process_results.ipynb +++ b/tutorials/demo_process_results.ipynb @@ -248,9 +248,9 @@ "# check if pretraining datasets overlap\n", "dm = PubChemDeepSMILES()\n", "processed_path = dm.processed_dir\n", - "test_set = torch.load(os.path.join(processed_path, \"test.pt\"))\n", - "val_set = torch.load(os.path.join(processed_path, \"validation.pt\"))\n", - "train_set = torch.load(os.path.join(processed_path, \"train.pt\"))\n", + "test_set = torch.load(os.path.join(processed_path, \"test.pt\"), weights_only=False)\n", + "val_set = torch.load(os.path.join(processed_path, \"validation.pt\"), weights_only=False)\n", + "train_set = torch.load(os.path.join(processed_path, \"train.pt\"), weights_only=False)\n", "print(processed_path)\n", "test_smiles = [entry[\"features\"] for entry in test_set]\n", "val_smiles = [entry[\"features\"] for entry in val_set]\n", @@ -320,7 +320,7 @@ "data_module_v200 = ChEBIOver100()\n", "data_module_v148 = ChEBIOver100(chebi_version_train=148)\n", "data_module_v227 = ChEBIOver100(chebi_version_train=227)\n", - "# dataset = torch.load(data_path)\n", + "# dataset = torch.load(data_path, weights_only=False)\n", "# processors = [CustomResultsProcessor()]\n", "# factory = ResultFactory(model, data_module, processors)\n", "# factory.execute(data_path)" @@ -653,7 +653,7 @@ " if test_file is None:\n", " test_file = data_module.processed_file_names_dict[\"test\"]\n", " data_path = os.path.join(data_module.processed_dir, test_file)\n", - " data_list = torch.load(data_path)\n", + " data_list = torch.load(data_path, weights_only=False)\n", " preds_list = []\n", " labels_list = []\n", " # if common_classes_mask is not N\n", diff --git a/tutorials/process_results_old_chebi.ipynb b/tutorials/process_results_old_chebi.ipynb index c8af0860..e72baf4c 100644 --- a/tutorials/process_results_old_chebi.ipynb +++ b/tutorials/process_results_old_chebi.ipynb @@ -167,7 +167,7 @@ " if test_file is None:\n", " test_file = data_module.processed_file_names_dict[\"test\"]\n", " data_path = os.path.join(data_module.processed_dir, test_file)\n", - " data_list = torch.load(data_path)\n", + " data_list = torch.load(data_path, weights_only=False)\n", " preds_list = []\n", " labels_list = []\n", "\n", From 242db56e2331a84a569f154a9215961ca210ad78 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 5 Oct 2024 23:48:13 +0200 Subject: [PATCH 066/112] re-order section 3 and 4 as per suggestion --- tutorials/data_exploration_chebi.ipynb | 365 +++++++++++++++---------- 1 file changed, 216 insertions(+), 149 deletions(-) diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb index 6a7e25ed..87818cba 100644 --- a/tutorials/data_exploration_chebi.ipynb +++ b/tutorials/data_exploration_chebi.ipynb @@ -1,8 +1,9 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", + "metadata": {}, "source": [ "# Introduction\n", "\n", @@ -14,40 +15,47 @@ "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", "\n", "---\n" - ], - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "4550d01fc7af5ae4", + "metadata": {}, "source": [ "# 1. Instantiation of a Data Class\n", "\n", "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data." - ], - "id": "4550d01fc7af5ae4" + ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": 1, + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "metadata": {}, "outputs": [], - "execution_count": 18, - "source": "from chebai.preprocessing.datasets.chebi import ChEBIOver50", - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22" + "source": [ + "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:07:26.371796Z", + "start_time": "2024-10-05T21:07:26.058728Z" + } + }, "outputs": [], "source": [ "chebi_class = ChEBIOver50(chebi_version=231)" ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", + "metadata": {}, "source": [ "\n", "### Inheritance Hierarchy\n", @@ -73,12 +81,12 @@ "### Additional Input Parameters\n", "\n", "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" - ], - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", + "metadata": {}, "source": [ "# Available ChEBI Data Classes\n", "\n", @@ -91,8 +99,7 @@ "\n", "Finally, `ChEBIOver50Partial` selects extracts a part of ChEBI based on a given top class, with a threshold of 50 for selecting labels.\n", "This class inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" - ], - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a" + ] }, { "cell_type": "markdown", @@ -103,25 +110,25 @@ ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "ed973fb59df11849", + "metadata": {}, "source": [ "# 2. Preparation / Setup Methods\n", "\n", "Now we have a ChEBI data class with all the relevant parameters. Next, we need to generate the actual dataset." - ], - "id": "ed973fb59df11849" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "d0a58e2bd9c0e6d9", + "metadata": {}, + "outputs": [], "source": [ "chebi_class.prepare_data()\n", "chebi_class.setup()" - ], - "id": "d0a58e2bd9c0e6d9" + ] }, { "cell_type": "markdown", @@ -163,37 +170,10 @@ }, { "cell_type": "markdown", - "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", + "id": "bb6e9a81554368f7", "metadata": {}, "source": [ - "# 3. Different Data Files Created and their Structure\n", - "\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", - "\n", - "### Data Files\n", - "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", - " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", - "\n", - "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", - "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "4. **`classes.txt`**\n", - " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", - "\n", - "5. **`splits.csv`**\n", - " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", - "\n", - "### File Structure and Preprocessing Stages\n", + "# 3. Overview of the 3 preprocessing stages\n", "\n", "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", "\n", @@ -214,34 +194,28 @@ " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", - "\n", "### Summary of File Paths\n", "\n", "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." - ] - }, - { - "cell_type": "markdown", - "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", - "metadata": {}, - "source": [ - "---" + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" ] }, { "cell_type": "markdown", - "id": "74adb549-9e02-472d-a535-78a584853b52", + "id": "7e172c0d1e8bb93f", "metadata": {}, "source": [ - "# 4. Information Stored in the Files\n" + "# 4. Data Files and their structure\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" ] }, { @@ -249,13 +223,10 @@ "id": "43329709-5134-4ce5-88e7-edd2176bf84d", "metadata": {}, "source": [ - "## chebi.obo\n", + "## chebi.obo File\n", "\n", - "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", - "\n", - "### Structure of `chebi.obo`\n", - "\n", - "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "**Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " \n", "\n", "#### Example of a Term Document\n", "\n", @@ -269,6 +240,14 @@ "is_a: CHEBI:33238\n", "```\n", "\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "\n", + "### Structure of `chebi.obo`\n", + "\n", + "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "\n", + "\n", "### Breakdown of Attributes\n", "\n", "Each term document in the `chebi.obo` file consists of the following key attributes:\n", @@ -291,46 +270,46 @@ }, { "cell_type": "markdown", - "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", + "id": "558295e5a7ded456", "metadata": {}, "source": [ - "## `data.pkl` File\n", - "\n", - "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", - "\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the ID of each ChEBI data instance.\n", - "- **Column 1**: Contains the name of each ChEBI data instance.\n", - "- **Column 2**: Contains the SMILES representation of the chemical.\n", - "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", + "## data.pkl File\n", "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + "**Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values." ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 6, "id": "fd490270-59b8-4c1c-8b09-204defddf592", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:09:01.622317Z", + "start_time": "2024-10-05T21:09:01.606698Z" + } + }, "outputs": [], "source": [ - "import pandas as pd" + "import pandas as pd\n", + "import os" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 10, "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:11:51.296162Z", + "start_time": "2024-10-05T21:11:44.559304Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Size of the data (rows x columns): (129184, 1335)\n" + "Size of the data (rows x columns): (185007, 1514)\n" ] }, { @@ -358,23 +337,23 @@ " name\n", " SMILES\n", " 1722\n", + " 2440\n", " 2468\n", " 2571\n", " 2580\n", " 2634\n", " 3098\n", - " 3992\n", " ...\n", - " 143017\n", - " 143212\n", - " 143813\n", - " 146180\n", - " 147334\n", - " 156473\n", - " 166828\n", - " 166904\n", - " 167497\n", - " 167559\n", + " 176910\n", + " 177333\n", + " 183508\n", + " 183509\n", + " 189832\n", + " 189840\n", + " 192499\n", + " 194321\n", + " 197504\n", + " 229684\n", " \n", " \n", " \n", @@ -500,73 +479,91 @@ " \n", " \n", "\n", - "

5 rows × 1335 columns

\n", + "

5 rows × 1514 columns

\n", "" ], "text/plain": [ - " id name SMILES 1722 2468 2571 2580 2634 \\\n", + " id name SMILES 1722 2440 2468 2571 2580 \\\n", "0 33429 monoatomic monoanion [*-] False False False False False \n", "1 30151 aluminide(1-) [Al-] False False False False False \n", "2 16042 halide anion [*-] False False False False False \n", "3 17051 fluoride [F-] False False False False False \n", "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", "\n", - " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", + " 2634 3098 ... 176910 177333 183508 183509 189832 189840 192499 \\\n", "0 False False ... False False False False False False False \n", "1 False False ... False False False False False False False \n", "2 False False ... False False False False False False False \n", "3 False False ... False False False False False False False \n", "4 False False ... False False False False False False False \n", "\n", - " 166904 167497 167559 \n", + " 194321 197504 229684 \n", "0 False False False \n", "1 False False False \n", "2 False False False \n", "3 False False False \n", "4 False False False \n", "\n", - "[5 rows x 1335 columns]" + "[5 rows x 1514 columns]" ] }, - "execution_count": 53, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", + "pkl_df = pd.DataFrame(\n", + " pd.read_pickle(\n", + " os.path.join(\n", + " chebi_class.processed_dir_main,\n", + " chebi_class.processed_dir_main_file_names_dict[\"data\"],\n", + " )\n", + " )\n", + ")\n", "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", "pkl_df.head()" ] }, { "cell_type": "markdown", - "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", + "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", "metadata": {}, "source": [ - "## `data.pt` File\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", "\n", - "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", "\n", - "### Structure of `data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the ID of each ChEBI data instance.\n", + "- **Column 1**: Contains the name of each ChEBI data instance.\n", + "- **Column 2**: Contains the SMILES representation of the chemical.\n", + "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ba019d2d4324bd0b", + "metadata": {}, + "source": [ + "## data.pt File\n", "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library, specifically as a list of dictionaries. Each dictionary in this list includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input." ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 11, "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:12:49.338943Z", + "start_time": "2024-10-05T21:12:49.323319Z" + } + }, "outputs": [], "source": [ "import torch" @@ -574,9 +571,14 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 13, "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:14:12.892845Z", + "start_time": "2024-10-05T21:13:59.859953Z" + } + }, "outputs": [ { "name": "stdout", @@ -587,15 +589,25 @@ } ], "source": [ - "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", + "data_pt = torch.load(\n", + " os.path.join(\n", + " chebi_class.processed_dir, chebi_class.processed_file_names_dict[\"data\"]\n", + " ),\n", + " weights_only=False,\n", + ")\n", "print(\"Type of loaded data:\", type(data_pt))" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 15, "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:14:21.185027Z", + "start_time": "2024-10-05T21:14:21.169358Z" + } + }, "outputs": [ { "name": "stdout", @@ -616,36 +628,61 @@ }, { "cell_type": "markdown", - "id": "861da1c3-0401-49f0-a22f-109814ed95d5", + "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", "metadata": {}, "source": [ - "## `classes.txt` File\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", "\n", - "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", "\n", - "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + "### Structure of `data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "markdown", + "id": "186ec6f0eed6ecf7", + "metadata": {}, + "source": [ + "## classes.txt File\n", + "\n", + "**Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset." ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 16, "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:15:19.146285Z", + "start_time": "2024-10-05T21:15:18.503284Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1722\n", + "2440\n", "2468\n", "2571\n", - "2580\n", - "2634\n" + "2580\n" ] } ], "source": [ - "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", + "with open(os.path.join(chebi_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", " for i in range(5):\n", " line = file.readline()\n", " print(line.strip())" @@ -653,19 +690,37 @@ }, { "cell_type": "markdown", - "id": "b058714f-e434-4367-89b9-74c129ac727f", + "id": "861da1c3-0401-49f0-a22f-109814ed95d5", "metadata": {}, "source": [ - "## `splits.csv` File\n", "\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", + "\n", + "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", + "\n", + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ] + }, + { + "cell_type": "markdown", + "id": "fb72be449e52b63f", + "metadata": {}, + "source": [ + "## splits.csv File\n", + "\n", + "**Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." ] }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 17, "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:15:54.575116Z", + "start_time": "2024-10-05T21:15:53.945139Z" + } + }, "outputs": [ { "data": { @@ -731,16 +786,28 @@ "4 30340 train" ] }, - "execution_count": 98, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", + "csv_df = pd.read_csv(os.path.join(chebi_class.processed_dir_main, \"splits.csv\"))\n", "csv_df.head()" ] }, + { + "cell_type": "markdown", + "id": "b058714f-e434-4367-89b9-74c129ac727f", + "metadata": {}, + "source": [ + "\n", + "\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" + ] + }, { "cell_type": "markdown", "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", From 748eebedc354f64c84932d3d722a4766e41edae5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 6 Oct 2024 12:02:21 +0200 Subject: [PATCH 067/112] GO: reformat section 3 and 4 as per suggestion --- tutorials/data_exploration_go.ipynb | 625 ++++++++++++++++------------ 1 file changed, 364 insertions(+), 261 deletions(-) diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb index 8dc4cb44..e60e972b 100644 --- a/tutorials/data_exploration_go.ipynb +++ b/tutorials/data_exploration_go.ipynb @@ -1,26 +1,67 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "id": "da687d32ba48b188", + "metadata": {}, "source": [ "# Introduction\n", "\n", - "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on Gene Ontology (GO) and Swiss UniProt Protein data. This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the GO-UniProt dataset. It covers\n", + "- how to instantiate a data class and generate data\n", + "- how the data is processed and stored\n", + "- and how to work with different molecule encodings.\n", "\n", - "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", "\n", - "---" - ], - "id": "da687d32ba48b188" + "---\n" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "0bd07c91-bb02-48d4-b759-aa35ecb224bd", + "metadata": {}, "source": [ "# 1. Instantiation of a Data Class\n", "\n", - "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n", + "To start working with `chebai`, you first need to instantiate a GO-UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "440f203ceaf7e4b7", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:03.920610Z", + "start_time": "2024-09-30T21:25:03.622407Z" + } + }, + "outputs": [], + "source": [ + "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a648346d81d0dc5e", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:08.863132Z", + "start_time": "2024-09-30T21:25:08.387739Z" + } + }, + "outputs": [], + "source": [ + "go_class = GOUniProtOver250(go_branch=\"BP\")" + ] + }, + { + "cell_type": "markdown", + "id": "64585012b0d7f66f", + "metadata": {}, + "source": [ "### Inheritance Hierarchy\n", "\n", "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", @@ -37,6 +78,11 @@ "Data classes related to proteins can be configured using the following main parameters:\n", "\n", "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", + " - **`\"BP\"`**: Biological Process branch.\n", + " - **`\"MF\"`**: Molecular Function branch.\n", + " - **`\"CC\"`**: Cellular Component branch.\n", + "\n", + "This allows for more specific datasets focused on a particular aspect of gene function.\n", "\n", "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", "\n", @@ -44,69 +90,52 @@ "\n", "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n", "\n", - "### Available GOUniProt Data Classes\n", - "\n", - "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n", - "\n", - "#### `GOUniProtOver250`\n", "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", + "# Available ChEBI Data Classes\n", "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n", "\n", - "#### `GOUniProtOver50`\n", + "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset.\n", "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", + "Refer `select_classes` method of `_GOUniProtOverX` for more details on selection process.\n", "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n" - ], - "id": "64585012b0d7f66f" + "If you need a different threshold, you can create your own subclass." + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "### Instantiation Example", - "id": "605bbca601037df2" + "id": "651ab5c39833bd2c", + "metadata": {}, + "source": [ + "---" + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:03.920610Z", - "start_time": "2024-09-30T21:25:03.622407Z" - } - }, - "cell_type": "code", - "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250", - "id": "440f203ceaf7e4b7", - "outputs": [], - "execution_count": 12 + "cell_type": "markdown", + "id": "a52b4363-7398-44aa-a4cc-8bba14bdd966", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is to generate the actual dataset." + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:08.863132Z", - "start_time": "2024-09-30T21:25:08.387739Z" - } - }, "cell_type": "code", - "source": "go_class = GOUniProtOver250()", - "id": "a648346d81d0dc5e", + "execution_count": null, + "id": "9f77351090560bc4", + "metadata": {}, "outputs": [], - "execution_count": 13 + "source": [ + "go_class.prepare_data()\n", + "go_class.setup()" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "---", - "id": "651ab5c39833bd2c" - }, - { + "id": "2328e824c4dafb2d", "metadata": {}, - "cell_type": "markdown", "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", "### Automatic Execution: \n", "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", "\n", @@ -130,81 +159,86 @@ " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", "\n", "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ], - "id": "2328e824c4dafb2d" + ] }, { + "cell_type": "markdown", + "id": "db5b58f2d96823fc", "metadata": {}, - "cell_type": "code", "source": [ - "go_class.prepare_data()\n", - "go_class.setup()" - ], - "id": "9f77351090560bc4", - "outputs": [], - "execution_count": null + "---" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "---", - "id": "db5b58f2d96823fc" - }, - { + "id": "ee174b61b36c71aa", "metadata": {}, - "cell_type": "markdown", "source": [ - "# 3. GOUniProt Data File Structure\n", + "# 3. Overview of the 3 preprocessing stages\n", "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", - " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **File**: `go-basic.obo` and `uniprot_sprot.data`\n", + " - **Description**: This stage contains the raw GO ontology data and raw Swiss-UniProt data, serving as the initial input for further processing.\n", " - **File Paths**:\n", - " - `data/GO_UniProt/raw/${filename}.obo`\n", - " - `data/GO_UniProt/raw/${filename}.dat`\n", + " - `data/GO_UniProt/raw/go-basic.obo`\n", + " - `data/GO_UniProt/raw/uniprot_sprot.dat`\n", "\n", - "2. **`data.pkl`**\n", - " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains sequence strings, class columns, and metadata but lacks data splits.\n", " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", "\n", - "4. **`classes.txt`**\n", - " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n", "\n", - "5. **`splits.csv`**\n", - " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", + "### Summary of File Paths\n", "\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" - ], - "id": "ee174b61b36c71aa" + "- **Raw Data**: `data/GO_UniProt/raw`\n", + "- **Processed Data 1**: `data/GO_UniProt/${dataset_name}/processed`\n", + "- **Processed Data 2**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}`\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "---", - "id": "a927ad484c930960" + "id": "a927ad484c930960", + "metadata": {}, + "source": [ + "---" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "# 4. Information Stored in the Files", - "id": "3f92b58e460c08fd" + "id": "3f92b58e460c08fd", + "metadata": {}, + "source": [ + "# 4. Data Files and their structure\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "cca75d881cb8bade", + "metadata": {}, "source": [ - "## go-basic.obo\n", + "## go-basic.obo File\n", "\n", - "The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", - "\n", - "### Structure of `go-basic.obo`\n", - "\n", - "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", + "**Description**: The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", "\n", "#### Example of a Term Document\n", "\n", @@ -219,6 +253,14 @@ "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n", "```\n", "\n", + "**File Path**: `data/GO_UniProt/raw/go-basic.obo`\n", + "\n", + "### Structure of `go-basic.obo`\n", + "\n", + "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", + "\n", + "\n", + "\n", "### Breakdown of Attributes\n", "\n", "Each term document in the `go-basic.obo` file consists of the following key attributes:\n", @@ -240,22 +282,18 @@ "- **`is_a: GO:0006057`**: \n", " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n", " - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n" - ], - "id": "cca75d881cb8bade" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "87c841de7d80beef", + "metadata": {}, "source": [ - "## uniprot_sprot.dat\n", + "## uniprot_sprot.dat File\n", "\n", - "The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", + "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", "\n", "\n", - "## Structure of `uniprot_sprot.dat`\n", - "\n", - "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", - "\n", "### Example of a Protein Entry\n", "\n", "```plaintext\n", @@ -302,6 +340,13 @@ "//\n", "```\n", "\n", + "**File Path**: `data/GO_UniProt/raw/uniprot_sprot.dat`\n", + "\n", + "\n", + "## Structure of `uniprot_sprot.dat`\n", + "\n", + "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", + "\n", "### Breakdown of Attributes\n", "\n", "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n", @@ -341,107 +386,56 @@ "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", "\n", "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", - "\n" - ], - "id": "87c841de7d80beef" + "\n", + "__Note__: For more details on evidence codes check section 5.2" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "b7687078-f6b8-4fbf-afa7-dfda89061a5e", + "metadata": {}, "source": [ - "## data.pkl\n", - "\n", - "The `data.pkl` file, generated during the preprocessing stage, contains the processed GO data in a dataframe format. Below is an example of how this data is structured:\n", - "\n", + "## data.pkl File\n", "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n", - "- **Column 1**: Contains the accession of each Protein data instance.\n", - "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n", - "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n", - "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ], - "id": "735844f0b2474ad6" + "**Description**: This file is generated by the `prepare_data` method and contains the processed GO data in a dataframe format. It includes protein IDs, data representations (such as sequence strings), and class columns with boolean values." + ] }, { + "cell_type": "code", + "execution_count": 5, + "id": "b4da7e73e251e1d1", "metadata": { "ExecuteTime": { "end_time": "2024-09-30T14:08:33.990378Z", "start_time": "2024-09-30T14:08:33.959459Z" } }, - "cell_type": "code", - "source": "import pandas as pd", - "id": "b4da7e73e251e1d1", "outputs": [], - "execution_count": 3 + "source": [ + "import pandas as pd\n", + "import os" + ] }, { + "cell_type": "code", + "execution_count": 8, + "id": "b66fbb9b720d053c", "metadata": { "ExecuteTime": { "end_time": "2024-09-30T14:10:12.796911Z", "start_time": "2024-09-30T14:10:06.052276Z" } }, - "cell_type": "code", - "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ], - "id": "b66fbb9b720d053c", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Size of the data (rows x columns): (27459, 1050)\n" + "Size of the data (rows x columns): (32933, 1049)\n" ] }, { "data": { - "text/plain": [ - " swiss_id accession \\\n", - "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", - "9 14331_CAEEL P41932,Q21537 \n", - "10 14331_MAIZE P49106 \n", - "13 14332_MAIZE Q01526 \n", - "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", - "\n", - " go_ids \\\n", - "8 [19222] \n", - "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", - "10 [3677, 5634, 10468, 44877] \n", - "13 [3677, 5634, 10468, 44877] \n", - "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", - "\n", - " sequence 41 75 122 \\\n", - "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", - "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", - "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", - "\n", - " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", - "8 False False False ... False False False False False \n", - "9 False False False ... False False False False False \n", - "10 False False False ... False False False False False \n", - "13 False False False ... False False False False False \n", - "14 False False False ... False False False False False \n", - "\n", - " 2000377 2001020 2001141 2001233 2001234 \n", - "8 False False False False False \n", - "9 False False False False False \n", - "10 False False False False False \n", - "13 False False False False False \n", - "14 False False False False False \n", - "\n", - "[5 rows x 1050 columns]" - ], "text/html": [ "
\n", "