From 44f28d8b4d901bb438648a005a768b3d9c2eddb8 Mon Sep 17 00:00:00 2001 From: Safoine El Khabich <34200873+safoinme@users.noreply.github.com> Date: Tue, 14 Nov 2023 14:20:11 +0100 Subject: [PATCH] add sample_rate to accelrate training --- README.md | 2 +- copier.yml | 8 ++++++-- template/steps/dataset_loader/data_loader.py | 16 ++++++++++++++++ template/steps/training/model_trainer.py | 2 +- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b8a9a56..b27591c 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ The template can be configured using the following parameters: | Deploy to HuggingFace | Whether to deploy to HuggingFace Hub | False | | Deploy to SkyPilot | Whether to deploy to SkyPilot | False | | Dataset | The dataset to use from HuggingFace Datasets | airline_reviews | -| Model | The model to use from HuggingFace Models | roberta-base | +| Model | The model to use from HuggingFace Models | distilbert-base-uncased | | Cloud Provider | The cloud provider to use (AWS or GCP) | aws | | Metric-Based Promotion | Whether to promote models based on metrics | True | | Notifications on Failure | Whether to notify about pipeline failures | True | diff --git a/copier.yml b/copier.yml index e073e48..e87d2a7 100644 --- a/copier.yml +++ b/copier.yml @@ -65,6 +65,10 @@ accelerator: - gpu - cpu default: gpu +sample_rate: + type: bool + help: "Whether to use a sample of the dataset for quick iteration" + default: False deploy_locally: type: bool help: "Whether to deploy locally" @@ -91,8 +95,8 @@ model: choices: - bert-base-uncased - roberta-base - - distilbert-base-cased - default: roberta-base + - distilbert-base-uncased + default: distilbert-base-uncased cloud_of_choice: type: str help: "Whether to use AWS cloud provider or GCP" diff --git a/template/steps/dataset_loader/data_loader.py b/template/steps/dataset_loader/data_loader.py index 9c6cda7..529da76 100644 --- a/template/steps/dataset_loader/data_loader.py +++ b/template/steps/dataset_loader/data_loader.py @@ -4,6 +4,9 @@ from datasets import load_dataset, DatasetDict from zenml import step from zenml.logger import get_logger +{%- if sample_rate %} +import numpy as np +{%- endif %} logger = get_logger(__name__) @@ -41,6 +44,19 @@ def data_loader( dataset = dataset.remove_columns(["airline_sentiment_confidence","negativereason_confidence"]) {%- endif %} + {%- if sample_rate %} + # Sample 20% of the data randomly for the demo + def sample_dataset(dataset, sample_rate=0.2): + sampled_dataset = DatasetDict() + for split in dataset.keys(): + split_size = len(dataset[split]) + indices = np.random.choice(split_size, int(split_size * sample_rate), replace=False) + sampled_dataset[split] = dataset[split].select(indices) + return sampled_dataset + + dataset = sample_dataset(dataset) + {%- endif %} + # Log the dataset and sample examples logger.info(dataset) logger.info(f"Sample Example 1 : {dataset['train'][0]['text']} with label {dataset['train'][0]['label']}") diff --git a/template/steps/training/model_trainer.py b/template/steps/training/model_trainer.py index d5b5143..9fcbbea 100644 --- a/template/steps/training/model_trainer.py +++ b/template/steps/training/model_trainer.py @@ -105,7 +105,7 @@ def model_trainer( evaluation_strategy='steps', save_strategy='steps', save_steps=1000, - eval_steps=200, + eval_steps=100, logging_steps=logging_steps, save_total_limit=5, report_to="mlflow",