zenml-io · safoinme · Nov 15, 2023 · Nov 13, 2023 · Nov 13, 2023 · Nov 14, 2023
diff --git a/.github/actions/nlp_template_test/action.yml b/.github/actions/nlp_template_test/action.yml
@@ -69,17 +69,23 @@ runs:
     - name: Concatenate requirements
       shell: bash
       run: |
-        zenml integration export-requirements -o ./local_checkout/integration-requirements.txt sklearn mlflow s3 kubernetes kubeflow slack evidently
+        zenml integration export-requirements -o ./local_checkout/integration-requirements.txt mlflow s3 kubernetes kubeflow discord aws huggingface pytorch
         cat ./local_checkout/requirements.txt ./local_checkout/test-requirements.txt ./local_checkout/integration-requirements.txt >> ./local_checkout/all-requirements.txt
 
     - name: Install requirements
       shell: bash
       run: |
         pip install -r ./local_checkout/all-requirements.txt
+        pip install accelerate torchvision
 
     - name: Run pytests
       shell: bash
       env:
         ZENML_STACK_NAME: ${{ inputs.stack-name }}
       run: |
         pytest ./local_checkout/tests
+
+    - name: Clean-up
+      shell: bash
+      run: |
+        rm -rf ./local_checkout
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
   workflow_call:
   push:
-    branches: ["main", "develop"]
+    branches: ["main"]
     paths-ignore: ["README.md"]
   pull_request:
     paths-ignore: ["README.md"]
@@ -35,3 +35,5 @@ jobs:
         with:
           stack-name: ${{ matrix.stack-name }}
           python-version: ${{ matrix.python-version }}
+          ref-zenml: develop
+          ref-template: ${{ github.ref }}
diff --git a/.github/workflows/image-optimizer.yml b/.github/workflows/image-optimizer.yml
@@ -0,0 +1,26 @@
+name: Compress Images
+on:
+  pull_request:
+    # Run Image Actions when JPG, JPEG, PNG or WebP files are added or changed.
+    # See https://help.github.com/en/actions/automating-your-workflow-with-github-actions/workflow-syntax-for-github-actions#onpushpull_requestpaths for reference.
+    paths:
+      - '**.jpg'
+      - '**.jpeg'
+      - '**.png'
+      - '**.webp'
+jobs:
+  build:
+    # Only run on non-draft PRs within the same repository.
+    if: github.event.pull_request.head.repo.full_name == github.repository && github.event.pull_request.draft == false
+    name: calibreapp/image-actions
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Compress Images
+        uses: calibreapp/image-actions@main
+        with:
+          # The `GITHUB_TOKEN` is automatically generated by GitHub and scoped only to the repository that is currently running the action. By default, the action can’t update Pull Requests initiated from forked repositories.
+          # See https://docs.github.com/en/actions/reference/authentication-in-a-workflow and https://help.github.com/en/articles/virtual-environments-for-github-actions#token-permissions
+          githubToken: ${{ secrets.GITHUB_TOKEN }}
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ The template can be configured using the following parameters:
 | Deploy to HuggingFace | Whether to deploy to HuggingFace Hub | False |
 | Deploy to SkyPilot | Whether to deploy to SkyPilot | False |
 | Dataset | The dataset to use from HuggingFace Datasets | airline_reviews |
-| Model | The model to use from HuggingFace Models | roberta-base |
+| Model | The model to use from HuggingFace Models | distilbert-base-uncased |
 | Cloud Provider | The cloud provider to use (AWS or GCP) | aws |
 | Metric-Based Promotion | Whether to promote models based on metrics | True |
 | Notifications on Failure | Whether to notify about pipeline failures | True |
@@ -66,6 +66,10 @@ For more details, check the `README.md` file in the generated project directory.
 
 This NLP project template includes three main pipelines:
 
+<p align="center">
+  <img height=500 src="assets/full_template.png">
+</p>
+
 ### Training Pipeline
 
 The training pipeline is designed to handle the end-to-end process of training an NLP model. It includes steps for data loading, tokenization, model training, and model registration. The pipeline is parameterized to allow for customization of the training process, such as sequence length, batch size, and learning rate.
@@ -113,24 +117,17 @@ The training pipeline is the heart of the NLP project. It is responsible for pre
 
 The training pipeline is configured using the `{{product_name}}_training_pipeline` function, which includes steps for data loading, tokenization, model training, and model registration. The pipeline can be customized with parameters such as `lower_case`, `padding`, `max_seq_length`, and others to tailor the tokenization and training process to your specific NLP use case.
 
-### Training Pipeline: Data and Tokenization
+### Training Pipeline
 
-[📂 Code folder](template/steps/data_tokenization/)
+[📂 Code folder](template/steps/model_training/)
 <p align="center">
-  <img height=500 src="assets/nlp_data_tokenization.png">
+  <img height=500 src="assets/training_pipeline.png">
 </p>
 
 The first stage of the training pipeline involves loading the dataset and preparing it for the model. The `data_loader` step fetches the dataset, which is then passed to the `tokenizer_loader` and `tokenization_step` to convert the raw text data into a format suitable for the NLP model.
 
 Tokenization is a critical step in NLP pipelines, as it converts text into tokens that the model can understand. The tokenizer can be configured to handle case sensitivity, padding strategies, and sequence lengths, ensuring that the input data is consistent and optimized for training.
 
-### Training Pipeline: Model Training
-
-[📂 Code folder](template/steps/model_training/)
-<p align="center">
-  <img height=500 src="assets/nlp_model_training.png">
-</p>
-
 Once the data is tokenized, the `model_trainer` step takes over to train the NLP model. This step utilizes the tokenized dataset and the tokenizer itself to fine-tune the model on the specific task, such as sentiment analysis, text classification, or named entity recognition.
 
 The model training step can be configured with parameters like `train_batch_size`, `eval_batch_size`, `num_epochs`, `learning_rate`, and `weight_decay` to control the training process. After training, the model is evaluated, and if it meets the quality criteria, it is registered in the model registry with a unique name.
@@ -139,7 +136,7 @@ The model training step can be configured with parameters like `train_batch_size
 
 [📂 Code folder](template/steps/promotion/)
 <p align="center">
-  <img height=500 src="assets/nlp_promotion.png">
+  <img height=500 src="assets/promote_pipeline.png">
 </p>
 
 The promotion pipeline is responsible for promoting the best model to the chosen stage, such as Production or Staging. The pipeline can be configured to promote models based on metric comparison or simply promote the latest model version.
@@ -150,7 +147,7 @@ The `{{product_name}}_promote_pipeline` function orchestrates the promotion proc
 
 [📂 Code folder](template/steps/deployment/)
 <p align="center">
-  <img height=500 src="assets/nlp_deployment.png">
+  <img height=500 src="assets/deploy_pipeline.png">
 </p>
 
 The deployment pipeline handles the deployment of the model to various environments. It can be configured to deploy locally, to HuggingFace Hub, or to SkyPilot, depending on the project's needs.

diff --git a/assets/deploy_pipeline.png b/assets/deploy_pipeline.png
diff --git a/assets/full_template.png b/assets/full_template.png
diff --git a/assets/promote_pipeline.png b/assets/promote_pipeline.png
diff --git a/assets/training_pipeline.png b/assets/training_pipeline.png
diff --git a/copier.yml b/copier.yml
@@ -64,7 +64,11 @@ accelerator:
     choices:
         - gpu
         - cpu
-    default: gpu
+    default: cpu
+sample_rate:
+    type: bool
+    help: "Whether to use a sample of the dataset for quick iteration"
+    default: False
 deploy_locally:
     type: bool
     help: "Whether to deploy locally"
@@ -91,8 +95,8 @@ model:
     choices:
         - bert-base-uncased
         - roberta-base
-        - distilbert-base-cased
-    default: roberta-base
+        - distilbert-base-uncased
+    default: distilbert-base-uncased
 cloud_of_choice:
     type: str
     help: "Whether to use AWS cloud provider or GCP"

diff --git a/template/config.yaml b/template/config.yaml
@@ -26,7 +26,7 @@ settings:
       - zenml[server]
 
 extra:
-  mlflow_model_name: nlp_use_case_model
+  mlflow_model_name: sentiment_analysis
 {%- if target_environment == 'production' %}
   target_env: production
 {%- else %}

diff --git a/template/pipelines/training.py b/template/pipelines/training.py
@@ -87,7 +87,7 @@ def {{product_name}}_training_pipeline(
     register_model(
         model=model,
         tokenizer=tokenizer,
-        mlflow_model_name="{{product_name}}_model",
+        mlflow_model_name="sentiment_analysis",
     )
 
     notify_on_success(after=["register_model"])

diff --git a/template/run.py b/template/run.py
@@ -186,7 +186,6 @@ def main(
             name=zenml_model_name,
             license="{{open_source_license}}",
             description="Show case Model Control Plane.",
-            create_new_model_version=True,
             delete_new_version_on_failure=True,
             tags=["sentiment_analysis", "huggingface"],
         )
@@ -202,7 +201,7 @@ def main(
     # Execute Promoting Pipeline
     if promoting_pipeline:
         run_args_promoting = {}
-        model_config = ModelConfig(name=zenml_model_name)
+        model_config = ModelConfig(name=zenml_model_name, version=ModelStages.LATEST)
         pipeline_args["model_config"] = model_config
         pipeline_args[
             "run_name"

diff --git a/template/steps/dataset_loader/data_loader.py b/template/steps/dataset_loader/data_loader.py
@@ -4,6 +4,9 @@
 from datasets import load_dataset, DatasetDict
 from zenml import step
 from zenml.logger import get_logger
+{%- if sample_rate %}
+import numpy as np
+{%- endif %}
 
 logger = get_logger(__name__)
 
@@ -41,6 +44,19 @@ def data_loader(
     dataset = dataset.remove_columns(["airline_sentiment_confidence","negativereason_confidence"])
     {%- endif %}
 
+    {%- if sample_rate %}
+    # Sample 20% of the data randomly for the demo
+    def sample_dataset(dataset, sample_rate=0.2):
+        sampled_dataset = DatasetDict()
+        for split in dataset.keys():
+            split_size = len(dataset[split])
+            indices = np.random.choice(split_size, int(split_size * sample_rate), replace=False)
+            sampled_dataset[split] = dataset[split].select(indices)
+        return sampled_dataset
+
+    dataset = sample_dataset(dataset)
+    {%- endif %}
+
     # Log the dataset and sample examples
     logger.info(dataset)
     logger.info(f"Sample Example 1 : {dataset['train'][0]['text']} with label {dataset['train'][0]['label']}")

diff --git a/template/steps/deploying/save_model.py b/template/steps/deploying/save_model.py
@@ -3,7 +3,6 @@
 
 from zenml import get_step_context, step
 from zenml.client import Client
-from zenml.enums import ModelStages
 from zenml.logger import get_logger
 
 # Initialize logger

diff --git a/template/steps/deploying/{% if deploy_locally %}local_deployment.py{% endif %} b/template/steps/deploying/{% if deploy_locally %}local_deployment.py{% endif %}
@@ -40,7 +40,7 @@ def deploy_locally(
         The process ID of the Gradio app.
     """
     ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
-    def start_gradio_app(command: list[str]) -> int:
+    def start_gradio_app(command: List[str]) -> int:
         """
         Start the Gradio app in a separate process.
 

diff --git a/template/steps/promotion/{% if not metric_compare_promotion %}promote_current.py{% endif %} b/template/steps/promotion/{% if not metric_compare_promotion %}promote_current.py{% endif %}
@@ -20,6 +20,7 @@ def promote_current():
     """
 
     ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
+    pipeline_extra = get_step_context().pipeline_run.config.extra
     logger.info(f"Promoting current model version")
     model_config = get_step_context().model_config
     model_version = model_config._get_model_version()

diff --git a/template/steps/registrer/model_log_register.py b/template/steps/registrer/model_log_register.py
@@ -32,7 +32,7 @@
 def register_model(
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizerBase,
-    mlflow_model_name: Optional[str] = "model",
+    mlflow_model_name: Optional[str] = "sentiment_analysis",
 ):
     """
     Register model to MLFlow.

diff --git a/template/steps/training/model_trainer.py b/template/steps/training/model_trainer.py
@@ -46,7 +46,7 @@ def model_trainer(
     load_best_model_at_end: Optional[bool] = True,
     eval_batch_size: Optional[int] = 16,
     weight_decay: Optional[float] = 0.01,
-    mlflow_model_name: Optional[str] = "model",
+    mlflow_model_name: Optional[str] = "sentiment_analysis",
 ) -> Tuple[Annotated[PreTrainedModel, "model", ModelArtifactConfig(overwrite=True)], Annotated[PreTrainedTokenizerBase, "tokenizer", ModelArtifactConfig(overwrite=True)]]:
     """
     Configure and train a model on the training dataset.
@@ -105,7 +105,7 @@ def model_trainer(
         evaluation_strategy='steps',
         save_strategy='steps',
         save_steps=1000,
-        eval_steps=200,
+        eval_steps=100,
         logging_steps=logging_steps,
         save_total_limit=5,
         report_to="mlflow",

diff --git a/template/utils/misc.py b/template/utils/misc.py
@@ -1,12 +1,12 @@
 # {% include 'template/license_header' %}
 
-from typing import Dict
+from typing import Dict, Tuple, List
 
 import numpy as np
 from datasets import load_metric
 
 
-def compute_metrics(eval_pred: tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
+def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
     """Compute the metrics for the model.
 
     Args:
@@ -34,7 +34,7 @@ def compute_metrics(eval_pred: tuple[np.ndarray, np.ndarray]) -> Dict[str, float
     }
 
 
-def find_max_length(dataset: list[str]) -> int:
+def find_max_length(dataset: List[str]) -> int:
     """Find the maximum length of the dataset.
 
     The dataset is a list of strings which are the text samples.