From 5d2a2ee99bafd58a35398997861fb3b46b1573fa Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 17 Apr 2024 16:24:35 +0200
Subject: [PATCH 01/28] multi GPU with PEFT on LLM

---
 .../configs/default_finetune.yaml             |   1 +
 .../tiny_accelerated_bf16_finetune.yaml       |  56 ++++
 .../configs/tiny_accelerated_finetune.yaml    |  56 ++++
 .../configs/tiny_local_bf16_finetune.yaml     |  55 ++++
 .../configs/tiny_local_finetune.yaml          |  55 ++++
 llm-lora-finetuning/pipelines/__init__.py     |  16 +
 llm-lora-finetuning/pipelines/train.py        |   4 +-
 llm-lora-finetuning/requirements.txt          |   3 +-
 llm-lora-finetuning/scripts/__init__.py       |  16 +
 llm-lora-finetuning/scripts/finetune.py       | 296 ++++++++++++++++++
 llm-lora-finetuning/steps/evaluate_model.py   |  10 +-
 llm-lora-finetuning/steps/finetune.py         | 129 ++++----
 llm-lora-finetuning/steps/prepare_datasets.py |  12 +-
 llm-lora-finetuning/steps/promote.py          |   2 +-
 llm-lora-finetuning/utils/__init__.py         |  16 +
 llm-lora-finetuning/utils/callbacks.py        |  22 +-
 llm-lora-finetuning/utils/loaders.py          |  31 +-
 llm-lora-finetuning/utils/logging.py          |   3 +-
 18 files changed, 697 insertions(+), 86 deletions(-)
 create mode 100644 llm-lora-finetuning/configs/tiny_accelerated_bf16_finetune.yaml
 create mode 100644 llm-lora-finetuning/configs/tiny_accelerated_finetune.yaml
 create mode 100644 llm-lora-finetuning/configs/tiny_local_bf16_finetune.yaml
 create mode 100644 llm-lora-finetuning/configs/tiny_local_finetune.yaml
 create mode 100644 llm-lora-finetuning/pipelines/__init__.py
 create mode 100644 llm-lora-finetuning/scripts/__init__.py
 create mode 100644 llm-lora-finetuning/scripts/finetune.py
 create mode 100644 llm-lora-finetuning/utils/__init__.py

diff --git a/llm-lora-finetuning/configs/default_finetune.yaml b/llm-lora-finetuning/configs/default_finetune.yaml
index 49174106..83dbc362 100644
--- a/llm-lora-finetuning/configs/default_finetune.yaml
+++ b/llm-lora-finetuning/configs/default_finetune.yaml
@@ -46,6 +46,7 @@ steps:
     parameters:
       max_steps: 300
       eval_steps: 100
+      bf16: False
 
   promote:
     parameters:
diff --git a/llm-lora-finetuning/configs/tiny_accelerated_bf16_finetune.yaml b/llm-lora-finetuning/configs/tiny_accelerated_bf16_finetune.yaml
new file mode 100644
index 00000000..82a822c6
--- /dev/null
+++ b/llm-lora-finetuning/configs/tiny_accelerated_bf16_finetune.yaml
@@ -0,0 +1,56 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model:
+  name: llm-peft-mistralai-Mistral-7B-v0.1
+  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  tags:
+    - llm
+    - peft
+    - mistralai/Mistral-7B-v0.1
+  version: 50_steps_accelerate
+
+settings:
+  docker:
+    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+    requirements: requirements.txt
+
+parameters:
+  base_model_id: mistralai/Mistral-7B-v0.1
+  system_prompt: |
+    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+
+steps:
+  prepare_data:
+    parameters:
+      dataset_name: gem/viggo
+
+  finetune:
+    enable_step_logs: False
+    parameters:
+      max_steps: 50
+      eval_steps: 50
+      bf16: True
+      use_accelerate: True
+      
+
+  promote:
+    parameters:
+      metric: rouge2
+      target_stage: staging
diff --git a/llm-lora-finetuning/configs/tiny_accelerated_finetune.yaml b/llm-lora-finetuning/configs/tiny_accelerated_finetune.yaml
new file mode 100644
index 00000000..7669354d
--- /dev/null
+++ b/llm-lora-finetuning/configs/tiny_accelerated_finetune.yaml
@@ -0,0 +1,56 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model:
+  name: llm-peft-mistralai-Mistral-7B-v0.1
+  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  tags:
+    - llm
+    - peft
+    - mistralai/Mistral-7B-v0.1
+  version: 50_steps_accelerate
+
+settings:
+  docker:
+    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+    requirements: requirements.txt
+
+parameters:
+  base_model_id: mistralai/Mistral-7B-v0.1
+  system_prompt: |
+    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+
+steps:
+  prepare_data:
+    parameters:
+      dataset_name: gem/viggo
+
+  finetune:
+    enable_step_logs: False
+    parameters:
+      max_steps: 50
+      eval_steps: 50
+      bf16: False
+      use_accelerate: True
+      
+
+  promote:
+    parameters:
+      metric: rouge2
+      target_stage: staging
diff --git a/llm-lora-finetuning/configs/tiny_local_bf16_finetune.yaml b/llm-lora-finetuning/configs/tiny_local_bf16_finetune.yaml
new file mode 100644
index 00000000..7d25de39
--- /dev/null
+++ b/llm-lora-finetuning/configs/tiny_local_bf16_finetune.yaml
@@ -0,0 +1,55 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model:
+  name: llm-peft-mistralai-Mistral-7B-v0.1
+  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  tags:
+    - llm
+    - peft
+    - mistralai/Mistral-7B-v0.1
+  version: 50_steps
+
+settings:
+  docker:
+    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+    requirements: requirements.txt
+
+parameters:
+  base_model_id: mistralai/Mistral-7B-v0.1
+  system_prompt: |
+    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+
+steps:
+  prepare_data:
+    parameters:
+      dataset_name: gem/viggo
+
+  finetune:
+    enable_step_logs: False
+    parameters:
+      max_steps: 50
+      eval_steps: 50
+      bf16: True
+      
+
+  promote:
+    parameters:
+      metric: rouge2
+      target_stage: staging
diff --git a/llm-lora-finetuning/configs/tiny_local_finetune.yaml b/llm-lora-finetuning/configs/tiny_local_finetune.yaml
new file mode 100644
index 00000000..dcfc4898
--- /dev/null
+++ b/llm-lora-finetuning/configs/tiny_local_finetune.yaml
@@ -0,0 +1,55 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model:
+  name: llm-peft-mistralai-Mistral-7B-v0.1
+  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  tags:
+    - llm
+    - peft
+    - mistralai/Mistral-7B-v0.1
+  version: 50_steps
+
+settings:
+  docker:
+    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+    requirements: requirements.txt
+
+parameters:
+  base_model_id: mistralai/Mistral-7B-v0.1
+  system_prompt: |
+    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+
+steps:
+  prepare_data:
+    parameters:
+      dataset_name: gem/viggo
+
+  finetune:
+    enable_step_logs: False
+    parameters:
+      max_steps: 50
+      eval_steps: 50
+      bf16: False
+      
+
+  promote:
+    parameters:
+      metric: rouge2
+      target_stage: staging
diff --git a/llm-lora-finetuning/pipelines/__init__.py b/llm-lora-finetuning/pipelines/__init__.py
new file mode 100644
index 00000000..757bd841
--- /dev/null
+++ b/llm-lora-finetuning/pipelines/__init__.py
@@ -0,0 +1,16 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/llm-lora-finetuning/pipelines/train.py b/llm-lora-finetuning/pipelines/train.py
index 3e339972..3f9c6508 100644
--- a/llm-lora-finetuning/pipelines/train.py
+++ b/llm-lora-finetuning/pipelines/train.py
@@ -26,9 +26,9 @@
 
 
 @pipeline
-def llm_peft_full_finetune(system_prompt:str, base_model_id:str):
+def llm_peft_full_finetune(system_prompt: str, base_model_id: str):
     """Pipeline for finetuning an LLM with peft.
-    
+
     It will run the following steps:
 
     - configure: set the system prompt and base model id
diff --git a/llm-lora-finetuning/requirements.txt b/llm-lora-finetuning/requirements.txt
index efd793bb..3d884b69 100644
--- a/llm-lora-finetuning/requirements.txt
+++ b/llm-lora-finetuning/requirements.txt
@@ -3,8 +3,9 @@ torch>=2.2.0
 datasets
 transformers
 peft
-bitsandbytes==0.41.0
+bitsandbytes==0.41.3
 scipy
 evaluate
 rouge_score
 nltk
+accelerate==0.29.2
\ No newline at end of file
diff --git a/llm-lora-finetuning/scripts/__init__.py b/llm-lora-finetuning/scripts/__init__.py
new file mode 100644
index 00000000..757bd841
--- /dev/null
+++ b/llm-lora-finetuning/scripts/__init__.py
@@ -0,0 +1,16 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/llm-lora-finetuning/scripts/finetune.py b/llm-lora-finetuning/scripts/finetune.py
new file mode 100644
index 00000000..25e3545a
--- /dev/null
+++ b/llm-lora-finetuning/scripts/finetune.py
@@ -0,0 +1,296 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pathlib import Path
+from typing import List
+
+import click
+import transformers
+from datasets import load_from_disk
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@click.command(
+    help="Technical wrapper to pass into the `accelerate launch` command."
+)
+@click.option(
+    "--base-model-id",
+    type=str,
+    help="The base model id to use.",
+)
+@click.option(
+    "--dataset-dir",
+    type=str,
+    help="The path to the dataset directory.",
+)
+@click.option(
+    "--max-steps",
+    type=int,
+    default=100,
+    help="The maximum number of steps to train for.",
+)
+@click.option(
+    "--logging-steps",
+    type=int,
+    default=50,
+    help="The number of steps to log at.",
+)
+@click.option(
+    "--eval-steps",
+    type=int,
+    default=50,
+    help="The number of steps to log at.",
+)
+@click.option(
+    "--save-steps",
+    type=int,
+    default=50,
+    help="The number of steps to log at.",
+)
+@click.option(
+    "--optimizer",
+    type=str,
+    default="paged_adamw_8bit",
+    help="The optimizer to use.",
+)
+@click.option(
+    "--lr",
+    type=float,
+    default=2.5e-5,
+    help="The learning rate to use.",
+)
+@click.option(
+    "--per-device-train-batch-size",
+    type=int,
+    default=2,
+    help="The batch size to use for training.",
+)
+@click.option(
+    "--gradient-accumulation-steps",
+    type=int,
+    default=4,
+    help="The number of gradient accumulation steps.",
+)
+@click.option(
+    "--warmup-steps",
+    type=int,
+    default=5,
+    help="The number of warmup steps.",
+)
+@click.option(
+    "--bf16",
+    is_flag=True,
+    default=False,
+    help="Use bf16 for training.",
+)
+@click.option(
+    "--use-accelerate",
+    is_flag=True,
+    default=False,
+    help="Use accelerate for training.",
+)
+@click.option(
+    "--label-names",
+    "-l",
+    help="The label names to use.",
+    type=str,
+    required=False,
+    multiple=True,
+)
+@click.option(
+    "--ft-model-dir",
+    type=str,
+    default="",
+    help="The path to the finetuned model directory.",
+)
+def cli_wrapper(
+    base_model_id: str,
+    dataset_dir: str,
+    max_steps: int = 100,
+    logging_steps: int = 50,
+    eval_steps: int = 50,
+    save_steps: int = 50,
+    optimizer: str = "paged_adamw_8bit",
+    lr: float = 2.5e-5,
+    per_device_train_batch_size: int = 2,
+    gradient_accumulation_steps: int = 4,
+    warmup_steps: int = 5,
+    bf16: bool = False,
+    use_accelerate: bool = False,
+    label_names: List[str] = None,
+    ft_model_dir: str = "",
+) -> Path:
+    dataset_dir = Path(dataset_dir)
+    if ft_model_dir:
+        ft_model_dir = Path(ft_model_dir)
+    else:
+        ft_model_dir = None
+
+    return accelerated_finetune(
+        base_model_id=base_model_id,
+        dataset_dir=dataset_dir,
+        max_steps=max_steps,
+        logging_steps=logging_steps,
+        eval_steps=eval_steps,
+        save_steps=save_steps,
+        optimizer=optimizer,
+        lr=lr,
+        per_device_train_batch_size=per_device_train_batch_size,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        warmup_steps=warmup_steps,
+        bf16=bf16,
+        use_accelerate=use_accelerate,
+        label_names=list(label_names),
+        ft_model_dir=ft_model_dir,
+    )
+
+
+def accelerated_finetune(
+    base_model_id: str,
+    dataset_dir: Path,
+    max_steps: int = 100,
+    logging_steps: int = 50,
+    eval_steps: int = 50,
+    save_steps: int = 50,
+    optimizer: str = "paged_adamw_8bit",
+    lr: float = 2.5e-5,
+    per_device_train_batch_size: int = 2,
+    gradient_accumulation_steps: int = 4,
+    warmup_steps: int = 5,
+    bf16: bool = True,
+    use_accelerate: bool = False,
+    label_names: List[str] = None,
+    ft_model_dir: Path = None,
+) -> Path:
+    """Finetune the model using PEFT.
+
+    It can be run with accelerate or without.
+
+    Args:
+        base_model_id: The base model id to use.
+        dataset_dir: The path to the dataset directory.
+        max_steps: The maximum number of steps to train for.
+        logging_steps: The number of steps to log at.
+        eval_steps: The number of steps to evaluate at.
+        save_steps: The number of steps to save at.
+        optimizer: The optimizer to use.
+        lr: The learning rate to use.
+        per_device_train_batch_size: The batch size to use for training.
+        gradient_accumulation_steps: The number of gradient accumulation steps.
+        warmup_steps: The number of warmup steps.
+        bf16: Whether to use bf16.
+        use_accelerate: Whether to use accelerate.
+        label_names: The label names to use.
+        ft_model_dir: The path to the finetuned model directory.
+
+    Returns:
+        The path to the finetuned model directory.
+    """
+    import sys
+
+    # hack to make internal modules visible in the script
+    sys.path.append("..")
+    sys.path.append(".")
+
+    from accelerate import Accelerator
+    from utils.callbacks import ZenMLCallback
+    from utils.loaders import load_base_model
+    from utils.tokenizer import load_tokenizer
+
+    if use_accelerate:
+        accelerator = Accelerator()
+        should_print = accelerator.is_main_process
+    else:
+        should_print = True
+
+    project = "zenml-finetune"
+    base_model_name = "mistral"
+    run_name = base_model_name + "-" + project
+    output_dir = "./" + run_name
+
+    if should_print:
+        logger.info("Loading datasets...")
+    tokenizer = load_tokenizer(base_model_id)
+    tokenized_train_dataset = load_from_disk(dataset_dir / "train")
+    tokenized_val_dataset = load_from_disk(dataset_dir / "val")
+
+    if should_print:
+        logger.info("Loading base model...")
+
+    model = load_base_model(
+        base_model_id,
+        use_accelerate=use_accelerate,
+        should_print=should_print,
+    )
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=tokenized_train_dataset,
+        eval_dataset=tokenized_val_dataset,
+        args=transformers.TrainingArguments(
+            output_dir=output_dir,
+            warmup_steps=warmup_steps,
+            per_device_train_batch_size=per_device_train_batch_size,
+            gradient_checkpointing=(not use_accelerate),
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            max_steps=max_steps,
+            learning_rate=lr,
+            logging_steps=logging_steps,
+            bf16=bf16,
+            optim=optimizer,
+            logging_dir="./logs",
+            save_strategy="steps",
+            save_steps=save_steps,
+            evaluation_strategy="steps",
+            eval_steps=eval_steps,
+            do_eval=True,
+            label_names=label_names,
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
+        ),
+        callbacks=[ZenMLCallback(accelerator=accelerator)],
+    )
+    if not use_accelerate:
+        model.config.use_cache = (
+            False  # silence the warnings. Please re-enable for inference!
+        )
+
+    if should_print:
+        logger.info("Training model...")
+    trainer.train()
+
+    if should_print:
+        logger.info("Saving model...")
+    if not use_accelerate:
+        model.config.use_cache = True
+    else:
+        model = accelerator.unwrap_model(model)
+
+    if ft_model_dir is None:
+        ft_model_dir = Path("model_dir")
+    if not use_accelerate or accelerator.is_main_process:
+        ft_model_dir.mkdir(parents=True, exist_ok=True)
+        trainer.save_model(ft_model_dir)
+
+    return ft_model_dir
+
+
+if __name__ == "__main__":
+    cli_wrapper()
diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index 36bc726b..13c49cbb 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -25,7 +25,7 @@
     load_base_model,
     load_pretrained_model,
 )
-from utils.tokenizer import tokenize_for_eval, load_tokenizer
+from utils.tokenizer import load_tokenizer, tokenize_for_eval
 from zenml import log_model_metadata, save_artifact, step
 from zenml.logger import get_logger
 
@@ -55,7 +55,9 @@ def evaluate_model(
     test_dataset = load_from_disk(datasets_dir / "test_raw")
     test_dataset = test_dataset[:50]
     ground_truths = test_dataset["meaning_representation"]
-    tokenized_train_dataset = tokenize_for_eval(test_dataset, tokenizer, system_prompt)
+    tokenized_train_dataset = tokenize_for_eval(
+        test_dataset, tokenizer, system_prompt
+    )
 
     if ft_model_dir is None:
         logger.info("Generating using base model...")
@@ -80,7 +82,9 @@ def evaluate_model(
     logger.info("Computing ROUGE metrics...")
     prefix = "base_model_" if ft_model_dir is None else "finetuned_model_"
     rouge = evaluate.load("rouge")
-    rouge_metrics = rouge.compute(predictions=predictions, references=ground_truths)
+    rouge_metrics = rouge.compute(
+        predictions=predictions, references=ground_truths
+    )
     metadata = {prefix + k: float(v) for k, v in rouge_metrics.items()}
 
     log_model_metadata(metadata)
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index 3b60b7f8..ceecff2f 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -15,20 +15,19 @@
 # limitations under the License.
 #
 
+import subprocess
 from pathlib import Path
 
-import transformers
-from datasets import load_from_disk
+import torch
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
-from utils.callbacks import ZenMLCallback
-from utils.loaders import load_base_model
-from utils.tokenizer import load_tokenizer
 from zenml import logging as zenml_logging
 from zenml import step
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
 
+from scripts.finetune import accelerated_finetune
+
 logger = get_logger(__name__)
 zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
     10000  # workaround for https://github.com/zenml-io/zenml/issues/2252
@@ -48,6 +47,8 @@ def finetune(
     per_device_train_batch_size: int = 2,
     gradient_accumulation_steps: int = 4,
     warmup_steps: int = 5,
+    bf16: bool = True,
+    use_accelerate: bool = False,
 ) -> Annotated[Path, "ft_model_dir"]:
     """Finetune the model using PEFT.
 
@@ -68,62 +69,70 @@ def finetune(
         per_device_train_batch_size: The batch size to use for training.
         gradient_accumulation_steps: The number of gradient accumulation steps.
         warmup_steps: The number of warmup steps.
+        bf16: Whether to use bf16.
+        use_accelerate: Whether to use accelerate.
 
     Returns:
         The path to the finetuned model directory.
     """
-    project = "zenml-finetune"
-    base_model_name = "mistral"
-    run_name = base_model_name + "-" + project
-    output_dir = "./" + run_name
-
-    logger.info("Loading datasets...")
-    tokenizer = load_tokenizer(base_model_id)
-    tokenized_train_dataset = load_from_disk(dataset_dir / "train")
-    tokenized_val_dataset = load_from_disk(dataset_dir / "val")
-
-    logger.info("Loading base model...")
-    model = load_base_model(base_model_id)
-
-    trainer = transformers.Trainer(
-        model=model,
-        train_dataset=tokenized_train_dataset,
-        eval_dataset=tokenized_val_dataset,
-        args=transformers.TrainingArguments(
-            output_dir=output_dir,
-            warmup_steps=warmup_steps,
-            per_device_train_batch_size=per_device_train_batch_size,
-            gradient_checkpointing=True,
-            gradient_accumulation_steps=gradient_accumulation_steps,
-            max_steps=max_steps,
-            learning_rate=lr,
-            logging_steps=logging_steps,
-            bf16=True,
-            optim=optimizer,
-            logging_dir="./logs",
-            save_strategy="steps",
-            save_steps=save_steps,
-            evaluation_strategy="steps",
-            eval_steps=eval_steps,
-            do_eval=True,
-        ),
-        data_collator=transformers.DataCollatorForLanguageModeling(
-            tokenizer, mlm=False
-        ),
-        callbacks=[ZenMLCallback()],
-    )
-
-    model.config.use_cache = (
-        False  # silence the warnings. Please re-enable for inference!
-    )
-
-    logger.info("Training model...")
-    trainer.train()
-
-    logger.info("Saving model...")
-    model.config.use_cache = True
-    ft_model_dir = Path("model_dir")
-    ft_model_dir.mkdir(parents=True, exist_ok=True)
-    trainer.save_model(ft_model_dir)
-
-    return ft_model_dir
+    if not use_accelerate:
+        return (
+            accelerated_finetune(
+                base_model_id=base_model_id,
+                dataset_dir=dataset_dir,
+                max_steps=max_steps,
+                logging_steps=logging_steps,
+                eval_steps=eval_steps,
+                save_steps=save_steps,
+                optimizer=optimizer,
+                lr=lr,
+                per_device_train_batch_size=per_device_train_batch_size,
+                gradient_accumulation_steps=gradient_accumulation_steps,
+                warmup_steps=warmup_steps,
+                bf16=bf16,
+                use_accelerate=False,
+            ),
+        )
+    else:
+        logger.info("Starting accelerate training job...")
+        ft_model_dir = "model_dir"
+        command = f"accelerate launch --num_processes {torch.cuda.device_count()} "
+        command += str(Path("scripts/finetune.py").absolute()) + " "
+        command += f'--base-model-id "{base_model_id}" '
+        command += f'--dataset-dir "{dataset_dir}" '
+        command += f"--max-steps {max_steps} "
+        command += f"--logging-steps {logging_steps} "
+        command += f"--eval-steps {eval_steps} "
+        command += f"--save-steps {save_steps} "
+        command += f"--optimizer {optimizer} "
+        command += f"--lr {lr} "
+        command += f"--per-device-train-batch-size {per_device_train_batch_size} "
+        command += f"--gradient-accumulation-steps {gradient_accumulation_steps} "
+        command += f"--warmup-steps {warmup_steps} "
+        if bf16:
+            command += f"--bf16 "
+        if use_accelerate:
+            command += f"--use-accelerate "
+            command += f"-l input_ids "
+            command += f'--ft-model-dir "{ft_model_dir}" '
+
+        print(command)
+
+        result = subprocess.run(
+            command,
+            shell=True,
+            stdout=subprocess.PIPE,
+            universal_newlines=True,
+        )
+        for stdout_line in result.stdout:
+            print(stdout_line, end="")
+        result.stdout.close()
+        return_code = result.wait()
+        if return_code == 0:
+            logger.info("Accelerate training job finished.")
+            return Path(ft_model_dir)
+        else:
+            logger.error(
+                f"Accelerate training job failed. With return code {return_code}."
+            )
+            raise subprocess.CalledProcessError(return_code, command)
diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index 06cb0a9d..dc2c2a2f 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -21,7 +21,7 @@
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
 from utils.tokenizer import generate_and_tokenize_prompt, load_tokenizer
-from zenml import step, log_model_metadata
+from zenml import log_model_metadata, step
 from zenml.materializers import BuiltInMaterializer
 
 
@@ -43,10 +43,12 @@ def prepare_data(
     """
     from datasets import load_dataset
 
-    log_model_metadata({
-        "system_prompt": system_prompt,
-        "base_model_id": base_model_id
-    })
+    log_model_metadata(
+        {
+            "system_prompt": system_prompt,
+            "base_model_id": base_model_id,
+        }
+    )
 
     tokenizer = load_tokenizer(base_model_id, False)
     gen_and_tokenize = partial(
diff --git a/llm-lora-finetuning/steps/promote.py b/llm-lora-finetuning/steps/promote.py
index 53549d43..eca870d6 100644
--- a/llm-lora-finetuning/steps/promote.py
+++ b/llm-lora-finetuning/steps/promote.py
@@ -22,7 +22,7 @@
 logger = get_logger(__name__)
 
 
-@step
+@step(enable_cache=False)
 def promote(
     metric: str = "rouge1",
     target_stage: str = "staging",
diff --git a/llm-lora-finetuning/utils/__init__.py b/llm-lora-finetuning/utils/__init__.py
new file mode 100644
index 00000000..757bd841
--- /dev/null
+++ b/llm-lora-finetuning/utils/__init__.py
@@ -0,0 +1,16 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/llm-lora-finetuning/utils/callbacks.py b/llm-lora-finetuning/utils/callbacks.py
index 988016a2..bd2a1d13 100644
--- a/llm-lora-finetuning/utils/callbacks.py
+++ b/llm-lora-finetuning/utils/callbacks.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from typing import Dict
+from typing import TYPE_CHECKING, Dict
 
 from transformers import (
     TrainerCallback,
@@ -25,9 +25,16 @@
 )
 from zenml import get_step_context
 
+if TYPE_CHECKING:
+    from accelerate import Accelerator
+
 
 class ZenMLCallback(TrainerCallback):
     """Callback that logs metrics to ZenML."""
+
+    def __init__(self, accelerator: "Accelerator"):
+        self.accelerator = accelerator
+
     def on_evaluate(
         self,
         args: TrainingArguments,
@@ -45,12 +52,13 @@ def on_evaluate(
             metrics: The metrics to log.
         """
         try:
-            context = get_step_context()
-            context.model.log_metadata(
-                {
-                    f"step_{state.global_step}_eval_metrics": metrics,
-                }
-            )
+            if self.accelerator is None or self.accelerator.is_main_process:
+                context = get_step_context()
+                context.model.log_metadata(
+                    {
+                        f"step_{state.global_step}_eval_metrics": metrics,
+                    }
+                )
         except RuntimeError:
             # If we can't get the context, silently pass
             return
diff --git a/llm-lora-finetuning/utils/loaders.py b/llm-lora-finetuning/utils/loaders.py
index 9a5b6a37..fc03afbb 100644
--- a/llm-lora-finetuning/utils/loaders.py
+++ b/llm-lora-finetuning/utils/loaders.py
@@ -16,18 +16,22 @@
 #
 
 from pathlib import Path
-from typing import Any
+from typing import Any, Tuple, Union
 
 import torch
+from datasets import Dataset
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM
+
 from utils.logging import print_trainable_parameters
 
 
 def load_base_model(
     base_model_id: str,
     is_training: bool = True,
-) -> Any:
+    use_accelerate: bool = False,
+    should_print: bool = True,
+) -> Union[Any, Tuple[Any, Dataset, Dataset]]:
     """Load the base model.
 
     Args:
@@ -39,15 +43,25 @@ def load_base_model(
     Returns:
         The base model.
     """
+    from accelerate import Accelerator
+    from transformers import BitsAndBytesConfig
+
+    if use_accelerate:
+        accelerator = Accelerator()
+        device_map = {"": accelerator.process_index}
+    else:
+        device_map = {"": torch.cuda.current_device()}
+
     bnb_config = BitsAndBytesConfig(
-        load_in_8bit=True,
+        # load_in_8bit=True,
+        load_in_4bit=True,
         bnb_4bit_use_double_quant=True,
         bnb_4bit_quant_type="nf4",
         bnb_4bit_compute_dtype=torch.bfloat16,
     )
 
     model = AutoModelForCausalLM.from_pretrained(
-        base_model_id, quantization_config=bnb_config, device_map="auto"
+        base_model_id, quantization_config=bnb_config, device_map=device_map
     )
 
     if is_training:
@@ -73,7 +87,10 @@ def load_base_model(
         )
 
         model = get_peft_model(model, config)
-        print_trainable_parameters(model)
+        if should_print:
+            print_trainable_parameters(model)
+        if use_accelerate:
+            model = accelerator.prepare_model(model)
 
     return model
 
@@ -87,6 +104,8 @@ def load_pretrained_model(ft_model_dir: Path) -> AutoModelForCausalLM:
     Returns:
         The finetuned model.
     """
+    from transformers import BitsAndBytesConfig
+
     bnb_config = BitsAndBytesConfig(
         load_in_8bit=True,
         bnb_4bit_use_double_quant=True,
diff --git a/llm-lora-finetuning/utils/logging.py b/llm-lora-finetuning/utils/logging.py
index 6b178b6a..0fd2df78 100644
--- a/llm-lora-finetuning/utils/logging.py
+++ b/llm-lora-finetuning/utils/logging.py
@@ -16,12 +16,13 @@
 #
 
 from typing import Any
+
 from zenml.logger import get_logger
 
 logger = get_logger(__name__)
 
 
-def print_trainable_parameters(model:Any):
+def print_trainable_parameters(model: Any):
     """Prints the number of trainable parameters in the model."""
     trainable_params = 0
     all_param = 0

From 3e9776ae37fbc6886e2de39224a2b046226dbd12 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 17 Apr 2024 16:28:15 +0200
Subject: [PATCH 02/28] eof

---
 llm-lora-finetuning/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm-lora-finetuning/requirements.txt b/llm-lora-finetuning/requirements.txt
index 3d884b69..2276008b 100644
--- a/llm-lora-finetuning/requirements.txt
+++ b/llm-lora-finetuning/requirements.txt
@@ -8,4 +8,4 @@ scipy
 evaluate
 rouge_score
 nltk
-accelerate==0.29.2
\ No newline at end of file
+accelerate==0.29.2

From d77afdc8766cc9c4e48285c6270853048ba56c3d Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 17 Apr 2024 16:46:15 +0200
Subject: [PATCH 03/28] fixes for subprocess

---
 llm-lora-finetuning/steps/finetune.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index ceecff2f..7825d7cf 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -126,13 +126,11 @@ def finetune(
         )
         for stdout_line in result.stdout:
             print(stdout_line, end="")
-        result.stdout.close()
-        return_code = result.wait()
-        if return_code == 0:
+        if result.returncode == 0:
             logger.info("Accelerate training job finished.")
             return Path(ft_model_dir)
         else:
             logger.error(
-                f"Accelerate training job failed. With return code {return_code}."
+                f"Accelerate training job failed. With return code {result.returncode}."
             )
-            raise subprocess.CalledProcessError(return_code, command)
+            raise subprocess.CalledProcessError(result.returncode, command)

From 7cc1f01e540140fed55dc58f8e6c9d63beb22e1a Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 17 Apr 2024 16:54:39 +0200
Subject: [PATCH 04/28] callback patch

---
 llm-lora-finetuning/scripts/finetune.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm-lora-finetuning/scripts/finetune.py b/llm-lora-finetuning/scripts/finetune.py
index 25e3545a..755c34da 100644
--- a/llm-lora-finetuning/scripts/finetune.py
+++ b/llm-lora-finetuning/scripts/finetune.py
@@ -217,6 +217,7 @@ def accelerated_finetune(
         accelerator = Accelerator()
         should_print = accelerator.is_main_process
     else:
+        accelerator = None
         should_print = True
 
     project = "zenml-finetune"

From 377ec0fae0aee363f6bf764843eff82d97b7ea34 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 17 Apr 2024 17:13:05 +0200
Subject: [PATCH 05/28] tidy up

---
 llm-lora-finetuning/steps/finetune.py | 31 +++++++++++++--------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index 7825d7cf..0ac9a529 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -76,23 +76,22 @@ def finetune(
         The path to the finetuned model directory.
     """
     if not use_accelerate:
-        return (
-            accelerated_finetune(
-                base_model_id=base_model_id,
-                dataset_dir=dataset_dir,
-                max_steps=max_steps,
-                logging_steps=logging_steps,
-                eval_steps=eval_steps,
-                save_steps=save_steps,
-                optimizer=optimizer,
-                lr=lr,
-                per_device_train_batch_size=per_device_train_batch_size,
-                gradient_accumulation_steps=gradient_accumulation_steps,
-                warmup_steps=warmup_steps,
-                bf16=bf16,
-                use_accelerate=False,
-            ),
+        return accelerated_finetune(
+            base_model_id=base_model_id,
+            dataset_dir=dataset_dir,
+            max_steps=max_steps,
+            logging_steps=logging_steps,
+            eval_steps=eval_steps,
+            save_steps=save_steps,
+            optimizer=optimizer,
+            lr=lr,
+            per_device_train_batch_size=per_device_train_batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            warmup_steps=warmup_steps,
+            bf16=bf16,
+            use_accelerate=False,
         )
+
     else:
         logger.info("Starting accelerate training job...")
         ft_model_dir = "model_dir"

From 8856e5ced3dd18037556cda057471820fd893aea Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 19 Apr 2024 10:51:27 +0200
Subject: [PATCH 06/28] new iteration

---
 .../configs/default_finetune.yaml             |  3 +-
 ...aml => phi_accelerated_bf16_finetune.yaml} | 15 ++---
 ...une.yaml => phi_accelerated_finetune.yaml} | 17 +++---
 ...tune.yaml => phi_local_bf16_finetune.yaml} | 15 ++---
 ..._finetune.yaml => phi_local_finetune.yaml} | 17 +++---
 .../configs/remote_finetune.yaml              |  2 +-
 llm-lora-finetuning/pipelines/train.py        | 24 +++++++-
 llm-lora-finetuning/scripts/finetune.py       | 60 +++++++++++++++----
 llm-lora-finetuning/steps/evaluate_model.py   | 28 +++++++--
 llm-lora-finetuning/steps/finetune.py         | 23 ++++++-
 llm-lora-finetuning/steps/prepare_datasets.py |  7 ++-
 llm-lora-finetuning/steps/promote.py          |  2 +
 llm-lora-finetuning/utils/cuda.py             |  4 ++
 llm-lora-finetuning/utils/hashing.py          | 15 +++++
 llm-lora-finetuning/utils/loaders.py          | 21 +++++--
 llm-lora-finetuning/utils/tokenizer.py        | 12 ++--
 16 files changed, 205 insertions(+), 60 deletions(-)
 rename llm-lora-finetuning/configs/{tiny_accelerated_bf16_finetune.yaml => phi_accelerated_bf16_finetune.yaml} (87%)
 rename llm-lora-finetuning/configs/{tiny_accelerated_finetune.yaml => phi_accelerated_finetune.yaml} (86%)
 rename llm-lora-finetuning/configs/{tiny_local_bf16_finetune.yaml => phi_local_bf16_finetune.yaml} (87%)
 rename llm-lora-finetuning/configs/{tiny_local_finetune.yaml => phi_local_finetune.yaml} (86%)
 create mode 100644 llm-lora-finetuning/utils/hashing.py

diff --git a/llm-lora-finetuning/configs/default_finetune.yaml b/llm-lora-finetuning/configs/default_finetune.yaml
index 83dbc362..3bfd4640 100644
--- a/llm-lora-finetuning/configs/default_finetune.yaml
+++ b/llm-lora-finetuning/configs/default_finetune.yaml
@@ -35,7 +35,8 @@ parameters:
     Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
     This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
     The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-
+  load_in_8bit: True
+  
 steps:
   prepare_data:
     parameters:
diff --git a/llm-lora-finetuning/configs/tiny_accelerated_bf16_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_bf16_finetune.yaml
similarity index 87%
rename from llm-lora-finetuning/configs/tiny_accelerated_bf16_finetune.yaml
rename to llm-lora-finetuning/configs/phi_accelerated_bf16_finetune.yaml
index 82a822c6..c3be19e4 100644
--- a/llm-lora-finetuning/configs/tiny_accelerated_bf16_finetune.yaml
+++ b/llm-lora-finetuning/configs/phi_accelerated_bf16_finetune.yaml
@@ -16,13 +16,13 @@
 #
 
 model:
-  name: llm-peft-mistralai-Mistral-7B-v0.1
-  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  name: llm-peft-microsoft-phi-2
+  description: "Fine-tune `microsoft/phi-2`."
   tags:
     - llm
     - peft
-    - mistralai/Mistral-7B-v0.1
-  version: 50_steps_accelerate
+    - microsoft/phi-2
+  version: 200_steps_accelerate
 
 settings:
   docker:
@@ -30,11 +30,13 @@ settings:
     requirements: requirements.txt
 
 parameters:
-  base_model_id: mistralai/Mistral-7B-v0.1
+  base_model_id: microsoft/phi-2
   system_prompt: |
     Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
     This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
     The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+  use_fast: False
+  load_in_4bit: True
 
 steps:
   prepare_data:
@@ -42,9 +44,8 @@ steps:
       dataset_name: gem/viggo
 
   finetune:
-    enable_step_logs: False
     parameters:
-      max_steps: 50
+      max_steps: 200
       eval_steps: 50
       bf16: True
       use_accelerate: True
diff --git a/llm-lora-finetuning/configs/tiny_accelerated_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_finetune.yaml
similarity index 86%
rename from llm-lora-finetuning/configs/tiny_accelerated_finetune.yaml
rename to llm-lora-finetuning/configs/phi_accelerated_finetune.yaml
index 7669354d..b78d2d19 100644
--- a/llm-lora-finetuning/configs/tiny_accelerated_finetune.yaml
+++ b/llm-lora-finetuning/configs/phi_accelerated_finetune.yaml
@@ -16,13 +16,13 @@
 #
 
 model:
-  name: llm-peft-mistralai-Mistral-7B-v0.1
-  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  name: llm-peft-microsoft-phi-2
+  description: "Fine-tune `microsoft/phi-2`."
   tags:
     - llm
     - peft
-    - mistralai/Mistral-7B-v0.1
-  version: 50_steps_accelerate
+    - microsoft/phi-2
+  version: 25_steps_accelerate
 
 settings:
   docker:
@@ -30,11 +30,13 @@ settings:
     requirements: requirements.txt
 
 parameters:
-  base_model_id: mistralai/Mistral-7B-v0.1
+  base_model_id: microsoft/phi-2
   system_prompt: |
     Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
     This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
     The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+  use_fast: False
+  load_in_4bit: True
 
 steps:
   prepare_data:
@@ -42,10 +44,9 @@ steps:
       dataset_name: gem/viggo
 
   finetune:
-    enable_step_logs: False
     parameters:
-      max_steps: 50
-      eval_steps: 50
+      max_steps: 25
+      eval_steps: 25
       bf16: False
       use_accelerate: True
       
diff --git a/llm-lora-finetuning/configs/tiny_local_bf16_finetune.yaml b/llm-lora-finetuning/configs/phi_local_bf16_finetune.yaml
similarity index 87%
rename from llm-lora-finetuning/configs/tiny_local_bf16_finetune.yaml
rename to llm-lora-finetuning/configs/phi_local_bf16_finetune.yaml
index 7d25de39..982b17e3 100644
--- a/llm-lora-finetuning/configs/tiny_local_bf16_finetune.yaml
+++ b/llm-lora-finetuning/configs/phi_local_bf16_finetune.yaml
@@ -16,13 +16,13 @@
 #
 
 model:
-  name: llm-peft-mistralai-Mistral-7B-v0.1
-  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  name: llm-peft-microsoft-phi-2
+  description: "Fine-tune `microsoft/phi-2`."
   tags:
     - llm
     - peft
-    - mistralai/Mistral-7B-v0.1
-  version: 50_steps
+    - microsoft/phi-2
+  version: 200_steps
 
 settings:
   docker:
@@ -30,11 +30,13 @@ settings:
     requirements: requirements.txt
 
 parameters:
-  base_model_id: mistralai/Mistral-7B-v0.1
+  base_model_id: microsoft/phi-2
   system_prompt: |
     Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
     This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
     The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+  use_fast: False
+  load_in_4bit: True
 
 steps:
   prepare_data:
@@ -42,9 +44,8 @@ steps:
       dataset_name: gem/viggo
 
   finetune:
-    enable_step_logs: False
     parameters:
-      max_steps: 50
+      max_steps: 200
       eval_steps: 50
       bf16: True
       
diff --git a/llm-lora-finetuning/configs/tiny_local_finetune.yaml b/llm-lora-finetuning/configs/phi_local_finetune.yaml
similarity index 86%
rename from llm-lora-finetuning/configs/tiny_local_finetune.yaml
rename to llm-lora-finetuning/configs/phi_local_finetune.yaml
index dcfc4898..d7f87c0e 100644
--- a/llm-lora-finetuning/configs/tiny_local_finetune.yaml
+++ b/llm-lora-finetuning/configs/phi_local_finetune.yaml
@@ -16,13 +16,13 @@
 #
 
 model:
-  name: llm-peft-mistralai-Mistral-7B-v0.1
-  description: "Fine-tune `mistralai/Mistral-7B-v0.1`."
+  name: llm-peft-microsoft-phi-2
+  description: "Fine-tune `microsoft/phi-2`."
   tags:
     - llm
     - peft
-    - mistralai/Mistral-7B-v0.1
-  version: 50_steps
+    - microsoft/phi-2
+  version: 25_steps
 
 settings:
   docker:
@@ -30,11 +30,13 @@ settings:
     requirements: requirements.txt
 
 parameters:
-  base_model_id: mistralai/Mistral-7B-v0.1
+  base_model_id: microsoft/phi-2
   system_prompt: |
     Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
     This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
     The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+  use_fast: False
+  load_in_4bit: True
 
 steps:
   prepare_data:
@@ -42,10 +44,9 @@ steps:
       dataset_name: gem/viggo
 
   finetune:
-    enable_step_logs: False
     parameters:
-      max_steps: 50
-      eval_steps: 50
+      max_steps: 25
+      eval_steps: 25
       bf16: False
       
 
diff --git a/llm-lora-finetuning/configs/remote_finetune.yaml b/llm-lora-finetuning/configs/remote_finetune.yaml
index 2039ac06..9b23fee9 100644
--- a/llm-lora-finetuning/configs/remote_finetune.yaml
+++ b/llm-lora-finetuning/configs/remote_finetune.yaml
@@ -35,7 +35,7 @@ parameters:
     Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
     This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
     The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-
+  load_in_8bit: True
 
 steps:
   prepare_data:
diff --git a/llm-lora-finetuning/pipelines/train.py b/llm-lora-finetuning/pipelines/train.py
index 3f9c6508..95021ac0 100644
--- a/llm-lora-finetuning/pipelines/train.py
+++ b/llm-lora-finetuning/pipelines/train.py
@@ -19,6 +19,7 @@
 from steps import evaluate_model, finetune, prepare_data, promote
 from zenml import logging as zenml_logging
 from zenml import pipeline
+from utils.hashing import compute_md5
 
 zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
     10000  # workaround for https://github.com/zenml-io/zenml/issues/2252
@@ -26,7 +27,9 @@
 
 
 @pipeline
-def llm_peft_full_finetune(system_prompt: str, base_model_id: str):
+def llm_peft_full_finetune(
+    system_prompt: str, base_model_id: str, use_fast: bool = True, load_in_8bit: bool = False, load_in_4bit: bool = False
+):
     """Pipeline for finetuning an LLM with peft.
 
     It will run the following steps:
@@ -37,18 +40,32 @@ def llm_peft_full_finetune(system_prompt: str, base_model_id: str):
     - evaluate_model: evaluate the base and finetuned model
     - promote: promote the model to the target stage, if evaluation was successful
     """
+    if not load_in_8bit and not load_in_4bit:
+        raise ValueError("At least one of `load_in_8bit` and `load_in_4bit` must be True.")
+    if load_in_4bit and load_in_8bit:
+        raise ValueError("Only one of `load_in_8bit` and `load_in_4bit` can be True.")
+    
     datasets_dir = prepare_data(
-        base_model_id=base_model_id, system_prompt=system_prompt
+        base_model_id=base_model_id,
+        system_prompt=system_prompt,
+        use_fast=use_fast,
     )
     ft_model_dir = finetune(
         base_model_id,
         datasets_dir,
+        finetune_script_sha=compute_md5("scripts/finetune.py"),
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
     )
     evaluate_model(
         base_model_id,
         system_prompt,
         datasets_dir,
         ft_model_dir,
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
         id="evaluate_finetuned",
     )
     evaluate_model(
@@ -56,6 +73,9 @@ def llm_peft_full_finetune(system_prompt: str, base_model_id: str):
         system_prompt,
         datasets_dir,
         None,
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
         id="evaluate_base",
     )
     promote(after=["evaluate_finetuned", "evaluate_base"])
diff --git a/llm-lora-finetuning/scripts/finetune.py b/llm-lora-finetuning/scripts/finetune.py
index 755c34da..a0e34744 100644
--- a/llm-lora-finetuning/scripts/finetune.py
+++ b/llm-lora-finetuning/scripts/finetune.py
@@ -19,6 +19,7 @@
 from typing import List
 
 import click
+import torch
 import transformers
 from datasets import load_from_disk
 from zenml.logger import get_logger
@@ -26,9 +27,7 @@
 logger = get_logger(__name__)
 
 
-@click.command(
-    help="Technical wrapper to pass into the `accelerate launch` command."
-)
+@click.command(help="Technical wrapper to pass into the `accelerate launch` command.")
 @click.option(
     "--base-model-id",
     type=str,
@@ -119,6 +118,24 @@
     default="",
     help="The path to the finetuned model directory.",
 )
+@click.option(
+    "--use-fast",
+    is_flag=True,
+    default=False,
+    help="Use the fast tokenizer.",
+)
+@click.option(
+    "--load-in-4bit",
+    is_flag=True,
+    default=False,
+    help="Whether to load the model in 4bit mode",
+)
+@click.option(
+    "--load-in-8bit",
+    is_flag=True,
+    default=False,
+    help="Whether to load the model in 8bit mode",
+)
 def cli_wrapper(
     base_model_id: str,
     dataset_dir: str,
@@ -135,6 +152,9 @@ def cli_wrapper(
     use_accelerate: bool = False,
     label_names: List[str] = None,
     ft_model_dir: str = "",
+    use_fast: bool = False,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
 ) -> Path:
     dataset_dir = Path(dataset_dir)
     if ft_model_dir:
@@ -158,6 +178,9 @@ def cli_wrapper(
         use_accelerate=use_accelerate,
         label_names=list(label_names),
         ft_model_dir=ft_model_dir,
+        use_fast=use_fast,
+        load_in_4bit=load_in_4bit,
+        load_in_8bit=load_in_8bit,
     )
 
 
@@ -177,6 +200,9 @@ def accelerated_finetune(
     use_accelerate: bool = False,
     label_names: List[str] = None,
     ft_model_dir: Path = None,
+    use_fast: bool = True,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
 ) -> Path:
     """Finetune the model using PEFT.
 
@@ -198,6 +224,9 @@ def accelerated_finetune(
         use_accelerate: Whether to use accelerate.
         label_names: The label names to use.
         ft_model_dir: The path to the finetuned model directory.
+        use_fast: Whether to use fast tokenizers.
+        load_in_4bit: Whether to load the model in 4bit mode.
+        load_in_8bit: Whether to load the model in 8bit mode.
 
     Returns:
         The path to the finetuned model directory.
@@ -227,7 +256,7 @@ def accelerated_finetune(
 
     if should_print:
         logger.info("Loading datasets...")
-    tokenizer = load_tokenizer(base_model_id)
+    tokenizer = load_tokenizer(base_model_id, use_fast=use_fast)
     tokenized_train_dataset = load_from_disk(dataset_dir / "train")
     tokenized_val_dataset = load_from_disk(dataset_dir / "val")
 
@@ -238,6 +267,8 @@ def accelerated_finetune(
         base_model_id,
         use_accelerate=use_accelerate,
         should_print=should_print,
+        load_in_4bit=load_in_4bit,
+        load_in_8bit=load_in_8bit,
     )
 
     trainer = transformers.Trainer(
@@ -252,14 +283,16 @@ def accelerated_finetune(
             gradient_accumulation_steps=gradient_accumulation_steps,
             max_steps=max_steps,
             learning_rate=lr,
-            logging_steps=logging_steps,
+            logging_steps=(
+                min(logging_steps, max_steps) if max_steps >= 0 else logging_steps
+            ),
             bf16=bf16,
             optim=optimizer,
             logging_dir="./logs",
             save_strategy="steps",
-            save_steps=save_steps,
+            save_steps=min(save_steps, max_steps) if max_steps >= 0 else save_steps,
             evaluation_strategy="steps",
-            eval_steps=eval_steps,
+            eval_steps=min(eval_steps, max_steps) if max_steps >= 0 else eval_steps,
             do_eval=True,
             label_names=label_names,
         ),
@@ -279,16 +312,21 @@ def accelerated_finetune(
 
     if should_print:
         logger.info("Saving model...")
-    if not use_accelerate:
-        model.config.use_cache = True
-    else:
-        model = accelerator.unwrap_model(model)
 
     if ft_model_dir is None:
         ft_model_dir = Path("model_dir")
     if not use_accelerate or accelerator.is_main_process:
         ft_model_dir.mkdir(parents=True, exist_ok=True)
+    if not use_accelerate:
+        model.config.use_cache = True
         trainer.save_model(ft_model_dir)
+    else:
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            ft_model_dir,
+            is_main_process=accelerator.is_main_process,
+            save_function=accelerator.save,
+        )
 
     return ft_model_dir
 
diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index 13c49cbb..454c950f 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -28,6 +28,7 @@
 from utils.tokenizer import load_tokenizer, tokenize_for_eval
 from zenml import log_model_metadata, save_artifact, step
 from zenml.logger import get_logger
+from utils.cuda import cleanup_memory
 
 logger = get_logger(__name__)
 
@@ -38,6 +39,9 @@ def evaluate_model(
     system_prompt: str,
     datasets_dir: Path,
     ft_model_dir: Optional[Path],
+    use_fast: bool = True,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
 ) -> None:
     """Evaluate the model with ROUGE metrics.
 
@@ -47,21 +51,36 @@ def evaluate_model(
         datasets_dir: The path to the datasets directory.
         ft_model_dir: The path to the finetuned model directory. If None, the
             base model will be used.
+        use_fast: Whether to use the fast tokenizer.
+        load_in_4bit: Whether to load the model in 4bit mode.
+        load_in_8bit: Whether to load the model in 8bit mode.
     """
+    cleanup_memory()
     logger.info("Evaluating model...")
 
     logger.info("Loading dataset...")
-    tokenizer = load_tokenizer(base_model_id, is_eval=True)
+    tokenizer = load_tokenizer(
+        base_model_id,
+        is_eval=True,
+        use_fast=use_fast,
+    )
     test_dataset = load_from_disk(datasets_dir / "test_raw")
     test_dataset = test_dataset[:50]
     ground_truths = test_dataset["meaning_representation"]
     tokenized_train_dataset = tokenize_for_eval(
-        test_dataset, tokenizer, system_prompt
+        test_dataset,
+        tokenizer,
+        system_prompt,
     )
 
     if ft_model_dir is None:
         logger.info("Generating using base model...")
-        model = load_base_model(base_model_id, is_training=False)
+        model = load_base_model(
+            base_model_id,
+            is_training=False,
+            load_in_4bit=load_in_4bit,
+            load_in_8bit=load_in_8bit,
+        )
     else:
         logger.info("Generating using finetuned model...")
         model = load_pretrained_model(ft_model_dir)
@@ -83,7 +102,8 @@ def evaluate_model(
     prefix = "base_model_" if ft_model_dir is None else "finetuned_model_"
     rouge = evaluate.load("rouge")
     rouge_metrics = rouge.compute(
-        predictions=predictions, references=ground_truths
+        predictions=predictions,
+        references=ground_truths,
     )
     metadata = {prefix + k: float(v) for k, v in rouge_metrics.items()}
 
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index 0ac9a529..2964c0d5 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -27,6 +27,7 @@
 from zenml.materializers import BuiltInMaterializer
 
 from scripts.finetune import accelerated_finetune
+from utils.cuda import cleanup_memory
 
 logger = get_logger(__name__)
 zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
@@ -38,6 +39,7 @@
 def finetune(
     base_model_id: str,
     dataset_dir: Path,
+    finetune_script_sha: str,
     max_steps: int = 1000,
     logging_steps: int = 50,
     eval_steps: int = 50,
@@ -49,6 +51,9 @@ def finetune(
     warmup_steps: int = 5,
     bf16: bool = True,
     use_accelerate: bool = False,
+    use_fast: bool = True,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
 ) -> Annotated[Path, "ft_model_dir"]:
     """Finetune the model using PEFT.
 
@@ -60,6 +65,7 @@ def finetune(
     Args:
         base_model_id: The base model id to use.
         dataset_dir: The path to the dataset directory.
+        finetune_script_sha: The sha of the finetune script.
         max_steps: The maximum number of steps to train for.
         logging_steps: The number of steps to log at.
         eval_steps: The number of steps to evaluate at.
@@ -71,10 +77,14 @@ def finetune(
         warmup_steps: The number of warmup steps.
         bf16: Whether to use bf16.
         use_accelerate: Whether to use accelerate.
+        use_fast: Whether to use the fast tokenizer.
+        load_in_4bit: Whether to load the model in 4bit mode.
+        load_in_8bit: Whether to load the model in 8bit mode.
 
     Returns:
         The path to the finetuned model directory.
     """
+    cleanup_memory()
     if not use_accelerate:
         return accelerated_finetune(
             base_model_id=base_model_id,
@@ -90,6 +100,9 @@ def finetune(
             warmup_steps=warmup_steps,
             bf16=bf16,
             use_accelerate=False,
+            use_fast=use_fast,
+            load_in_4bit=load_in_4bit,
+            load_in_8bit=load_in_8bit,
         )
 
     else:
@@ -114,6 +127,12 @@ def finetune(
             command += f"--use-accelerate "
             command += f"-l input_ids "
             command += f'--ft-model-dir "{ft_model_dir}" '
+        if use_fast:
+            command += f"--use-fast "
+        if load_in_4bit:
+            command += f"--load-in-4bit "
+        if load_in_8bit:
+            command += f"--load-in-8bit "
 
         print(command)
 
@@ -123,8 +142,8 @@ def finetune(
             stdout=subprocess.PIPE,
             universal_newlines=True,
         )
-        for stdout_line in result.stdout:
-            print(stdout_line, end="")
+        for stdout_line in result.stdout.split("\n"):
+            print(stdout_line)
         if result.returncode == 0:
             logger.info("Accelerate training job finished.")
             return Path(ft_model_dir)
diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index dc2c2a2f..72bc8310 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -23,6 +23,7 @@
 from utils.tokenizer import generate_and_tokenize_prompt, load_tokenizer
 from zenml import log_model_metadata, step
 from zenml.materializers import BuiltInMaterializer
+from utils.cuda import cleanup_memory
 
 
 @step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
@@ -30,6 +31,7 @@ def prepare_data(
     base_model_id: str,
     system_prompt: str,
     dataset_name: str = "gem/viggo",
+    use_fast: bool = True
 ) -> Annotated[Path, "datasets_dir"]:
     """Prepare the datasets for finetuning.
 
@@ -37,12 +39,15 @@ def prepare_data(
         base_model_id: The base model id to use.
         system_prompt: The system prompt to use.
         dataset_name: The name of the dataset to use.
+        use_fast: Whether to use the fast tokenizer.
 
     Returns:
         The path to the datasets directory.
     """
     from datasets import load_dataset
 
+    cleanup_memory()
+
     log_model_metadata(
         {
             "system_prompt": system_prompt,
@@ -50,7 +55,7 @@ def prepare_data(
         }
     )
 
-    tokenizer = load_tokenizer(base_model_id, False)
+    tokenizer = load_tokenizer(base_model_id, False, use_fast)
     gen_and_tokenize = partial(
         generate_and_tokenize_prompt,
         tokenizer=tokenizer,
diff --git a/llm-lora-finetuning/steps/promote.py b/llm-lora-finetuning/steps/promote.py
index eca870d6..72a36de4 100644
--- a/llm-lora-finetuning/steps/promote.py
+++ b/llm-lora-finetuning/steps/promote.py
@@ -18,6 +18,7 @@
 from zenml import get_step_context, step
 from zenml.client import Client
 from zenml.logger import get_logger
+from utils.cuda import cleanup_memory
 
 logger = get_logger(__name__)
 
@@ -36,6 +37,7 @@ def promote(
         metric: The metric to use for promotion.
         target_stage: The target stage to promote to.
     """
+    cleanup_memory()
     context_model = get_step_context().model
     base_metrics = context_model.load_artifact("base_model_rouge_metrics")
     ft_metrics = context_model.load_artifact("finetuned_model_rouge_metrics")
diff --git a/llm-lora-finetuning/utils/cuda.py b/llm-lora-finetuning/utils/cuda.py
index 35d78d4a..958b5bac 100644
--- a/llm-lora-finetuning/utils/cuda.py
+++ b/llm-lora-finetuning/utils/cuda.py
@@ -18,9 +18,13 @@
 import gc
 
 import torch
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 def cleanup_memory() -> None:
     """Clean up GPU memory."""
+    logger.info("Cleaning up GPU memory...")
     while gc.collect():
         torch.cuda.empty_cache()
diff --git a/llm-lora-finetuning/utils/hashing.py b/llm-lora-finetuning/utils/hashing.py
new file mode 100644
index 00000000..781f38f4
--- /dev/null
+++ b/llm-lora-finetuning/utils/hashing.py
@@ -0,0 +1,15 @@
+import sys
+import hashlib
+
+BUF_SIZE = 65536 
+
+def compute_md5(file_path:str)->str:
+    md5 = hashlib.md5()
+
+    with open(file_path, 'rb') as f:
+        while True:
+            data = f.read(BUF_SIZE)
+            if not data:
+                break
+            md5.update(data)
+    return md5.hexdigest()
\ No newline at end of file
diff --git a/llm-lora-finetuning/utils/loaders.py b/llm-lora-finetuning/utils/loaders.py
index fc03afbb..837b1824 100644
--- a/llm-lora-finetuning/utils/loaders.py
+++ b/llm-lora-finetuning/utils/loaders.py
@@ -31,6 +31,8 @@ def load_base_model(
     is_training: bool = True,
     use_accelerate: bool = False,
     should_print: bool = True,
+    load_in_8bit: bool = False,
+    load_in_4bit: bool = False,
 ) -> Union[Any, Tuple[Any, Dataset, Dataset]]:
     """Load the base model.
 
@@ -39,6 +41,10 @@ def load_base_model(
         is_training: Whether the model should be prepared for training or not.
             If True, the Lora parameters will be enabled and PEFT will be
             applied.
+        use_accelerate: Whether to use the Accelerate library for training.
+        should_print: Whether to print the trainable parameters.
+        load_in_8bit: Whether to load the model in 8-bit mode.
+        load_in_4bit: Whether to load the model in 4-bit mode.
 
     Returns:
         The base model.
@@ -53,8 +59,8 @@ def load_base_model(
         device_map = {"": torch.cuda.current_device()}
 
     bnb_config = BitsAndBytesConfig(
-        # load_in_8bit=True,
-        load_in_4bit=True,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
         bnb_4bit_use_double_quant=True,
         bnb_4bit_quant_type="nf4",
         bnb_4bit_compute_dtype=torch.bfloat16,
@@ -95,11 +101,17 @@ def load_base_model(
     return model
 
 
-def load_pretrained_model(ft_model_dir: Path) -> AutoModelForCausalLM:
+def load_pretrained_model(
+    ft_model_dir: Path,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
+) -> AutoModelForCausalLM:
     """Load the finetuned model saved in the output directory.
 
     Args:
         ft_model_dir: The path to the finetuned model directory.
+        load_in_4bit: Whether to load the model in 4-bit mode.
+        load_in_8bit: Whether to load the model in 8-bit mode.
 
     Returns:
         The finetuned model.
@@ -107,7 +119,8 @@ def load_pretrained_model(ft_model_dir: Path) -> AutoModelForCausalLM:
     from transformers import BitsAndBytesConfig
 
     bnb_config = BitsAndBytesConfig(
-        load_in_8bit=True,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
         bnb_4bit_use_double_quant=True,
         bnb_4bit_quant_type="nf4",
         bnb_4bit_compute_dtype=torch.bfloat16,
diff --git a/llm-lora-finetuning/utils/tokenizer.py b/llm-lora-finetuning/utils/tokenizer.py
index 516e9c10..f2bf51d8 100644
--- a/llm-lora-finetuning/utils/tokenizer.py
+++ b/llm-lora-finetuning/utils/tokenizer.py
@@ -22,19 +22,24 @@
 def load_tokenizer(
     base_model_id: str,
     is_eval: bool = False,
+    use_fast: bool = True,
 ) -> AutoTokenizer:
     """Loads the tokenizer for the given base model id.
 
     Args:
         base_model_id: The base model id to use.
         is_eval: Whether to load the tokenizer for evaluation.
+        use_fast: Whether to use the fast tokenizer.
 
     Returns:
         The tokenizer.
     """
     if is_eval:
         tokenizer = AutoTokenizer.from_pretrained(
-            base_model_id, add_bos_token=True, device_map="auto"
+            base_model_id,
+            add_bos_token=True,
+            device_map="auto",
+            use_fast=use_fast,
         )
         tokenizer.pad_token_id = 0
     else:
@@ -44,6 +49,7 @@ def load_tokenizer(
             padding_side="left",
             add_eos_token=True,
             device_map="auto",
+            use_fast=use_fast,
         )
         tokenizer.pad_token = tokenizer.eos_token
     return tokenizer
@@ -127,6 +133,4 @@ def tokenize_for_eval(
 """
         for data_point in data_points["target"]
     ]
-    return tokenizer(eval_prompts, padding="longest", return_tensors="pt").to(
-        "cuda"
-    )
+    return tokenizer(eval_prompts, padding="longest", return_tensors="pt").to("cuda")

From 0c85e8bed5cbae1ecf3344f1d789df8e1d3e2994 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 19 Apr 2024 10:52:23 +0200
Subject: [PATCH 07/28] lint

---
 llm-lora-finetuning/pipelines/train.py        | 18 +++++++++----
 llm-lora-finetuning/scripts/finetune.py       | 17 ++++++++----
 llm-lora-finetuning/steps/evaluate_model.py   |  2 +-
 llm-lora-finetuning/steps/finetune.py         | 14 +++++++---
 llm-lora-finetuning/steps/prepare_datasets.py |  4 +--
 llm-lora-finetuning/steps/promote.py          |  2 +-
 llm-lora-finetuning/utils/hashing.py          | 27 +++++++++++++++----
 llm-lora-finetuning/utils/tokenizer.py        |  4 ++-
 8 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/llm-lora-finetuning/pipelines/train.py b/llm-lora-finetuning/pipelines/train.py
index 95021ac0..e8a2c790 100644
--- a/llm-lora-finetuning/pipelines/train.py
+++ b/llm-lora-finetuning/pipelines/train.py
@@ -17,9 +17,9 @@
 
 
 from steps import evaluate_model, finetune, prepare_data, promote
+from utils.hashing import compute_md5
 from zenml import logging as zenml_logging
 from zenml import pipeline
-from utils.hashing import compute_md5
 
 zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
     10000  # workaround for https://github.com/zenml-io/zenml/issues/2252
@@ -28,7 +28,11 @@
 
 @pipeline
 def llm_peft_full_finetune(
-    system_prompt: str, base_model_id: str, use_fast: bool = True, load_in_8bit: bool = False, load_in_4bit: bool = False
+    system_prompt: str,
+    base_model_id: str,
+    use_fast: bool = True,
+    load_in_8bit: bool = False,
+    load_in_4bit: bool = False,
 ):
     """Pipeline for finetuning an LLM with peft.
 
@@ -41,10 +45,14 @@ def llm_peft_full_finetune(
     - promote: promote the model to the target stage, if evaluation was successful
     """
     if not load_in_8bit and not load_in_4bit:
-        raise ValueError("At least one of `load_in_8bit` and `load_in_4bit` must be True.")
+        raise ValueError(
+            "At least one of `load_in_8bit` and `load_in_4bit` must be True."
+        )
     if load_in_4bit and load_in_8bit:
-        raise ValueError("Only one of `load_in_8bit` and `load_in_4bit` can be True.")
-    
+        raise ValueError(
+            "Only one of `load_in_8bit` and `load_in_4bit` can be True."
+        )
+
     datasets_dir = prepare_data(
         base_model_id=base_model_id,
         system_prompt=system_prompt,
diff --git a/llm-lora-finetuning/scripts/finetune.py b/llm-lora-finetuning/scripts/finetune.py
index a0e34744..cdd5b8c2 100644
--- a/llm-lora-finetuning/scripts/finetune.py
+++ b/llm-lora-finetuning/scripts/finetune.py
@@ -19,7 +19,6 @@
 from typing import List
 
 import click
-import torch
 import transformers
 from datasets import load_from_disk
 from zenml.logger import get_logger
@@ -27,7 +26,9 @@
 logger = get_logger(__name__)
 
 
-@click.command(help="Technical wrapper to pass into the `accelerate launch` command.")
+@click.command(
+    help="Technical wrapper to pass into the `accelerate launch` command."
+)
 @click.option(
     "--base-model-id",
     type=str,
@@ -284,15 +285,21 @@ def accelerated_finetune(
             max_steps=max_steps,
             learning_rate=lr,
             logging_steps=(
-                min(logging_steps, max_steps) if max_steps >= 0 else logging_steps
+                min(logging_steps, max_steps)
+                if max_steps >= 0
+                else logging_steps
             ),
             bf16=bf16,
             optim=optimizer,
             logging_dir="./logs",
             save_strategy="steps",
-            save_steps=min(save_steps, max_steps) if max_steps >= 0 else save_steps,
+            save_steps=min(save_steps, max_steps)
+            if max_steps >= 0
+            else save_steps,
             evaluation_strategy="steps",
-            eval_steps=min(eval_steps, max_steps) if max_steps >= 0 else eval_steps,
+            eval_steps=min(eval_steps, max_steps)
+            if max_steps >= 0
+            else eval_steps,
             do_eval=True,
             label_names=label_names,
         ),
diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index 454c950f..4d3240f0 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -21,6 +21,7 @@
 import evaluate
 import torch
 from datasets import load_from_disk
+from utils.cuda import cleanup_memory
 from utils.loaders import (
     load_base_model,
     load_pretrained_model,
@@ -28,7 +29,6 @@
 from utils.tokenizer import load_tokenizer, tokenize_for_eval
 from zenml import log_model_metadata, save_artifact, step
 from zenml.logger import get_logger
-from utils.cuda import cleanup_memory
 
 logger = get_logger(__name__)
 
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index 2964c0d5..027be10a 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -21,13 +21,13 @@
 import torch
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
+from utils.cuda import cleanup_memory
 from zenml import logging as zenml_logging
 from zenml import step
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
 
 from scripts.finetune import accelerated_finetune
-from utils.cuda import cleanup_memory
 
 logger = get_logger(__name__)
 zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
@@ -108,7 +108,9 @@ def finetune(
     else:
         logger.info("Starting accelerate training job...")
         ft_model_dir = "model_dir"
-        command = f"accelerate launch --num_processes {torch.cuda.device_count()} "
+        command = (
+            f"accelerate launch --num_processes {torch.cuda.device_count()} "
+        )
         command += str(Path("scripts/finetune.py").absolute()) + " "
         command += f'--base-model-id "{base_model_id}" '
         command += f'--dataset-dir "{dataset_dir}" '
@@ -118,8 +120,12 @@ def finetune(
         command += f"--save-steps {save_steps} "
         command += f"--optimizer {optimizer} "
         command += f"--lr {lr} "
-        command += f"--per-device-train-batch-size {per_device_train_batch_size} "
-        command += f"--gradient-accumulation-steps {gradient_accumulation_steps} "
+        command += (
+            f"--per-device-train-batch-size {per_device_train_batch_size} "
+        )
+        command += (
+            f"--gradient-accumulation-steps {gradient_accumulation_steps} "
+        )
         command += f"--warmup-steps {warmup_steps} "
         if bf16:
             command += f"--bf16 "
diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index 72bc8310..9c0aed23 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -20,10 +20,10 @@
 
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
+from utils.cuda import cleanup_memory
 from utils.tokenizer import generate_and_tokenize_prompt, load_tokenizer
 from zenml import log_model_metadata, step
 from zenml.materializers import BuiltInMaterializer
-from utils.cuda import cleanup_memory
 
 
 @step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
@@ -31,7 +31,7 @@ def prepare_data(
     base_model_id: str,
     system_prompt: str,
     dataset_name: str = "gem/viggo",
-    use_fast: bool = True
+    use_fast: bool = True,
 ) -> Annotated[Path, "datasets_dir"]:
     """Prepare the datasets for finetuning.
 
diff --git a/llm-lora-finetuning/steps/promote.py b/llm-lora-finetuning/steps/promote.py
index 72a36de4..0772d4d3 100644
--- a/llm-lora-finetuning/steps/promote.py
+++ b/llm-lora-finetuning/steps/promote.py
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
+from utils.cuda import cleanup_memory
 from zenml import get_step_context, step
 from zenml.client import Client
 from zenml.logger import get_logger
-from utils.cuda import cleanup_memory
 
 logger = get_logger(__name__)
 
diff --git a/llm-lora-finetuning/utils/hashing.py b/llm-lora-finetuning/utils/hashing.py
index 781f38f4..b1977003 100644
--- a/llm-lora-finetuning/utils/hashing.py
+++ b/llm-lora-finetuning/utils/hashing.py
@@ -1,15 +1,32 @@
-import sys
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 import hashlib
 
-BUF_SIZE = 65536 
+BUF_SIZE = 65536
+
 
-def compute_md5(file_path:str)->str:
+def compute_md5(file_path: str) -> str:
     md5 = hashlib.md5()
 
-    with open(file_path, 'rb') as f:
+    with open(file_path, "rb") as f:
         while True:
             data = f.read(BUF_SIZE)
             if not data:
                 break
             md5.update(data)
-    return md5.hexdigest()
\ No newline at end of file
+    return md5.hexdigest()
diff --git a/llm-lora-finetuning/utils/tokenizer.py b/llm-lora-finetuning/utils/tokenizer.py
index f2bf51d8..22f963ff 100644
--- a/llm-lora-finetuning/utils/tokenizer.py
+++ b/llm-lora-finetuning/utils/tokenizer.py
@@ -133,4 +133,6 @@ def tokenize_for_eval(
 """
         for data_point in data_points["target"]
     ]
-    return tokenizer(eval_prompts, padding="longest", return_tensors="pt").to("cuda")
+    return tokenizer(eval_prompts, padding="longest", return_tensors="pt").to(
+        "cuda"
+    )

From 27e6795367b363e2c02942ed88baf64a5efaf65a Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:25:59 +0200
Subject: [PATCH 08/28] fsspec fix

---
 llm-lora-finetuning/steps/prepare_datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index 9c0aed23..dbe59f79 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -69,8 +69,8 @@ def prepare_data(
     test_dataset = load_dataset(dataset_name, split="test")
 
     datasets_path = Path("datasets")
-    tokenized_train_dataset.save_to_disk(datasets_path / "train")
-    tokenized_val_dataset.save_to_disk(datasets_path / "val")
-    test_dataset.save_to_disk(datasets_path / "test_raw")
+    tokenized_train_dataset.save_to_disk(str((datasets_path / "train")))
+    tokenized_val_dataset.save_to_disk(str((datasets_path / "val")))
+    test_dataset.save_to_disk(str((datasets_path / "test_raw")))
 
     return datasets_path

From f2576301b56ad9f2bd17b65535cb10a0b401152c Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:51:53 +0200
Subject: [PATCH 09/28] pin datasets to lower version

---
 llm-lora-finetuning/requirements.txt          | 2 +-
 llm-lora-finetuning/steps/prepare_datasets.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llm-lora-finetuning/requirements.txt b/llm-lora-finetuning/requirements.txt
index 2276008b..309589ac 100644
--- a/llm-lora-finetuning/requirements.txt
+++ b/llm-lora-finetuning/requirements.txt
@@ -1,6 +1,6 @@
 zenml
 torch>=2.2.0
-datasets
+datasets<=2.16.1
 transformers
 peft
 bitsandbytes==0.41.3
diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index dbe59f79..9c0aed23 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -69,8 +69,8 @@ def prepare_data(
     test_dataset = load_dataset(dataset_name, split="test")
 
     datasets_path = Path("datasets")
-    tokenized_train_dataset.save_to_disk(str((datasets_path / "train")))
-    tokenized_val_dataset.save_to_disk(str((datasets_path / "val")))
-    test_dataset.save_to_disk(str((datasets_path / "test_raw")))
+    tokenized_train_dataset.save_to_disk(datasets_path / "train")
+    tokenized_val_dataset.save_to_disk(datasets_path / "val")
+    test_dataset.save_to_disk(datasets_path / "test_raw")
 
     return datasets_path

From f68a46910ae2ace107071f198b954000c653c1b2 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:54:40 +0200
Subject: [PATCH 10/28] relax datasets pin a bit

---
 llm-lora-finetuning/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm-lora-finetuning/requirements.txt b/llm-lora-finetuning/requirements.txt
index 309589ac..44420ab0 100644
--- a/llm-lora-finetuning/requirements.txt
+++ b/llm-lora-finetuning/requirements.txt
@@ -1,6 +1,6 @@
 zenml
 torch>=2.2.0
-datasets<=2.16.1
+datasets<=2.18
 transformers
 peft
 bitsandbytes==0.41.3

From 65cdc7eb8fe6bbacb9db65c4681c9434178fb73f Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 3 May 2024 16:22:33 +0200
Subject: [PATCH 11/28] polish for step operators

---
 llm-lora-finetuning/.dockerignore             |  1 +
 llm-lora-finetuning/README.md                 | 64 ++++++++++++------
 ...une.yaml => mistral_default_finetune.yaml} |  0
 ...tune.yaml => mistral_remote_finetune.yaml} |  0
 ... phi_accelerated_local_bf16_finetune.yaml} |  0
 ...ml => phi_accelerated_local_finetune.yaml} |  0
 .../phi_accelerated_remote_finetune.yaml      | 67 +++++++++++++++++++
 7 files changed, 110 insertions(+), 22 deletions(-)
 rename llm-lora-finetuning/configs/{default_finetune.yaml => mistral_default_finetune.yaml} (100%)
 rename llm-lora-finetuning/configs/{remote_finetune.yaml => mistral_remote_finetune.yaml} (100%)
 rename llm-lora-finetuning/configs/{phi_accelerated_bf16_finetune.yaml => phi_accelerated_local_bf16_finetune.yaml} (100%)
 rename llm-lora-finetuning/configs/{phi_accelerated_finetune.yaml => phi_accelerated_local_finetune.yaml} (100%)
 create mode 100644 llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml

diff --git a/llm-lora-finetuning/.dockerignore b/llm-lora-finetuning/.dockerignore
index c43c482f..41da4b80 100644
--- a/llm-lora-finetuning/.dockerignore
+++ b/llm-lora-finetuning/.dockerignore
@@ -3,3 +3,4 @@
 !/pipelines/**
 !/steps/**
 !/utils/**
+!/scripts/**
diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md
index 75de2133..e41cc056 100644
--- a/llm-lora-finetuning/README.md
+++ b/llm-lora-finetuning/README.md
@@ -34,10 +34,10 @@ pip install -r requirements.txt
 
 ### 👷 Combined feature engineering and finetuning pipeline
 
-The easiest way to get started with just a single command is to run the finetuning pipeline with the `default_finetune.yaml` configuration file, which will do data preparation, model finetuning, evaluation with [Rouge](https://huggingface.co/spaces/evaluate-metric/rouge) and promotion:
+The easiest way to get started with just a single command is to run the finetuning pipeline with the `mistral_default_finetune.yaml` configuration file, which will do data preparation, model finetuning, evaluation with [Rouge](https://huggingface.co/spaces/evaluate-metric/rouge) and promotion:
 
 ```shell
-python run.py --config default_finetune.yaml
+python run.py --config mistral_default_finetune.yaml
 ```
 
 When running the pipeline like this, the trained model will be stored in the ZenML artifact store.
@@ -50,6 +50,19 @@ When running the pipeline like this, the trained model will be stored in the Zen
   <br/>
 </div>
 
+### ⚡ Accelerate your finetuning
+
+Do you want to benefit from multiple GPUs training with Distributed Data Parallelism (DDP)? Then you can use other configuration files prepared for this task.
+For example, `phi_accelerated_local_finetune.yaml` can run finetuning of the [Microsoft Phi 2](https://huggingface.co/microsoft/phi-2) powered by [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/en/index) on all GPUs available in the environment. To do so, just call:
+
+```shell
+python run.py --config phi_accelerated_local_finetune.yaml # if you architecture doesn't support BF16
+# OR
+python run.py --config phi_accelerated_local_bf16_finetune.yaml # if you architecture support BF16
+```
+
+Under the hood, the finetuning step will spin up the accelerated job using the finetuning script CLI wrapper (`scripts/finetune.py`), which will run on all available GPUs.
+
 ## ☁️ Running with a remote stack
 
 To finetune an LLM on remote infrastructure, you can either use a remote orchestrator or a remote step operator. Follow these steps to set up a complete remote stack:
@@ -71,26 +84,33 @@ The project loosely follows [the recommended ZenML project structure](https://do
 
 ```
 .
-├── configs                         # pipeline configuration files
-│   ├── default_finetune.yaml       # default local configuration (or remote orchestrator)
-│   └── remote_finetune.yaml        # default step operator configuration
+├── configs                                       # pipeline configuration files
+│   ├── mistral_default_finetune.yaml             # mistral local configuration (or remote orchestrator)
+│   ├── mistral_remote_finetune.yaml              # mistral step operator configuration
+│   ├── phi_accelerated_local_bf16_finetune.yaml  # phi accelerated local with bf16
+│   ├── phi_accelerated_local_finetune.yaml       # phi accelerated local without bf16
+│   ├── phi_accelerated_remote_finetune.yaml      # phi accelerated step operator without bf16
+│   ├── phi_local_bf16_finetune.yaml              # phi local with bf16
+│   └── phi_local_finetune.yaml                   # phi local without bf16
 ├── materializers
-│   └── directory_materializer.py   # custom materializer to push whole directories to the artifact store and back
-├── pipelines                       # `zenml.pipeline` implementations
-│   └── train.py                    # Finetuning and evaluation pipeline
-├── steps                           # logically grouped `zenml.steps` implementations
-│   ├── evaluate_model.py           # evaluate base and finetuned models using Rouge metrics
-│   ├── finetune.py                 # finetune the base model
-│   ├── prepare_datasets.py         # load and tokenize dataset
-│   └── promote.py                  # promote good models to target environment
-├── utils                           # utility functions
-│   ├── callbacks.py                # custom callbacks
-│   ├── cuda.py                     # helpers for CUDA
-│   ├── loaders.py                  # loaders for models and data
-│   ├── logging.py                  # logging helpers
-│   └── tokenizer.py                # load and tokenize
+│   └── directory_materializer.py                 # custom materializer to push whole directories to the artifact store and back
+├── pipelines                                     # `zenml.pipeline` implementations
+│   └── train.py                                  # Finetuning and evaluation pipeline
+├── scripts                                       # scripts used in the finetuning
+│   └── finetune.py                               # Finetuning function and a CLI wrapper for `accelerate run ...`
+├── steps                                         # logically grouped `zenml.steps` implementations
+│   ├── evaluate_model.py                         # evaluate base and finetuned models using Rouge metrics
+│   ├── finetune.py                               # finetune the base model
+│   ├── prepare_datasets.py                       # load and tokenize dataset
+│   └── promote.py                                # promote good models to target environment
+├── utils                                         # utility functions
+│   ├── callbacks.py                              # custom callbacks
+│   ├── cuda.py                                   # helpers for CUDA
+│   ├── loaders.py                                # loaders for models and data
+│   ├── logging.py                                # logging helpers
+│   └── tokenizer.py                              # load and tokenize
 ├── .dockerignore
-├── README.md                       # this file
-├── requirements.txt                # extra Python dependencies 
-└── run.py                          # CLI tool to run pipelines on ZenML Stack
+├── README.md                                     # this file
+├── requirements.txt                              # extra Python dependencies 
+└── run.py                                        # CLI tool to run pipelines on ZenML Stack
 ```
diff --git a/llm-lora-finetuning/configs/default_finetune.yaml b/llm-lora-finetuning/configs/mistral_default_finetune.yaml
similarity index 100%
rename from llm-lora-finetuning/configs/default_finetune.yaml
rename to llm-lora-finetuning/configs/mistral_default_finetune.yaml
diff --git a/llm-lora-finetuning/configs/remote_finetune.yaml b/llm-lora-finetuning/configs/mistral_remote_finetune.yaml
similarity index 100%
rename from llm-lora-finetuning/configs/remote_finetune.yaml
rename to llm-lora-finetuning/configs/mistral_remote_finetune.yaml
diff --git a/llm-lora-finetuning/configs/phi_accelerated_bf16_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_local_bf16_finetune.yaml
similarity index 100%
rename from llm-lora-finetuning/configs/phi_accelerated_bf16_finetune.yaml
rename to llm-lora-finetuning/configs/phi_accelerated_local_bf16_finetune.yaml
diff --git a/llm-lora-finetuning/configs/phi_accelerated_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_local_finetune.yaml
similarity index 100%
rename from llm-lora-finetuning/configs/phi_accelerated_finetune.yaml
rename to llm-lora-finetuning/configs/phi_accelerated_local_finetune.yaml
diff --git a/llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml
new file mode 100644
index 00000000..abbbe28f
--- /dev/null
+++ b/llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml
@@ -0,0 +1,67 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model:
+  name: llm-peft-microsoft-phi-2
+  description: "Fine-tune `microsoft/phi-2`."
+  tags:
+    - llm
+    - peft
+    - microsoft/phi-2
+  version: 100_steps_accelerate
+
+settings:
+  docker:
+    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+    requirements: requirements.txt
+    environment:
+      PJRT_DEVICE: CUDA
+      USE_TORCH_XLA: "false"
+      MKL_SERVICE_FORCE_INTEL: "1"
+
+parameters:
+  base_model_id: microsoft/phi-2
+  system_prompt: |
+    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+  use_fast: False
+  load_in_4bit: True
+
+steps:
+  prepare_data:
+    parameters:
+      dataset_name: gem/viggo
+
+  finetune:
+    step_operator: gcp_t4x2
+    parameters:
+      max_steps: 100
+      eval_steps: 50
+      bf16: False
+      use_accelerate: True
+
+  evaluate_finetuned:
+    step_operator: gcp_t4x2
+
+  evaluate_base:
+    step_operator: gcp_t4x2
+      
+  promote:
+    parameters:
+      metric: rouge2
+      target_stage: staging

From 93398cdca6c1b66560aaa105a2bb93ac5fb6c61f Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 7 May 2024 13:44:40 +0200
Subject: [PATCH 12/28] push some functionality to the core

---
 llm-lora-finetuning/.dockerignore             |   2 +-
 .../{scripts => functions}/__init__.py        |   0
 .../{scripts => functions}/finetune.py        | 192 +-----------------
 llm-lora-finetuning/pipelines/train.py        |   2 -
 llm-lora-finetuning/steps/evaluate_model.py   |   4 +-
 llm-lora-finetuning/steps/finetune.py         | 107 +++-------
 llm-lora-finetuning/steps/prepare_datasets.py |   4 +-
 llm-lora-finetuning/steps/promote.py          |   4 +-
 llm-lora-finetuning/utils/cuda.py             |  30 ---
 llm-lora-finetuning/utils/hashing.py          |  32 ---
 10 files changed, 49 insertions(+), 328 deletions(-)
 rename llm-lora-finetuning/{scripts => functions}/__init__.py (100%)
 rename llm-lora-finetuning/{scripts => functions}/finetune.py (57%)
 delete mode 100644 llm-lora-finetuning/utils/cuda.py
 delete mode 100644 llm-lora-finetuning/utils/hashing.py

diff --git a/llm-lora-finetuning/.dockerignore b/llm-lora-finetuning/.dockerignore
index 41da4b80..9b549922 100644
--- a/llm-lora-finetuning/.dockerignore
+++ b/llm-lora-finetuning/.dockerignore
@@ -3,4 +3,4 @@
 !/pipelines/**
 !/steps/**
 !/utils/**
-!/scripts/**
+!/functions/**
diff --git a/llm-lora-finetuning/scripts/__init__.py b/llm-lora-finetuning/functions/__init__.py
similarity index 100%
rename from llm-lora-finetuning/scripts/__init__.py
rename to llm-lora-finetuning/functions/__init__.py
diff --git a/llm-lora-finetuning/scripts/finetune.py b/llm-lora-finetuning/functions/finetune.py
similarity index 57%
rename from llm-lora-finetuning/scripts/finetune.py
rename to llm-lora-finetuning/functions/finetune.py
index cdd5b8c2..3770a13a 100644
--- a/llm-lora-finetuning/scripts/finetune.py
+++ b/llm-lora-finetuning/functions/finetune.py
@@ -16,9 +16,8 @@
 #
 
 from pathlib import Path
-from typing import List
+from typing import Tuple
 
-import click
 import transformers
 from datasets import load_from_disk
 from zenml.logger import get_logger
@@ -26,118 +25,7 @@
 logger = get_logger(__name__)
 
 
-@click.command(
-    help="Technical wrapper to pass into the `accelerate launch` command."
-)
-@click.option(
-    "--base-model-id",
-    type=str,
-    help="The base model id to use.",
-)
-@click.option(
-    "--dataset-dir",
-    type=str,
-    help="The path to the dataset directory.",
-)
-@click.option(
-    "--max-steps",
-    type=int,
-    default=100,
-    help="The maximum number of steps to train for.",
-)
-@click.option(
-    "--logging-steps",
-    type=int,
-    default=50,
-    help="The number of steps to log at.",
-)
-@click.option(
-    "--eval-steps",
-    type=int,
-    default=50,
-    help="The number of steps to log at.",
-)
-@click.option(
-    "--save-steps",
-    type=int,
-    default=50,
-    help="The number of steps to log at.",
-)
-@click.option(
-    "--optimizer",
-    type=str,
-    default="paged_adamw_8bit",
-    help="The optimizer to use.",
-)
-@click.option(
-    "--lr",
-    type=float,
-    default=2.5e-5,
-    help="The learning rate to use.",
-)
-@click.option(
-    "--per-device-train-batch-size",
-    type=int,
-    default=2,
-    help="The batch size to use for training.",
-)
-@click.option(
-    "--gradient-accumulation-steps",
-    type=int,
-    default=4,
-    help="The number of gradient accumulation steps.",
-)
-@click.option(
-    "--warmup-steps",
-    type=int,
-    default=5,
-    help="The number of warmup steps.",
-)
-@click.option(
-    "--bf16",
-    is_flag=True,
-    default=False,
-    help="Use bf16 for training.",
-)
-@click.option(
-    "--use-accelerate",
-    is_flag=True,
-    default=False,
-    help="Use accelerate for training.",
-)
-@click.option(
-    "--label-names",
-    "-l",
-    help="The label names to use.",
-    type=str,
-    required=False,
-    multiple=True,
-)
-@click.option(
-    "--ft-model-dir",
-    type=str,
-    default="",
-    help="The path to the finetuned model directory.",
-)
-@click.option(
-    "--use-fast",
-    is_flag=True,
-    default=False,
-    help="Use the fast tokenizer.",
-)
-@click.option(
-    "--load-in-4bit",
-    is_flag=True,
-    default=False,
-    help="Whether to load the model in 4bit mode",
-)
-@click.option(
-    "--load-in-8bit",
-    is_flag=True,
-    default=False,
-    help="Whether to load the model in 8bit mode",
-)
-def cli_wrapper(
+def finetune_fn(
     base_model_id: str,
     dataset_dir: str,
     max_steps: int = 100,
@@ -149,62 +37,14 @@ def cli_wrapper(
     per_device_train_batch_size: int = 2,
     gradient_accumulation_steps: int = 4,
     warmup_steps: int = 5,
-    bf16: bool = False,
-    use_accelerate: bool = False,
-    label_names: List[str] = None,
-    ft_model_dir: str = "",
-    use_fast: bool = False,
-    load_in_4bit: bool = False,
-    load_in_8bit: bool = False,
-) -> Path:
-    dataset_dir = Path(dataset_dir)
-    if ft_model_dir:
-        ft_model_dir = Path(ft_model_dir)
-    else:
-        ft_model_dir = None
-
-    return accelerated_finetune(
-        base_model_id=base_model_id,
-        dataset_dir=dataset_dir,
-        max_steps=max_steps,
-        logging_steps=logging_steps,
-        eval_steps=eval_steps,
-        save_steps=save_steps,
-        optimizer=optimizer,
-        lr=lr,
-        per_device_train_batch_size=per_device_train_batch_size,
-        gradient_accumulation_steps=gradient_accumulation_steps,
-        warmup_steps=warmup_steps,
-        bf16=bf16,
-        use_accelerate=use_accelerate,
-        label_names=list(label_names),
-        ft_model_dir=ft_model_dir,
-        use_fast=use_fast,
-        load_in_4bit=load_in_4bit,
-        load_in_8bit=load_in_8bit,
-    )
-
-
-def accelerated_finetune(
-    base_model_id: str,
-    dataset_dir: Path,
-    max_steps: int = 100,
-    logging_steps: int = 50,
-    eval_steps: int = 50,
-    save_steps: int = 50,
-    optimizer: str = "paged_adamw_8bit",
-    lr: float = 2.5e-5,
-    per_device_train_batch_size: int = 2,
-    gradient_accumulation_steps: int = 4,
-    warmup_steps: int = 5,
     bf16: bool = True,
     use_accelerate: bool = False,
-    label_names: List[str] = None,
-    ft_model_dir: Path = None,
+    label_names: Tuple[str] = None,
+    ft_model_dir: str = "model_dir",
     use_fast: bool = True,
     load_in_4bit: bool = False,
     load_in_8bit: bool = False,
-) -> Path:
+) -> None:
     """Finetune the model using PEFT.
 
     It can be run with accelerate or without.
@@ -228,21 +68,15 @@ def accelerated_finetune(
         use_fast: Whether to use fast tokenizers.
         load_in_4bit: Whether to load the model in 4bit mode.
         load_in_8bit: Whether to load the model in 8bit mode.
-
-    Returns:
-        The path to the finetuned model directory.
     """
-    import sys
-
-    # hack to make internal modules visible in the script
-    sys.path.append("..")
-    sys.path.append(".")
-
     from accelerate import Accelerator
     from utils.callbacks import ZenMLCallback
     from utils.loaders import load_base_model
     from utils.tokenizer import load_tokenizer
 
+    if label_names:
+        label_names = list(label_names)
+
     if use_accelerate:
         accelerator = Accelerator()
         should_print = accelerator.is_main_process
@@ -258,6 +92,7 @@ def accelerated_finetune(
     if should_print:
         logger.info("Loading datasets...")
     tokenizer = load_tokenizer(base_model_id, use_fast=use_fast)
+    dataset_dir = Path(dataset_dir)
     tokenized_train_dataset = load_from_disk(dataset_dir / "train")
     tokenized_val_dataset = load_from_disk(dataset_dir / "val")
 
@@ -320,8 +155,7 @@ def accelerated_finetune(
     if should_print:
         logger.info("Saving model...")
 
-    if ft_model_dir is None:
-        ft_model_dir = Path("model_dir")
+    ft_model_dir = Path(ft_model_dir)
     if not use_accelerate or accelerator.is_main_process:
         ft_model_dir.mkdir(parents=True, exist_ok=True)
     if not use_accelerate:
@@ -334,9 +168,3 @@ def accelerated_finetune(
             is_main_process=accelerator.is_main_process,
             save_function=accelerator.save,
         )
-
-    return ft_model_dir
-
-
-if __name__ == "__main__":
-    cli_wrapper()
diff --git a/llm-lora-finetuning/pipelines/train.py b/llm-lora-finetuning/pipelines/train.py
index e8a2c790..73054202 100644
--- a/llm-lora-finetuning/pipelines/train.py
+++ b/llm-lora-finetuning/pipelines/train.py
@@ -17,7 +17,6 @@
 
 
 from steps import evaluate_model, finetune, prepare_data, promote
-from utils.hashing import compute_md5
 from zenml import logging as zenml_logging
 from zenml import pipeline
 
@@ -61,7 +60,6 @@ def llm_peft_full_finetune(
     ft_model_dir = finetune(
         base_model_id,
         datasets_dir,
-        finetune_script_sha=compute_md5("scripts/finetune.py"),
         use_fast=use_fast,
         load_in_8bit=load_in_8bit,
         load_in_4bit=load_in_4bit,
diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index 4d3240f0..f5047394 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -21,7 +21,7 @@
 import evaluate
 import torch
 from datasets import load_from_disk
-from utils.cuda import cleanup_memory
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 from utils.loaders import (
     load_base_model,
     load_pretrained_model,
@@ -55,7 +55,7 @@ def evaluate_model(
         load_in_4bit: Whether to load the model in 4bit mode.
         load_in_8bit: Whether to load the model in 8bit mode.
     """
-    cleanup_memory()
+    cleanup_gpu_memory(force=True)
     logger.info("Evaluating model...")
 
     logger.info("Loading dataset...")
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index 027be10a..b38712a5 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -15,31 +15,30 @@
 # limitations under the License.
 #
 
-import subprocess
 from pathlib import Path
 
-import torch
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
-from utils.cuda import cleanup_memory
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 from zenml import logging as zenml_logging
 from zenml import step
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
+from zenml.integrations.accelerate.utils.accelerate_runner import run_with_accelerate
 
-from scripts.finetune import accelerated_finetune
+from functions.finetune import finetune_fn
 
 logger = get_logger(__name__)
 zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
     10000  # workaround for https://github.com/zenml-io/zenml/issues/2252
 )
 
+cache_invalidator = hash(finetune_fn.__code__)
 
 @step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
 def finetune(
     base_model_id: str,
     dataset_dir: Path,
-    finetune_script_sha: str,
     max_steps: int = 1000,
     logging_steps: int = 50,
     eval_steps: int = 50,
@@ -54,6 +53,7 @@ def finetune(
     use_fast: bool = True,
     load_in_4bit: bool = False,
     load_in_8bit: bool = False,
+    cache_invalidator: int = cache_invalidator,
 ) -> Annotated[Path, "ft_model_dir"]:
     """Finetune the model using PEFT.
 
@@ -65,7 +65,6 @@ def finetune(
     Args:
         base_model_id: The base model id to use.
         dataset_dir: The path to the dataset directory.
-        finetune_script_sha: The sha of the finetune script.
         max_steps: The maximum number of steps to train for.
         logging_steps: The number of steps to log at.
         eval_steps: The number of steps to evaluate at.
@@ -84,77 +83,35 @@ def finetune(
     Returns:
         The path to the finetuned model directory.
     """
-    cleanup_memory()
+    cleanup_gpu_memory(force=True)
+
+    ft_model_dir = "model_dir"
+    common_kwargs = dict(
+        base_model_id=base_model_id,
+        dataset_dir=dataset_dir.as_posix(),
+        max_steps=max_steps,
+        logging_steps=logging_steps,
+        eval_steps=eval_steps,
+        save_steps=save_steps,
+        optimizer=optimizer,
+        lr=lr,
+        per_device_train_batch_size=per_device_train_batch_size,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        warmup_steps=warmup_steps,
+        bf16=bf16,
+        use_fast=use_fast,
+        load_in_4bit=load_in_4bit,
+        load_in_8bit=load_in_8bit,
+        use_accelerate=use_accelerate,
+        ft_model_dir=ft_model_dir,
+    )
     if not use_accelerate:
-        return accelerated_finetune(
-            base_model_id=base_model_id,
-            dataset_dir=dataset_dir,
-            max_steps=max_steps,
-            logging_steps=logging_steps,
-            eval_steps=eval_steps,
-            save_steps=save_steps,
-            optimizer=optimizer,
-            lr=lr,
-            per_device_train_batch_size=per_device_train_batch_size,
-            gradient_accumulation_steps=gradient_accumulation_steps,
-            warmup_steps=warmup_steps,
-            bf16=bf16,
-            use_accelerate=False,
-            use_fast=use_fast,
-            load_in_4bit=load_in_4bit,
-            load_in_8bit=load_in_8bit,
+        finetune_fn(
+            **common_kwargs,
         )
-
     else:
-        logger.info("Starting accelerate training job...")
-        ft_model_dir = "model_dir"
-        command = (
-            f"accelerate launch --num_processes {torch.cuda.device_count()} "
-        )
-        command += str(Path("scripts/finetune.py").absolute()) + " "
-        command += f'--base-model-id "{base_model_id}" '
-        command += f'--dataset-dir "{dataset_dir}" '
-        command += f"--max-steps {max_steps} "
-        command += f"--logging-steps {logging_steps} "
-        command += f"--eval-steps {eval_steps} "
-        command += f"--save-steps {save_steps} "
-        command += f"--optimizer {optimizer} "
-        command += f"--lr {lr} "
-        command += (
-            f"--per-device-train-batch-size {per_device_train_batch_size} "
+        run_with_accelerate(
+            function=finetune_fn, label_names=["input_ids"], **common_kwargs
         )
-        command += (
-            f"--gradient-accumulation-steps {gradient_accumulation_steps} "
-        )
-        command += f"--warmup-steps {warmup_steps} "
-        if bf16:
-            command += f"--bf16 "
-        if use_accelerate:
-            command += f"--use-accelerate "
-            command += f"-l input_ids "
-            command += f'--ft-model-dir "{ft_model_dir}" '
-        if use_fast:
-            command += f"--use-fast "
-        if load_in_4bit:
-            command += f"--load-in-4bit "
-        if load_in_8bit:
-            command += f"--load-in-8bit "
-
-        print(command)
 
-        result = subprocess.run(
-            command,
-            shell=True,
-            stdout=subprocess.PIPE,
-            universal_newlines=True,
-        )
-        for stdout_line in result.stdout.split("\n"):
-            print(stdout_line)
-        if result.returncode == 0:
-            logger.info("Accelerate training job finished.")
-            return Path(ft_model_dir)
-        else:
-            logger.error(
-                f"Accelerate training job failed. With return code {result.returncode}."
-            )
-            raise subprocess.CalledProcessError(result.returncode, command)
+    return Path(ft_model_dir)
diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index 9c0aed23..92b00379 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -20,7 +20,7 @@
 
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
-from utils.cuda import cleanup_memory
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 from utils.tokenizer import generate_and_tokenize_prompt, load_tokenizer
 from zenml import log_model_metadata, step
 from zenml.materializers import BuiltInMaterializer
@@ -46,7 +46,7 @@ def prepare_data(
     """
     from datasets import load_dataset
 
-    cleanup_memory()
+    cleanup_gpu_memory(force=True)
 
     log_model_metadata(
         {
diff --git a/llm-lora-finetuning/steps/promote.py b/llm-lora-finetuning/steps/promote.py
index 0772d4d3..42380178 100644
--- a/llm-lora-finetuning/steps/promote.py
+++ b/llm-lora-finetuning/steps/promote.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from utils.cuda import cleanup_memory
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 from zenml import get_step_context, step
 from zenml.client import Client
 from zenml.logger import get_logger
@@ -37,7 +37,7 @@ def promote(
         metric: The metric to use for promotion.
         target_stage: The target stage to promote to.
     """
-    cleanup_memory()
+    cleanup_gpu_memory(force=True)
     context_model = get_step_context().model
     base_metrics = context_model.load_artifact("base_model_rouge_metrics")
     ft_metrics = context_model.load_artifact("finetuned_model_rouge_metrics")
diff --git a/llm-lora-finetuning/utils/cuda.py b/llm-lora-finetuning/utils/cuda.py
deleted file mode 100644
index 958b5bac..00000000
--- a/llm-lora-finetuning/utils/cuda.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import gc
-
-import torch
-from zenml.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-def cleanup_memory() -> None:
-    """Clean up GPU memory."""
-    logger.info("Cleaning up GPU memory...")
-    while gc.collect():
-        torch.cuda.empty_cache()
diff --git a/llm-lora-finetuning/utils/hashing.py b/llm-lora-finetuning/utils/hashing.py
deleted file mode 100644
index b1977003..00000000
--- a/llm-lora-finetuning/utils/hashing.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import hashlib
-
-BUF_SIZE = 65536
-
-
-def compute_md5(file_path: str) -> str:
-    md5 = hashlib.md5()
-
-    with open(file_path, "rb") as f:
-        while True:
-            data = f.read(BUF_SIZE)
-            if not data:
-                break
-            md5.update(data)
-    return md5.hexdigest()

From 2bb3ab97b0015409486e88af092856f9fe06f065 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 7 May 2024 13:45:04 +0200
Subject: [PATCH 13/28] format

---
 llm-lora-finetuning/steps/evaluate_model.py   |  2 +-
 llm-lora-finetuning/steps/finetune.py         | 10 ++++++----
 llm-lora-finetuning/steps/prepare_datasets.py |  2 +-
 llm-lora-finetuning/steps/promote.py          |  2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index f5047394..4620fe28 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -21,7 +21,6 @@
 import evaluate
 import torch
 from datasets import load_from_disk
-from zenml.utils.cuda_utils import cleanup_gpu_memory
 from utils.loaders import (
     load_base_model,
     load_pretrained_model,
@@ -29,6 +28,7 @@
 from utils.tokenizer import load_tokenizer, tokenize_for_eval
 from zenml import log_model_metadata, save_artifact, step
 from zenml.logger import get_logger
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 
 logger = get_logger(__name__)
 
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index b38712a5..f2c93672 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -17,16 +17,17 @@
 
 from pathlib import Path
 
+from functions.finetune import finetune_fn
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
-from zenml.utils.cuda_utils import cleanup_gpu_memory
 from zenml import logging as zenml_logging
 from zenml import step
+from zenml.integrations.accelerate.utils.accelerate_runner import (
+    run_with_accelerate,
+)
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
-from zenml.integrations.accelerate.utils.accelerate_runner import run_with_accelerate
-
-from functions.finetune import finetune_fn
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 
 logger = get_logger(__name__)
 zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
@@ -35,6 +36,7 @@
 
 cache_invalidator = hash(finetune_fn.__code__)
 
+
 @step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
 def finetune(
     base_model_id: str,
diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index 92b00379..7c055775 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -20,10 +20,10 @@
 
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
-from zenml.utils.cuda_utils import cleanup_gpu_memory
 from utils.tokenizer import generate_and_tokenize_prompt, load_tokenizer
 from zenml import log_model_metadata, step
 from zenml.materializers import BuiltInMaterializer
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 
 
 @step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
diff --git a/llm-lora-finetuning/steps/promote.py b/llm-lora-finetuning/steps/promote.py
index 42380178..12b305d0 100644
--- a/llm-lora-finetuning/steps/promote.py
+++ b/llm-lora-finetuning/steps/promote.py
@@ -15,10 +15,10 @@
 # limitations under the License.
 #
 
-from zenml.utils.cuda_utils import cleanup_gpu_memory
 from zenml import get_step_context, step
 from zenml.client import Client
 from zenml.logger import get_logger
+from zenml.utils.cuda_utils import cleanup_gpu_memory
 
 logger = get_logger(__name__)
 

From 261e2ceb75aa4af7ce1f419eae863e860bb0785b Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 7 May 2024 13:51:11 +0200
Subject: [PATCH 14/28] update README

---
 llm-lora-finetuning/README.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md
index e41cc056..9fdf745f 100644
--- a/llm-lora-finetuning/README.md
+++ b/llm-lora-finetuning/README.md
@@ -34,6 +34,11 @@ pip install -r requirements.txt
 
 ### 👷 Combined feature engineering and finetuning pipeline
 
+> [!WARNING]  
+> All steps of this pipeline have a `clean_gpu_memory(force=True)` at the beginning. This is used to ensure that the memory is properly cleared after previous steps.
+>
+> This functionality might affect other GPU processes running on the same environment, so if you don't want to clean the GPU memory between the steps, you can delete those utility calls from all steps.
+
 The easiest way to get started with just a single command is to run the finetuning pipeline with the `mistral_default_finetune.yaml` configuration file, which will do data preparation, model finetuning, evaluation with [Rouge](https://huggingface.co/spaces/evaluate-metric/rouge) and promotion:
 
 ```shell
@@ -52,13 +57,13 @@ When running the pipeline like this, the trained model will be stored in the Zen
 
 ### ⚡ Accelerate your finetuning
 
-Do you want to benefit from multiple GPUs training with Distributed Data Parallelism (DDP)? Then you can use other configuration files prepared for this task.
-For example, `phi_accelerated_local_finetune.yaml` can run finetuning of the [Microsoft Phi 2](https://huggingface.co/microsoft/phi-2) powered by [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/en/index) on all GPUs available in the environment. To do so, just call:
+Do you want to benefit from multi-GPU-training with Distributed Data Parallelism (DDP)? Then you can use other configuration files prepared for this purpose.
+For example, `phi_accelerated_local_finetune.yaml` can run a finetuning of the [Microsoft Phi 2](https://huggingface.co/microsoft/phi-2) powered by [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/en/index) on all GPUs available in the environment. To do so, just call:
 
 ```shell
-python run.py --config phi_accelerated_local_finetune.yaml # if you architecture doesn't support BF16
+python run.py --config phi_accelerated_local_finetune.yaml # if your architecture doesn't support BF16
 # OR
-python run.py --config phi_accelerated_local_bf16_finetune.yaml # if you architecture support BF16
+python run.py --config phi_accelerated_local_bf16_finetune.yaml # if your architecture support BF16
 ```
 
 Under the hood, the finetuning step will spin up the accelerated job using the finetuning script CLI wrapper (`scripts/finetune.py`), which will run on all available GPUs.

From c22092b917de82f8a987fa5c2ff82fb95c8530e4 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 7 May 2024 13:55:51 +0200
Subject: [PATCH 15/28] update README

---
 llm-lora-finetuning/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md
index 9fdf745f..85635101 100644
--- a/llm-lora-finetuning/README.md
+++ b/llm-lora-finetuning/README.md
@@ -101,8 +101,8 @@ The project loosely follows [the recommended ZenML project structure](https://do
 │   └── directory_materializer.py                 # custom materializer to push whole directories to the artifact store and back
 ├── pipelines                                     # `zenml.pipeline` implementations
 │   └── train.py                                  # Finetuning and evaluation pipeline
-├── scripts                                       # scripts used in the finetuning
-│   └── finetune.py                               # Finetuning function and a CLI wrapper for `accelerate run ...`
+├── functions                                     # functions used in the finetuning
+│   └── finetune.py                               # Finetuning function
 ├── steps                                         # logically grouped `zenml.steps` implementations
 │   ├── evaluate_model.py                         # evaluate base and finetuned models using Rouge metrics
 │   ├── finetune.py                               # finetune the base model

From b51a1111f4d26c970c58562dde30d1bfdee26f62 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 14 May 2024 08:57:59 +0200
Subject: [PATCH 16/28] use `AccelerateScaler`

---
 llm-lora-finetuning/.dockerignore         |   1 -
 llm-lora-finetuning/README.md             |   2 -
 llm-lora-finetuning/functions/__init__.py |  16 --
 llm-lora-finetuning/functions/finetune.py | 170 ----------------------
 llm-lora-finetuning/steps/finetune.py     | 126 ++++++++++++----
 5 files changed, 96 insertions(+), 219 deletions(-)
 delete mode 100644 llm-lora-finetuning/functions/__init__.py
 delete mode 100644 llm-lora-finetuning/functions/finetune.py

diff --git a/llm-lora-finetuning/.dockerignore b/llm-lora-finetuning/.dockerignore
index 9b549922..c43c482f 100644
--- a/llm-lora-finetuning/.dockerignore
+++ b/llm-lora-finetuning/.dockerignore
@@ -3,4 +3,3 @@
 !/pipelines/**
 !/steps/**
 !/utils/**
-!/functions/**
diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md
index 85635101..df8d3e40 100644
--- a/llm-lora-finetuning/README.md
+++ b/llm-lora-finetuning/README.md
@@ -101,8 +101,6 @@ The project loosely follows [the recommended ZenML project structure](https://do
 │   └── directory_materializer.py                 # custom materializer to push whole directories to the artifact store and back
 ├── pipelines                                     # `zenml.pipeline` implementations
 │   └── train.py                                  # Finetuning and evaluation pipeline
-├── functions                                     # functions used in the finetuning
-│   └── finetune.py                               # Finetuning function
 ├── steps                                         # logically grouped `zenml.steps` implementations
 │   ├── evaluate_model.py                         # evaluate base and finetuned models using Rouge metrics
 │   ├── finetune.py                               # finetune the base model
diff --git a/llm-lora-finetuning/functions/__init__.py b/llm-lora-finetuning/functions/__init__.py
deleted file mode 100644
index 757bd841..00000000
--- a/llm-lora-finetuning/functions/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
diff --git a/llm-lora-finetuning/functions/finetune.py b/llm-lora-finetuning/functions/finetune.py
deleted file mode 100644
index 3770a13a..00000000
--- a/llm-lora-finetuning/functions/finetune.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from pathlib import Path
-from typing import Tuple
-
-import transformers
-from datasets import load_from_disk
-from zenml.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-def finetune_fn(
-    base_model_id: str,
-    dataset_dir: str,
-    max_steps: int = 100,
-    logging_steps: int = 50,
-    eval_steps: int = 50,
-    save_steps: int = 50,
-    optimizer: str = "paged_adamw_8bit",
-    lr: float = 2.5e-5,
-    per_device_train_batch_size: int = 2,
-    gradient_accumulation_steps: int = 4,
-    warmup_steps: int = 5,
-    bf16: bool = True,
-    use_accelerate: bool = False,
-    label_names: Tuple[str] = None,
-    ft_model_dir: str = "model_dir",
-    use_fast: bool = True,
-    load_in_4bit: bool = False,
-    load_in_8bit: bool = False,
-) -> None:
-    """Finetune the model using PEFT.
-
-    It can be run with accelerate or without.
-
-    Args:
-        base_model_id: The base model id to use.
-        dataset_dir: The path to the dataset directory.
-        max_steps: The maximum number of steps to train for.
-        logging_steps: The number of steps to log at.
-        eval_steps: The number of steps to evaluate at.
-        save_steps: The number of steps to save at.
-        optimizer: The optimizer to use.
-        lr: The learning rate to use.
-        per_device_train_batch_size: The batch size to use for training.
-        gradient_accumulation_steps: The number of gradient accumulation steps.
-        warmup_steps: The number of warmup steps.
-        bf16: Whether to use bf16.
-        use_accelerate: Whether to use accelerate.
-        label_names: The label names to use.
-        ft_model_dir: The path to the finetuned model directory.
-        use_fast: Whether to use fast tokenizers.
-        load_in_4bit: Whether to load the model in 4bit mode.
-        load_in_8bit: Whether to load the model in 8bit mode.
-    """
-    from accelerate import Accelerator
-    from utils.callbacks import ZenMLCallback
-    from utils.loaders import load_base_model
-    from utils.tokenizer import load_tokenizer
-
-    if label_names:
-        label_names = list(label_names)
-
-    if use_accelerate:
-        accelerator = Accelerator()
-        should_print = accelerator.is_main_process
-    else:
-        accelerator = None
-        should_print = True
-
-    project = "zenml-finetune"
-    base_model_name = "mistral"
-    run_name = base_model_name + "-" + project
-    output_dir = "./" + run_name
-
-    if should_print:
-        logger.info("Loading datasets...")
-    tokenizer = load_tokenizer(base_model_id, use_fast=use_fast)
-    dataset_dir = Path(dataset_dir)
-    tokenized_train_dataset = load_from_disk(dataset_dir / "train")
-    tokenized_val_dataset = load_from_disk(dataset_dir / "val")
-
-    if should_print:
-        logger.info("Loading base model...")
-
-    model = load_base_model(
-        base_model_id,
-        use_accelerate=use_accelerate,
-        should_print=should_print,
-        load_in_4bit=load_in_4bit,
-        load_in_8bit=load_in_8bit,
-    )
-
-    trainer = transformers.Trainer(
-        model=model,
-        train_dataset=tokenized_train_dataset,
-        eval_dataset=tokenized_val_dataset,
-        args=transformers.TrainingArguments(
-            output_dir=output_dir,
-            warmup_steps=warmup_steps,
-            per_device_train_batch_size=per_device_train_batch_size,
-            gradient_checkpointing=(not use_accelerate),
-            gradient_accumulation_steps=gradient_accumulation_steps,
-            max_steps=max_steps,
-            learning_rate=lr,
-            logging_steps=(
-                min(logging_steps, max_steps)
-                if max_steps >= 0
-                else logging_steps
-            ),
-            bf16=bf16,
-            optim=optimizer,
-            logging_dir="./logs",
-            save_strategy="steps",
-            save_steps=min(save_steps, max_steps)
-            if max_steps >= 0
-            else save_steps,
-            evaluation_strategy="steps",
-            eval_steps=min(eval_steps, max_steps)
-            if max_steps >= 0
-            else eval_steps,
-            do_eval=True,
-            label_names=label_names,
-        ),
-        data_collator=transformers.DataCollatorForLanguageModeling(
-            tokenizer, mlm=False
-        ),
-        callbacks=[ZenMLCallback(accelerator=accelerator)],
-    )
-    if not use_accelerate:
-        model.config.use_cache = (
-            False  # silence the warnings. Please re-enable for inference!
-        )
-
-    if should_print:
-        logger.info("Training model...")
-    trainer.train()
-
-    if should_print:
-        logger.info("Saving model...")
-
-    ft_model_dir = Path(ft_model_dir)
-    if not use_accelerate or accelerator.is_main_process:
-        ft_model_dir.mkdir(parents=True, exist_ok=True)
-    if not use_accelerate:
-        model.config.use_cache = True
-        trainer.save_model(ft_model_dir)
-    else:
-        unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained(
-            ft_model_dir,
-            is_main_process=accelerator.is_main_process,
-            save_function=accelerator.save,
-        )
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index f2c93672..abbb7c58 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -17,14 +17,17 @@
 
 from pathlib import Path
 
-from functions.finetune import finetune_fn
+import transformers
+from accelerate import Accelerator
+from datasets import load_from_disk
 from materializers.directory_materializer import DirectoryMaterializer
 from typing_extensions import Annotated
+from utils.callbacks import ZenMLCallback
+from utils.loaders import load_base_model
+from utils.tokenizer import load_tokenizer
 from zenml import logging as zenml_logging
 from zenml import step
-from zenml.integrations.accelerate.utils.accelerate_runner import (
-    run_with_accelerate,
-)
+from zenml.integrations.accelerate import AccelerateScaler
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
 from zenml.utils.cuda_utils import cleanup_gpu_memory
@@ -34,10 +37,11 @@
     10000  # workaround for https://github.com/zenml-io/zenml/issues/2252
 )
 
-cache_invalidator = hash(finetune_fn.__code__)
-
 
-@step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
+@step(
+    output_materializers=[DirectoryMaterializer, BuiltInMaterializer],
+    scaler=AccelerateScaler(),
+)
 def finetune(
     base_model_id: str,
     dataset_dir: Path,
@@ -55,7 +59,6 @@ def finetune(
     use_fast: bool = True,
     load_in_4bit: bool = False,
     load_in_8bit: bool = False,
-    cache_invalidator: int = cache_invalidator,
 ) -> Annotated[Path, "ft_model_dir"]:
     """Finetune the model using PEFT.
 
@@ -87,33 +90,96 @@ def finetune(
     """
     cleanup_gpu_memory(force=True)
 
-    ft_model_dir = "model_dir"
-    common_kwargs = dict(
-        base_model_id=base_model_id,
-        dataset_dir=dataset_dir.as_posix(),
-        max_steps=max_steps,
-        logging_steps=logging_steps,
-        eval_steps=eval_steps,
-        save_steps=save_steps,
-        optimizer=optimizer,
-        lr=lr,
-        per_device_train_batch_size=per_device_train_batch_size,
-        gradient_accumulation_steps=gradient_accumulation_steps,
-        warmup_steps=warmup_steps,
-        bf16=bf16,
-        use_fast=use_fast,
+    ft_model_dir = Path("model_dir")
+    dataset_dir = Path(dataset_dir)
+
+    if use_accelerate:
+        accelerator = Accelerator()
+        should_print = accelerator.is_main_process
+    else:
+        accelerator = None
+        should_print = True
+
+    project = "zenml-finetune"
+    base_model_name = "mistral"
+    run_name = base_model_name + "-" + project
+    output_dir = "./" + run_name
+
+    if should_print:
+        logger.info("Loading datasets...")
+    tokenizer = load_tokenizer(base_model_id, use_fast=use_fast)
+    tokenized_train_dataset = load_from_disk(dataset_dir / "train")
+    tokenized_val_dataset = load_from_disk(dataset_dir / "val")
+
+    if should_print:
+        logger.info("Loading base model...")
+
+    model = load_base_model(
+        base_model_id,
+        use_accelerate=use_accelerate,
+        should_print=should_print,
         load_in_4bit=load_in_4bit,
         load_in_8bit=load_in_8bit,
-        use_accelerate=use_accelerate,
-        ft_model_dir=ft_model_dir,
+    )
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=tokenized_train_dataset,
+        eval_dataset=tokenized_val_dataset,
+        args=transformers.TrainingArguments(
+            output_dir=output_dir,
+            warmup_steps=warmup_steps,
+            per_device_train_batch_size=per_device_train_batch_size,
+            gradient_checkpointing=(not use_accelerate),
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            max_steps=max_steps,
+            learning_rate=lr,
+            logging_steps=(
+                min(logging_steps, max_steps)
+                if max_steps >= 0
+                else logging_steps
+            ),
+            bf16=bf16,
+            optim=optimizer,
+            logging_dir="./logs",
+            save_strategy="steps",
+            save_steps=min(save_steps, max_steps)
+            if max_steps >= 0
+            else save_steps,
+            evaluation_strategy="steps",
+            eval_steps=eval_steps,
+            do_eval=True,
+            label_names=["input_ids"],
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
+        ),
+        callbacks=[ZenMLCallback(accelerator=accelerator)],
     )
     if not use_accelerate:
-        finetune_fn(
-            **common_kwargs,
+        model.config.use_cache = (
+            False  # silence the warnings. Please re-enable for inference!
         )
+
+    if should_print:
+        logger.info("Training model...")
+    trainer.train()
+
+    if should_print:
+        logger.info("Saving model...")
+
+    ft_model_dir = Path(ft_model_dir)
+    if not use_accelerate or accelerator.is_main_process:
+        ft_model_dir.mkdir(parents=True, exist_ok=True)
+    if not use_accelerate:
+        model.config.use_cache = True
+        trainer.save_model(ft_model_dir)
     else:
-        run_with_accelerate(
-            function=finetune_fn, label_names=["input_ids"], **common_kwargs
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            ft_model_dir,
+            is_main_process=accelerator.is_main_process,
+            save_function=accelerator.save,
         )
 
-    return Path(ft_model_dir)
+    return ft_model_dir

From f3943b29ac75a96ff37c8f6e92d7a86c85ba1ad6 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 15 May 2024 17:16:46 +0200
Subject: [PATCH 17/28] pass bit config around

---
 llm-lora-finetuning/steps/evaluate_model.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index 4620fe28..ab81f764 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -83,7 +83,11 @@ def evaluate_model(
         )
     else:
         logger.info("Generating using finetuned model...")
-        model = load_pretrained_model(ft_model_dir)
+        model = load_pretrained_model(
+            ft_model_dir,
+            load_in_4bit=load_in_4bit,
+            load_in_8bit=load_in_8bit,
+        )
 
     model.eval()
     with torch.no_grad():

From 555997e0f0813ab89e0474399a716bdb007a80e1 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 4 Jun 2024 12:46:11 +0200
Subject: [PATCH 18/28] functional way

---
 llm-lora-finetuning/pipelines/train.py | 32 ++++++++++++++++----------
 llm-lora-finetuning/steps/finetune.py  | 14 +++--------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/llm-lora-finetuning/pipelines/train.py b/llm-lora-finetuning/pipelines/train.py
index 73054202..b5104533 100644
--- a/llm-lora-finetuning/pipelines/train.py
+++ b/llm-lora-finetuning/pipelines/train.py
@@ -17,12 +17,8 @@
 
 
 from steps import evaluate_model, finetune, prepare_data, promote
-from zenml import logging as zenml_logging
 from zenml import pipeline
-
-zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
-    10000  # workaround for https://github.com/zenml-io/zenml/issues/2252
-)
+from zenml.integrations.huggingface.steps import run_with_accelerate
 
 
 @pipeline
@@ -32,6 +28,7 @@ def llm_peft_full_finetune(
     use_fast: bool = True,
     load_in_8bit: bool = False,
     load_in_4bit: bool = False,
+    use_accelerate: bool = False,
 ):
     """Pipeline for finetuning an LLM with peft.
 
@@ -57,13 +54,24 @@ def llm_peft_full_finetune(
         system_prompt=system_prompt,
         use_fast=use_fast,
     )
-    ft_model_dir = finetune(
-        base_model_id,
-        datasets_dir,
-        use_fast=use_fast,
-        load_in_8bit=load_in_8bit,
-        load_in_4bit=load_in_4bit,
-    )
+    if not use_accelerate:
+        ft_model_dir = finetune(
+            base_model_id,
+            datasets_dir,
+            use_fast=use_fast,
+            load_in_8bit=load_in_8bit,
+            load_in_4bit=load_in_4bit,
+        )
+    else:
+        ft_model_dir = run_with_accelerate(
+            finetune,
+            base_model_id=base_model_id,
+            datasets_dir=datasets_dir,
+            use_fast=use_fast,
+            load_in_8bit=load_in_8bit,
+            load_in_4bit=load_in_4bit,
+            id="finetune_accelerated",
+        )
     evaluate_model(
         base_model_id,
         system_prompt,
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index abbb7c58..31e86c44 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -27,7 +27,6 @@
 from utils.tokenizer import load_tokenizer
 from zenml import logging as zenml_logging
 from zenml import step
-from zenml.integrations.accelerate import AccelerateScaler
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
 from zenml.utils.cuda_utils import cleanup_gpu_memory
@@ -38,10 +37,7 @@
 )
 
 
-@step(
-    output_materializers=[DirectoryMaterializer, BuiltInMaterializer],
-    scaler=AccelerateScaler(),
-)
+@step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
 def finetune(
     base_model_id: str,
     dataset_dir: Path,
@@ -135,17 +131,13 @@ def finetune(
             max_steps=max_steps,
             learning_rate=lr,
             logging_steps=(
-                min(logging_steps, max_steps)
-                if max_steps >= 0
-                else logging_steps
+                min(logging_steps, max_steps) if max_steps >= 0 else logging_steps
             ),
             bf16=bf16,
             optim=optimizer,
             logging_dir="./logs",
             save_strategy="steps",
-            save_steps=min(save_steps, max_steps)
-            if max_steps >= 0
-            else save_steps,
+            save_steps=min(save_steps, max_steps) if max_steps >= 0 else save_steps,
             evaluation_strategy="steps",
             eval_steps=eval_steps,
             do_eval=True,

From 65b5e114d5f7a856e29b904aed6ad0f505d7ff14 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 4 Jun 2024 12:55:57 +0200
Subject: [PATCH 19/28] remove configs

---
 .../configs/mistral_default_finetune.yaml     | 75 -------------------
 .../configs/mistral_remote_finetune.yaml      | 69 -----------------
 .../configs/orchestrator_finetune.yaml        | 75 -------------------
 .../phi_accelerated_local_bf16_finetune.yaml  | 57 --------------
 .../phi_accelerated_local_finetune.yaml       | 57 --------------
 .../phi_accelerated_remote_finetune.yaml      | 67 -----------------
 .../configs/phi_local_bf16_finetune.yaml      | 56 --------------
 .../configs/phi_local_finetune.yaml           | 56 --------------
 8 files changed, 512 deletions(-)
 delete mode 100644 llm-lora-finetuning/configs/mistral_default_finetune.yaml
 delete mode 100644 llm-lora-finetuning/configs/mistral_remote_finetune.yaml
 delete mode 100644 llm-lora-finetuning/configs/orchestrator_finetune.yaml
 delete mode 100644 llm-lora-finetuning/configs/phi_accelerated_local_bf16_finetune.yaml
 delete mode 100644 llm-lora-finetuning/configs/phi_accelerated_local_finetune.yaml
 delete mode 100644 llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml
 delete mode 100644 llm-lora-finetuning/configs/phi_local_bf16_finetune.yaml
 delete mode 100644 llm-lora-finetuning/configs/phi_local_finetune.yaml

diff --git a/llm-lora-finetuning/configs/mistral_default_finetune.yaml b/llm-lora-finetuning/configs/mistral_default_finetune.yaml
deleted file mode 100644
index 92d8b0d2..00000000
--- a/llm-lora-finetuning/configs/mistral_default_finetune.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-# Apache Software License 2.0
-# 
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# 
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 300_steps
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-    python_package_installer: uv
-    environment:
-      PJRT_DEVICE: CUDA
-      USE_TORCH_XLA: "false"
-      MKL_SERVICE_FORCE_INTEL: "1"
-
-parameters:
-  base_model_id: microsoft/phi-2
-  use_fast: False
-  load_in_4bit: True
-  system_prompt: |
-<<<<<<<< HEAD:llm-lora-finetuning/configs/mistral_default_finetune.yaml
-    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-  load_in_8bit: True
-  
-========
-      Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-      This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-      The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-      
-
->>>>>>>> main:llm-lora-finetuning/configs/orchestrator_finetune.yaml
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    enable_step_logs: False
-    parameters:
-      max_steps: 300
-<<<<<<<< HEAD:llm-lora-finetuning/configs/mistral_default_finetune.yaml
-      eval_steps: 100
-      bf16: False
-========
-      eval_steps: 30
-      bf16: True
->>>>>>>> main:llm-lora-finetuning/configs/orchestrator_finetune.yaml
-
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging
diff --git a/llm-lora-finetuning/configs/mistral_remote_finetune.yaml b/llm-lora-finetuning/configs/mistral_remote_finetune.yaml
deleted file mode 100644
index 6fda3072..00000000
--- a/llm-lora-finetuning/configs/mistral_remote_finetune.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Apache Software License 2.0
-# 
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# 
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 300_steps
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-    python_package_installer: uv
-    environment:
-      PJRT_DEVICE: CUDA
-      USE_TORCH_XLA: "false"
-      MKL_SERVICE_FORCE_INTEL: "1"
-
-parameters:
-  base_model_id: microsoft/phi-2
-  use_fast: False
-  load_in_4bit: True
-  system_prompt: |
-      Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-      This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-      The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-      
-
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    enable_step_logs: False
-    step_operator: gcp_a100
-    parameters:
-      max_steps: 300
-      eval_steps: 30
-      bf16: True
-
-  evaluate_finetuned:
-    step_operator: gcp_a100
-
-  evaluate_base:
-    step_operator: gcp_a100
-
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging
diff --git a/llm-lora-finetuning/configs/orchestrator_finetune.yaml b/llm-lora-finetuning/configs/orchestrator_finetune.yaml
deleted file mode 100644
index 92d8b0d2..00000000
--- a/llm-lora-finetuning/configs/orchestrator_finetune.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-# Apache Software License 2.0
-# 
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# 
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 300_steps
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-    python_package_installer: uv
-    environment:
-      PJRT_DEVICE: CUDA
-      USE_TORCH_XLA: "false"
-      MKL_SERVICE_FORCE_INTEL: "1"
-
-parameters:
-  base_model_id: microsoft/phi-2
-  use_fast: False
-  load_in_4bit: True
-  system_prompt: |
-<<<<<<<< HEAD:llm-lora-finetuning/configs/mistral_default_finetune.yaml
-    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-  load_in_8bit: True
-  
-========
-      Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-      This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-      The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-      
-
->>>>>>>> main:llm-lora-finetuning/configs/orchestrator_finetune.yaml
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    enable_step_logs: False
-    parameters:
-      max_steps: 300
-<<<<<<<< HEAD:llm-lora-finetuning/configs/mistral_default_finetune.yaml
-      eval_steps: 100
-      bf16: False
-========
-      eval_steps: 30
-      bf16: True
->>>>>>>> main:llm-lora-finetuning/configs/orchestrator_finetune.yaml
-
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging
diff --git a/llm-lora-finetuning/configs/phi_accelerated_local_bf16_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_local_bf16_finetune.yaml
deleted file mode 100644
index c3be19e4..00000000
--- a/llm-lora-finetuning/configs/phi_accelerated_local_bf16_finetune.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 200_steps_accelerate
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-
-parameters:
-  base_model_id: microsoft/phi-2
-  system_prompt: |
-    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-  use_fast: False
-  load_in_4bit: True
-
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    parameters:
-      max_steps: 200
-      eval_steps: 50
-      bf16: True
-      use_accelerate: True
-      
-
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging
diff --git a/llm-lora-finetuning/configs/phi_accelerated_local_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_local_finetune.yaml
deleted file mode 100644
index b78d2d19..00000000
--- a/llm-lora-finetuning/configs/phi_accelerated_local_finetune.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 25_steps_accelerate
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-
-parameters:
-  base_model_id: microsoft/phi-2
-  system_prompt: |
-    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-  use_fast: False
-  load_in_4bit: True
-
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    parameters:
-      max_steps: 25
-      eval_steps: 25
-      bf16: False
-      use_accelerate: True
-      
-
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging
diff --git a/llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml b/llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml
deleted file mode 100644
index abbbe28f..00000000
--- a/llm-lora-finetuning/configs/phi_accelerated_remote_finetune.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 100_steps_accelerate
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-    environment:
-      PJRT_DEVICE: CUDA
-      USE_TORCH_XLA: "false"
-      MKL_SERVICE_FORCE_INTEL: "1"
-
-parameters:
-  base_model_id: microsoft/phi-2
-  system_prompt: |
-    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-  use_fast: False
-  load_in_4bit: True
-
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    step_operator: gcp_t4x2
-    parameters:
-      max_steps: 100
-      eval_steps: 50
-      bf16: False
-      use_accelerate: True
-
-  evaluate_finetuned:
-    step_operator: gcp_t4x2
-
-  evaluate_base:
-    step_operator: gcp_t4x2
-      
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging
diff --git a/llm-lora-finetuning/configs/phi_local_bf16_finetune.yaml b/llm-lora-finetuning/configs/phi_local_bf16_finetune.yaml
deleted file mode 100644
index 982b17e3..00000000
--- a/llm-lora-finetuning/configs/phi_local_bf16_finetune.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 200_steps
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-
-parameters:
-  base_model_id: microsoft/phi-2
-  system_prompt: |
-    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-  use_fast: False
-  load_in_4bit: True
-
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    parameters:
-      max_steps: 200
-      eval_steps: 50
-      bf16: True
-      
-
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging
diff --git a/llm-lora-finetuning/configs/phi_local_finetune.yaml b/llm-lora-finetuning/configs/phi_local_finetune.yaml
deleted file mode 100644
index d7f87c0e..00000000
--- a/llm-lora-finetuning/configs/phi_local_finetune.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Apache Software License 2.0
-#
-# Copyright (c) ZenML GmbH 2024. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-model:
-  name: llm-peft-microsoft-phi-2
-  description: "Fine-tune `microsoft/phi-2`."
-  tags:
-    - llm
-    - peft
-    - microsoft/phi-2
-  version: 25_steps
-
-settings:
-  docker:
-    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
-    requirements: requirements.txt
-
-parameters:
-  base_model_id: microsoft/phi-2
-  system_prompt: |
-    Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
-    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
-    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
-  use_fast: False
-  load_in_4bit: True
-
-steps:
-  prepare_data:
-    parameters:
-      dataset_name: gem/viggo
-
-  finetune:
-    parameters:
-      max_steps: 25
-      eval_steps: 25
-      bf16: False
-      
-
-  promote:
-    parameters:
-      metric: rouge2
-      target_stage: staging

From d77f50f1b47d83f603df9647d0f9a227103b2030 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 4 Jun 2024 12:57:15 +0200
Subject: [PATCH 20/28] restore configs

---
 .../configs/orchestrator_finetune.yaml        | 62 +++++++++++++++++
 .../configs/remote_finetune.yaml              | 69 +++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 llm-lora-finetuning/configs/orchestrator_finetune.yaml
 create mode 100644 llm-lora-finetuning/configs/remote_finetune.yaml

diff --git a/llm-lora-finetuning/configs/orchestrator_finetune.yaml b/llm-lora-finetuning/configs/orchestrator_finetune.yaml
new file mode 100644
index 00000000..9bd271f6
--- /dev/null
+++ b/llm-lora-finetuning/configs/orchestrator_finetune.yaml
@@ -0,0 +1,62 @@
+# Apache Software License 2.0
+# 
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+model:
+  name: llm-peft-microsoft-phi-2
+  description: "Fine-tune `microsoft/phi-2`."
+  tags:
+    - llm
+    - peft
+    - microsoft/phi-2
+  version: 300_steps
+
+settings:
+  docker:
+    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+    requirements: requirements.txt
+    python_package_installer: uv
+    environment:
+      PJRT_DEVICE: CUDA
+      USE_TORCH_XLA: "false"
+      MKL_SERVICE_FORCE_INTEL: "1"
+
+parameters:
+  base_model_id: microsoft/phi-2
+  use_fast: False
+  load_in_4bit: True
+  system_prompt: |
+      Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+      This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+      The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+      
+
+steps:
+  prepare_data:
+    parameters:
+      dataset_name: gem/viggo
+
+  finetune:
+    enable_step_logs: False
+    parameters:
+      max_steps: 300
+      eval_steps: 30
+      bf16: True
+
+  promote:
+    parameters:
+      metric: rouge2
+      target_stage: staging
diff --git a/llm-lora-finetuning/configs/remote_finetune.yaml b/llm-lora-finetuning/configs/remote_finetune.yaml
new file mode 100644
index 00000000..6fda3072
--- /dev/null
+++ b/llm-lora-finetuning/configs/remote_finetune.yaml
@@ -0,0 +1,69 @@
+# Apache Software License 2.0
+# 
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+model:
+  name: llm-peft-microsoft-phi-2
+  description: "Fine-tune `microsoft/phi-2`."
+  tags:
+    - llm
+    - peft
+    - microsoft/phi-2
+  version: 300_steps
+
+settings:
+  docker:
+    parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+    requirements: requirements.txt
+    python_package_installer: uv
+    environment:
+      PJRT_DEVICE: CUDA
+      USE_TORCH_XLA: "false"
+      MKL_SERVICE_FORCE_INTEL: "1"
+
+parameters:
+  base_model_id: microsoft/phi-2
+  use_fast: False
+  load_in_4bit: True
+  system_prompt: |
+      Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+      This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+      The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+      
+
+steps:
+  prepare_data:
+    parameters:
+      dataset_name: gem/viggo
+
+  finetune:
+    enable_step_logs: False
+    step_operator: gcp_a100
+    parameters:
+      max_steps: 300
+      eval_steps: 30
+      bf16: True
+
+  evaluate_finetuned:
+    step_operator: gcp_a100
+
+  evaluate_base:
+    step_operator: gcp_a100
+
+  promote:
+    parameters:
+      metric: rouge2
+      target_stage: staging

From fd3887d80af861f5098660e52506c33e0bf9a8a2 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Tue, 4 Jun 2024 12:58:11 +0200
Subject: [PATCH 21/28] restore reqs

---
 llm-lora-finetuning/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm-lora-finetuning/requirements.txt b/llm-lora-finetuning/requirements.txt
index 06bed32b..ac6d8625 100644
--- a/llm-lora-finetuning/requirements.txt
+++ b/llm-lora-finetuning/requirements.txt
@@ -1,6 +1,6 @@
 zenml
 torch>=2.2.0
-datasets<=2.18
+datasets
 transformers
 peft
 bitsandbytes>=0.41.3

From 5264011b5da3e2ce7c05ba590cdd3c8e2555b637 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 5 Jun 2024 11:59:47 +0200
Subject: [PATCH 22/28] accelerate as a function from the core

---
 .../configs/orchestrator_finetune.yaml        |  4 +
 .../configs/remote_finetune.yaml              | 16 ++++
 llm-lora-finetuning/pipelines/train.py        | 32 ++------
 .../pipelines/train_accelerated.py            | 82 +++++++++++++++++++
 llm-lora-finetuning/run.py                    | 19 ++++-
 llm-lora-finetuning/steps/finetune.py         |  4 -
 6 files changed, 128 insertions(+), 29 deletions(-)
 create mode 100644 llm-lora-finetuning/pipelines/train_accelerated.py

diff --git a/llm-lora-finetuning/configs/orchestrator_finetune.yaml b/llm-lora-finetuning/configs/orchestrator_finetune.yaml
index 9bd271f6..ef620de9 100644
--- a/llm-lora-finetuning/configs/orchestrator_finetune.yaml
+++ b/llm-lora-finetuning/configs/orchestrator_finetune.yaml
@@ -29,6 +29,10 @@ settings:
     parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
     requirements: requirements.txt
     python_package_installer: uv
+    python_package_installer_args:
+      system: null
+    apt_packages: 
+      - git
     environment:
       PJRT_DEVICE: CUDA
       USE_TORCH_XLA: "false"
diff --git a/llm-lora-finetuning/configs/remote_finetune.yaml b/llm-lora-finetuning/configs/remote_finetune.yaml
index 6fda3072..55f838c1 100644
--- a/llm-lora-finetuning/configs/remote_finetune.yaml
+++ b/llm-lora-finetuning/configs/remote_finetune.yaml
@@ -29,6 +29,10 @@ settings:
     parent_image: pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
     requirements: requirements.txt
     python_package_installer: uv
+    python_package_installer_args:
+      system: null
+    apt_packages: 
+      - git
     environment:
       PJRT_DEVICE: CUDA
       USE_TORCH_XLA: "false"
@@ -52,6 +56,10 @@ steps:
   finetune:
     enable_step_logs: False
     step_operator: gcp_a100
+    retry:
+      max_retries: 3
+      delay: 10
+      backoff: 2
     parameters:
       max_steps: 300
       eval_steps: 30
@@ -59,9 +67,17 @@ steps:
 
   evaluate_finetuned:
     step_operator: gcp_a100
+    retry:
+      max_retries: 3
+      delay: 10
+      backoff: 2
 
   evaluate_base:
     step_operator: gcp_a100
+    retry:
+      max_retries: 3
+      delay: 10
+      backoff: 2
 
   promote:
     parameters:
diff --git a/llm-lora-finetuning/pipelines/train.py b/llm-lora-finetuning/pipelines/train.py
index 0ca124cb..55080178 100644
--- a/llm-lora-finetuning/pipelines/train.py
+++ b/llm-lora-finetuning/pipelines/train.py
@@ -18,7 +18,6 @@
 
 from steps import evaluate_model, finetune, prepare_data, promote
 from zenml import pipeline
-from zenml.integrations.huggingface.steps import run_with_accelerate
 
 
 @pipeline
@@ -28,7 +27,6 @@ def llm_peft_full_finetune(
     use_fast: bool = True,
     load_in_8bit: bool = False,
     load_in_4bit: bool = False,
-    use_accelerate: bool = False,
 ):
     """Pipeline for finetuning an LLM with peft.
 
@@ -44,33 +42,21 @@ def llm_peft_full_finetune(
             "At least one of `load_in_8bit` and `load_in_4bit` must be True."
         )
     if load_in_4bit and load_in_8bit:
-        raise ValueError(
-            "Only one of `load_in_8bit` and `load_in_4bit` can be True."
-        )
+        raise ValueError("Only one of `load_in_8bit` and `load_in_4bit` can be True.")
 
     datasets_dir = prepare_data(
         base_model_id=base_model_id,
         system_prompt=system_prompt,
         use_fast=use_fast,
     )
-    if not use_accelerate:
-        ft_model_dir = finetune(
-            base_model_id,
-            datasets_dir,
-            use_fast=use_fast,
-            load_in_8bit=load_in_8bit,
-            load_in_4bit=load_in_4bit,
-        )
-    else:
-        ft_model_dir = run_with_accelerate(
-            finetune,
-            base_model_id=base_model_id,
-            datasets_dir=datasets_dir,
-            use_fast=use_fast,
-            load_in_8bit=load_in_8bit,
-            load_in_4bit=load_in_4bit,
-            id="finetune_accelerated",
-        )
+    ft_model_dir = finetune(
+        base_model_id,
+        datasets_dir,
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
+    )
+
     evaluate_model(
         base_model_id,
         system_prompt,
diff --git a/llm-lora-finetuning/pipelines/train_accelerated.py b/llm-lora-finetuning/pipelines/train_accelerated.py
new file mode 100644
index 00000000..be82ad63
--- /dev/null
+++ b/llm-lora-finetuning/pipelines/train_accelerated.py
@@ -0,0 +1,82 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from steps import evaluate_model, finetune, prepare_data, promote
+from zenml import pipeline
+from zenml.integrations.huggingface.steps import run_with_accelerate
+
+
+@pipeline
+def llm_peft_full_finetune(
+    system_prompt: str,
+    base_model_id: str,
+    use_fast: bool = True,
+    load_in_8bit: bool = False,
+    load_in_4bit: bool = False,
+):
+    """Pipeline for finetuning an LLM with peft.
+
+    It will run the following steps:
+
+    - prepare_data: prepare the datasets and tokenize them
+    - finetune: finetune the model
+    - evaluate_model: evaluate the base and finetuned model
+    - promote: promote the model to the target stage, if evaluation was successful
+    """
+    if not load_in_8bit and not load_in_4bit:
+        raise ValueError(
+            "At least one of `load_in_8bit` and `load_in_4bit` must be True."
+        )
+    if load_in_4bit and load_in_8bit:
+        raise ValueError("Only one of `load_in_8bit` and `load_in_4bit` can be True.")
+
+    datasets_dir = prepare_data(
+        base_model_id=base_model_id,
+        system_prompt=system_prompt,
+        use_fast=use_fast,
+    )
+
+    ft_model_dir = run_with_accelerate(finetune)(
+        base_model_id=base_model_id,
+        dataset_dir=datasets_dir,
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
+    )
+
+    evaluate_model(
+        base_model_id,
+        system_prompt,
+        datasets_dir,
+        ft_model_dir,
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
+        id="evaluate_finetuned",
+    )
+    evaluate_model(
+        base_model_id,
+        system_prompt,
+        datasets_dir,
+        None,
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
+        id="evaluate_base",
+    )
+    promote(after=["evaluate_finetuned", "evaluate_base"])
diff --git a/llm-lora-finetuning/run.py b/llm-lora-finetuning/run.py
index 9d7aa67a..db5f637b 100644
--- a/llm-lora-finetuning/run.py
+++ b/llm-lora-finetuning/run.py
@@ -19,7 +19,6 @@
 from typing import Optional
 
 import click
-from pipelines.train import llm_peft_full_finetune
 
 
 @click.command(
@@ -45,6 +44,12 @@
     default="default_finetune.yaml",
     help="Path to the YAML config file.",
 )
+@click.option(
+    "--accelerate",
+    is_flag=True,
+    default=False,
+    help="Run the pipeline with Accelerate.",
+)
 @click.option(
     "--no-cache",
     is_flag=True,
@@ -53,11 +58,14 @@
 )
 def main(
     config: Optional[str] = None,
+    accelerate: bool = False,
     no_cache: bool = False,
 ):
     """Main entry point for the pipeline execution.
 
     Args:
+        config: Path to the YAML config file.
+        accelerate: If `True` Accelerate will be used.
         no_cache: If `True` cache will be disabled.
     """
     config_folder = os.path.join(
@@ -70,7 +78,14 @@ def main(
 
     pipeline_args["config_path"] = os.path.join(config_folder, config)
 
-    llm_peft_full_finetune.with_options(**pipeline_args)()
+    if accelerate:
+        from pipelines.train_accelerated import llm_peft_full_finetune
+
+        llm_peft_full_finetune.with_options(**pipeline_args)()
+    else:
+        from pipelines.train import llm_peft_full_finetune
+
+        llm_peft_full_finetune.with_options(**pipeline_args)()
 
 
 if __name__ == "__main__":
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index 31e86c44..9820a385 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -25,16 +25,12 @@
 from utils.callbacks import ZenMLCallback
 from utils.loaders import load_base_model
 from utils.tokenizer import load_tokenizer
-from zenml import logging as zenml_logging
 from zenml import step
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
 from zenml.utils.cuda_utils import cleanup_gpu_memory
 
 logger = get_logger(__name__)
-zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
-    10000  # workaround for https://github.com/zenml-io/zenml/issues/2252
-)
 
 
 @step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])

From d9172c7954c4aa3a8c852db8744b96e460318460 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Wed, 5 Jun 2024 12:01:37 +0200
Subject: [PATCH 23/28] reduce README

---
 llm-lora-finetuning/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md
index 01af43a4..43951f80 100644
--- a/llm-lora-finetuning/README.md
+++ b/llm-lora-finetuning/README.md
@@ -91,7 +91,6 @@ The project loosely follows [the recommended ZenML project structure](https://do
 .
 ├── configs                                       # pipeline configuration files
 │   ├── orchestrator_finetune.yaml                # default local or remote orchestrator configuration
-│   ├── remote_accelerated_finetune.yaml          # default step operator with Accelerate configuration
 │   └── remote_finetune.yaml                      # default step operator configuration
 ├── materializers
 │   └── directory_materializer.py                 # custom materializer to push whole directories to the artifact store and back

From 817a1b2225c4dd722e9e0109c75c392f3fe614c6 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 7 Jun 2024 12:32:18 +0200
Subject: [PATCH 24/28] og metadata separately

---
 llm-lora-finetuning/pipelines/train.py        | 39 +++++++++++++----
 .../pipelines/train_accelerated.py            | 42 ++++++++++++++-----
 llm-lora-finetuning/steps/__init__.py         |  1 +
 llm-lora-finetuning/steps/evaluate_model.py   | 14 ++-----
 llm-lora-finetuning/steps/finetune.py         |  4 +-
 llm-lora-finetuning/steps/log_metadata.py     | 42 +++++++++++++++++++
 6 files changed, 110 insertions(+), 32 deletions(-)
 create mode 100644 llm-lora-finetuning/steps/log_metadata.py

diff --git a/llm-lora-finetuning/pipelines/train.py b/llm-lora-finetuning/pipelines/train.py
index 55080178..13fdaa8c 100644
--- a/llm-lora-finetuning/pipelines/train.py
+++ b/llm-lora-finetuning/pipelines/train.py
@@ -16,7 +16,13 @@
 #
 
 
-from steps import evaluate_model, finetune, prepare_data, promote
+from steps import (
+    evaluate_model,
+    finetune,
+    prepare_data,
+    promote,
+    log_metadata_from_step_artifact,
+)
 from zenml import pipeline
 
 
@@ -49,32 +55,47 @@ def llm_peft_full_finetune(
         system_prompt=system_prompt,
         use_fast=use_fast,
     )
-    ft_model_dir = finetune(
+
+    evaluate_model(
         base_model_id,
+        system_prompt,
         datasets_dir,
+        None,
         use_fast=use_fast,
         load_in_8bit=load_in_8bit,
         load_in_4bit=load_in_4bit,
+        id="evaluate_base",
+    )
+    log_metadata_from_step_artifact(
+        "evaluate_base",
+        "base_model_rouge_metrics",
+        after=["evaluate_base"],
+        id="log_metadata_evaluation_base"
     )
 
-    evaluate_model(
+    ft_model_dir = finetune(
         base_model_id,
-        system_prompt,
         datasets_dir,
-        ft_model_dir,
         use_fast=use_fast,
         load_in_8bit=load_in_8bit,
         load_in_4bit=load_in_4bit,
-        id="evaluate_finetuned",
     )
+
     evaluate_model(
         base_model_id,
         system_prompt,
         datasets_dir,
-        None,
+        ft_model_dir,
         use_fast=use_fast,
         load_in_8bit=load_in_8bit,
         load_in_4bit=load_in_4bit,
-        id="evaluate_base",
+        id="evaluate_finetuned",
     )
-    promote(after=["evaluate_finetuned", "evaluate_base"])
+    log_metadata_from_step_artifact(
+        "evaluate_finetuned",
+        "finetuned_model_rouge_metrics",
+        after=["evaluate_finetuned"],
+        id="log_metadata_evaluation_finetuned"
+    )
+
+    promote(after=["log_metadata_evaluation_finetuned", "log_metadata_evaluation_base"])
diff --git a/llm-lora-finetuning/pipelines/train_accelerated.py b/llm-lora-finetuning/pipelines/train_accelerated.py
index be82ad63..84fc07e4 100644
--- a/llm-lora-finetuning/pipelines/train_accelerated.py
+++ b/llm-lora-finetuning/pipelines/train_accelerated.py
@@ -16,7 +16,13 @@
 #
 
 
-from steps import evaluate_model, finetune, prepare_data, promote
+from steps import (
+    evaluate_model,
+    finetune,
+    prepare_data,
+    promote,
+    log_metadata_from_step_artifact,
+)
 from zenml import pipeline
 from zenml.integrations.huggingface.steps import run_with_accelerate
 
@@ -51,6 +57,23 @@ def llm_peft_full_finetune(
         use_fast=use_fast,
     )
 
+    evaluate_model(
+        base_model_id,
+        system_prompt,
+        datasets_dir,
+        None,
+        use_fast=use_fast,
+        load_in_8bit=load_in_8bit,
+        load_in_4bit=load_in_4bit,
+        id="evaluate_base",
+    )
+    log_metadata_from_step_artifact(
+        "evaluate_base",
+        "base_model_rouge_metrics",
+        after=["evaluate_base"],
+        id="log_metadata_evaluation_base"
+    )
+
     ft_model_dir = run_with_accelerate(finetune)(
         base_model_id=base_model_id,
         dataset_dir=datasets_dir,
@@ -69,14 +92,11 @@ def llm_peft_full_finetune(
         load_in_4bit=load_in_4bit,
         id="evaluate_finetuned",
     )
-    evaluate_model(
-        base_model_id,
-        system_prompt,
-        datasets_dir,
-        None,
-        use_fast=use_fast,
-        load_in_8bit=load_in_8bit,
-        load_in_4bit=load_in_4bit,
-        id="evaluate_base",
+    log_metadata_from_step_artifact(
+        "evaluate_finetuned",
+        "finetuned_model_rouge_metrics",
+        after=["evaluate_finetuned"],
+        id="log_metadata_evaluation_finetuned"
     )
-    promote(after=["evaluate_finetuned", "evaluate_base"])
+
+    promote(after=["log_metadata_evaluation_finetuned", "log_metadata_evaluation_base"])
diff --git a/llm-lora-finetuning/steps/__init__.py b/llm-lora-finetuning/steps/__init__.py
index b8bfdaa3..317b6b4c 100644
--- a/llm-lora-finetuning/steps/__init__.py
+++ b/llm-lora-finetuning/steps/__init__.py
@@ -19,3 +19,4 @@
 from .finetune import finetune
 from .prepare_datasets import prepare_data
 from .promote import promote
+from .log_metadata import log_metadata_from_step_artifact
diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index 9ff9002d..0722effe 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -26,7 +26,7 @@
     load_pretrained_model,
 )
 from utils.tokenizer import load_tokenizer, tokenize_for_eval
-from zenml import log_model_metadata, save_artifact, step
+from zenml import save_artifact, step
 from zenml.logger import get_logger
 from zenml.utils.cuda_utils import cleanup_gpu_memory
 
@@ -67,9 +67,7 @@ def evaluate_model(
     test_dataset = load_from_disk(datasets_dir / "test_raw")
     test_dataset = test_dataset[:50]
     ground_truths = test_dataset["meaning_representation"]
-    tokenized_train_dataset = tokenize_for_eval(
-        test_dataset, tokenizer, system_prompt
-    )
+    tokenized_train_dataset = tokenize_for_eval(test_dataset, tokenizer, system_prompt)
 
     if ft_model_dir is None:
         logger.info("Generating using base model...")
@@ -103,12 +101,8 @@ def evaluate_model(
     logger.info("Computing ROUGE metrics...")
     prefix = "base_model_" if ft_model_dir is None else "finetuned_model_"
     rouge = evaluate.load("rouge")
-    rouge_metrics = rouge.compute(
-        predictions=predictions, references=ground_truths
-    )
-    metadata = {prefix + k: float(v) for k, v in rouge_metrics.items()}
+    rouge_metrics = rouge.compute(predictions=predictions, references=ground_truths)
 
-    log_model_metadata(metadata)
-    logger.info("Computed metrics: " + str(metadata))
+    logger.info("Computed metrics: " + str(rouge_metrics))
 
     save_artifact(rouge_metrics, prefix + "rouge_metrics")
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index 9820a385..da3640dd 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -25,7 +25,7 @@
 from utils.callbacks import ZenMLCallback
 from utils.loaders import load_base_model
 from utils.tokenizer import load_tokenizer
-from zenml import step
+from zenml import step, ArtifactConfig
 from zenml.logger import get_logger
 from zenml.materializers import BuiltInMaterializer
 from zenml.utils.cuda_utils import cleanup_gpu_memory
@@ -51,7 +51,7 @@ def finetune(
     use_fast: bool = True,
     load_in_4bit: bool = False,
     load_in_8bit: bool = False,
-) -> Annotated[Path, "ft_model_dir"]:
+) -> Annotated[Path, ArtifactConfig(name="ft_model_dir", is_model_artifact=True)]:
     """Finetune the model using PEFT.
 
     Base model will be derived from configure step and finetuned model will
diff --git a/llm-lora-finetuning/steps/log_metadata.py b/llm-lora-finetuning/steps/log_metadata.py
new file mode 100644
index 00000000..1e68609a
--- /dev/null
+++ b/llm-lora-finetuning/steps/log_metadata.py
@@ -0,0 +1,42 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any, Dict
+
+from zenml import log_model_metadata, step, get_step_context
+
+
+@step(enable_cache=False)
+def log_metadata_from_step_artifact(
+    step_name: str,
+    artifact_name: str,
+) -> None:
+    """Log metadata to the model from saved artifact.
+
+    Args:
+        step_name: The name of the step.
+        artifact_name: The name of the artifact.
+    """
+
+    context = get_step_context()
+    metadata_dict: Dict[str, Any] = (
+        context.pipeline_run.steps[step_name].outputs[artifact_name].load()
+    )
+
+    metadata = {artifact_name: metadata_dict}
+
+    log_model_metadata(metadata)

From 6d988eb0c0a1ff728c451d80689b796726111336 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Thu, 13 Jun 2024 17:17:47 +0200
Subject: [PATCH 25/28] resume logging

---
 llm-lora-finetuning/configs/orchestrator_finetune.yaml | 1 -
 llm-lora-finetuning/configs/remote_finetune.yaml       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llm-lora-finetuning/configs/orchestrator_finetune.yaml b/llm-lora-finetuning/configs/orchestrator_finetune.yaml
index ef620de9..0d76d8ea 100644
--- a/llm-lora-finetuning/configs/orchestrator_finetune.yaml
+++ b/llm-lora-finetuning/configs/orchestrator_finetune.yaml
@@ -54,7 +54,6 @@ steps:
       dataset_name: gem/viggo
 
   finetune:
-    enable_step_logs: False
     parameters:
       max_steps: 300
       eval_steps: 30
diff --git a/llm-lora-finetuning/configs/remote_finetune.yaml b/llm-lora-finetuning/configs/remote_finetune.yaml
index 55f838c1..4c3f12da 100644
--- a/llm-lora-finetuning/configs/remote_finetune.yaml
+++ b/llm-lora-finetuning/configs/remote_finetune.yaml
@@ -54,7 +54,6 @@ steps:
       dataset_name: gem/viggo
 
   finetune:
-    enable_step_logs: False
     step_operator: gcp_a100
     retry:
       max_retries: 3

From 80a108477ca1c4e16fa3b22795ca99b3d3377fd0 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Fri, 14 Jun 2024 11:46:51 +0200
Subject: [PATCH 26/28] add `trust_remote_code=True`

---
 llm-lora-finetuning/steps/prepare_datasets.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/llm-lora-finetuning/steps/prepare_datasets.py b/llm-lora-finetuning/steps/prepare_datasets.py
index cc4a86c4..eb2c4353 100644
--- a/llm-lora-finetuning/steps/prepare_datasets.py
+++ b/llm-lora-finetuning/steps/prepare_datasets.py
@@ -62,11 +62,23 @@ def prepare_data(
         system_prompt=system_prompt,
     )
 
-    train_dataset = load_dataset(dataset_name, split="train")
+    train_dataset = load_dataset(
+        dataset_name,
+        split="train",
+        trust_remote_code=True,
+    )
     tokenized_train_dataset = train_dataset.map(gen_and_tokenize)
-    eval_dataset = load_dataset(dataset_name, split="validation")
+    eval_dataset = load_dataset(
+        dataset_name,
+        split="validation",
+        trust_remote_code=True,
+    )
     tokenized_val_dataset = eval_dataset.map(gen_and_tokenize)
-    test_dataset = load_dataset(dataset_name, split="test")
+    test_dataset = load_dataset(
+        dataset_name,
+        split="test",
+        trust_remote_code=True,
+    )
 
     datasets_path = Path("datasets")
     tokenized_train_dataset.save_to_disk(str((datasets_path / "train").absolute()))

From bb9dc65156a8ec7ce1efa1e86ed6e6a5fd6908db Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Thu, 20 Jun 2024 11:18:46 +0200
Subject: [PATCH 27/28] final touches

---
 llm-lora-finetuning/README.md         | 10 ++++------
 llm-lora-finetuning/steps/finetune.py |  3 +--
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md
index 43951f80..7b47ebf6 100644
--- a/llm-lora-finetuning/README.md
+++ b/llm-lora-finetuning/README.md
@@ -58,15 +58,13 @@ When running the pipeline like this, the trained model will be stored in the Zen
 ### ⚡ Accelerate your finetuning
 
 Do you want to benefit from multi-GPU-training with Distributed Data Parallelism (DDP)? Then you can use other configuration files prepared for this purpose.
-For example, `phi_accelerated_local_finetune.yaml` can run a finetuning of the [Microsoft Phi 2](https://huggingface.co/microsoft/phi-2) powered by [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/en/index) on all GPUs available in the environment. To do so, just call:
+For example, `orchestrator_finetune.yaml` can run a finetuning of the [Microsoft Phi 2](https://huggingface.co/microsoft/phi-2) powered by [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/en/index) on all GPUs available in the environment. To do so, just call:
 
 ```shell
-python run.py --config phi_accelerated_local_finetune.yaml # if your architecture doesn't support BF16
-# OR
-python run.py --config phi_accelerated_local_bf16_finetune.yaml # if your architecture support BF16
+python run.py --config orchestrator_finetune.yaml --accelerate
 ```
 
-Under the hood, the finetuning step will spin up the accelerated job using the finetuning script CLI wrapper (`scripts/finetune.py`), which will run on all available GPUs.
+Under the hood, the finetuning step will spin up the accelerated job using the step code, which will run on all available GPUs.
 
 ## ☁️ Running with a remote stack
 
@@ -99,11 +97,11 @@ The project loosely follows [the recommended ZenML project structure](https://do
 ├── steps                                         # logically grouped `zenml.steps` implementations
 │   ├── evaluate_model.py                         # evaluate base and finetuned models using Rouge metrics
 │   ├── finetune.py                               # finetune the base model
+│   ├── log_metadata.py                           # helper step to ensure that model metadata is always logged
 │   ├── prepare_datasets.py                       # load and tokenize dataset
 │   └── promote.py                                # promote good models to target environment
 ├── utils                                         # utility functions
 │   ├── callbacks.py                              # custom callbacks
-│   ├── cuda.py                                   # helpers for CUDA
 │   ├── loaders.py                                # loaders for models and data
 │   ├── logging.py                                # logging helpers
 │   └── tokenizer.py                              # load and tokenize
diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py
index da3640dd..8de58330 100644
--- a/llm-lora-finetuning/steps/finetune.py
+++ b/llm-lora-finetuning/steps/finetune.py
@@ -93,8 +93,7 @@ def finetune(
         should_print = True
 
     project = "zenml-finetune"
-    base_model_name = "mistral"
-    run_name = base_model_name + "-" + project
+    run_name = base_model_id + "-" + project
     output_dir = "./" + run_name
 
     if should_print:

From 329cf171dcc09963dfd2b7cbfaa957bd066fbed7 Mon Sep 17 00:00:00 2001
From: Andrei Vishniakov <31008759+avishniakov@users.noreply.github.com>
Date: Thu, 20 Jun 2024 11:33:11 +0200
Subject: [PATCH 28/28] final touches

---
 llm-lora-finetuning/steps/evaluate_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm-lora-finetuning/steps/evaluate_model.py b/llm-lora-finetuning/steps/evaluate_model.py
index 0722effe..4dfba094 100644
--- a/llm-lora-finetuning/steps/evaluate_model.py
+++ b/llm-lora-finetuning/steps/evaluate_model.py
@@ -64,7 +64,7 @@ def evaluate_model(
         is_eval=True,
         use_fast=use_fast,
     )
-    test_dataset = load_from_disk(datasets_dir / "test_raw")
+    test_dataset = load_from_disk(str((datasets_dir / "test_raw").absolute()))
     test_dataset = test_dataset[:50]
     ground_truths = test_dataset["meaning_representation"]
     tokenized_train_dataset = tokenize_for_eval(test_dataset, tokenizer, system_prompt)