Skip to content

Commit

Permalink
fixing the code
Browse files Browse the repository at this point in the history
  • Loading branch information
bcdurak committed Dec 11, 2024
1 parent efb6412 commit 4472b1a
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 27 deletions.
17 changes: 6 additions & 11 deletions llm-finetuning-simple/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,11 @@ In the fast-paced world of AI, the ability to efficiently fine-tune Large Langua
2. [Installation](#installation)
3. [Running the Pipeline](#running-the-pipeline)
4. [Configuration](#configuration)
5. [Accelerated Fine-Tuning](#accelerated-fine-tuning)
6. [Running with Remote Stack](#running-with-remote-stack)
7. [Customizing Data Preparation](#customizing-data-preparation)
8. [Project Structure](#project-structure)
9. [Benefits & Future](#benefits--future)
10. [Credits](#credits)
5. [Running with Remote Stack](#running-with-remote-stack)
6. [Customizing Data Preparation](#customizing-data-preparation)
7. [Project Structure](#project-structure)
8. [Benefits & Future](#benefits--future)
9. [Credits](#credits)

## Introduction

Expand All @@ -39,10 +38,6 @@ source .venv/bin/activate
# Install requirements
pip install -r requirements.txt

# Install ZenML and Lightning integrations
pip install "zenml>=0.70.0"
zenml integration install lightning s3 aws -y

# Initialize and connect to a deployed ZenML server
zenml init
zenml login <MYZENMLSERVERURL>
Expand Down Expand Up @@ -120,7 +115,7 @@ steps:

## Running with Remote Stack

Set up a remote lightning stack with ZenML for fine tuning on remote infrastructure:
Set up a remote lightning stack with ZenML for fine-tuning on remote infrastructure:

1. **Register Orchestrator and Artifact Store:**

Expand Down
6 changes: 6 additions & 0 deletions llm-finetuning-simple/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ zenml>=0.70.0
torch>=2.2.0
sentencepiece
huggingface_hub
s3fs>2022.3.0
boto3
aws-profile-manager
sagemaker>=2.117.0
kubernetes
lightning-sdk>=0.1.17
77 changes: 61 additions & 16 deletions llm-finetuning-simple/run.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,59 @@
import argparse

import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from zenml import pipeline, step, log_model_metadata
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
from typing_extensions import Annotated
import argparse
from zenml.integrations.huggingface.materializers.huggingface_datasets_materializer import HFDatasetMaterializer
from zenml import pipeline, step, log_metadata
from zenml.integrations.huggingface.materializers.huggingface_datasets_materializer import (
HFDatasetMaterializer
)


@step(output_materializers=HFDatasetMaterializer)
def prepare_data(base_model_id: str, dataset_name: str, dataset_size: int, max_length: int) -> Annotated[Dataset, "tokenized_dataset"]:
def prepare_data(
base_model_id: str,
dataset_name: str,
dataset_size: int,
max_length: int,
) -> Annotated[Dataset, "tokenized_dataset"]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset(dataset_name, split=f"train[:{dataset_size}]")

def tokenize_function(example):
prompt = f"Question: {example['question']}\nAnswer: {example['answers']['text'][0]}"
return tokenizer(prompt, truncation=True, padding="max_length", max_length=max_length)
prompt = f"Question: {example['question']}\n" \
f"Answer: {example['answers']['text'][0]}"
return tokenizer(prompt, truncation=True, padding="max_length",
max_length=max_length)

tokenized_data = dataset.map(tokenize_function, remove_columns=dataset.column_names)
log_model_metadata(metadata={"dataset_size": len(tokenized_data), "max_length": max_length})
tokenized_data = dataset.map(
tokenize_function,
remove_columns=dataset.column_names
)
log_metadata(
metadata={
"dataset_size": len(tokenized_data),
"max_length": max_length
},
infer_model=True,
)
return tokenized_data


@step
def finetune(base_model_id: str, tokenized_dataset: Dataset, num_train_epochs: int, per_device_train_batch_size: int) -> None:
def finetune(
base_model_id: str,
tokenized_dataset: Dataset,
num_train_epochs: int,
per_device_train_batch_size: int
) -> None:
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
base_model_id,
Expand Down Expand Up @@ -49,20 +81,33 @@ def finetune(base_model_id: str, tokenized_dataset: Dataset, num_train_epochs: i
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,
mlm=False),
)

train_result = trainer.train()
log_model_metadata(metadata={"metrics": {"train_loss": train_result.metrics.get("train_loss")}})
log_metadata(
metadata={
"metrics": {"train_loss": train_result.metrics.get("train_loss")}
},
infer_model=True,
)
trainer.save_model("finetuned_model")


@pipeline
def llm_finetune_pipeline(base_model_id: str):
tokenized_dataset = prepare_data(base_model_id)
finetune(base_model_id, tokenized_dataset)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, required=True, help='Path to the YAML config file')
parser.add_argument(
'--config',
type=str,
required=True,
help='Path to the YAML config file'
)
args = parser.parse_args()
llm_finetune_pipeline.with_options(config_path=args.config)()
llm_finetune_pipeline.with_options(config_path=args.config)()

0 comments on commit 4472b1a

Please sign in to comment.