Finetuning in 100 lines of code
Learn how to implement an LLM fine-tuning pipeline in just 100 lines of code.
pip install datasets transformers torch accelerate>=0.26.0import os
from typing import List, Dict, Tuple
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
import torch
def prepare_dataset() -> Dataset:
data: List[Dict[str, str]] = [
{"instruction": "Describe a Zenbot.",
"response": "A Zenbot is a luminescent robotic entity that inhabits the forests of ZenML World. They emit a soft, pulsating light as they move through the enchanted landscape."},
{"instruction": "What are Cosmic Butterflies?",
"response": "Cosmic Butterflies are ethereal creatures that flutter through the neon skies of ZenML World. Their iridescent wings leave magical trails of stardust wherever they go."},
{"instruction": "Tell me about the Telepathic Treants.",
"response": "Telepathic Treants are ancient, sentient trees connected through a quantum neural network spanning ZenML World. They share wisdom and knowledge across their vast network."}
]
return Dataset.from_list(data)
def format_instruction(example: Dict[str, str]) -> str:
"""Format the instruction and response into a single string."""
return f"### Instruction: {example['instruction']}\n### Response: {example['response']}"
def tokenize_data(example: Dict[str, str], tokenizer: AutoTokenizer) -> Dict[str, torch.Tensor]:
formatted_text = format_instruction(example)
return tokenizer(formatted_text, truncation=True, padding="max_length", max_length=128)
def fine_tune_model(base_model: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.bfloat16,
device_map="auto"
)
dataset = prepare_dataset()
tokenized_dataset = dataset.map(
lambda x: tokenize_data(x, tokenizer),
remove_columns=dataset.column_names
)
# Setup training arguments
training_args = TrainingArguments(
output_dir="./zenml-world-model",
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
learning_rate=2e-4,
bf16=True,
logging_steps=10,
save_total_limit=2,
)
# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
)
trainer.train()
return model, tokenizer
def generate_response(prompt: str, model: AutoModelForCausalLM, tokenizer: AutoTokenizer, max_length: int = 128) -> str:
"""Generate a response using the fine-tuned model."""
formatted_prompt = f"### Instruction: {prompt}\n### Response:"
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_length=max_length,
temperature=0.7,
num_return_sequences=1,
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
if __name__ == "__main__":
model, tokenizer = fine_tune_model()
# Test the model
test_prompts: List[str] = [
"What is a Zenbot?",
"Describe the Cosmic Butterflies.",
"Tell me about an unknown creature.",
]
for prompt in test_prompts:
response = generate_response(prompt, model, tokenizer)
print(f"\nPrompt: {prompt}")
print(f"Response: {response}")
How It Works
1. Dataset Preparation
2. Data Formatting and Tokenization
3. Model Selection and Setup
4. Training Configuration
5. Generation and Inference
Understanding the Limitations
Next Steps
Last updated
Was this helpful?