# Load finetuned model ans say hello

## ✅ Utils

In [None]:
from unsloth import FastLanguageModel
from vllm import SamplingParams


def load_model(
 # model_name="meta-llama/Llama-3.2-1B-Instruct",
 model_name="meta-llama/meta-Llama-3.1-8B-Instruct",
 lora_path="../trainer_output_meta-llama_Llama-3.1-8B-Instruct_gpu1_20250326_134236/checkpoint-101",
 max_seq_length=8192,
):
 """Load model and tokenizer with optional LoRA weights."""
 # Load base model
 model, tokenizer = FastLanguageModel.from_pretrained(
 model_name=model_name,
 max_seq_length=max_seq_length,
 load_in_4bit=True,
 fast_inference=True,
 max_lora_rank=64,
 gpu_memory_utilization=0.6,
 )

 # Setup LoRA if path provided
 if lora_path:
 model = FastLanguageModel.get_peft_model(
 model,
 r=64,
 target_modules=[
 "q_proj",
 "k_proj",
 "v_proj",
 "o_proj",
 "gate_proj",
 "up_proj",
 "down_proj",
 ],
 lora_alpha=64,
 use_gradient_checkpointing=True,
 random_state=3407,
 )
 model.load_lora(lora_path)

 return model, tokenizer


def get_sampling_params(
 temperature=0.7,
 top_p=0.95,
 max_tokens=4096,
):
 """Get sampling parameters for text generation."""
 return SamplingParams(
 temperature=temperature,
 top_p=top_p,
 max_tokens=max_tokens,
 )


def generate_response(prompt, model, tokenizer, sampling_params):
 """Generate a response from the model."""
 inputs = tokenizer.apply_chat_template(
 [{"role": "user", "content": prompt}],
 tokenize=False,
 add_generation_prompt=True,
 )

 outputs = model.fast_generate([inputs], sampling_params=sampling_params)

 if hasattr(outputs[0], "outputs"):
 response_text = outputs[0].outputs[0].text
 else:
 response_text = outputs[0]

 return response_text

In [None]:
model, tokenizer = load_model() # Using default hardcoded path
sampling_params = get_sampling_params()

In [None]:
response = generate_response("Hi! How are you?", model, tokenizer, sampling_params)
print("\nModel response:")
print(response)

# Merge lora and save model to 16 bit, then test load inference
if False:

In [None]:
model.save_pretrained_merged(
 "model",
 tokenizer,
 save_method="merged_16bit",
)

## ✅ Test load merged model 16bit

In [None]:
# Test load merged model
model, tokenizer = load_model(model_name="model", lora_path=None)
sampling_params = get_sampling_params()
response = generate_response("Hi! How are you?", model, tokenizer, sampling_params)
print("\nModel response:")
print(response)

# ❌ Save model to Ollama format
- bug no lllama-quantize on wsl, fix later
https://docs.unsloth.ai/basics/runninand-saving-models/saving-to-gguf
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
cmake -B build
cmake --build build --config Release
```

In [None]:
# Save to 8bit Q8_0
if True:
 model.save_pretrained_gguf(
 "model-gguf",
 tokenizer,
 )
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False:
 model.push_to_hub_gguf("hf/model", tokenizer, token="")

# Save to 16bit GGUF
if False:
 model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")
if False:
 model.push_to_hub_gguf("hf/model", tokenizer, quantization_method="f16", token="")

# Save to q4_k_m GGUF
if False:
 model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")
if False:
 model.push_to_hub_gguf(
 "hf/model", tokenizer, quantization_method="q4_k_m", token=""
 )

# Save to multiple GGUF options - much faster if you want multiple!
if False:
 model.push_to_hub_gguf(
 "hf/model", # Change hf to your username!
 tokenizer,
 quantization_method=[
 "q4_k_m",
 "q8_0",
 "q5_k_m",
 ],
 token="",
 )