You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
85 lines
2.6 KiB
85 lines
2.6 KiB
envs:
|
|
# MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
|
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B
|
|
HF_TOKEN: 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 # Change to your own huggingface token, or use --env to pass.
|
|
HF_HUB_ENABLE_HF_TRANSFER: True
|
|
|
|
# Service configuration
|
|
service:
|
|
readiness_probe:
|
|
path: /v1/chat/completions # Path for the readiness probe
|
|
post_data:
|
|
model: $MODEL_NAME # Specify the model name
|
|
messages:
|
|
- role: user
|
|
content: Hello! What is your name? # Specify the initial message
|
|
max_tokens: 1 # Maximum number of tokens
|
|
readiness_probe: /v1/health # Additional readiness probe
|
|
|
|
# Replica Policy
|
|
replica_policy:
|
|
min_replicas: 1 # Minimum number of replicas
|
|
max_replicas: 10 # Maximum number of replicas
|
|
target_qps_per_replica: 2.5 # Target queries per second per replica
|
|
upscale_delay_seconds: 200 # Delay before upscaling replicas
|
|
downscale_delay_seconds: 1200 # Delay before downscaling replicas
|
|
|
|
resources:
|
|
# accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
|
|
accelerators: {A10g, A10, L40, A40} # We can use cheaper accelerators for 8B model.
|
|
# cpus: 32+
|
|
use_spot: True
|
|
disk_size: 100 # Ensure model checkpoints can fit.
|
|
# disk_tier: best
|
|
ports: 8081 # Expose to internet traffic.
|
|
|
|
setup: |
|
|
#Install vllm
|
|
conda activate vllm
|
|
if [ $? -ne 0 ]; then
|
|
conda create -n vllm python=3.10 -y
|
|
conda activate vllm
|
|
fi
|
|
|
|
pip install vllm==0.4.0.post1
|
|
|
|
# Install Gradio for web UI.
|
|
pip install gradio openai
|
|
pip install flash-attn==2.5.7
|
|
pip install hf_transfer
|
|
|
|
run: |
|
|
# Serve VLM
|
|
|
|
conda activate vllm
|
|
echo 'Starting vllm api server...'
|
|
# https://github.com/vllm-project/vllm/issues/3098
|
|
export PATH=$PATH:/sbin
|
|
|
|
# NOTE: --gpu-memory-utilization 0.95 needed for 4-GPU nodes.
|
|
python3 -u -m vllm.entrypoints.openai.api_server \
|
|
--port 8090 \
|
|
--model meta-llama/Meta-Llama-3-8B \
|
|
--trust-remote-code --tensor-parallel-size 4 \
|
|
--gpu-memory-utilization 0.95 \
|
|
--max-num-seqs 64 \
|
|
|
|
# Serve Gradio
|
|
|
|
# echo 'Starting gradio server...'
|
|
# git clone https://github.com/vllm-project/vllm.git || true
|
|
# python vllm/examples/gradio_openai_chatbot_webserver.py \
|
|
# -m $MODEL_NAME \
|
|
# --port 8811 \
|
|
# --model-url http://localhost:8081/v1 \
|
|
# --stop-token-ids 128009,128001
|
|
# --share
|
|
|
|
echo 'Starting gradio server...'
|
|
git clone https://github.com/vllm-project/vllm.git || true
|
|
python3 vllm/examples/gradio_openai_chatbot_webserver.py \
|
|
-m meta-llama/Meta-Llama-3-8B\
|
|
--port 8811 \
|
|
--model-url http://localhost:8081/v1 \
|
|
--stop-token-ids 128009,128001
|