You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
swarms/sky_serve.yaml

85 lines
2.6 KiB

envs:
# MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
MODEL_NAME: meta-llama/Meta-Llama-3-8B
HF_TOKEN: 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 # Change to your own huggingface token, or use --env to pass.
HF_HUB_ENABLE_HF_TRANSFER: True
# Service configuration
service:
readiness_probe:
path: /v1/chat/completions # Path for the readiness probe
post_data:
model: $MODEL_NAME # Specify the model name
messages:
- role: user
content: Hello! What is your name? # Specify the initial message
max_tokens: 1 # Maximum number of tokens
readiness_probe: /v1/health # Additional readiness probe
# Replica Policy
replica_policy:
min_replicas: 1 # Minimum number of replicas
max_replicas: 10 # Maximum number of replicas
target_qps_per_replica: 2.5 # Target queries per second per replica
upscale_delay_seconds: 200 # Delay before upscaling replicas
downscale_delay_seconds: 1200 # Delay before downscaling replicas
resources:
# accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
accelerators: {A10g, A10, L40, A40} # We can use cheaper accelerators for 8B model.
# cpus: 32+
use_spot: True
disk_size: 100 # Ensure model checkpoints can fit.
# disk_tier: best
ports: 8081 # Expose to internet traffic.
setup: |
#Install vllm
conda activate vllm
if [ $? -ne 0 ]; then
conda create -n vllm python=3.10 -y
conda activate vllm
fi
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
pip install hf_transfer
run: |
# Serve VLM
conda activate vllm
echo 'Starting vllm api server...'
# https://github.com/vllm-project/vllm/issues/3098
export PATH=$PATH:/sbin
# NOTE: --gpu-memory-utilization 0.95 needed for 4-GPU nodes.
python3 -u -m vllm.entrypoints.openai.api_server \
--port 8090 \
--model meta-llama/Meta-Llama-3-8B \
--trust-remote-code --tensor-parallel-size 4 \
--gpu-memory-utilization 0.95 \
--max-num-seqs 64 \
# Serve Gradio
# echo 'Starting gradio server...'
# git clone https://github.com/vllm-project/vllm.git || true
# python vllm/examples/gradio_openai_chatbot_webserver.py \
# -m $MODEL_NAME \
# --port 8811 \
# --model-url http://localhost:8081/v1 \
# --stop-token-ids 128009,128001
# --share
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python3 vllm/examples/gradio_openai_chatbot_webserver.py \
-m meta-llama/Meta-Llama-3-8B\
--port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001