swarms/sky_serve.yaml

envs:
  # MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
  MODEL_NAME: meta-llama/Meta-Llama-3-8B
  HF_TOKEN: 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  # Change to your own huggingface token, or use --env to pass.
  HF_HUB_ENABLE_HF_TRANSFER: True

# Service configuration
service:
  readiness_probe:
    path: /v1/chat/completions  # Path for the readiness probe
    post_data:
      model: $MODEL_NAME  # Specify the model name
      messages:
        - role: user
          content: Hello! What is your name?  # Specify the initial message
      max_tokens: 1  # Maximum number of tokens
  readiness_probe: /v1/health  # Additional readiness probe

  # Replica Policy
  replica_policy:
    min_replicas: 1  # Minimum number of replicas
    max_replicas: 10  # Maximum number of replicas
    target_qps_per_replica: 2.5  # Target queries per second per replica
    upscale_delay_seconds: 200  # Delay before upscaling replicas
    downscale_delay_seconds: 1200  # Delay before downscaling replicas

resources:
  # accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
  accelerators: {A10g, A10, L40, A40} # We can use cheaper accelerators for 8B model.
  # cpus: 32+
  use_spot: True
  disk_size: 100  # Ensure model checkpoints can fit.
  # disk_tier: best
  ports: 8081  # Expose to internet traffic.

setup: |
  #Install vllm
  conda activate vllm
  if [ $? -ne 0 ]; then
    conda create -n vllm python=3.10 -y
    conda activate vllm
  fi

  pip install vllm==0.4.0.post1

  # Install Gradio for web UI.
  pip install gradio openai
  pip install flash-attn==2.5.7
  pip install hf_transfer

run: |
  # Serve VLM

  conda activate vllm
  echo 'Starting vllm api server...'
  # https://github.com/vllm-project/vllm/issues/3098
  export PATH=$PATH:/sbin

  # NOTE: --gpu-memory-utilization 0.95 needed for 4-GPU nodes.
  python3 -u -m vllm.entrypoints.openai.api_server \
    --port 8090 \
    --model meta-llama/Meta-Llama-3-8B \
    --trust-remote-code --tensor-parallel-size 4 \
    --gpu-memory-utilization 0.95 \
    --max-num-seqs 64 \

  # Serve Gradio

  # echo 'Starting gradio server...'
  # git clone https://github.com/vllm-project/vllm.git || true
  # python vllm/examples/gradio_openai_chatbot_webserver.py \
  #   -m $MODEL_NAME \
  #   --port 8811 \
  #   --model-url http://localhost:8081/v1 \
  #   --stop-token-ids 128009,128001
  #   --share

  echo 'Starting gradio server...'
  git clone https://github.com/vllm-project/vllm.git || true
  python3 vllm/examples/gradio_openai_chatbot_webserver.py \
    -m meta-llama/Meta-Llama-3-8B\
    --port 8811 \
    --model-url http://localhost:8081/v1 \
    --stop-token-ids 128009,128001