envs: # MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B HF_TOKEN: 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 # Change to your own huggingface token, or use --env to pass. HF_HUB_ENABLE_HF_TRANSFER: True # Service configuration service: readiness_probe: path: /v1/chat/completions # Path for the readiness probe post_data: model: $MODEL_NAME # Specify the model name messages: - role: user content: Hello! What is your name? # Specify the initial message max_tokens: 1 # Maximum number of tokens readiness_probe: /v1/health # Additional readiness probe # Replica Policy replica_policy: min_replicas: 1 # Minimum number of replicas max_replicas: 10 # Maximum number of replicas target_qps_per_replica: 2.5 # Target queries per second per replica upscale_delay_seconds: 200 # Delay before upscaling replicas downscale_delay_seconds: 1200 # Delay before downscaling replicas resources: # accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} accelerators: {A10g, A10, L40, A40} # We can use cheaper accelerators for 8B model. # cpus: 32+ use_spot: True disk_size: 100 # Ensure model checkpoints can fit. # disk_tier: best ports: 8081 # Expose to internet traffic. setup: | #Install vllm conda activate vllm if [ $? -ne 0 ]; then conda create -n vllm python=3.10 -y conda activate vllm fi pip install vllm==0.4.0.post1 # Install Gradio for web UI. pip install gradio openai pip install flash-attn==2.5.7 pip install hf_transfer run: | # Serve VLM conda activate vllm echo 'Starting vllm api server...' # https://github.com/vllm-project/vllm/issues/3098 export PATH=$PATH:/sbin # NOTE: --gpu-memory-utilization 0.95 needed for 4-GPU nodes. python3 -u -m vllm.entrypoints.openai.api_server \ --port 8090 \ --model meta-llama/Meta-Llama-3-8B \ --trust-remote-code --tensor-parallel-size 4 \ --gpu-memory-utilization 0.95 \ --max-num-seqs 64 \ # Serve Gradio # echo 'Starting gradio server...' # git clone https://github.com/vllm-project/vllm.git || true # python vllm/examples/gradio_openai_chatbot_webserver.py \ # -m $MODEL_NAME \ # --port 8811 \ # --model-url http://localhost:8081/v1 \ # --stop-token-ids 128009,128001 # --share echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true python3 vllm/examples/gradio_openai_chatbot_webserver.py \ -m meta-llama/Meta-Llama-3-8B\ --port 8811 \ --model-url http://localhost:8081/v1 \ --stop-token-ids 128009,128001