from swarms.utils.vllm_wrapper import VLLMWrapper def main(): # Initialize the vLLM wrapper with a model # Note: You'll need to have the model downloaded or specify a HuggingFace model ID llm = VLLMWrapper( model_name="meta-llama/Llama-2-7b-chat-hf", # Replace with your model path or HF model ID temperature=0.7, max_tokens=1000, ) # Example task task = "What are the benefits of using vLLM for inference?" # Run inference response = llm.run(task) print("Response:", response) # Example with system prompt llm_with_system = VLLMWrapper( model_name="meta-llama/Llama-2-7b-chat-hf", # Replace with your model path or HF model ID system_prompt="You are a helpful AI assistant that provides concise answers.", temperature=0.7, ) # Run inference with system prompt response = llm_with_system.run(task) print("\nResponse with system prompt:", response) # Example with batched inference tasks = [ "What is vLLM?", "How does vLLM improve inference speed?", "What are the main features of vLLM?" ] responses = llm.batched_run(tasks, batch_size=2) print("\nBatched responses:") for task, response in zip(tasks, responses): print(f"\nTask: {task}") print(f"Response: {response}") if __name__ == "__main__": main()