You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
47 lines
1.4 KiB
47 lines
1.4 KiB
from swarms.utils.vllm_wrapper import VLLMWrapper
|
|
|
|
|
|
def main():
|
|
# Initialize the vLLM wrapper with a model
|
|
# Note: You'll need to have the model downloaded or specify a HuggingFace model ID
|
|
llm = VLLMWrapper(
|
|
model_name="meta-llama/Llama-2-7b-chat-hf", # Replace with your model path or HF model ID
|
|
temperature=0.7,
|
|
max_tokens=1000,
|
|
)
|
|
|
|
# Example task
|
|
task = "What are the benefits of using vLLM for inference?"
|
|
|
|
# Run inference
|
|
response = llm.run(task)
|
|
print("Response:", response)
|
|
|
|
# Example with system prompt
|
|
llm_with_system = VLLMWrapper(
|
|
model_name="meta-llama/Llama-2-7b-chat-hf", # Replace with your model path or HF model ID
|
|
system_prompt="You are a helpful AI assistant that provides concise answers.",
|
|
temperature=0.7,
|
|
)
|
|
|
|
# Run inference with system prompt
|
|
response = llm_with_system.run(task)
|
|
print("\nResponse with system prompt:", response)
|
|
|
|
# Example with batched inference
|
|
tasks = [
|
|
"What is vLLM?",
|
|
"How does vLLM improve inference speed?",
|
|
"What are the main features of vLLM?",
|
|
]
|
|
|
|
responses = llm.batched_run(tasks, batch_size=2)
|
|
print("\nBatched responses:")
|
|
for task, response in zip(tasks, responses):
|
|
print(f"\nTask: {task}")
|
|
print(f"Response: {response}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|