Implement lazy initialization for VLLMWrapper

1 month ago · 7f3a854bb2
parent 2770b8c7bf
commit 7f3a854bb2
6 changed files with 49 additions and 53034 deletions
--- a/docs/llm.txt
+++ b/docs/llm.txt
--- a/docs/swarms/examples/vllm.md
+++ b/docs/swarms/examples/vllm.md
@ -47,13 +47,15 @@ Here's a complete example of setting up the stock analysis swarm:
 from swarms import Agent, ConcurrentWorkflow
 from swarms.utils.vllm_wrapper import VLLMWrapper

-# Initialize the VLLM wrapper
+# Initialize the VLLM wrapper (model loads when used)
 vllm = VLLMWrapper(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    system_prompt="You are a helpful assistant.",
 )
 ```

+The model is initialized when `run()` or `batched_run()` is first called.
+
 !!! note "Model Selection"
    The example uses Llama-2-7b-chat, but you can use any VLLM-compatible model. Make sure you have the necessary permissions and resources to run your chosen model.

--- a/docs/swarms/examples/vllm_integration.md
+++ b/docs/swarms/examples/vllm_integration.md
@ -28,7 +28,7 @@ Here's a simple example of how to use vLLM with Swarms:
 ```python title="basic_usage.py"
 from swarms.utils.vllm_wrapper import VLLMWrapper

-# Initialize the vLLM wrapper
+# Initialize the vLLM wrapper (model loads on first use)
 vllm = VLLMWrapper(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    system_prompt="You are a helpful assistant.",
@ -41,6 +41,8 @@ response = vllm.run("What is the capital of France?")
 print(response)
 ```

+The first call to `run()` lazily loads the model weights.
+
 ## VLLMWrapper Class

 !!! abstract "Class Overview"
--- a/examples/demos/swarms_of_vllm.py
+++ b/examples/demos/swarms_of_vllm.py
@ -4,7 +4,7 @@ from dotenv import load_dotenv

 load_dotenv()

-# Initialize the VLLM wrapper
+# Initialize the VLLM wrapper (model loads lazily on first run)
 vllm = VLLMWrapper(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    system_prompt="You are a helpful assistant.",
--- a/examples/models/vllm_example.py
+++ b/examples/models/vllm_example.py
@ -2,8 +2,8 @@ from swarms.utils.vllm_wrapper import VLLMWrapper


 def main():
-    # Initialize the vLLM wrapper with a model
-    # Note: You'll need to have the model downloaded or specify a HuggingFace model ID
+    # Initialize the vLLM wrapper.
+    # The actual model weights load lazily on the first call to `run()`.
    llm = VLLMWrapper(
        model_name="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model path or HF model ID
        temperature=0.7,
@ -17,7 +17,7 @@ def main():
    response = llm.run(task)
    print("Response:", response)

-    # Example with system prompt
+    # Example with system prompt. Model initialization is still lazy.
    llm_with_system = VLLMWrapper(
        model_name="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model path or HF model ID
        system_prompt="You are a helpful AI assistant that provides concise answers.",
--- a/swarms/utils/vllm_wrapper.py
+++ b/swarms/utils/vllm_wrapper.py
@ -61,12 +61,22 @@ class VLLMWrapper:
        self.tool_choice = tool_choice
        self.parallel_tool_calls = parallel_tool_calls

-        # Initialize vLLM
-        self.llm = LLM(model=model_name, **kwargs)
-        self.sampling_params = SamplingParams(
-            temperature=temperature,
-            max_tokens=max_tokens,
-        )
+        # store kwargs for later lazy initialization
+        self._llm_kwargs = kwargs
+
+        # Initialize attributes for lazy loading
+        self.llm = None
+        self.sampling_params = None
+
+    def _ensure_initialized(self):
+        """Lazily initialize the underlying vLLM objects if needed."""
+        if self.llm is None:
+            self.llm = LLM(model=self.model_name, **self._llm_kwargs)
+        if self.sampling_params is None:
+            self.sampling_params = SamplingParams(
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+            )

    def _prepare_prompt(self, task: str) -> str:
        """
@ -95,6 +105,7 @@ class VLLMWrapper:
            str: The model's response.
        """
        try:
+            self._ensure_initialized()
            prompt = self._prepare_prompt(task)

            outputs = self.llm.generate(prompt, self.sampling_params)
@ -133,6 +144,7 @@ class VLLMWrapper:
        Returns:
            List[str]: List of model responses.
        """
+        self._ensure_initialized()
        # Calculate the worker count based on 95% of available CPU cores
        num_workers = max(1, int((os.cpu_count() or 1) * 0.95))
        with concurrent.futures.ThreadPoolExecutor(