diff --git a/docs/swarms/examples/vllm.md b/docs/swarms/examples/vllm.md
index 11df0aab..b7f0a365 100644
--- a/docs/swarms/examples/vllm.md
+++ b/docs/swarms/examples/vllm.md
@@ -47,13 +47,15 @@ Here's a complete example of setting up the stock analysis swarm:
 from swarms import Agent, ConcurrentWorkflow
 from swarms.utils.vllm_wrapper import VLLMWrapper
 
-# Initialize the VLLM wrapper
+# Initialize the VLLM wrapper (model loads when used)
 vllm = VLLMWrapper(
     model_name="meta-llama/Llama-2-7b-chat-hf",
     system_prompt="You are a helpful assistant.",
 )
 ```
 
+The model is initialized when `run()` or `batched_run()` is first called.
+
 !!! note "Model Selection"
     The example uses Llama-2-7b-chat, but you can use any VLLM-compatible model. Make sure you have the necessary permissions and resources to run your chosen model.
 
diff --git a/docs/swarms/examples/vllm_integration.md b/docs/swarms/examples/vllm_integration.md
index c270e954..b3517fd5 100644
--- a/docs/swarms/examples/vllm_integration.md
+++ b/docs/swarms/examples/vllm_integration.md
@@ -28,7 +28,7 @@ Here's a simple example of how to use vLLM with Swarms:
 ```python title="basic_usage.py"
 from swarms.utils.vllm_wrapper import VLLMWrapper
 
-# Initialize the vLLM wrapper
+# Initialize the vLLM wrapper (model loads on first use)
 vllm = VLLMWrapper(
     model_name="meta-llama/Llama-2-7b-chat-hf",
     system_prompt="You are a helpful assistant.",
@@ -41,6 +41,8 @@ response = vllm.run("What is the capital of France?")
 print(response)
 ```
 
+The first call to `run()` lazily loads the model weights.
+
 ## VLLMWrapper Class
 
 !!! abstract "Class Overview"
diff --git a/examples/demos/swarms_of_vllm.py b/examples/demos/swarms_of_vllm.py
index 89191ab0..9e8591d1 100644
--- a/examples/demos/swarms_of_vllm.py
+++ b/examples/demos/swarms_of_vllm.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-# Initialize the VLLM wrapper
+# Initialize the VLLM wrapper (model loads lazily on first run)
 vllm = VLLMWrapper(
     model_name="meta-llama/Llama-2-7b-chat-hf",
     system_prompt="You are a helpful assistant.",
diff --git a/examples/models/vllm_example.py b/examples/models/vllm_example.py
index 0545354f..98725c56 100644
--- a/examples/models/vllm_example.py
+++ b/examples/models/vllm_example.py
@@ -2,9 +2,9 @@ from swarms.utils.vllm_wrapper import VLLMWrapper
 
 
 def main():
-    # Initialize the vLLM wrapper with a model
-    # Note: You'll need to have the model downloaded or specify a HuggingFace model ID
-    llm = VLLMWrapper(
+    # Initialize the vLLM wrapper.
+    # The actual model weights load lazily on the first call to `run()`.
+    llm = VLLMWrapper(
         model_name="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model path or HF model ID
         temperature=0.7,
         max_tokens=1000,
@@ -17,8 +17,8 @@ def main():
     response = llm.run(task)
     print("Response:", response)
 
-    # Example with system prompt
-    llm_with_system = VLLMWrapper(
+    # Example with system prompt. Model initialization is still lazy.
+    llm_with_system = VLLMWrapper(
         model_name="meta-llama/Llama-2-7b-chat-hf",  # Replace with your model path or HF model ID
         system_prompt="You are a helpful AI assistant that provides concise answers.",
         temperature=0.7,
diff --git a/swarms/utils/vllm_wrapper.py b/swarms/utils/vllm_wrapper.py
index 8a114ad1..ae8fa3dc 100644
--- a/swarms/utils/vllm_wrapper.py
+++ b/swarms/utils/vllm_wrapper.py
@@ -61,12 +61,22 @@ class VLLMWrapper:
         self.tool_choice = tool_choice
         self.parallel_tool_calls = parallel_tool_calls
 
-        # Initialize vLLM
-        self.llm = LLM(model=model_name, **kwargs)
-        self.sampling_params = SamplingParams(
-            temperature=temperature,
-            max_tokens=max_tokens,
-        )
+        # store kwargs for later lazy initialization
+        self._llm_kwargs = kwargs
+
+        # Initialize attributes for lazy loading
+        self.llm = None
+        self.sampling_params = None
+
+    def _ensure_initialized(self):
+        """Lazily initialize the underlying vLLM objects if needed."""
+        if self.llm is None:
+            self.llm = LLM(model=self.model_name, **self._llm_kwargs)
+        if self.sampling_params is None:
+            self.sampling_params = SamplingParams(
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+            )
 
     def _prepare_prompt(self, task: str) -> str:
         """
@@ -82,9 +92,9 @@ class VLLMWrapper:
             return f"{self.system_prompt}\n\nUser: {task}\nAssistant:"
         return f"User: {task}\nAssistant:"
 
-    def run(self, task: str, *args, **kwargs) -> str:
-        """
-        Run the model for the given task.
+    def run(self, task: str, *args, **kwargs) -> str:
+        """
+        Run the model for the given task.
 
         Args:
             task (str): The task to run the model for.
@@ -94,10 +104,11 @@ class VLLMWrapper:
         Returns:
             str: The model's response.
         """
-        try:
-            prompt = self._prepare_prompt(task)
-
-            outputs = self.llm.generate(prompt, self.sampling_params)
+        try:
+            self._ensure_initialized()
+            prompt = self._prepare_prompt(task)
+
+            outputs = self.llm.generate(prompt, self.sampling_params)
             response = outputs[0].outputs[0].text.strip()
 
             return response
@@ -120,9 +131,9 @@ class VLLMWrapper:
         """
         return self.run(task, *args, **kwargs)
 
-    def batched_run(
-        self, tasks: List[str], batch_size: int = 10
-    ) -> List[str]:
+    def batched_run(
+        self, tasks: List[str], batch_size: int = 10
+    ) -> List[str]:
         """
         Run the model for multiple tasks in batches.
 
@@ -133,6 +144,7 @@ class VLLMWrapper:
         Returns:
             List[str]: List of model responses.
         """
+        self._ensure_initialized()
         # Calculate the worker count based on 95% of available CPU cores
         num_workers = max(1, int((os.cpu_count() or 1) * 0.95))
         with concurrent.futures.ThreadPoolExecutor(