[FEAT][check_device]

pull/334/head
Kye 1 year ago
parent d66621a9e1
commit 385d2df93a

@ -1,12 +1,19 @@
import torch
from swarms.models.base_llm import AbstractLLM
import subprocess
try:
from vllm import LLM, SamplingParams
except ImportError as error:
print(f"[ERROR] [vLLM] {error}")
# subprocess.run(["pip", "install", "vllm"])
# raise error
raise error
if torch.cuda.is_available() or torch.cuda.device_count() > 0:
# Download vllm with pip
try:
subprocess.run(["pip", "install", "vllm"])
from vllm import LLM, SamplingParams
except Exception as error:
print(f"[ERROR] [vLLM] {error}")
raise error
else:
from swarms.models.huggingface import HuggingfaceLLM as LLM
SamplingParams = None
class vLLM(AbstractLLM):
@ -83,8 +90,9 @@ class vLLM(AbstractLLM):
_type_: _description_
"""
try:
outputs = self.llm.generate(task, self.sampling_params)
return outputs
return self.llm.generate(
task, self.sampling_params, *args, **kwargs
)
except Exception as error:
print(f"[ERROR] [vLLM] [run] {error}")
raise error

@ -0,0 +1,70 @@
import torch
import logging
from typing import Union, List, Any
from torch.cuda import memory_allocated, memory_reserved
def check_device(
log_level: Any = logging.INFO,
memory_threshold: float = 0.8,
capability_threshold: float = 3.5,
return_type: str = "list",
) -> Union[torch.device, List[torch.device]]:
"""
Checks for the availability of CUDA and returns the appropriate device(s).
If CUDA is not available, returns a CPU device.
If CUDA is available, returns a list of all available GPU devices.
"""
logging.basicConfig(level=log_level)
# Check for CUDA availability
try:
if not torch.cuda.is_available():
logging.info("CUDA is not available. Using CPU...")
return torch.device("cpu")
except Exception as e:
logging.error("Error checking for CUDA availability: ", e)
return torch.device("cpu")
logging.info("CUDA is available.")
# Check for multiple GPUs
num_gpus = torch.cuda.device_count()
devices = []
if num_gpus > 1:
logging.info(f"Multiple GPUs available: {num_gpus}")
devices = [torch.device(f"cuda:{i}") for i in range(num_gpus)]
else:
logging.info("Only one GPU is available.")
devices = [torch.device("cuda")]
# Check additional properties for each device
for device in devices:
try:
torch.cuda.set_device(device)
capability = torch.cuda.get_device_capability(device)
total_memory = torch.cuda.get_device_properties(
device
).total_memory
allocated_memory = memory_allocated(device)
reserved_memory = memory_reserved(device)
device_name = torch.cuda.get_device_name(device)
logging.info(
f"Device: {device}, Name: {device_name}, Compute"
f" Capability: {capability}, Total Memory:"
f" {total_memory}, Allocated Memory:"
f" {allocated_memory}, Reserved Memory:"
f" {reserved_memory}"
)
except Exception as e:
logging.error(
f"Error retrieving properties for device {device}: ",
e,
)
return devices
devices = check_device()
logging.info(f"Using device(s): {devices}")

@ -0,0 +1,111 @@
import torch
from unittest.mock import MagicMock
import pytest
from swarms.utils.device_checker_cuda import check_device
def test_cuda_not_available(mocker):
mocker.patch("torch.cuda.is_available", return_value=False)
device = check_device()
assert str(device) == "cpu"
def test_single_gpu_available(mocker):
mocker.patch("torch.cuda.is_available", return_value=True)
mocker.patch("torch.cuda.device_count", return_value=1)
devices = check_device()
assert len(devices) == 1
assert str(devices[0]) == "cuda"
def test_multiple_gpus_available(mocker):
mocker.patch("torch.cuda.is_available", return_value=True)
mocker.patch("torch.cuda.device_count", return_value=2)
devices = check_device()
assert len(devices) == 2
assert str(devices[0]) == "cuda:0"
assert str(devices[1]) == "cuda:1"
def test_device_properties(mocker):
mocker.patch("torch.cuda.is_available", return_value=True)
mocker.patch("torch.cuda.device_count", return_value=1)
mocker.patch(
"torch.cuda.get_device_capability", return_value=(7, 5)
)
mocker.patch(
"torch.cuda.get_device_properties",
return_value=MagicMock(total_memory=1000),
)
mocker.patch("torch.cuda.memory_allocated", return_value=200)
mocker.patch("torch.cuda.memory_reserved", return_value=300)
mocker.patch(
"torch.cuda.get_device_name", return_value="Tesla K80"
)
devices = check_device()
assert len(devices) == 1
assert str(devices[0]) == "cuda"
def test_memory_threshold(mocker):
mocker.patch("torch.cuda.is_available", return_value=True)
mocker.patch("torch.cuda.device_count", return_value=1)
mocker.patch(
"torch.cuda.get_device_capability", return_value=(7, 5)
)
mocker.patch(
"torch.cuda.get_device_properties",
return_value=MagicMock(total_memory=1000),
)
mocker.patch(
"torch.cuda.memory_allocated", return_value=900
) # 90% of total memory
mocker.patch("torch.cuda.memory_reserved", return_value=300)
mocker.patch(
"torch.cuda.get_device_name", return_value="Tesla K80"
)
with pytest.warns(
UserWarning,
match=r"Memory usage for device cuda exceeds threshold",
):
devices = check_device(
memory_threshold=0.8
) # Set memory threshold to 80%
assert len(devices) == 1
assert str(devices[0]) == "cuda"
def test_compute_capability_threshold(mocker):
mocker.patch("torch.cuda.is_available", return_value=True)
mocker.patch("torch.cuda.device_count", return_value=1)
mocker.patch(
"torch.cuda.get_device_capability", return_value=(3, 0)
) # Compute capability 3.0
mocker.patch(
"torch.cuda.get_device_properties",
return_value=MagicMock(total_memory=1000),
)
mocker.patch("torch.cuda.memory_allocated", return_value=200)
mocker.patch("torch.cuda.memory_reserved", return_value=300)
mocker.patch(
"torch.cuda.get_device_name", return_value="Tesla K80"
)
with pytest.warns(
UserWarning,
match=(
r"Compute capability for device cuda is below threshold"
),
):
devices = check_device(
capability_threshold=3.5
) # Set compute capability threshold to 3.5
assert len(devices) == 1
assert str(devices[0]) == "cuda"
def test_return_single_device(mocker):
mocker.patch("torch.cuda.is_available", return_value=True)
mocker.patch("torch.cuda.device_count", return_value=2)
device = check_device(return_type="single")
assert isinstance(device, torch.device)
assert str(device) == "cuda:0"
Loading…
Cancel
Save