parent
d66621a9e1
commit
385d2df93a
@ -0,0 +1,70 @@
|
||||
import torch
|
||||
import logging
|
||||
from typing import Union, List, Any
|
||||
from torch.cuda import memory_allocated, memory_reserved
|
||||
|
||||
|
||||
def check_device(
|
||||
log_level: Any = logging.INFO,
|
||||
memory_threshold: float = 0.8,
|
||||
capability_threshold: float = 3.5,
|
||||
return_type: str = "list",
|
||||
) -> Union[torch.device, List[torch.device]]:
|
||||
"""
|
||||
Checks for the availability of CUDA and returns the appropriate device(s).
|
||||
If CUDA is not available, returns a CPU device.
|
||||
If CUDA is available, returns a list of all available GPU devices.
|
||||
"""
|
||||
logging.basicConfig(level=log_level)
|
||||
|
||||
# Check for CUDA availability
|
||||
try:
|
||||
if not torch.cuda.is_available():
|
||||
logging.info("CUDA is not available. Using CPU...")
|
||||
return torch.device("cpu")
|
||||
except Exception as e:
|
||||
logging.error("Error checking for CUDA availability: ", e)
|
||||
return torch.device("cpu")
|
||||
|
||||
logging.info("CUDA is available.")
|
||||
|
||||
# Check for multiple GPUs
|
||||
num_gpus = torch.cuda.device_count()
|
||||
devices = []
|
||||
if num_gpus > 1:
|
||||
logging.info(f"Multiple GPUs available: {num_gpus}")
|
||||
devices = [torch.device(f"cuda:{i}") for i in range(num_gpus)]
|
||||
else:
|
||||
logging.info("Only one GPU is available.")
|
||||
devices = [torch.device("cuda")]
|
||||
|
||||
# Check additional properties for each device
|
||||
for device in devices:
|
||||
try:
|
||||
torch.cuda.set_device(device)
|
||||
capability = torch.cuda.get_device_capability(device)
|
||||
total_memory = torch.cuda.get_device_properties(
|
||||
device
|
||||
).total_memory
|
||||
allocated_memory = memory_allocated(device)
|
||||
reserved_memory = memory_reserved(device)
|
||||
device_name = torch.cuda.get_device_name(device)
|
||||
|
||||
logging.info(
|
||||
f"Device: {device}, Name: {device_name}, Compute"
|
||||
f" Capability: {capability}, Total Memory:"
|
||||
f" {total_memory}, Allocated Memory:"
|
||||
f" {allocated_memory}, Reserved Memory:"
|
||||
f" {reserved_memory}"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
f"Error retrieving properties for device {device}: ",
|
||||
e,
|
||||
)
|
||||
|
||||
return devices
|
||||
|
||||
|
||||
devices = check_device()
|
||||
logging.info(f"Using device(s): {devices}")
|
@ -0,0 +1,111 @@
|
||||
import torch
|
||||
from unittest.mock import MagicMock
|
||||
import pytest
|
||||
from swarms.utils.device_checker_cuda import check_device
|
||||
|
||||
|
||||
def test_cuda_not_available(mocker):
|
||||
mocker.patch("torch.cuda.is_available", return_value=False)
|
||||
device = check_device()
|
||||
assert str(device) == "cpu"
|
||||
|
||||
|
||||
def test_single_gpu_available(mocker):
|
||||
mocker.patch("torch.cuda.is_available", return_value=True)
|
||||
mocker.patch("torch.cuda.device_count", return_value=1)
|
||||
devices = check_device()
|
||||
assert len(devices) == 1
|
||||
assert str(devices[0]) == "cuda"
|
||||
|
||||
|
||||
def test_multiple_gpus_available(mocker):
|
||||
mocker.patch("torch.cuda.is_available", return_value=True)
|
||||
mocker.patch("torch.cuda.device_count", return_value=2)
|
||||
devices = check_device()
|
||||
assert len(devices) == 2
|
||||
assert str(devices[0]) == "cuda:0"
|
||||
assert str(devices[1]) == "cuda:1"
|
||||
|
||||
|
||||
def test_device_properties(mocker):
|
||||
mocker.patch("torch.cuda.is_available", return_value=True)
|
||||
mocker.patch("torch.cuda.device_count", return_value=1)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_capability", return_value=(7, 5)
|
||||
)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_properties",
|
||||
return_value=MagicMock(total_memory=1000),
|
||||
)
|
||||
mocker.patch("torch.cuda.memory_allocated", return_value=200)
|
||||
mocker.patch("torch.cuda.memory_reserved", return_value=300)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_name", return_value="Tesla K80"
|
||||
)
|
||||
devices = check_device()
|
||||
assert len(devices) == 1
|
||||
assert str(devices[0]) == "cuda"
|
||||
|
||||
|
||||
def test_memory_threshold(mocker):
|
||||
mocker.patch("torch.cuda.is_available", return_value=True)
|
||||
mocker.patch("torch.cuda.device_count", return_value=1)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_capability", return_value=(7, 5)
|
||||
)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_properties",
|
||||
return_value=MagicMock(total_memory=1000),
|
||||
)
|
||||
mocker.patch(
|
||||
"torch.cuda.memory_allocated", return_value=900
|
||||
) # 90% of total memory
|
||||
mocker.patch("torch.cuda.memory_reserved", return_value=300)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_name", return_value="Tesla K80"
|
||||
)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match=r"Memory usage for device cuda exceeds threshold",
|
||||
):
|
||||
devices = check_device(
|
||||
memory_threshold=0.8
|
||||
) # Set memory threshold to 80%
|
||||
assert len(devices) == 1
|
||||
assert str(devices[0]) == "cuda"
|
||||
|
||||
|
||||
def test_compute_capability_threshold(mocker):
|
||||
mocker.patch("torch.cuda.is_available", return_value=True)
|
||||
mocker.patch("torch.cuda.device_count", return_value=1)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_capability", return_value=(3, 0)
|
||||
) # Compute capability 3.0
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_properties",
|
||||
return_value=MagicMock(total_memory=1000),
|
||||
)
|
||||
mocker.patch("torch.cuda.memory_allocated", return_value=200)
|
||||
mocker.patch("torch.cuda.memory_reserved", return_value=300)
|
||||
mocker.patch(
|
||||
"torch.cuda.get_device_name", return_value="Tesla K80"
|
||||
)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match=(
|
||||
r"Compute capability for device cuda is below threshold"
|
||||
),
|
||||
):
|
||||
devices = check_device(
|
||||
capability_threshold=3.5
|
||||
) # Set compute capability threshold to 3.5
|
||||
assert len(devices) == 1
|
||||
assert str(devices[0]) == "cuda"
|
||||
|
||||
|
||||
def test_return_single_device(mocker):
|
||||
mocker.patch("torch.cuda.is_available", return_value=True)
|
||||
mocker.patch("torch.cuda.device_count", return_value=2)
|
||||
device = check_device(return_type="single")
|
||||
assert isinstance(device, torch.device)
|
||||
assert str(device) == "cuda:0"
|
Loading…
Reference in new issue