You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1447 lines
46 KiB
1447 lines
46 KiB
"""
|
|
GPU Model Manager
|
|
================
|
|
|
|
A production-grade utility for managing multiple PyTorch or Hugging Face models
|
|
across available GPUs. This module automatically calculates model memory requirements,
|
|
allocates models to appropriate GPUs, and provides a unified interface for running
|
|
inference tasks across all loaded models.
|
|
|
|
Features:
|
|
- Dynamic model memory calculation
|
|
- Optimal GPU memory allocation
|
|
- Multi-processing support for parallel model execution
|
|
- Customizable task execution for specific models
|
|
- Comprehensive logging and error handling
|
|
"""
|
|
|
|
import os
|
|
import queue
|
|
import sys
|
|
import time
|
|
import json
|
|
import uuid
|
|
import torch
|
|
import multiprocessing
|
|
from typing import Dict, List, Union, Optional, Any
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from loguru import logger
|
|
import numpy as np
|
|
from contextlib import contextmanager
|
|
|
|
# Try to import transformers, but don't fail if not available
|
|
try:
|
|
import transformers
|
|
from transformers import AutoModel, AutoTokenizer
|
|
|
|
TRANSFORMERS_AVAILABLE = True
|
|
except ImportError:
|
|
TRANSFORMERS_AVAILABLE = False
|
|
logger.warning(
|
|
"Transformers package not found. HuggingFace models will not be supported."
|
|
)
|
|
|
|
|
|
class ModelType(Enum):
|
|
"""Enum defining supported model types."""
|
|
|
|
PYTORCH = "pytorch"
|
|
HUGGINGFACE = "huggingface"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
class GPUAllocationStrategy(Enum):
|
|
"""Enum defining GPU allocation strategies."""
|
|
|
|
FILL_GPU = "fill_gpu" # Fill each GPU before moving to next
|
|
DISTRIBUTE = "distribute" # Distribute models evenly across GPUs
|
|
MEMORY_OPTIMIZED = "memory_optimized" # Optimize for memory usage
|
|
|
|
|
|
@dataclass
|
|
class ModelMetadata:
|
|
"""Data class for storing model metadata."""
|
|
|
|
name: str
|
|
model_type: ModelType
|
|
memory_required: float # in GB
|
|
model: Any
|
|
device: Optional[torch.device] = None
|
|
process: Optional[multiprocessing.Process] = None
|
|
loaded: bool = False
|
|
|
|
|
|
@dataclass
|
|
class GPUMetadata:
|
|
"""Data class for storing GPU metadata."""
|
|
|
|
id: int
|
|
device: torch.device
|
|
total_memory: float # in GB
|
|
available_memory: float # in GB
|
|
models: List[str] = None
|
|
|
|
def __post_init__(self):
|
|
if self.models is None:
|
|
self.models = []
|
|
|
|
|
|
class ModelMemoryCalculator:
|
|
"""Utility class for calculating model memory requirements."""
|
|
|
|
@staticmethod
|
|
def get_pytorch_model_size(model: torch.nn.Module) -> float:
|
|
"""
|
|
Calculate the memory size of a PyTorch model in GB.
|
|
|
|
Args:
|
|
model: PyTorch model object
|
|
|
|
Returns:
|
|
Memory size in GB
|
|
"""
|
|
try:
|
|
# Get model size in parameters
|
|
model_parameters = sum(
|
|
p.numel() for p in model.parameters()
|
|
)
|
|
|
|
# Calculate size based on dtype (default to float32)
|
|
if any(
|
|
p.dtype == torch.float16 for p in model.parameters()
|
|
):
|
|
bytes_per_param = 2 # float16
|
|
elif any(
|
|
p.dtype == torch.bfloat16 for p in model.parameters()
|
|
):
|
|
bytes_per_param = 2 # bfloat16
|
|
elif any(
|
|
p.dtype == torch.float64 for p in model.parameters()
|
|
):
|
|
bytes_per_param = 8 # float64
|
|
else:
|
|
bytes_per_param = 4 # float32
|
|
|
|
# Calculate raw model size in bytes
|
|
model_size_bytes = model_parameters * bytes_per_param
|
|
|
|
# Add 20% for optimizer states, gradients, and other overhead
|
|
model_size_bytes_with_overhead = model_size_bytes * 1.2
|
|
|
|
# Convert to GB
|
|
model_size_gb = model_size_bytes_with_overhead / (1024**3)
|
|
|
|
# Add a safety margin of 10%
|
|
model_size_gb_with_safety = model_size_gb * 1.1
|
|
|
|
return model_size_gb_with_safety
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error calculating PyTorch model size: {str(e)}"
|
|
)
|
|
# Fallback estimation
|
|
return 2.0 # Default estimation if calculation fails
|
|
|
|
@staticmethod
|
|
def get_huggingface_model_size(
|
|
model_or_path: Union[str, Any],
|
|
) -> float:
|
|
"""
|
|
Calculate the memory size of a Hugging Face model in GB.
|
|
Works with either model path or loaded model.
|
|
|
|
Args:
|
|
model_or_path: Hugging Face model object or path to model
|
|
|
|
Returns:
|
|
Memory size in GB
|
|
"""
|
|
if not TRANSFORMERS_AVAILABLE:
|
|
logger.error(
|
|
"Transformers package not available. Cannot calculate Hugging Face model size."
|
|
)
|
|
return 5.0 # Default fallback
|
|
|
|
try:
|
|
# If it's a path, we'll try to estimate without loading
|
|
if isinstance(model_or_path, str):
|
|
path = Path(model_or_path)
|
|
if path.exists():
|
|
# Check for model info in config
|
|
config_path = path / "config.json"
|
|
if config_path.exists():
|
|
with open(config_path, "r") as f:
|
|
config = json.load(f)
|
|
if "n_params" in config:
|
|
n_params = config["n_params"]
|
|
# Estimate with overhead
|
|
model_size_gb = (
|
|
n_params * 4 * 1.5
|
|
) / (1024**3)
|
|
return model_size_gb
|
|
|
|
# Alternatively, estimate from model files
|
|
pytorch_files = list(path.glob("*.bin"))
|
|
if pytorch_files:
|
|
total_size = sum(
|
|
f.stat().st_size for f in pytorch_files
|
|
)
|
|
model_size_gb = (total_size * 1.5) / (
|
|
1024**3
|
|
) # 50% overhead
|
|
return model_size_gb
|
|
|
|
# If we can't estimate, load the model and calculate
|
|
logger.info(
|
|
f"Loading model from {model_or_path} to calculate memory requirements..."
|
|
)
|
|
model = AutoModel.from_pretrained(model_or_path)
|
|
return ModelMemoryCalculator.get_pytorch_model_size(
|
|
model
|
|
)
|
|
else:
|
|
# If we already have the model loaded, calculate directly
|
|
return ModelMemoryCalculator.get_pytorch_model_size(
|
|
model_or_path
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error calculating Hugging Face model size: {str(e)}"
|
|
)
|
|
return 5.0 # Default estimation if calculation fails
|
|
|
|
|
|
class GPUManager:
|
|
"""Manages available GPUs and their memory."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the GPU manager."""
|
|
self.gpus: List[GPUMetadata] = []
|
|
self._initialize_gpus()
|
|
|
|
def _initialize_gpus(self) -> None:
|
|
"""
|
|
Initialize available GPUs and collect their metadata.
|
|
"""
|
|
if not torch.cuda.is_available():
|
|
logger.warning("No CUDA-capable devices detected.")
|
|
return
|
|
|
|
num_gpus = torch.cuda.device_count()
|
|
logger.info(f"Found {num_gpus} CUDA-capable devices.")
|
|
|
|
for gpu_id in range(num_gpus):
|
|
device = torch.device(f"cuda:{gpu_id}")
|
|
|
|
# Get total memory
|
|
total_memory = torch.cuda.get_device_properties(
|
|
gpu_id
|
|
).total_memory
|
|
total_memory_gb = total_memory / (1024**3)
|
|
|
|
# Get available memory
|
|
torch.cuda.set_device(device)
|
|
torch.cuda.empty_cache()
|
|
available_memory = torch.cuda.mem_get_info(device)[0]
|
|
available_memory_gb = available_memory / (1024**3)
|
|
|
|
# Create GPU metadata
|
|
gpu_metadata = GPUMetadata(
|
|
id=gpu_id,
|
|
device=device,
|
|
total_memory=total_memory_gb,
|
|
available_memory=available_memory_gb,
|
|
)
|
|
|
|
self.gpus.append(gpu_metadata)
|
|
logger.info(
|
|
f"GPU {gpu_id}: {total_memory_gb:.2f} GB total, {available_memory_gb:.2f} GB available"
|
|
)
|
|
|
|
def update_gpu_memory_info(self) -> None:
|
|
"""
|
|
Update the available memory information for all GPUs.
|
|
"""
|
|
if not self.gpus:
|
|
logger.warning(
|
|
"No GPUs available to update memory information."
|
|
)
|
|
return
|
|
|
|
for gpu in self.gpus:
|
|
torch.cuda.set_device(gpu.device)
|
|
torch.cuda.empty_cache()
|
|
available_memory = torch.cuda.mem_get_info(gpu.device)[0]
|
|
gpu.available_memory = available_memory / (1024**3)
|
|
logger.debug(
|
|
f"Updated GPU {gpu.id}: {gpu.available_memory:.2f} GB available"
|
|
)
|
|
|
|
|
|
class ModelGrid:
|
|
"""
|
|
Main class for managing multiple models across available GPUs.
|
|
|
|
This class handles:
|
|
- Loading and unloading models
|
|
- Allocating models to appropriate GPUs based on memory requirements
|
|
- Running inference tasks on specific models
|
|
- Managing model lifecycle through multiple processes
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
allocation_strategy: GPUAllocationStrategy = GPUAllocationStrategy.MEMORY_OPTIMIZED,
|
|
memory_buffer: float = 0.5, # GB buffer to leave on each GPU
|
|
max_cpu_models: int = 0, # Maximum models to keep on CPU if no GPU space
|
|
use_multiprocessing: bool = True,
|
|
log_level: str = "INFO",
|
|
):
|
|
"""
|
|
Initialize the model manager.
|
|
|
|
Args:
|
|
allocation_strategy: Strategy for allocating models to GPUs
|
|
memory_buffer: Memory buffer to leave on each GPU (in GB)
|
|
max_cpu_models: Maximum number of models to keep on CPU if no GPU space
|
|
use_multiprocessing: Whether to use multiprocessing for model execution
|
|
log_level: Logging level
|
|
"""
|
|
# Set log level
|
|
logger.remove()
|
|
logger.add(sys.stderr, level=log_level)
|
|
logger.add(
|
|
"gpu_model_manager.log",
|
|
rotation="100 MB",
|
|
retention="1 week",
|
|
level=log_level,
|
|
)
|
|
|
|
self.models: Dict[str, ModelMetadata] = {}
|
|
self.gpu_manager = GPUManager()
|
|
self.allocation_strategy = allocation_strategy
|
|
self.memory_buffer = memory_buffer
|
|
self.max_cpu_models = max_cpu_models
|
|
self.use_multiprocessing = use_multiprocessing
|
|
|
|
# Initialize locks and queues for multiprocessing
|
|
self.manager = (
|
|
multiprocessing.Manager() if use_multiprocessing else None
|
|
)
|
|
self.task_queues: Dict[str, Any] = (
|
|
self.manager.dict() if use_multiprocessing else {}
|
|
)
|
|
self.result_queues: Dict[str, Any] = (
|
|
self.manager.dict() if use_multiprocessing else {}
|
|
)
|
|
self.model_locks: Dict[str, Any] = {}
|
|
|
|
logger.info(
|
|
f"ModelGrid initialized with {len(self.gpu_manager.gpus)} GPUs"
|
|
)
|
|
logger.info(
|
|
f"Using allocation strategy: {allocation_strategy.value}"
|
|
)
|
|
|
|
def add_model(
|
|
self,
|
|
model_name: str,
|
|
model: Any,
|
|
model_type: Optional[ModelType] = None,
|
|
memory_override: Optional[float] = None,
|
|
) -> bool:
|
|
"""
|
|
Add a model to the manager.
|
|
|
|
Args:
|
|
model_name: Unique name for the model
|
|
model: The model object or path
|
|
model_type: Type of the model (will be auto-detected if not provided)
|
|
memory_override: Override the automatic memory calculation (in GB)
|
|
|
|
Returns:
|
|
Success status
|
|
"""
|
|
if model_name in self.models:
|
|
logger.warning(
|
|
f"Model '{model_name}' already exists. Use update_model to replace it."
|
|
)
|
|
return False
|
|
|
|
# Auto-detect model type if not provided
|
|
if model_type is None:
|
|
if isinstance(model, str):
|
|
if os.path.exists(model) and TRANSFORMERS_AVAILABLE:
|
|
model_type = ModelType.HUGGINGFACE
|
|
else:
|
|
model_type = ModelType.UNKNOWN
|
|
elif isinstance(model, torch.nn.Module):
|
|
model_type = ModelType.PYTORCH
|
|
elif TRANSFORMERS_AVAILABLE and isinstance(
|
|
model, transformers.PreTrainedModel
|
|
):
|
|
model_type = ModelType.HUGGINGFACE
|
|
else:
|
|
model_type = ModelType.UNKNOWN
|
|
|
|
# Calculate memory requirements
|
|
if memory_override is not None:
|
|
memory_required = memory_override
|
|
else:
|
|
if model_type == ModelType.PYTORCH:
|
|
memory_required = (
|
|
ModelMemoryCalculator.get_pytorch_model_size(
|
|
model
|
|
)
|
|
)
|
|
elif model_type == ModelType.HUGGINGFACE:
|
|
memory_required = (
|
|
ModelMemoryCalculator.get_huggingface_model_size(
|
|
model
|
|
)
|
|
)
|
|
else:
|
|
logger.warning(
|
|
f"Unknown model type for '{model_name}'. Using default memory estimation."
|
|
)
|
|
memory_required = 2.0 # Default estimation
|
|
|
|
# Create model metadata
|
|
model_metadata = ModelMetadata(
|
|
name=model_name,
|
|
model_type=model_type,
|
|
memory_required=memory_required,
|
|
model=model,
|
|
loaded=False,
|
|
)
|
|
|
|
self.models[model_name] = model_metadata
|
|
logger.info(
|
|
f"Added model '{model_name}' ({model_type.value}) with {memory_required:.2f} GB memory requirement"
|
|
)
|
|
|
|
# Initialize multiprocessing resources for this model
|
|
if self.use_multiprocessing:
|
|
self.task_queues[model_name] = self.manager.Queue()
|
|
self.result_queues[model_name] = self.manager.Queue()
|
|
self.model_locks[model_name] = self.manager.Lock()
|
|
|
|
return True
|
|
|
|
def remove_model(self, model_name: str) -> bool:
|
|
"""
|
|
Remove a model from the manager.
|
|
|
|
Args:
|
|
model_name: Name of the model to remove
|
|
|
|
Returns:
|
|
Success status
|
|
"""
|
|
if model_name not in self.models:
|
|
logger.warning(f"Model '{model_name}' does not exist.")
|
|
return False
|
|
|
|
model_metadata = self.models[model_name]
|
|
|
|
# Terminate the model process if running
|
|
if (
|
|
model_metadata.process is not None
|
|
and model_metadata.process.is_alive()
|
|
):
|
|
logger.info(
|
|
f"Terminating process for model '{model_name}'"
|
|
)
|
|
model_metadata.process.terminate()
|
|
model_metadata.process.join(timeout=5)
|
|
if model_metadata.process.is_alive():
|
|
logger.warning(
|
|
f"Process for model '{model_name}' did not terminate gracefully. Killing..."
|
|
)
|
|
model_metadata.process.kill()
|
|
|
|
# Remove from GPU if loaded
|
|
if (
|
|
model_metadata.loaded
|
|
and model_metadata.device is not None
|
|
):
|
|
gpu_id = model_metadata.device.index
|
|
for gpu in self.gpu_manager.gpus:
|
|
if gpu.id == gpu_id and model_name in gpu.models:
|
|
gpu.models.remove(model_name)
|
|
logger.info(
|
|
f"Removed model '{model_name}' from GPU {gpu_id}"
|
|
)
|
|
|
|
# Update GPU memory info
|
|
self.gpu_manager.update_gpu_memory_info()
|
|
|
|
# Clean up multiprocessing resources
|
|
if self.use_multiprocessing:
|
|
if model_name in self.task_queues:
|
|
del self.task_queues[model_name]
|
|
if model_name in self.result_queues:
|
|
del self.result_queues[model_name]
|
|
if model_name in self.model_locks:
|
|
del self.model_locks[model_name]
|
|
|
|
# Remove model metadata
|
|
del self.models[model_name]
|
|
|
|
logger.info(f"Removed model '{model_name}'")
|
|
return True
|
|
|
|
def _find_best_gpu_for_model(
|
|
self, model_metadata: ModelMetadata
|
|
) -> Optional[GPUMetadata]:
|
|
"""
|
|
Find the best GPU for a given model based on the allocation strategy.
|
|
|
|
Args:
|
|
model_metadata: Metadata for the model
|
|
|
|
Returns:
|
|
Best GPU metadata or None if no suitable GPU found
|
|
"""
|
|
model_memory = (
|
|
model_metadata.memory_required + self.memory_buffer
|
|
)
|
|
|
|
# Update GPU memory info before allocation
|
|
self.gpu_manager.update_gpu_memory_info()
|
|
|
|
# Find available GPUs that can fit the model
|
|
available_gpus = [
|
|
gpu
|
|
for gpu in self.gpu_manager.gpus
|
|
if gpu.available_memory >= model_memory
|
|
]
|
|
|
|
if not available_gpus:
|
|
logger.warning(
|
|
f"No GPU with sufficient memory for model '{model_metadata.name}' "
|
|
f"(requires {model_memory:.2f} GB)"
|
|
)
|
|
return None
|
|
|
|
# Apply allocation strategy
|
|
if self.allocation_strategy == GPUAllocationStrategy.FILL_GPU:
|
|
# Sort by number of models (ascending) and then by available memory (descending)
|
|
return sorted(
|
|
available_gpus,
|
|
key=lambda g: (len(g.models), -g.available_memory),
|
|
)[0]
|
|
|
|
elif (
|
|
self.allocation_strategy
|
|
== GPUAllocationStrategy.DISTRIBUTE
|
|
):
|
|
# Sort by number of models (ascending)
|
|
return sorted(
|
|
available_gpus, key=lambda g: len(g.models)
|
|
)[0]
|
|
|
|
elif (
|
|
self.allocation_strategy
|
|
== GPUAllocationStrategy.MEMORY_OPTIMIZED
|
|
):
|
|
# Sort by available memory (ascending) but ensure it fits
|
|
return sorted(
|
|
available_gpus, key=lambda g: g.available_memory
|
|
)[0]
|
|
|
|
# Default fallback
|
|
return available_gpus[0]
|
|
|
|
def allocate_all_models(self) -> Dict[str, Optional[int]]:
|
|
"""
|
|
Allocate all models to GPUs based on the allocation strategy.
|
|
|
|
Returns:
|
|
Dict mapping model names to allocated GPU IDs (or None if on CPU)
|
|
"""
|
|
# Sort models by memory requirement (descending)
|
|
sorted_models = sorted(
|
|
self.models.values(),
|
|
key=lambda m: m.memory_required,
|
|
reverse=True,
|
|
)
|
|
|
|
allocations = {}
|
|
|
|
for model_metadata in sorted_models:
|
|
best_gpu = self._find_best_gpu_for_model(model_metadata)
|
|
|
|
if best_gpu is not None:
|
|
# Allocate model to GPU
|
|
gpu_id = best_gpu.id
|
|
model_metadata.device = best_gpu.device
|
|
best_gpu.models.append(model_metadata.name)
|
|
best_gpu.available_memory -= (
|
|
model_metadata.memory_required
|
|
+ self.memory_buffer
|
|
)
|
|
|
|
allocations[model_metadata.name] = gpu_id
|
|
logger.info(
|
|
f"Allocated model '{model_metadata.name}' to GPU {gpu_id} "
|
|
f"({best_gpu.available_memory:.2f} GB remaining)"
|
|
)
|
|
else:
|
|
# No suitable GPU found, keep model on CPU if allowed
|
|
if (
|
|
len(
|
|
[m for m in allocations.values() if m is None]
|
|
)
|
|
< self.max_cpu_models
|
|
):
|
|
model_metadata.device = None
|
|
allocations[model_metadata.name] = None
|
|
logger.info(
|
|
f"Keeping model '{model_metadata.name}' on CPU (no suitable GPU)"
|
|
)
|
|
else:
|
|
logger.warning(
|
|
f"Cannot allocate model '{model_metadata.name}'. "
|
|
f"No GPU space available and max_cpu_models limit reached."
|
|
)
|
|
|
|
return allocations
|
|
|
|
def load_model(self, model_name: str) -> bool:
|
|
"""
|
|
Load a specific model to its allocated device.
|
|
|
|
Args:
|
|
model_name: Name of the model to load
|
|
|
|
Returns:
|
|
Success status
|
|
"""
|
|
if model_name not in self.models:
|
|
logger.warning(f"Model '{model_name}' does not exist.")
|
|
return False
|
|
|
|
model_metadata = self.models[model_name]
|
|
|
|
# Skip if already loaded
|
|
if model_metadata.loaded:
|
|
logger.info(f"Model '{model_name}' is already loaded.")
|
|
return True
|
|
|
|
# Allocate to GPU if not already allocated
|
|
if model_metadata.device is None:
|
|
best_gpu = self._find_best_gpu_for_model(model_metadata)
|
|
if best_gpu is not None:
|
|
model_metadata.device = best_gpu.device
|
|
best_gpu.models.append(model_name)
|
|
best_gpu.available_memory -= (
|
|
model_metadata.memory_required
|
|
+ self.memory_buffer
|
|
)
|
|
logger.info(
|
|
f"Allocated model '{model_name}' to GPU {best_gpu.id} "
|
|
f"({best_gpu.available_memory:.2f} GB remaining)"
|
|
)
|
|
|
|
try:
|
|
device_str = (
|
|
"cpu"
|
|
if model_metadata.device is None
|
|
else str(model_metadata.device)
|
|
)
|
|
logger.info(
|
|
f"Loading model '{model_name}' to {device_str}"
|
|
)
|
|
|
|
# Load based on model type
|
|
if model_metadata.model_type == ModelType.PYTORCH:
|
|
if isinstance(model_metadata.model, torch.nn.Module):
|
|
model_metadata.model.to(
|
|
model_metadata.device or "cpu"
|
|
)
|
|
else:
|
|
logger.error(
|
|
f"Model '{model_name}' is not a valid PyTorch module."
|
|
)
|
|
return False
|
|
|
|
elif model_metadata.model_type == ModelType.HUGGINGFACE:
|
|
if TRANSFORMERS_AVAILABLE:
|
|
if isinstance(model_metadata.model, str):
|
|
# Load from path
|
|
logger.info(
|
|
f"Loading HuggingFace model from {model_metadata.model}"
|
|
)
|
|
loaded_model = AutoModel.from_pretrained(
|
|
model_metadata.model
|
|
)
|
|
loaded_model.to(
|
|
model_metadata.device or "cpu"
|
|
)
|
|
model_metadata.model = loaded_model
|
|
elif isinstance(
|
|
model_metadata.model,
|
|
transformers.PreTrainedModel,
|
|
):
|
|
# Move existing model to device
|
|
model_metadata.model.to(
|
|
model_metadata.device or "cpu"
|
|
)
|
|
else:
|
|
logger.error(
|
|
f"Model '{model_name}' is not a valid HuggingFace model."
|
|
)
|
|
return False
|
|
else:
|
|
logger.error(
|
|
"Transformers package not available. Cannot load HuggingFace model."
|
|
)
|
|
return False
|
|
else:
|
|
logger.error(
|
|
f"Unknown model type for '{model_name}'."
|
|
)
|
|
return False
|
|
|
|
model_metadata.loaded = True
|
|
|
|
# Start model process if using multiprocessing
|
|
if self.use_multiprocessing:
|
|
self._start_model_process(model_name)
|
|
|
|
logger.info(f"Successfully loaded model '{model_name}'")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error loading model '{model_name}': {str(e)}"
|
|
)
|
|
# Try to clean up GPU allocation if failed
|
|
if model_metadata.device is not None:
|
|
gpu_id = model_metadata.device.index
|
|
for gpu in self.gpu_manager.gpus:
|
|
if gpu.id == gpu_id and model_name in gpu.models:
|
|
gpu.models.remove(model_name)
|
|
gpu.available_memory += (
|
|
model_metadata.memory_required
|
|
+ self.memory_buffer
|
|
)
|
|
model_metadata.device = None
|
|
|
|
self.gpu_manager.update_gpu_memory_info()
|
|
return False
|
|
|
|
def unload_model(self, model_name: str) -> bool:
|
|
"""
|
|
Unload a specific model from its device.
|
|
|
|
Args:
|
|
model_name: Name of the model to unload
|
|
|
|
Returns:
|
|
Success status
|
|
"""
|
|
if model_name not in self.models:
|
|
logger.warning(f"Model '{model_name}' does not exist.")
|
|
return False
|
|
|
|
model_metadata = self.models[model_name]
|
|
|
|
# Skip if not loaded
|
|
if not model_metadata.loaded:
|
|
logger.info(f"Model '{model_name}' is not loaded.")
|
|
return True
|
|
|
|
try:
|
|
# Stop model process if using multiprocessing
|
|
if (
|
|
self.use_multiprocessing
|
|
and model_metadata.process is not None
|
|
):
|
|
logger.info(
|
|
f"Stopping process for model '{model_name}'"
|
|
)
|
|
model_metadata.process.terminate()
|
|
model_metadata.process.join(timeout=5)
|
|
if model_metadata.process.is_alive():
|
|
logger.warning(
|
|
f"Process for model '{model_name}' did not terminate gracefully. Killing..."
|
|
)
|
|
model_metadata.process.kill()
|
|
model_metadata.process = None
|
|
|
|
# Move model to CPU and clean up
|
|
if (
|
|
model_metadata.device is not None
|
|
and model_metadata.device.type == "cuda"
|
|
):
|
|
logger.info(
|
|
f"Unloading model '{model_name}' from {model_metadata.device}"
|
|
)
|
|
|
|
# Update GPU allocation
|
|
gpu_id = model_metadata.device.index
|
|
for gpu in self.gpu_manager.gpus:
|
|
if gpu.id == gpu_id and model_name in gpu.models:
|
|
gpu.models.remove(model_name)
|
|
gpu.available_memory += (
|
|
model_metadata.memory_required
|
|
+ self.memory_buffer
|
|
)
|
|
|
|
# Move model to CPU if it's a PyTorch module
|
|
if isinstance(model_metadata.model, torch.nn.Module):
|
|
model_metadata.model.to("cpu")
|
|
|
|
# Clear CUDA cache
|
|
if torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
|
|
model_metadata.device = None
|
|
model_metadata.loaded = False
|
|
|
|
# Update GPU memory info
|
|
self.gpu_manager.update_gpu_memory_info()
|
|
|
|
logger.info(f"Successfully unloaded model '{model_name}'")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error unloading model '{model_name}': {str(e)}"
|
|
)
|
|
return False
|
|
|
|
def load_all_models(self) -> Dict[str, bool]:
|
|
"""
|
|
Load all models to their allocated devices.
|
|
|
|
Returns:
|
|
Dict mapping model names to load success status
|
|
"""
|
|
# First allocate all models
|
|
self.allocate_all_models()
|
|
|
|
# Then load each model
|
|
results = {}
|
|
for model_name in self.models:
|
|
results[model_name] = self.load_model(model_name)
|
|
|
|
return results
|
|
|
|
def unload_all_models(self) -> Dict[str, bool]:
|
|
"""
|
|
Unload all models from their devices.
|
|
|
|
Returns:
|
|
Dict mapping model names to unload success status
|
|
"""
|
|
results = {}
|
|
for model_name in self.models:
|
|
results[model_name] = self.unload_model(model_name)
|
|
|
|
return results
|
|
|
|
def _start_model_process(self, model_name: str) -> bool:
|
|
"""
|
|
Start a dedicated process for a model.
|
|
|
|
Args:
|
|
model_name: Name of the model
|
|
|
|
Returns:
|
|
Success status
|
|
"""
|
|
if not self.use_multiprocessing:
|
|
logger.warning(
|
|
"Multiprocessing is disabled. Cannot start model process."
|
|
)
|
|
return False
|
|
|
|
if model_name not in self.models:
|
|
logger.warning(f"Model '{model_name}' does not exist.")
|
|
return False
|
|
|
|
model_metadata = self.models[model_name]
|
|
|
|
if (
|
|
model_metadata.process is not None
|
|
and model_metadata.process.is_alive()
|
|
):
|
|
logger.info(
|
|
f"Process for model '{model_name}' is already running."
|
|
)
|
|
return True
|
|
|
|
try:
|
|
# Create a new process for the model
|
|
process = multiprocessing.Process(
|
|
target=self._model_process_worker,
|
|
args=(
|
|
model_name,
|
|
model_metadata.model_type,
|
|
self.task_queues[model_name],
|
|
self.result_queues[model_name],
|
|
(
|
|
model_metadata.device.index
|
|
if model_metadata.device is not None
|
|
else None
|
|
),
|
|
),
|
|
daemon=True,
|
|
)
|
|
|
|
process.start()
|
|
model_metadata.process = process
|
|
|
|
logger.info(
|
|
f"Started process for model '{model_name}' (PID: {process.pid})"
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error starting process for model '{model_name}': {str(e)}"
|
|
)
|
|
return False
|
|
|
|
def _model_process_worker(
|
|
self,
|
|
model_name: str,
|
|
model_type: ModelType,
|
|
task_queue: multiprocessing.Queue,
|
|
result_queue: multiprocessing.Queue,
|
|
gpu_id: Optional[int],
|
|
) -> None:
|
|
"""
|
|
Worker function for model processes.
|
|
|
|
Args:
|
|
model_name: Name of the model
|
|
model_type: Type of the model
|
|
task_queue: Queue for receiving tasks
|
|
result_queue: Queue for sending results
|
|
gpu_id: GPU device ID or None for CPU
|
|
"""
|
|
try:
|
|
# Configure device
|
|
if gpu_id is not None:
|
|
device = torch.device(f"cuda:{gpu_id}")
|
|
torch.cuda.set_device(device)
|
|
else:
|
|
device = torch.device("cpu")
|
|
|
|
logger.info(
|
|
f"Model process for '{model_name}' started on {device}"
|
|
)
|
|
|
|
# Process tasks from the queue
|
|
while True:
|
|
try:
|
|
# Get task from queue with timeout
|
|
task_id, task_type, task_data = task_queue.get(
|
|
timeout=1.0
|
|
)
|
|
|
|
logger.debug(
|
|
f"Model '{model_name}' processing task {task_id}: {task_type}"
|
|
)
|
|
|
|
# Process task based on task_type
|
|
try:
|
|
if task_type == "run_model":
|
|
# Run the model on the task data
|
|
# This would be implemented based on the specific model type
|
|
result = {
|
|
"status": "success",
|
|
"result": "Model output placeholder",
|
|
}
|
|
else:
|
|
result = {
|
|
"status": "error",
|
|
"error": f"Unknown task type: {task_type}",
|
|
}
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error processing task {task_id} for model '{model_name}': {str(e)}"
|
|
)
|
|
result = {"status": "error", "error": str(e)}
|
|
|
|
# Send result back
|
|
result_queue.put((task_id, result))
|
|
logger.debug(
|
|
f"Model '{model_name}' completed task {task_id}"
|
|
)
|
|
|
|
except queue.Empty:
|
|
# No tasks in queue, just continue
|
|
continue
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info(
|
|
f"Model process for '{model_name}' interrupted"
|
|
)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error in model process for '{model_name}': {str(e)}"
|
|
)
|
|
finally:
|
|
logger.info(f"Model process for '{model_name}' exiting")
|
|
|
|
@contextmanager
|
|
def _model_lock(self, model_name: str) -> None:
|
|
"""
|
|
Context manager for acquiring model lock.
|
|
|
|
Args:
|
|
model_name: Name of the model
|
|
"""
|
|
if (
|
|
not self.use_multiprocessing
|
|
or model_name not in self.model_locks
|
|
):
|
|
# No-op if not using multiprocessing
|
|
yield
|
|
return
|
|
|
|
lock = self.model_locks[model_name]
|
|
try:
|
|
lock.acquire()
|
|
yield
|
|
finally:
|
|
lock.release()
|
|
|
|
def run(
|
|
self,
|
|
task: Union[str, List[str]],
|
|
model_names: Optional[List[str]] = None,
|
|
input_data: Any = None,
|
|
timeout: float = 30.0,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Run a task on specific models or all models.
|
|
|
|
Args:
|
|
task: Task name or list of task names to run
|
|
model_names: List of model names to run the task on (None for all loaded models)
|
|
input_data: Input data for the task
|
|
timeout: Timeout in seconds
|
|
|
|
Returns:
|
|
Dict mapping model names to results
|
|
"""
|
|
# Normalize task to list
|
|
if isinstance(task, str):
|
|
tasks = [task]
|
|
else:
|
|
tasks = task
|
|
|
|
# Determine which models to run on
|
|
if model_names is None:
|
|
target_models = [
|
|
name
|
|
for name, meta in self.models.items()
|
|
if meta.loaded
|
|
]
|
|
else:
|
|
target_models = [
|
|
name
|
|
for name in model_names
|
|
if name in self.models and self.models[name].loaded
|
|
]
|
|
|
|
if not target_models:
|
|
logger.warning(
|
|
"No loaded models available for running tasks."
|
|
)
|
|
return {}
|
|
|
|
logger.info(
|
|
f"Running tasks {tasks} on models: {', '.join(target_models)}"
|
|
)
|
|
|
|
results = {}
|
|
|
|
# Run tasks on each model
|
|
for model_name in target_models:
|
|
model_metadata = self.models[model_name]
|
|
|
|
try:
|
|
if (
|
|
self.use_multiprocessing
|
|
and model_metadata.process is not None
|
|
):
|
|
# Run in separate process
|
|
results[model_name] = self._run_in_process(
|
|
model_name, tasks, input_data, timeout
|
|
)
|
|
else:
|
|
# Run in current process
|
|
results[model_name] = (
|
|
self._run_in_current_process(
|
|
model_name, tasks, input_data
|
|
)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error running tasks on model '{model_name}': {str(e)}"
|
|
)
|
|
results[model_name] = {
|
|
"status": "error",
|
|
"error": str(e),
|
|
}
|
|
|
|
return results
|
|
|
|
def _run_in_process(
|
|
self,
|
|
model_name: str,
|
|
tasks: List[str],
|
|
input_data: Any,
|
|
timeout: float,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Run tasks on a model in a separate process.
|
|
|
|
Args:
|
|
model_name: Name of the model
|
|
tasks: List of tasks to run
|
|
input_data: Input data for the tasks
|
|
timeout: Timeout in seconds
|
|
|
|
Returns:
|
|
Task results
|
|
"""
|
|
task_id = str(uuid.uuid4())
|
|
task_queue = self.task_queues[model_name]
|
|
result_queue = self.result_queues[model_name]
|
|
|
|
# Send task to model process
|
|
task_queue.put((task_id, tasks[0], input_data))
|
|
|
|
# Wait for result
|
|
start_time = time.time()
|
|
while time.time() - start_time < timeout:
|
|
try:
|
|
# Check if result is available
|
|
result_task_id, result = result_queue.get(block=False)
|
|
|
|
if result_task_id == task_id:
|
|
return result
|
|
else:
|
|
# Put back other task results
|
|
result_queue.put((result_task_id, result))
|
|
|
|
except queue.Empty:
|
|
# No results yet, wait a bit
|
|
time.sleep(0.1)
|
|
|
|
# Timeout
|
|
logger.warning(
|
|
f"Timeout waiting for tasks on model '{model_name}'"
|
|
)
|
|
return {"status": "error", "error": "Timeout"}
|
|
|
|
def _run_in_current_process(
|
|
self, model_name: str, tasks: List[str], input_data: Any
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Run tasks on a model in the current process.
|
|
|
|
Args:
|
|
model_name: Name of the model
|
|
tasks: List of tasks to run
|
|
input_data: Input data for the tasks
|
|
|
|
Returns:
|
|
Task results
|
|
"""
|
|
model_metadata = self.models[model_name]
|
|
|
|
with self._model_lock(model_name):
|
|
try:
|
|
# This would need to be implemented based on the specific model types
|
|
# and tasks supported. Here's a simple placeholder:
|
|
|
|
if model_metadata.model_type == ModelType.PYTORCH:
|
|
# Run PyTorch model
|
|
return {
|
|
"status": "success",
|
|
"result": "PyTorch model output placeholder",
|
|
}
|
|
|
|
elif (
|
|
model_metadata.model_type == ModelType.HUGGINGFACE
|
|
):
|
|
# Run Hugging Face model
|
|
return {
|
|
"status": "success",
|
|
"result": "Hugging Face model output placeholder",
|
|
}
|
|
|
|
else:
|
|
return {
|
|
"status": "error",
|
|
"error": f"Unsupported model type: {model_metadata.model_type}",
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error running tasks on model '{model_name}': {str(e)}"
|
|
)
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
def get_gpu_status(self) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get status information for all GPUs.
|
|
|
|
Returns:
|
|
List of GPU status dictionaries
|
|
"""
|
|
# Update GPU memory info
|
|
self.gpu_manager.update_gpu_memory_info()
|
|
|
|
gpu_status = []
|
|
for gpu in self.gpu_manager.gpus:
|
|
status = {
|
|
"id": gpu.id,
|
|
"total_memory": gpu.total_memory,
|
|
"available_memory": gpu.available_memory,
|
|
"used_memory": gpu.total_memory
|
|
- gpu.available_memory,
|
|
"utilization": (
|
|
gpu.total_memory - gpu.available_memory
|
|
)
|
|
/ gpu.total_memory,
|
|
"models": gpu.models,
|
|
}
|
|
gpu_status.append(status)
|
|
|
|
return gpu_status
|
|
|
|
def get_model_status(self) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Get status information for all models.
|
|
|
|
Returns:
|
|
Dict mapping model names to status dictionaries
|
|
"""
|
|
model_status = {}
|
|
for name, metadata in self.models.items():
|
|
status = {
|
|
"name": name,
|
|
"type": metadata.model_type.value,
|
|
"memory_required": metadata.memory_required,
|
|
"loaded": metadata.loaded,
|
|
"device": (
|
|
str(metadata.device)
|
|
if metadata.device is not None
|
|
else "cpu"
|
|
),
|
|
"process_running": metadata.process is not None
|
|
and metadata.process.is_alive(),
|
|
}
|
|
model_status[name] = status
|
|
|
|
return model_status
|
|
|
|
|
|
class ModelWithCustomRunMethod:
|
|
"""
|
|
Base class for models with custom run methods.
|
|
|
|
Extend this class to implement custom run methods for specific model types.
|
|
"""
|
|
|
|
def __init__(
|
|
self, model: Any, device: Optional[torch.device] = None
|
|
):
|
|
"""
|
|
Initialize the model wrapper.
|
|
|
|
Args:
|
|
model: The model object
|
|
device: Device to run the model on
|
|
"""
|
|
self.model = model
|
|
self.device = device
|
|
|
|
def run(self, task: str, input_data: Any) -> Any:
|
|
"""
|
|
Run a task on the model.
|
|
|
|
Args:
|
|
task: Task name
|
|
input_data: Input data for the task
|
|
|
|
Returns:
|
|
Task result
|
|
"""
|
|
raise NotImplementedError(
|
|
"Subclasses must implement this method"
|
|
)
|
|
|
|
|
|
class PyTorchModelWrapper(ModelWithCustomRunMethod):
|
|
"""
|
|
Wrapper for PyTorch models with custom run methods.
|
|
"""
|
|
|
|
def run(self, task: str, input_data: Any) -> Any:
|
|
"""
|
|
Run a task on a PyTorch model.
|
|
|
|
Args:
|
|
task: Task name
|
|
input_data: Input data for the task
|
|
|
|
Returns:
|
|
Task result
|
|
"""
|
|
# Example implementation for common PyTorch tasks
|
|
if task == "forward":
|
|
# Ensure model is in eval mode
|
|
self.model.eval()
|
|
|
|
# Convert input to tensor if needed
|
|
if not isinstance(input_data, torch.Tensor):
|
|
if isinstance(input_data, np.ndarray):
|
|
input_tensor = torch.from_numpy(input_data).to(
|
|
self.device
|
|
)
|
|
else:
|
|
input_tensor = torch.tensor(input_data).to(
|
|
self.device
|
|
)
|
|
else:
|
|
input_tensor = input_data.to(self.device)
|
|
|
|
# Run forward pass
|
|
with torch.no_grad():
|
|
output = self.model(input_tensor)
|
|
|
|
# Convert output to numpy if needed
|
|
if isinstance(output, torch.Tensor):
|
|
return output.cpu().numpy()
|
|
else:
|
|
return output
|
|
|
|
elif task == "predict":
|
|
# Similar to forward but with different post-processing
|
|
self.model.eval()
|
|
|
|
# Convert input to tensor if needed
|
|
if not isinstance(input_data, torch.Tensor):
|
|
if isinstance(input_data, np.ndarray):
|
|
input_tensor = torch.from_numpy(input_data).to(
|
|
self.device
|
|
)
|
|
else:
|
|
input_tensor = torch.tensor(input_data).to(
|
|
self.device
|
|
)
|
|
else:
|
|
input_tensor = input_data.to(self.device)
|
|
|
|
# Run prediction
|
|
with torch.no_grad():
|
|
output = self.model(input_tensor)
|
|
|
|
# Apply softmax if output is logits
|
|
if len(output.shape) > 1 and output.shape[1] > 1:
|
|
probs = torch.nn.functional.softmax(output, dim=1)
|
|
predicted_class = torch.argmax(probs, dim=1)
|
|
return {
|
|
"probabilities": probs.cpu().numpy(),
|
|
"predicted_class": predicted_class.cpu().numpy(),
|
|
}
|
|
else:
|
|
return output.cpu().numpy()
|
|
else:
|
|
raise ValueError(f"Unsupported task: {task}")
|
|
|
|
|
|
class HuggingFaceModelWrapper(ModelWithCustomRunMethod):
|
|
"""
|
|
Wrapper for Hugging Face models with custom run methods.
|
|
"""
|
|
|
|
def run(self, task: str, input_data: Any) -> Any:
|
|
"""
|
|
Run a task on a Hugging Face model.
|
|
|
|
Args:
|
|
task: Task name
|
|
input_data: Input data for the task
|
|
|
|
Returns:
|
|
Task result
|
|
"""
|
|
if not TRANSFORMERS_AVAILABLE:
|
|
raise ImportError("Transformers package not available.")
|
|
|
|
# Example implementation for common Hugging Face tasks
|
|
if task == "generate":
|
|
# Generate text
|
|
return self.model.generate(**input_data)
|
|
|
|
elif task == "encode":
|
|
# Encode text
|
|
return self.model.encode(input_data)
|
|
|
|
elif task == "predict":
|
|
# Make predictions
|
|
return self.model(**input_data)
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported task: {task}")
|
|
|
|
|
|
# # Example usage
|
|
# if __name__ == "__main__":
|
|
# # Initialize model manager
|
|
# manager = ModelGrid(
|
|
# allocation_strategy=GPUAllocationStrategy.MEMORY_OPTIMIZED,
|
|
# memory_buffer=0.5,
|
|
# max_cpu_models=1,
|
|
# use_multiprocessing=True,
|
|
# log_level="INFO",
|
|
# )
|
|
|
|
# # # Add models
|
|
# model1 = torch.nn.Sequential(
|
|
# torch.nn.Linear(10, 10),
|
|
# torch.nn.ReLU(),
|
|
# torch.nn.Linear(10, 2),
|
|
# )
|
|
# manager.add_model("small_model", model1, ModelType.PYTORCH)
|
|
|
|
# # Add more models if available
|
|
# if TRANSFORMERS_AVAILABLE:
|
|
# manager.add_model(
|
|
# "bert_model", "bert-base-uncased", ModelType.HUGGINGFACE
|
|
# )
|
|
|
|
# # Allocate and load models
|
|
# manager.load_all_models()
|
|
|
|
# # Print GPU status
|
|
# print("GPU Status:")
|
|
# for gpu in manager.get_gpu_status():
|
|
# print(
|
|
# f"GPU {gpu['id']}: {gpu['available_memory']:.2f} GB / {gpu['total_memory']:.2f} GB"
|
|
# )
|
|
# print(f" Models: {', '.join(gpu['models'])}")
|
|
|
|
# # Run a task on all models
|
|
# results = manager.run("forward", input_data=torch.randn(1, 10))
|
|
|
|
# # Unload all models
|
|
# manager.unload_all_models()
|