""" Production-grade AI Vision Pipeline for depth estimation, segmentation, object detection, and 3D point cloud generation. This module provides a comprehensive pipeline that combines MiDaS for depth estimation, SAM (Segment Anything Model) for semantic segmentation, YOLOv8 for object detection, and Open3D for 3D point cloud generation. """ import sys from pathlib import Path from typing import Dict, List, Optional, Tuple, Union, Any import warnings warnings.filterwarnings("ignore") import cv2 import numpy as np import torch import torchvision.transforms as transforms from PIL import Image import open3d as o3d from loguru import logger # Third-party model imports try: import timm from segment_anything import ( SamAutomaticMaskGenerator, sam_model_registry, ) from ultralytics import YOLO except ImportError as e: logger.error(f"Missing required dependencies: {e}") sys.exit(1) class AIVisionPipeline: """ A comprehensive AI vision pipeline that performs depth estimation, semantic segmentation, object detection, and 3D point cloud generation from input images. This class integrates multiple state-of-the-art models: - MiDaS for monocular depth estimation - SAM (Segment Anything Model) for semantic segmentation - YOLOv8 for object detection - Open3D for 3D point cloud generation Attributes: model_dir (Path): Directory where models are stored device (torch.device): Computing device (CPU/CUDA) midas_model: Loaded MiDaS depth estimation model midas_transform: MiDaS preprocessing transforms sam_generator: SAM automatic mask generator yolo_model: YOLOv8 object detection model Example: >>> pipeline = AIVisionPipeline() >>> results = pipeline.process_image("path/to/image.jpg") >>> point_cloud = results["point_cloud"] """ def __init__( self, model_dir: str = "./models", device: Optional[str] = None, midas_model_type: str = "MiDaS", sam_model_type: str = "vit_b", yolo_model_path: str = "yolov8n.pt", log_level: str = "INFO", ) -> None: """ Initialize the AI Vision Pipeline. Args: model_dir: Directory to store downloaded models device: Computing device ('cpu', 'cuda', or None for auto-detection) midas_model_type: MiDaS model variant ('MiDaS', 'MiDaS_small', 'DPT_Large', etc.) sam_model_type: SAM model type ('vit_b', 'vit_l', 'vit_h') yolo_model_path: Path to YOLOv8 model weights log_level: Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR') Raises: RuntimeError: If required models cannot be loaded FileNotFoundError: If model files are not found """ # Setup logging logger.remove() logger.add( sys.stdout, level=log_level, format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", ) # Initialize attributes self.model_dir = Path(model_dir) self.model_dir.mkdir(parents=True, exist_ok=True) # Device setup if device is None: self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) else: self.device = torch.device(device) logger.info(f"Using device: {self.device}") # Model configuration self.midas_model_type = midas_model_type self.sam_model_type = sam_model_type self.yolo_model_path = yolo_model_path # Initialize model placeholders self.midas_model: Optional[torch.nn.Module] = None self.midas_transform: Optional[transforms.Compose] = None self.sam_generator: Optional[SamAutomaticMaskGenerator] = None self.yolo_model: Optional[YOLO] = None # Load all models self._setup_models() logger.success("AI Vision Pipeline initialized successfully") def _setup_models(self) -> None: """ Load and initialize all AI models with proper error handling. Raises: RuntimeError: If any model fails to load """ try: self._load_midas_model() self._load_sam_model() self._load_yolo_model() except Exception as e: logger.error(f"Failed to setup models: {e}") raise RuntimeError(f"Model initialization failed: {e}") def _load_midas_model(self) -> None: """Load MiDaS depth estimation model.""" try: logger.info( f"Loading MiDaS model: {self.midas_model_type}" ) # Load MiDaS model from torch hub self.midas_model = torch.hub.load( "intel-isl/MiDaS", self.midas_model_type, pretrained=True, ) self.midas_model.to(self.device) self.midas_model.eval() # Load corresponding transforms midas_transforms = torch.hub.load( "intel-isl/MiDaS", "transforms" ) if self.midas_model_type in ["DPT_Large", "DPT_Hybrid"]: self.midas_transform = midas_transforms.dpt_transform else: self.midas_transform = ( midas_transforms.default_transform ) logger.success("MiDaS model loaded successfully") except Exception as e: logger.error(f"Failed to load MiDaS model: {e}") raise def _load_sam_model(self) -> None: """Load SAM (Segment Anything Model) for semantic segmentation.""" try: logger.info(f"Loading SAM model: {self.sam_model_type}") # SAM model checkpoints mapping sam_checkpoint_urls = { "vit_b": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth", "vit_l": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth", "vit_h": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", } checkpoint_path = ( self.model_dir / f"sam_{self.sam_model_type}.pth" ) # Download checkpoint if not exists if not checkpoint_path.exists(): logger.info( f"Downloading SAM checkpoint to {checkpoint_path}" ) import urllib.request urllib.request.urlretrieve( sam_checkpoint_urls[self.sam_model_type], checkpoint_path, ) # Load SAM model sam = sam_model_registry[self.sam_model_type]( checkpoint=str(checkpoint_path) ) sam.to(self.device) # Create automatic mask generator self.sam_generator = SamAutomaticMaskGenerator( model=sam, points_per_side=32, pred_iou_thresh=0.86, stability_score_thresh=0.92, crop_n_layers=1, crop_n_points_downscale_factor=2, min_mask_region_area=100, ) logger.success("SAM model loaded successfully") except Exception as e: logger.error(f"Failed to load SAM model: {e}") raise def _load_yolo_model(self) -> None: """Load YOLOv8 object detection model.""" try: logger.info( f"Loading YOLOv8 model: {self.yolo_model_path}" ) self.yolo_model = YOLO(self.yolo_model_path) # Move to appropriate device if self.device.type == "cuda": self.yolo_model.to(self.device) logger.success("YOLOv8 model loaded successfully") except Exception as e: logger.error(f"Failed to load YOLOv8 model: {e}") raise def _load_and_preprocess_image( self, image_path: Union[str, Path] ) -> Tuple[np.ndarray, Image.Image]: """ Load and preprocess input image. Args: image_path: Path to the input image (JPG or PNG) Returns: Tuple of (opencv_image, pil_image) Raises: FileNotFoundError: If image file doesn't exist ValueError: If image format is not supported """ image_path = Path(image_path) if not image_path.exists(): raise FileNotFoundError(f"Image not found: {image_path}") if image_path.suffix.lower() not in [".jpg", ".jpeg", ".png"]: raise ValueError( f"Unsupported image format: {image_path.suffix}" ) try: # Load with OpenCV (BGR format) cv_image = cv2.imread(str(image_path)) if cv_image is None: raise ValueError( f"Could not load image: {image_path}" ) # Convert BGR to RGB for PIL rgb_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(rgb_image) logger.debug( f"Loaded image: {image_path} ({rgb_image.shape})" ) return rgb_image, pil_image except Exception as e: logger.error(f"Failed to load image {image_path}: {e}") raise def estimate_depth(self, image: np.ndarray) -> np.ndarray: """ Generate depth map using MiDaS model. Args: image: Input image as numpy array (H, W, 3) in RGB format Returns: Depth map as numpy array (H, W) Raises: RuntimeError: If depth estimation fails """ try: logger.debug("Estimating depth with MiDaS") # Preprocess image for MiDaS input_tensor = self.midas_transform(image).to(self.device) # Perform inference with torch.no_grad(): depth_map = self.midas_model(input_tensor) depth_map = torch.nn.functional.interpolate( depth_map.unsqueeze(1), size=image.shape[:2], mode="bicubic", align_corners=False, ).squeeze() # Convert to numpy depth_numpy = depth_map.cpu().numpy() # Normalize depth values depth_numpy = (depth_numpy - depth_numpy.min()) / ( depth_numpy.max() - depth_numpy.min() ) logger.debug( f"Depth estimation completed. Shape: {depth_numpy.shape}" ) return depth_numpy except Exception as e: logger.error(f"Depth estimation failed: {e}") raise RuntimeError(f"Depth estimation error: {e}") def segment_image( self, image: np.ndarray ) -> List[Dict[str, Any]]: """ Perform semantic segmentation using SAM. Args: image: Input image as numpy array (H, W, 3) in RGB format Returns: List of segmentation masks with metadata Raises: RuntimeError: If segmentation fails """ try: logger.debug("Performing segmentation with SAM") # Generate masks masks = self.sam_generator.generate(image) logger.debug(f"Generated {len(masks)} segmentation masks") return masks except Exception as e: logger.error(f"Segmentation failed: {e}") raise RuntimeError(f"Segmentation error: {e}") def detect_objects( self, image: np.ndarray ) -> List[Dict[str, Any]]: """ Perform object detection using YOLOv8. Args: image: Input image as numpy array (H, W, 3) in RGB format Returns: List of detected objects with bounding boxes and confidence scores Raises: RuntimeError: If object detection fails """ try: logger.debug("Performing object detection with YOLOv8") # Run inference results = self.yolo_model(image, verbose=False) # Extract detections detections = [] for result in results: boxes = result.boxes if boxes is not None: for i in range(len(boxes)): detection = { "bbox": boxes.xyxy[i] .cpu() .numpy(), # [x1, y1, x2, y2] "confidence": float( boxes.conf[i].cpu().numpy() ), "class_id": int( boxes.cls[i].cpu().numpy() ), "class_name": result.names[ int(boxes.cls[i].cpu().numpy()) ], } detections.append(detection) logger.debug(f"Detected {len(detections)} objects") return detections except Exception as e: logger.error(f"Object detection failed: {e}") raise RuntimeError(f"Object detection error: {e}") def generate_point_cloud( self, image: np.ndarray, depth_map: np.ndarray, masks: Optional[List[Dict[str, Any]]] = None, ) -> o3d.geometry.PointCloud: """ Generate 3D point cloud from image and depth data. Args: image: RGB image array (H, W, 3) depth_map: Depth map array (H, W) masks: Optional segmentation masks for point cloud filtering Returns: Open3D PointCloud object Raises: ValueError: If input dimensions don't match RuntimeError: If point cloud generation fails """ try: logger.debug("Generating 3D point cloud") if image.shape[:2] != depth_map.shape: raise ValueError( "Image and depth map dimensions must match" ) height, width = depth_map.shape # Create intrinsic camera parameters (assuming standard camera) fx = fy = width # Focal length approximation cx, cy = ( width / 2, height / 2, ) # Principal point at image center # Create coordinate grids u, v = np.meshgrid(np.arange(width), np.arange(height)) # Convert depth to actual distances (inverse depth) # MiDaS outputs inverse depth, so we invert it z = 1.0 / ( depth_map + 1e-6 ) # Add small epsilon to avoid division by zero # Back-project to 3D coordinates x = (u - cx) * z / fx y = (v - cy) * z / fy # Create point cloud points = np.stack( [x.flatten(), y.flatten(), z.flatten()], axis=1 ) colors = ( image.reshape(-1, 3) / 255.0 ) # Normalize colors to [0, 1] # Filter out invalid points valid_mask = np.isfinite(points).all(axis=1) & ( z.flatten() > 0 ) points = points[valid_mask] colors = colors[valid_mask] # Create Open3D point cloud point_cloud = o3d.geometry.PointCloud() point_cloud.points = o3d.utility.Vector3dVector(points) point_cloud.colors = o3d.utility.Vector3dVector(colors) # Optional: Filter by segmentation masks if masks and len(masks) > 0: # Use the largest mask for filtering largest_mask = max(masks, key=lambda x: x["area"]) mask_2d = largest_mask["segmentation"] mask_1d = mask_2d.flatten()[valid_mask] filtered_points = points[mask_1d] filtered_colors = colors[mask_1d] point_cloud.points = o3d.utility.Vector3dVector( filtered_points ) point_cloud.colors = o3d.utility.Vector3dVector( filtered_colors ) # Remove statistical outliers point_cloud, _ = point_cloud.remove_statistical_outlier( nb_neighbors=20, std_ratio=2.0 ) logger.debug( f"Generated point cloud with {len(point_cloud.points)} points" ) return point_cloud except Exception as e: logger.error(f"Point cloud generation failed: {e}") raise RuntimeError(f"Point cloud generation error: {e}") def process_image( self, image_path: Union[str, Path] ) -> Dict[str, Any]: """ Process a single image through the complete AI vision pipeline. Args: image_path: Path to input image (JPG or PNG) Returns: Dictionary containing all processing results: - 'image': Original RGB image - 'depth_map': Depth estimation result - 'segmentation_masks': SAM segmentation results - 'detections': YOLO object detection results - 'point_cloud': Open3D point cloud object Raises: FileNotFoundError: If image file doesn't exist RuntimeError: If any processing step fails """ try: logger.info(f"Processing image: {image_path}") # Load and preprocess image rgb_image, pil_image = self._load_and_preprocess_image( image_path ) # Depth estimation depth_map = self.estimate_depth(rgb_image) # Semantic segmentation segmentation_masks = self.segment_image(rgb_image) # Object detection detections = self.detect_objects(rgb_image) # 3D point cloud generation point_cloud = self.generate_point_cloud( rgb_image, depth_map, segmentation_masks ) # Compile results results = { "image": rgb_image, "depth_map": depth_map, "segmentation_masks": segmentation_masks, "detections": detections, "point_cloud": point_cloud, "metadata": { "image_shape": rgb_image.shape, "num_segments": len(segmentation_masks), "num_detections": len(detections), "num_points": len(point_cloud.points), }, } logger.success("Image processing completed successfully") logger.info(f"Results: {results['metadata']}") return results except Exception as e: logger.error(f"Image processing failed: {e}") raise def save_point_cloud( self, point_cloud: o3d.geometry.PointCloud, output_path: Union[str, Path], ) -> None: """ Save point cloud to file. Args: point_cloud: Open3D PointCloud object output_path: Output file path (.ply, .pcd, .xyz) Raises: RuntimeError: If saving fails """ try: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) success = o3d.io.write_point_cloud( str(output_path), point_cloud ) if not success: raise RuntimeError("Failed to write point cloud file") logger.success(f"Point cloud saved to: {output_path}") except Exception as e: logger.error(f"Failed to save point cloud: {e}") raise RuntimeError(f"Point cloud save error: {e}") def visualize_point_cloud( self, point_cloud: o3d.geometry.PointCloud ) -> None: """ Visualize point cloud using Open3D viewer. Args: point_cloud: Open3D PointCloud object to visualize """ try: logger.info("Opening point cloud visualization") o3d.visualization.draw_geometries([point_cloud]) except Exception as e: logger.warning(f"Visualization failed: {e}") # Example usage and testing if __name__ == "__main__": # Example usage try: # Initialize pipeline pipeline = AIVisionPipeline( model_dir="./models", log_level="INFO" ) # Process an image (replace with actual image path) image_path = "map_two.png" # Replace with your image path if Path(image_path).exists(): results = pipeline.process_image(image_path) # Save point cloud pipeline.save_point_cloud( results["point_cloud"], "output_point_cloud.ply" ) # Optional: Visualize point cloud pipeline.visualize_point_cloud(results["point_cloud"]) print( f"Processing completed! Generated {results['metadata']['num_points']} 3D points" ) else: logger.warning(f"Example image not found: {image_path}") except Exception as e: logger.error(f"Example execution failed: {e}")