Merge a3087da815 into 11365f24fe
commit
aec3e56aed
@ -0,0 +1,376 @@
|
||||
"""
|
||||
OpenTelemetry integration for Swarms framework.
|
||||
|
||||
Provides distributed tracing, metrics, and logging capabilities across
|
||||
agents and multi-agent structures using OpenTelemetry standards.
|
||||
|
||||
Configuration via environment variables:
|
||||
OTEL_SERVICE_NAME: Service name (default: "swarms")
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: OTLP endpoint URL
|
||||
OTEL_EXPORTER_OTLP_HEADERS: Headers for OTLP exporter (JSON format)
|
||||
OTEL_TRACES_EXPORTER: Traces exporter (default: "otlp")
|
||||
OTEL_METRICS_EXPORTER: Metrics exporter (default: "otlp")
|
||||
OTEL_LOGS_EXPORTER: Logs exporter (default: "otlp")
|
||||
OTEL_ENABLED: Enable/disable OpenTelemetry (default: "true")
|
||||
OTEL_SDK_DISABLED: Disable OpenTelemetry SDK (default: "false")
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
from functools import wraps
|
||||
|
||||
from loguru import logger
|
||||
|
||||
_otel_available = False
|
||||
_tracer = None
|
||||
_meter = None
|
||||
_logger = None
|
||||
|
||||
try:
|
||||
from opentelemetry import trace, metrics, _logs
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.sdk._logs import LoggerProvider
|
||||
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
|
||||
OTLPSpanExporter,
|
||||
)
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
|
||||
OTLPMetricExporter,
|
||||
)
|
||||
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import (
|
||||
OTLPLogExporter,
|
||||
)
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from opentelemetry.trace.propagation.tracecontext import (
|
||||
TraceContextTextMapPropagator,
|
||||
)
|
||||
|
||||
_otel_available = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def _is_otel_enabled() -> bool:
|
||||
"""Check if OpenTelemetry is enabled via environment variables."""
|
||||
if os.getenv("OTEL_SDK_DISABLED", "false").lower() == "true":
|
||||
return False
|
||||
return os.getenv("OTEL_ENABLED", "true").lower() == "true"
|
||||
|
||||
|
||||
def _parse_headers(headers_str: str) -> Dict[str, str]:
|
||||
"""Parse headers from JSON string or key=value format."""
|
||||
import json
|
||||
|
||||
try:
|
||||
return json.loads(headers_str)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
headers = {}
|
||||
for pair in headers_str.split(","):
|
||||
if "=" in pair:
|
||||
key, value = pair.split("=", 1)
|
||||
headers[key.strip()] = value.strip()
|
||||
return headers
|
||||
|
||||
|
||||
def _initialize_otel():
|
||||
"""Initialize OpenTelemetry SDK with configuration from environment variables."""
|
||||
global _tracer, _meter, _logger
|
||||
|
||||
if not _otel_available or not _is_otel_enabled():
|
||||
return False
|
||||
|
||||
try:
|
||||
service_name = os.getenv("OTEL_SERVICE_NAME", "swarms")
|
||||
resource = Resource.create({"service.name": service_name})
|
||||
|
||||
trace_provider = TracerProvider(resource=resource)
|
||||
otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
headers = {}
|
||||
|
||||
if otlp_endpoint:
|
||||
headers = _parse_headers(
|
||||
os.getenv("OTEL_EXPORTER_OTLP_HEADERS", "{}")
|
||||
)
|
||||
span_exporter = OTLPSpanExporter(
|
||||
endpoint=otlp_endpoint,
|
||||
headers=headers,
|
||||
)
|
||||
trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
|
||||
|
||||
trace.set_tracer_provider(trace_provider)
|
||||
_tracer = trace.get_tracer(__name__)
|
||||
|
||||
if otlp_endpoint:
|
||||
meter_provider = MeterProvider(
|
||||
resource=resource,
|
||||
metric_readers=[
|
||||
PeriodicExportingMetricReader(
|
||||
OTLPMetricExporter(
|
||||
endpoint=otlp_endpoint,
|
||||
headers=headers,
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
metrics.set_meter_provider(meter_provider)
|
||||
_meter = metrics.get_meter(__name__)
|
||||
|
||||
logger_provider = LoggerProvider(resource=resource)
|
||||
log_exporter = OTLPLogExporter(
|
||||
endpoint=otlp_endpoint,
|
||||
headers=headers,
|
||||
)
|
||||
logger_provider.add_log_record_processor(
|
||||
BatchLogRecordProcessor(log_exporter)
|
||||
)
|
||||
_logs.set_logger_provider(logger_provider)
|
||||
_logger = _logs.get_logger(__name__)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to initialize OpenTelemetry: {e}")
|
||||
return False
|
||||
|
||||
|
||||
if _otel_available and _is_otel_enabled():
|
||||
_initialize_otel()
|
||||
|
||||
|
||||
def get_tracer(name: Optional[str] = None):
|
||||
"""Get OpenTelemetry tracer instance."""
|
||||
if not _otel_available or not _is_otel_enabled():
|
||||
return None
|
||||
return trace.get_tracer(name or __name__)
|
||||
|
||||
|
||||
def get_meter(name: Optional[str] = None):
|
||||
"""Get OpenTelemetry meter instance."""
|
||||
if not _otel_available or not _is_otel_enabled():
|
||||
return None
|
||||
return metrics.get_meter(name or __name__)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def trace_span(
|
||||
name: str,
|
||||
attributes: Optional[Dict[str, Any]] = None,
|
||||
kind: Optional[Any] = None,
|
||||
):
|
||||
"""
|
||||
Context manager for creating a trace span.
|
||||
|
||||
Args:
|
||||
name: Span name
|
||||
attributes: Dictionary of span attributes
|
||||
kind: Span kind (INTERNAL, SERVER, CLIENT, etc.)
|
||||
"""
|
||||
if not _otel_available or not _is_otel_enabled():
|
||||
yield None
|
||||
return
|
||||
|
||||
tracer = get_tracer()
|
||||
if not tracer:
|
||||
yield None
|
||||
return
|
||||
|
||||
span_kind = kind if kind is not None else trace.SpanKind.INTERNAL
|
||||
span = tracer.start_span(name=name, kind=span_kind)
|
||||
|
||||
if attributes:
|
||||
for key, value in attributes.items():
|
||||
try:
|
||||
span.set_attribute(key, str(value))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
with trace.use_span(span):
|
||||
yield span
|
||||
except Exception as e:
|
||||
try:
|
||||
span.record_exception(e)
|
||||
span.set_status(trace.Status(trace.StatusCode.ERROR, str(e)))
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
finally:
|
||||
try:
|
||||
span.end()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def trace_function(
|
||||
span_name: Optional[str] = None,
|
||||
attributes: Optional[Dict[str, Any]] = None,
|
||||
capture_args: bool = True,
|
||||
):
|
||||
"""
|
||||
Decorator to trace function execution.
|
||||
|
||||
Args:
|
||||
span_name: Custom span name (defaults to function name)
|
||||
attributes: Additional attributes to add to span
|
||||
capture_args: Whether to capture function arguments as attributes
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if not _otel_available or not _is_otel_enabled():
|
||||
return func(*args, **kwargs)
|
||||
|
||||
name = span_name or f"{func.__module__}.{func.__name__}"
|
||||
span_attrs = (attributes or {}).copy()
|
||||
|
||||
if capture_args:
|
||||
import inspect
|
||||
|
||||
try:
|
||||
sig = inspect.signature(func)
|
||||
bound = sig.bind(*args, **kwargs)
|
||||
bound.apply_defaults()
|
||||
|
||||
for param_name, param_value in bound.arguments.items():
|
||||
if param_name != "self":
|
||||
try:
|
||||
span_attrs[
|
||||
f"function.{param_name}"
|
||||
] = str(param_value)[:200]
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with trace_span(name, span_attrs):
|
||||
start_time = time.time()
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
record_metric(
|
||||
"function.execution.time",
|
||||
execution_time,
|
||||
{"function": func.__name__},
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
record_metric(
|
||||
"function.execution.errors",
|
||||
1,
|
||||
{
|
||||
"function": func.__name__,
|
||||
"error_type": type(e).__name__,
|
||||
},
|
||||
)
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def record_metric(
|
||||
name: str,
|
||||
value: float,
|
||||
attributes: Optional[Dict[str, str]] = None,
|
||||
metric_type: str = "histogram",
|
||||
):
|
||||
"""
|
||||
Record a metric value.
|
||||
|
||||
Args:
|
||||
name: Metric name
|
||||
value: Metric value
|
||||
attributes: Metric attributes/labels
|
||||
metric_type: Type of metric ("counter", "gauge", "histogram")
|
||||
"""
|
||||
if not _otel_available or not _is_otel_enabled() or not _meter:
|
||||
return
|
||||
|
||||
try:
|
||||
attrs = attributes or {}
|
||||
|
||||
if metric_type == "counter":
|
||||
counter = _meter.create_counter(name)
|
||||
counter.add(value, attributes=attrs)
|
||||
elif metric_type == "gauge":
|
||||
gauge = _meter.create_up_down_counter(name)
|
||||
gauge.add(value, attributes=attrs)
|
||||
elif metric_type == "histogram":
|
||||
histogram = _meter.create_histogram(name)
|
||||
histogram.record(value, attributes=attrs)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def get_current_trace_context() -> Optional[Dict[str, str]]:
|
||||
"""Get current trace context for propagation."""
|
||||
if not _otel_available or not _is_otel_enabled():
|
||||
return None
|
||||
|
||||
try:
|
||||
propagator = TraceContextTextMapPropagator()
|
||||
context_dict = {}
|
||||
propagator.inject(context_dict)
|
||||
return context_dict
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def set_trace_context(context: Dict[str, str]):
|
||||
"""Set trace context from external source (for distributed tracing)."""
|
||||
if not _otel_available or not _is_otel_enabled():
|
||||
return
|
||||
|
||||
try:
|
||||
propagator = TraceContextTextMapPropagator()
|
||||
propagator.extract(context)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def log_event(
|
||||
message: str,
|
||||
level: str = "INFO",
|
||||
attributes: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Log an event with OpenTelemetry logging.
|
||||
|
||||
Args:
|
||||
message: Log message
|
||||
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
attributes: Additional attributes
|
||||
"""
|
||||
if not _otel_available or not _is_otel_enabled() or not _logger:
|
||||
logger.log(level, message)
|
||||
return
|
||||
|
||||
try:
|
||||
from opentelemetry._logs import SeverityNumber
|
||||
|
||||
severity_map = {
|
||||
"DEBUG": SeverityNumber.DEBUG,
|
||||
"INFO": SeverityNumber.INFO,
|
||||
"WARNING": SeverityNumber.WARNING,
|
||||
"ERROR": SeverityNumber.ERROR,
|
||||
"CRITICAL": SeverityNumber.CRITICAL,
|
||||
}
|
||||
|
||||
_logger.emit(
|
||||
_logs.LogRecord(
|
||||
body=message,
|
||||
severity_number=severity_map.get(level, SeverityNumber.INFO),
|
||||
attributes=attributes or {},
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
logger.log(level, message)
|
||||
|
||||
Loading…
Reference in new issue