[FEAT][llm_metrics_decorator]

pull/334/head
Kye 1 year ago
parent 4557e1c3fb
commit 950907581c

@ -1,9 +1,11 @@
import os
import asyncio
import logging
import os
import time
from abc import ABC, abstractmethod
from typing import Optional, List
import asyncio
from typing import List, Optional
from swarms.utils.llm_metrcs_decorator import metrics_decorator
def count_tokens(text: str) -> int:
@ -118,6 +120,7 @@ class AbstractLLM(ABC):
}
@abstractmethod
@metrics_decorator
def run(self, task: Optional[str] = None, *args, **kwargs) -> str:
"""generate text using language model"""
pass
@ -381,3 +384,48 @@ class AbstractLLM(ABC):
TOKENS: {_num_tokens}
Tokens/SEC: {_time_for_generation}
"""
def time_to_first_token(self, prompt: str) -> float:
"""Time to first token
Args:
prompt (str): _description_
Returns:
float: _description_
"""
start_time = time.time()
tokens = self.track_resource_utilization(
prompt
) # assuming `generate` is a method that generates tokens
first_token_time = time.time()
return first_token_time - start_time
def generation_latency(self, prompt: str) -> float:
"""generation latency
Args:
prompt (str): _description_
Returns:
float: _description_
"""
start_time = time.time()
tokens = self.run(prompt)
end_time = time.time()
return end_time - start_time
def throughput(self, prompts: List[str]) -> float:
"""throughput
Args:
prompts (): _description_
Returns:
float: _description_
"""
start_time = time.time()
for prompt in prompts:
tokens = self.run(prompt)
end_time = time.time()
return len(prompts) / (end_time - start_time)

@ -3,4 +3,9 @@ from swarms.structs.sequential_workflow import SequentialWorkflow
from swarms.structs.autoscaler import AutoScaler
from swarms.structs.conversation import Conversation
__all__ = ["Agent", "SequentialWorkflow", "AutoScaler", "Conversation"]
__all__ = [
"Agent",
"SequentialWorkflow",
"AutoScaler",
"Conversation",
]

@ -9,22 +9,23 @@ from swarms.structs.base import BaseStructure
class Conversation(BaseStructure):
"""
Conversation class
Attributes:
time_enabled (bool): whether to enable time
conversation_history (list): list of messages in the conversation
Examples:
>>> conv = Conversation()
>>> conv.add("user", "Hello, world!")
>>> conv.add("assistant", "Hello, user!")
>>> conv.display_conversation()
user: Hello, world!
"""
def __init__(self, time_enabled: bool = False, *args, **kwargs):
super().__init__()
self.time_enabled = time_enabled
@ -186,6 +187,7 @@ class Conversation(BaseStructure):
# Load the conversation history from a JSON file
with open(filename, "r") as f:
self.conversation_history = json.load(f)
def search_keyword_in_conversation(self, keyword: str):
"""Search for a keyword in the conversation history

@ -5,6 +5,7 @@ from swarms.utils.parse_code import (
)
from swarms.utils.pdf_to_text import pdf_to_text
from swarms.utils.math_eval import math_eval
from swarms.utils.llm_metrcs_decorator import metrics_decorator
# from swarms.utils.phoenix_handler import phoenix_trace_decorator
@ -15,4 +16,5 @@ __all__ = [
"pdf_to_text",
# "phoenix_trace_decorator",
"math_eval",
"metrics_decorator",
]

@ -0,0 +1,39 @@
import time
from functools import wraps
from typing import Callable
def metrics_decorator(func: Callable):
"""Metrics decorator for LLM
Args:
func (Callable): The function to decorate
Example:
>>> @metrics_decorator
>>> def my_function():
>>> return "Hello, world!"
>>> my_function()
"""
@wraps(func)
def wrapper(self, *args, **kwargs):
# Time to First Token
start_time = time.time()
result = func(self, *args, **kwargs)
first_token_time = time.time()
# Generation Latency
end_time = time.time()
# Throughput (assuming the function returns a list of tokens)
throughput = len(result) / (end_time - start_time)
return f"""
Time to First Token: {first_token_time - start_time}
Generation Latency: {end_time - start_time}
Throughput: {throughput}
"""
return wrapper

@ -0,0 +1,63 @@
import time
from swarms.utils.llm_metrcs_decorator import metrics_decorator
def test_metrics_decorator():
@metrics_decorator
def test_func():
time.sleep(0.1) # simulate some work
return list(range(100)) # return a list of 100 tokens
result = test_func()
lines = result.strip().split("\n")
# Check that the decorator returns 3 lines of output
assert len(lines) == 3
# Check that the Time to First Token is less than or equal to the Generation Latency
time_to_first_token = float(lines[0].split(": ")[1])
generation_latency = float(lines[1].split(": ")[1])
assert time_to_first_token <= generation_latency
# Check that the Throughput is approximately equal to the number of tokens divided by the Generation Latency
throughput = float(lines[2].split(": ")[1])
assert (
abs(throughput - 100 / generation_latency) < 0.01
) # allow for a small amount of error
def test_metrics_decorator_1_token():
@metrics_decorator
def test_func():
time.sleep(0.1) # simulate some work
return [0] # return a list of 1 token
result = test_func()
lines = result.strip().split("\n")
assert len(lines) == 3
time_to_first_token = float(lines[0].split(": ")[1])
generation_latency = float(lines[1].split(": ")[1])
assert time_to_first_token <= generation_latency
throughput = float(lines[2].split(": ")[1])
assert abs(throughput - 1 / generation_latency) < 0.01
# Repeat the test with different numbers of tokens and different amounts of work
for i in range(2, 17):
def test_func():
@metrics_decorator
def test_func():
time.sleep(0.01 * i) # simulate some work
return list(range(i)) # return a list of i tokens
result = test_func()
lines = result.strip().split("\n")
assert len(lines) == 3
time_to_first_token = float(lines[0].split(": ")[1])
generation_latency = float(lines[1].split(": ")[1])
assert time_to_first_token <= generation_latency
throughput = float(lines[2].split(": ")[1])
assert abs(throughput - i / generation_latency) < 0.01
globals()[f"test_metrics_decorator_{i}_tokens"] = test_func
Loading…
Cancel
Save