[FEAT][llm_metrics_decorator]

pull/334/head
Kye 1 year ago
parent 4557e1c3fb
commit 950907581c

@ -1,9 +1,11 @@
import os import asyncio
import logging import logging
import os
import time import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional, List from typing import List, Optional
import asyncio
from swarms.utils.llm_metrcs_decorator import metrics_decorator
def count_tokens(text: str) -> int: def count_tokens(text: str) -> int:
@ -118,6 +120,7 @@ class AbstractLLM(ABC):
} }
@abstractmethod @abstractmethod
@metrics_decorator
def run(self, task: Optional[str] = None, *args, **kwargs) -> str: def run(self, task: Optional[str] = None, *args, **kwargs) -> str:
"""generate text using language model""" """generate text using language model"""
pass pass
@ -381,3 +384,48 @@ class AbstractLLM(ABC):
TOKENS: {_num_tokens} TOKENS: {_num_tokens}
Tokens/SEC: {_time_for_generation} Tokens/SEC: {_time_for_generation}
""" """
def time_to_first_token(self, prompt: str) -> float:
"""Time to first token
Args:
prompt (str): _description_
Returns:
float: _description_
"""
start_time = time.time()
tokens = self.track_resource_utilization(
prompt
) # assuming `generate` is a method that generates tokens
first_token_time = time.time()
return first_token_time - start_time
def generation_latency(self, prompt: str) -> float:
"""generation latency
Args:
prompt (str): _description_
Returns:
float: _description_
"""
start_time = time.time()
tokens = self.run(prompt)
end_time = time.time()
return end_time - start_time
def throughput(self, prompts: List[str]) -> float:
"""throughput
Args:
prompts (): _description_
Returns:
float: _description_
"""
start_time = time.time()
for prompt in prompts:
tokens = self.run(prompt)
end_time = time.time()
return len(prompts) / (end_time - start_time)

@ -3,4 +3,9 @@ from swarms.structs.sequential_workflow import SequentialWorkflow
from swarms.structs.autoscaler import AutoScaler from swarms.structs.autoscaler import AutoScaler
from swarms.structs.conversation import Conversation from swarms.structs.conversation import Conversation
__all__ = ["Agent", "SequentialWorkflow", "AutoScaler", "Conversation"] __all__ = [
"Agent",
"SequentialWorkflow",
"AutoScaler",
"Conversation",
]

@ -9,22 +9,23 @@ from swarms.structs.base import BaseStructure
class Conversation(BaseStructure): class Conversation(BaseStructure):
""" """
Conversation class Conversation class
Attributes: Attributes:
time_enabled (bool): whether to enable time time_enabled (bool): whether to enable time
conversation_history (list): list of messages in the conversation conversation_history (list): list of messages in the conversation
Examples: Examples:
>>> conv = Conversation() >>> conv = Conversation()
>>> conv.add("user", "Hello, world!") >>> conv.add("user", "Hello, world!")
>>> conv.add("assistant", "Hello, user!") >>> conv.add("assistant", "Hello, user!")
>>> conv.display_conversation() >>> conv.display_conversation()
user: Hello, world! user: Hello, world!
""" """
def __init__(self, time_enabled: bool = False, *args, **kwargs): def __init__(self, time_enabled: bool = False, *args, **kwargs):
super().__init__() super().__init__()
self.time_enabled = time_enabled self.time_enabled = time_enabled
@ -186,6 +187,7 @@ class Conversation(BaseStructure):
# Load the conversation history from a JSON file # Load the conversation history from a JSON file
with open(filename, "r") as f: with open(filename, "r") as f:
self.conversation_history = json.load(f) self.conversation_history = json.load(f)
def search_keyword_in_conversation(self, keyword: str): def search_keyword_in_conversation(self, keyword: str):
"""Search for a keyword in the conversation history """Search for a keyword in the conversation history

@ -5,6 +5,7 @@ from swarms.utils.parse_code import (
) )
from swarms.utils.pdf_to_text import pdf_to_text from swarms.utils.pdf_to_text import pdf_to_text
from swarms.utils.math_eval import math_eval from swarms.utils.math_eval import math_eval
from swarms.utils.llm_metrcs_decorator import metrics_decorator
# from swarms.utils.phoenix_handler import phoenix_trace_decorator # from swarms.utils.phoenix_handler import phoenix_trace_decorator
@ -15,4 +16,5 @@ __all__ = [
"pdf_to_text", "pdf_to_text",
# "phoenix_trace_decorator", # "phoenix_trace_decorator",
"math_eval", "math_eval",
"metrics_decorator",
] ]

@ -0,0 +1,39 @@
import time
from functools import wraps
from typing import Callable
def metrics_decorator(func: Callable):
"""Metrics decorator for LLM
Args:
func (Callable): The function to decorate
Example:
>>> @metrics_decorator
>>> def my_function():
>>> return "Hello, world!"
>>> my_function()
"""
@wraps(func)
def wrapper(self, *args, **kwargs):
# Time to First Token
start_time = time.time()
result = func(self, *args, **kwargs)
first_token_time = time.time()
# Generation Latency
end_time = time.time()
# Throughput (assuming the function returns a list of tokens)
throughput = len(result) / (end_time - start_time)
return f"""
Time to First Token: {first_token_time - start_time}
Generation Latency: {end_time - start_time}
Throughput: {throughput}
"""
return wrapper

@ -0,0 +1,63 @@
import time
from swarms.utils.llm_metrcs_decorator import metrics_decorator
def test_metrics_decorator():
@metrics_decorator
def test_func():
time.sleep(0.1) # simulate some work
return list(range(100)) # return a list of 100 tokens
result = test_func()
lines = result.strip().split("\n")
# Check that the decorator returns 3 lines of output
assert len(lines) == 3
# Check that the Time to First Token is less than or equal to the Generation Latency
time_to_first_token = float(lines[0].split(": ")[1])
generation_latency = float(lines[1].split(": ")[1])
assert time_to_first_token <= generation_latency
# Check that the Throughput is approximately equal to the number of tokens divided by the Generation Latency
throughput = float(lines[2].split(": ")[1])
assert (
abs(throughput - 100 / generation_latency) < 0.01
) # allow for a small amount of error
def test_metrics_decorator_1_token():
@metrics_decorator
def test_func():
time.sleep(0.1) # simulate some work
return [0] # return a list of 1 token
result = test_func()
lines = result.strip().split("\n")
assert len(lines) == 3
time_to_first_token = float(lines[0].split(": ")[1])
generation_latency = float(lines[1].split(": ")[1])
assert time_to_first_token <= generation_latency
throughput = float(lines[2].split(": ")[1])
assert abs(throughput - 1 / generation_latency) < 0.01
# Repeat the test with different numbers of tokens and different amounts of work
for i in range(2, 17):
def test_func():
@metrics_decorator
def test_func():
time.sleep(0.01 * i) # simulate some work
return list(range(i)) # return a list of i tokens
result = test_func()
lines = result.strip().split("\n")
assert len(lines) == 3
time_to_first_token = float(lines[0].split(": ")[1])
generation_latency = float(lines[1].split(": ")[1])
assert time_to_first_token <= generation_latency
throughput = float(lines[2].split(": ")[1])
assert abs(throughput - i / generation_latency) < 0.01
globals()[f"test_metrics_decorator_{i}_tokens"] = test_func
Loading…
Cancel
Save