parent
4557e1c3fb
commit
950907581c
@ -0,0 +1,39 @@
|
|||||||
|
import time
|
||||||
|
from functools import wraps
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
|
||||||
|
def metrics_decorator(func: Callable):
|
||||||
|
"""Metrics decorator for LLM
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func (Callable): The function to decorate
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> @metrics_decorator
|
||||||
|
>>> def my_function():
|
||||||
|
>>> return "Hello, world!"
|
||||||
|
>>> my_function()
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
@wraps(func)
|
||||||
|
def wrapper(self, *args, **kwargs):
|
||||||
|
# Time to First Token
|
||||||
|
start_time = time.time()
|
||||||
|
result = func(self, *args, **kwargs)
|
||||||
|
first_token_time = time.time()
|
||||||
|
|
||||||
|
# Generation Latency
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
# Throughput (assuming the function returns a list of tokens)
|
||||||
|
throughput = len(result) / (end_time - start_time)
|
||||||
|
|
||||||
|
return f"""
|
||||||
|
Time to First Token: {first_token_time - start_time}
|
||||||
|
Generation Latency: {end_time - start_time}
|
||||||
|
Throughput: {throughput}
|
||||||
|
"""
|
||||||
|
|
||||||
|
return wrapper
|
@ -0,0 +1,63 @@
|
|||||||
|
import time
|
||||||
|
from swarms.utils.llm_metrcs_decorator import metrics_decorator
|
||||||
|
|
||||||
|
|
||||||
|
def test_metrics_decorator():
|
||||||
|
@metrics_decorator
|
||||||
|
def test_func():
|
||||||
|
time.sleep(0.1) # simulate some work
|
||||||
|
return list(range(100)) # return a list of 100 tokens
|
||||||
|
|
||||||
|
result = test_func()
|
||||||
|
lines = result.strip().split("\n")
|
||||||
|
|
||||||
|
# Check that the decorator returns 3 lines of output
|
||||||
|
assert len(lines) == 3
|
||||||
|
|
||||||
|
# Check that the Time to First Token is less than or equal to the Generation Latency
|
||||||
|
time_to_first_token = float(lines[0].split(": ")[1])
|
||||||
|
generation_latency = float(lines[1].split(": ")[1])
|
||||||
|
assert time_to_first_token <= generation_latency
|
||||||
|
|
||||||
|
# Check that the Throughput is approximately equal to the number of tokens divided by the Generation Latency
|
||||||
|
throughput = float(lines[2].split(": ")[1])
|
||||||
|
assert (
|
||||||
|
abs(throughput - 100 / generation_latency) < 0.01
|
||||||
|
) # allow for a small amount of error
|
||||||
|
|
||||||
|
|
||||||
|
def test_metrics_decorator_1_token():
|
||||||
|
@metrics_decorator
|
||||||
|
def test_func():
|
||||||
|
time.sleep(0.1) # simulate some work
|
||||||
|
return [0] # return a list of 1 token
|
||||||
|
|
||||||
|
result = test_func()
|
||||||
|
lines = result.strip().split("\n")
|
||||||
|
assert len(lines) == 3
|
||||||
|
time_to_first_token = float(lines[0].split(": ")[1])
|
||||||
|
generation_latency = float(lines[1].split(": ")[1])
|
||||||
|
assert time_to_first_token <= generation_latency
|
||||||
|
throughput = float(lines[2].split(": ")[1])
|
||||||
|
assert abs(throughput - 1 / generation_latency) < 0.01
|
||||||
|
|
||||||
|
|
||||||
|
# Repeat the test with different numbers of tokens and different amounts of work
|
||||||
|
for i in range(2, 17):
|
||||||
|
|
||||||
|
def test_func():
|
||||||
|
@metrics_decorator
|
||||||
|
def test_func():
|
||||||
|
time.sleep(0.01 * i) # simulate some work
|
||||||
|
return list(range(i)) # return a list of i tokens
|
||||||
|
|
||||||
|
result = test_func()
|
||||||
|
lines = result.strip().split("\n")
|
||||||
|
assert len(lines) == 3
|
||||||
|
time_to_first_token = float(lines[0].split(": ")[1])
|
||||||
|
generation_latency = float(lines[1].split(": ")[1])
|
||||||
|
assert time_to_first_token <= generation_latency
|
||||||
|
throughput = float(lines[2].split(": ")[1])
|
||||||
|
assert abs(throughput - i / generation_latency) < 0.01
|
||||||
|
|
||||||
|
globals()[f"test_metrics_decorator_{i}_tokens"] = test_func
|
Loading…
Reference in new issue