HuggingfaceLLM, jina embeds

pull/91/head
Kye 1 year ago
parent cb936eaef7
commit 8dfb1d33d0

@ -39,4 +39,4 @@ jobs:
run: sphinx-build -b linkcheck docs build/docs run: sphinx-build -b linkcheck docs build/docs
- name: Run performance tests - name: Run performance tests
run: pytest tests/performance run: find ./tests -name '*.py' -exec pytest {} \;

@ -97,8 +97,8 @@ To run the documentation, install the project requirements with `poetry install
You can learn more about mkdocs on the [mkdocs website](https://www.mkdocs.org/). You can learn more about mkdocs on the [mkdocs website](https://www.mkdocs.org/).
## 🧪 tests ## 🧪 tests
- Run all the tests in the tests folder
[`pytests`](https://docs.pytest.org/en/7.1.x/) is used to run our tests. `find ./tests -name '*.py' -exec pytest {} \;`
## 📄 license ## 📄 license

@ -11,7 +11,11 @@ llm = OpenAIChat(
) )
# Initialize the flow # Initialize the flow
flow = Flow(llm=llm, max_loops=5, dashboard=True,) flow = Flow(
llm=llm,
max_loops=5,
dashboard=True,
)
flow = Flow( flow = Flow(
llm=llm, llm=llm,

@ -19,6 +19,7 @@ flow = Flow(
agent = SimpleAgent( agent = SimpleAgent(
name="Optimus Prime", name="Optimus Prime",
flow=flow, flow=flow,
# Memory
) )
out = agent.run("Generate a 10,000 word blog on health and wellness.") out = agent.run("Generate a 10,000 word blog on health and wellness.")

@ -1,6 +0,0 @@
def stream(response):
"""
Yield the response token by token (word by word) from llm
"""
for token in response.split():
yield token

@ -1,2 +0,0 @@
# from swarms.embeddings.pegasus import PegasusEmbedding
from swarms.embeddings.simple_ada import get_ada_embeddings

@ -1,10 +0,0 @@
# This file contains the function that embeds the input into a vector
from chromadb import EmbeddingFunction
def openai_embed(self, input, api_key, model_name):
openai = EmbeddingFunction.OpenAIEmbeddingFunction(
api_key=api_key, model_name=model_name
)
embedding = openai(input)
return embedding

@ -17,7 +17,7 @@ from typing import (
import numpy as np import numpy as np
from swarms.structs.document import Document from swarms.structs.document import Document
from swarms.embeddings.base import Embeddings from swarms.models.embeddings_base import Embeddings
from langchain.schema.vectorstore import VectorStore from langchain.schema.vectorstore import VectorStore
from langchain.utils import xor_args from langchain.utils import xor_args
from langchain.vectorstores.utils import maximal_marginal_relevance from langchain.vectorstores.utils import maximal_marginal_relevance

@ -3,6 +3,7 @@ import logging
import torch import torch
from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from termcolor import colored
class HuggingfaceLLM: class HuggingfaceLLM:
@ -20,13 +21,13 @@ class HuggingfaceLLM:
# Usage # Usage
``` ```
from finetuning_suite import Inference from swarms.models import HuggingfaceLLM
model_id = "gpt2-small" model_id = "gpt2-small"
inference = Inference(model_id=model_id) inference = HuggingfaceLLM(model_id=model_id)
prompt_text = "Once upon a time" task = "Once upon a time"
generated_text = inference(prompt_text) generated_text = inference(task)
print(generated_text) print(generated_text)
``` ```
""" """
@ -42,6 +43,8 @@ class HuggingfaceLLM:
# logger=None, # logger=None,
distributed=False, distributed=False,
decoding=False, decoding=False,
*args,
**kwargs,
): ):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self.device = ( self.device = (
@ -53,7 +56,6 @@ class HuggingfaceLLM:
self.distributed = distributed self.distributed = distributed
self.decoding = decoding self.decoding = decoding
self.model, self.tokenizer = None, None self.model, self.tokenizer = None, None
# self.log = Logging()
if self.distributed: if self.distributed:
assert ( assert (
@ -104,12 +106,12 @@ class HuggingfaceLLM:
self.logger.error(f"Failed to load the model or the tokenizer: {error}") self.logger.error(f"Failed to load the model or the tokenizer: {error}")
raise raise
def run(self, prompt_text: str): def run(self, task: str):
""" """
Generate a response based on the prompt text. Generate a response based on the prompt text.
Args: Args:
- prompt_text (str): Text to prompt the model. - task (str): Text to prompt the model.
- max_length (int): Maximum length of the response. - max_length (int): Maximum length of the response.
Returns: Returns:
@ -119,10 +121,10 @@ class HuggingfaceLLM:
max_length = self.max_length max_length = self.max_length
self.print_dashboard(task)
try: try:
inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( inputs = self.tokenizer.encode(task, return_tensors="pt").to(self.device)
self.device
)
# self.log.start() # self.log.start()
@ -181,12 +183,12 @@ class HuggingfaceLLM:
# Wrapping synchronous calls with async # Wrapping synchronous calls with async
return self.run(task, *args, **kwargs) return self.run(task, *args, **kwargs)
def __call__(self, prompt_text: str): def __call__(self, task: str):
""" """
Generate a response based on the prompt text. Generate a response based on the prompt text.
Args: Args:
- prompt_text (str): Text to prompt the model. - task (str): Text to prompt the model.
- max_length (int): Maximum length of the response. - max_length (int): Maximum length of the response.
Returns: Returns:
@ -194,12 +196,12 @@ class HuggingfaceLLM:
""" """
self.load_model() self.load_model()
max_length = self.max_ max_length = self.max_length
self.print_dashboard(task)
try: try:
inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( inputs = self.tokenizer.encode(task, return_tensors="pt").to(self.device)
self.device
)
# self.log.start() # self.log.start()
@ -258,3 +260,37 @@ class HuggingfaceLLM:
return {"allocated": allocated, "reserved": reserved} return {"allocated": allocated, "reserved": reserved}
else: else:
return {"error": "GPU not available"} return {"error": "GPU not available"}
def print_dashboard(self, task: str):
"""Print dashboard"""
dashboard = print(
colored(
f"""
HuggingfaceLLM Dashboard
--------------------------------------------
Model Name: {self.model_id}
Tokenizer: {self.tokenizer}
Model MaxLength: {self.max_length}
Model Device: {self.device}
Model Quantization: {self.quantize}
Model Quantization Config: {self.quantization_config}
Model Verbose: {self.verbose}
Model Distributed: {self.distributed}
Model Decoding: {self.decoding}
----------------------------------------
Metadata:
Task Memory Consumption: {self.memory_consumption()}
GPU Available: {self.gpu_available()}
----------------------------------------
Task Environment:
Task: {task}
""",
"red",
)
)
print(dashboard)

@ -0,0 +1,214 @@
import logging
import torch
from numpy.linalg import norm
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
def cos_sim(a, b):
return a @ b.T / (norm(a) * norm(b))
class JinaEmbeddings:
"""
A class for running inference on a given model.
Attributes:
model_id (str): The ID of the model.
device (str): The device to run the model on (either 'cuda' or 'cpu').
max_length (int): The maximum length of the output sequence.
quantize (bool, optional): Whether to use quantization. Defaults to False.
quantization_config (dict, optional): The configuration for quantization.
verbose (bool, optional): Whether to print verbose logs. Defaults to False.
logger (logging.Logger, optional): The logger to use. Defaults to a basic logger.
# Usage
```
from swarms.models import JinaEmbeddings
model = JinaEmbeddings()
embeddings = model("Encode this text")
print(embeddings)
```
"""
def __init__(
self,
model_id: str,
device: str = None,
max_length: int = 500,
quantize: bool = False,
quantization_config: dict = None,
verbose=False,
# logger=None,
distributed=False,
decoding=False,
cos_sim: bool = False,
*args,
**kwargs,
):
self.logger = logging.getLogger(__name__)
self.device = (
device if device else ("cuda" if torch.cuda.is_available() else "cpu")
)
self.model_id = model_id
self.max_length = max_length
self.verbose = verbose
self.distributed = distributed
self.decoding = decoding
self.model, self.tokenizer = None, None
# self.log = Logging()
self.cos_sim = cos_sim
if self.distributed:
assert (
torch.cuda.device_count() > 1
), "You need more than 1 gpu for distributed processing"
bnb_config = None
if quantize:
if not quantization_config:
quantization_config = {
"load_in_4bit": True,
"bnb_4bit_use_double_quant": True,
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": torch.bfloat16,
}
bnb_config = BitsAndBytesConfig(**quantization_config)
try:
self.model = AutoModelForCausalLM.from_pretrained(
self.model_id, quantization_config=bnb_config, trust_remote_code=True
)
self.model # .to(self.device)
except Exception as e:
self.logger.error(f"Failed to load the model or the tokenizer: {e}")
raise
def load_model(self):
"""Load the model"""
if not self.model or not self.tokenizer:
try:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
bnb_config = (
BitsAndBytesConfig(**self.quantization_config)
if self.quantization_config
else None
)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_id,
quantization_config=bnb_config,
trust_remote_code=True,
).to(self.device)
if self.distributed:
self.model = DDP(self.model)
except Exception as error:
self.logger.error(f"Failed to load the model or the tokenizer: {error}")
raise
def run(self, task: str):
"""
Generate a response based on the prompt text.
Args:
- task (str): Text to prompt the model.
- max_length (int): Maximum length of the response.
Returns:
- Generated text (str).
"""
self.load_model()
max_length = self.max_length
try:
embeddings = self.model.encode([task], max_length=max_length)
if self.cos_sim:
print(cos_sim(embeddings[0], embeddings[1]))
else:
return embeddings[0]
except Exception as e:
self.logger.error(f"Failed to generate the text: {e}")
raise
async def run_async(self, task: str, *args, **kwargs) -> str:
"""
Run the model asynchronously
Args:
task (str): Task to run.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Examples:
>>> mpt_instance = MPT('mosaicml/mpt-7b-storywriter', "EleutherAI/gpt-neox-20b", max_tokens=150)
>>> mpt_instance("generate", "Once upon a time in a land far, far away...")
'Once upon a time in a land far, far away...'
>>> mpt_instance.batch_generate(["In the deep jungles,", "At the heart of the city,"], temperature=0.7)
['In the deep jungles,',
'At the heart of the city,']
>>> mpt_instance.freeze_model()
>>> mpt_instance.unfreeze_model()
"""
# Wrapping synchronous calls with async
return self.run(task, *args, **kwargs)
def __call__(self, task: str):
"""
Generate a response based on the prompt text.
Args:
- task (str): Text to prompt the model.
- max_length (int): Maximum length of the response.
Returns:
- Generated text (str).
"""
self.load_model()
max_length = self.max_length
try:
embeddings = self.model.encode([task], max_length=max_length)
if self.cos_sim:
print(cos_sim(embeddings[0], embeddings[1]))
else:
return embeddings[0]
except Exception as e:
self.logger.error(f"Failed to generate the text: {e}")
raise
async def __call_async__(self, task: str, *args, **kwargs) -> str:
"""Call the model asynchronously""" ""
return await self.run_async(task, *args, **kwargs)
def save_model(self, path: str):
"""Save the model to a given path"""
self.model.save_pretrained(path)
self.tokenizer.save_pretrained(path)
def gpu_available(self) -> bool:
"""Check if GPU is available"""
return torch.cuda.is_available()
def memory_consumption(self) -> dict:
"""Get the memory consumption of the GPU"""
if self.gpu_available():
torch.cuda.synchronize()
allocated = torch.cuda.memory_allocated()
reserved = torch.cuda.memory_reserved()
return {"allocated": allocated, "reserved": reserved}
else:
return {"error": "GPU not available"}

@ -25,7 +25,7 @@ from tenacity import (
stop_after_attempt, stop_after_attempt,
wait_exponential, wait_exponential,
) )
from swarms.embeddings.base import Embeddings from swarms.models.embeddings_base import Embeddings
def get_from_dict_or_env( def get_from_dict_or_env(

@ -1,10 +1,9 @@
import openai import openai
from dotenv import load_dotenv from dotenv import load_dotenv
from os import getenv
load_dotenv() load_dotenv()
from os import getenv
def get_ada_embeddings(text: str, model: str = "text-embedding-ada-002"): def get_ada_embeddings(text: str, model: str = "text-embedding-ada-002"):
""" """

@ -0,0 +1,265 @@
import logging
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
class YarnMistral128:
"""
A class for running inference on a given model.
Attributes:
model_id (str): The ID of the model.
device (str): The device to run the model on (either 'cuda' or 'cpu').
max_length (int): The maximum length of the output sequence.
quantize (bool, optional): Whether to use quantization. Defaults to False.
quantization_config (dict, optional): The configuration for quantization.
verbose (bool, optional): Whether to print verbose logs. Defaults to False.
logger (logging.Logger, optional): The logger to use. Defaults to a basic logger.
# Usage
```
from finetuning_suite import Inference
model_id = "gpt2-small"
inference = Inference(model_id=model_id)
prompt_text = "Once upon a time"
generated_text = inference(prompt_text)
print(generated_text)
```
"""
def __init__(
self,
model_id: str = "NousResearch/Yarn-Mistral-7b-128k",
device: str = None,
max_length: int = 500,
quantize: bool = False,
quantization_config: dict = None,
verbose=False,
# logger=None,
distributed=False,
decoding=False,
):
self.logger = logging.getLogger(__name__)
self.device = (
device if device else ("cuda" if torch.cuda.is_available() else "cpu")
)
self.model_id = model_id
self.max_length = max_length
self.verbose = verbose
self.distributed = distributed
self.decoding = decoding
self.model, self.tokenizer = None, None
# self.log = Logging()
if self.distributed:
assert (
torch.cuda.device_count() > 1
), "You need more than 1 gpu for distributed processing"
bnb_config = None
if quantize:
if not quantization_config:
quantization_config = {
"load_in_4bit": True,
"bnb_4bit_use_double_quant": True,
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": torch.bfloat16,
}
bnb_config = BitsAndBytesConfig(**quantization_config)
try:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_id,
quantization_config=bnb_config,
use_flash_attention_2=True,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
self.model # .to(self.device)
except Exception as e:
self.logger.error(f"Failed to load the model or the tokenizer: {e}")
raise
def load_model(self):
"""Load the model"""
if not self.model or not self.tokenizer:
try:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
bnb_config = (
BitsAndBytesConfig(**self.quantization_config)
if self.quantization_config
else None
)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_id, quantization_config=bnb_config
).to(self.device)
if self.distributed:
self.model = DDP(self.model)
except Exception as error:
self.logger.error(f"Failed to load the model or the tokenizer: {error}")
raise
def run(self, prompt_text: str):
"""
Generate a response based on the prompt text.
Args:
- prompt_text (str): Text to prompt the model.
- max_length (int): Maximum length of the response.
Returns:
- Generated text (str).
"""
self.load_model()
max_length = self.max_length
try:
inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to(
self.device
)
# self.log.start()
if self.decoding:
with torch.no_grad():
for _ in range(max_length):
output_sequence = []
outputs = self.model.generate(
inputs, max_length=len(inputs) + 1, do_sample=True
)
output_tokens = outputs[0][-1]
output_sequence.append(output_tokens.item())
# print token in real-time
print(
self.tokenizer.decode(
[output_tokens], skip_special_tokens=True
),
end="",
flush=True,
)
inputs = outputs
else:
with torch.no_grad():
outputs = self.model.generate(
inputs, max_length=max_length, do_sample=True
)
del inputs
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
except Exception as e:
self.logger.error(f"Failed to generate the text: {e}")
raise
async def run_async(self, task: str, *args, **kwargs) -> str:
"""
Run the model asynchronously
Args:
task (str): Task to run.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Examples:
>>> mpt_instance = MPT('mosaicml/mpt-7b-storywriter', "EleutherAI/gpt-neox-20b", max_tokens=150)
>>> mpt_instance("generate", "Once upon a time in a land far, far away...")
'Once upon a time in a land far, far away...'
>>> mpt_instance.batch_generate(["In the deep jungles,", "At the heart of the city,"], temperature=0.7)
['In the deep jungles,',
'At the heart of the city,']
>>> mpt_instance.freeze_model()
>>> mpt_instance.unfreeze_model()
"""
# Wrapping synchronous calls with async
return self.run(task, *args, **kwargs)
def __call__(self, prompt_text: str):
"""
Generate a response based on the prompt text.
Args:
- prompt_text (str): Text to prompt the model.
- max_length (int): Maximum length of the response.
Returns:
- Generated text (str).
"""
self.load_model()
max_length = self.max_
try:
inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to(
self.device
)
# self.log.start()
if self.decoding:
with torch.no_grad():
for _ in range(max_length):
output_sequence = []
outputs = self.model.generate(
inputs, max_length=len(inputs) + 1, do_sample=True
)
output_tokens = outputs[0][-1]
output_sequence.append(output_tokens.item())
# print token in real-time
print(
self.tokenizer.decode(
[output_tokens], skip_special_tokens=True
),
end="",
flush=True,
)
inputs = outputs
else:
with torch.no_grad():
outputs = self.model.generate(
inputs, max_length=max_length, do_sample=True
)
del inputs
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
except Exception as e:
self.logger.error(f"Failed to generate the text: {e}")
raise
async def __call_async__(self, task: str, *args, **kwargs) -> str:
"""Call the model asynchronously""" ""
return await self.run_async(task, *args, **kwargs)
def save_model(self, path: str):
"""Save the model to a given path"""
self.model.save_pretrained(path)
self.tokenizer.save_pretrained(path)
def gpu_available(self) -> bool:
"""Check if GPU is available"""
return torch.cuda.is_available()
def memory_consumption(self) -> dict:
"""Get the memory consumption of the GPU"""
if self.gpu_available():
torch.cuda.synchronize()
allocated = torch.cuda.memory_allocated()
reserved = torch.cuda.memory_reserved()
return {"allocated": allocated, "reserved": reserved}
else:
return {"error": "GPU not available"}

@ -1,9 +1,9 @@
""" """
TODO: TODO:
- Add a retry mechanism - Add tools
- Add prompt injection letting the agent know it's in a flow, Flow prompt - Add open interpreter style conversation
- Dynamic temperature handling - Add configurable save and restore so the user can restore from previus flows
- Add memory vector database retrieval
""" """
import json import json
@ -252,7 +252,8 @@ class Flow:
History: {response} History: {response}
""", **kwargs """,
**kwargs,
) )
# print(f"Next query: {response}") # print(f"Next query: {response}")
# break # break

@ -0,0 +1,20 @@
"""
Sequential Workflow
from swarms.models import OpenAIChat, Mistral
from swarms.structs import SequentialWorkflow
llm = OpenAIChat(openai_api_key="")
mistral = Mistral()
# Max loops will run over the sequential pipeline twice
workflow = SequentialWorkflow(max_loops=2)
workflow.add("What's the weather in miami", llm)
workflow.add("Create a report on these metrics", mistral)
workflow.run()
"""

@ -1,6 +1,6 @@
import pytest import pytest
from unittest.mock import patch from unittest.mock import patch
from swarms.embeddings.pegasus import PegasusEmbedding from swarms.models.pegasus import PegasusEmbedding
def test_init(): def test_init():

Loading…
Cancel
Save