diff --git a/.github/workflows/cos_integration.yml b/.github/workflows/cos_integration.yml index 7cdb41e9..0f3fc605 100644 --- a/.github/workflows/cos_integration.yml +++ b/.github/workflows/cos_integration.yml @@ -39,4 +39,4 @@ jobs: run: sphinx-build -b linkcheck docs build/docs - name: Run performance tests - run: pytest tests/performance \ No newline at end of file + run: find ./tests -name '*.py' -exec pytest {} \; \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e00478f1..bd9090de 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -97,8 +97,8 @@ To run the documentation, install the project requirements with `poetry install You can learn more about mkdocs on the [mkdocs website](https://www.mkdocs.org/). ## ๐Ÿงช tests - -[`pytests`](https://docs.pytest.org/en/7.1.x/) is used to run our tests. +- Run all the tests in the tests folder +`find ./tests -name '*.py' -exec pytest {} \;` ## ๐Ÿ“„ license diff --git a/flow.py b/flow.py index fd7a02b2..d2c21ba8 100644 --- a/flow.py +++ b/flow.py @@ -11,7 +11,11 @@ llm = OpenAIChat( ) # Initialize the flow -flow = Flow(llm=llm, max_loops=5, dashboard=True,) +flow = Flow( + llm=llm, + max_loops=5, + dashboard=True, +) flow = Flow( llm=llm, @@ -28,4 +32,4 @@ flow = Flow( out = flow.run("Generate a 10,000 word blog on health and wellness.") -print(out) \ No newline at end of file +print(out) diff --git a/simple_agent.py b/simple_agent.py index 9ec9aaf6..515b83bc 100644 --- a/simple_agent.py +++ b/simple_agent.py @@ -19,6 +19,7 @@ flow = Flow( agent = SimpleAgent( name="Optimus Prime", flow=flow, + # Memory ) out = agent.run("Generate a 10,000 word blog on health and wellness.") diff --git a/swarms/agents/stream_response.py b/swarms/agents/stream_response.py deleted file mode 100644 index ecd29ff0..00000000 --- a/swarms/agents/stream_response.py +++ /dev/null @@ -1,6 +0,0 @@ -def stream(response): - """ - Yield the response token by token (word by word) from llm - """ - for token in response.split(): - yield token diff --git a/swarms/embeddings/__init__.py b/swarms/embeddings/__init__.py deleted file mode 100644 index 2c6c13b7..00000000 --- a/swarms/embeddings/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# from swarms.embeddings.pegasus import PegasusEmbedding -from swarms.embeddings.simple_ada import get_ada_embeddings diff --git a/swarms/embeddings/embed.py b/swarms/embeddings/embed.py deleted file mode 100644 index ce50e0cf..00000000 --- a/swarms/embeddings/embed.py +++ /dev/null @@ -1,10 +0,0 @@ -# This file contains the function that embeds the input into a vector -from chromadb import EmbeddingFunction - - -def openai_embed(self, input, api_key, model_name): - openai = EmbeddingFunction.OpenAIEmbeddingFunction( - api_key=api_key, model_name=model_name - ) - embedding = openai(input) - return embedding diff --git a/swarms/memory/chroma.py b/swarms/memory/chroma.py index dc0399ef..422d0a67 100644 --- a/swarms/memory/chroma.py +++ b/swarms/memory/chroma.py @@ -17,7 +17,7 @@ from typing import ( import numpy as np from swarms.structs.document import Document -from swarms.embeddings.base import Embeddings +from swarms.models.embeddings_base import Embeddings from langchain.schema.vectorstore import VectorStore from langchain.utils import xor_args from langchain.vectorstores.utils import maximal_marginal_relevance diff --git a/swarms/embeddings/base.py b/swarms/models/embeddings_base.py similarity index 100% rename from swarms/embeddings/base.py rename to swarms/models/embeddings_base.py diff --git a/swarms/models/huggingface.py b/swarms/models/huggingface.py index 5b12bc76..f07edad3 100644 --- a/swarms/models/huggingface.py +++ b/swarms/models/huggingface.py @@ -3,6 +3,7 @@ import logging import torch from torch.nn.parallel import DistributedDataParallel as DDP from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from termcolor import colored class HuggingfaceLLM: @@ -20,13 +21,13 @@ class HuggingfaceLLM: # Usage ``` - from finetuning_suite import Inference + from swarms.models import HuggingfaceLLM model_id = "gpt2-small" - inference = Inference(model_id=model_id) + inference = HuggingfaceLLM(model_id=model_id) - prompt_text = "Once upon a time" - generated_text = inference(prompt_text) + task = "Once upon a time" + generated_text = inference(task) print(generated_text) ``` """ @@ -42,6 +43,8 @@ class HuggingfaceLLM: # logger=None, distributed=False, decoding=False, + *args, + **kwargs, ): self.logger = logging.getLogger(__name__) self.device = ( @@ -53,7 +56,6 @@ class HuggingfaceLLM: self.distributed = distributed self.decoding = decoding self.model, self.tokenizer = None, None - # self.log = Logging() if self.distributed: assert ( @@ -104,12 +106,12 @@ class HuggingfaceLLM: self.logger.error(f"Failed to load the model or the tokenizer: {error}") raise - def run(self, prompt_text: str): + def run(self, task: str): """ Generate a response based on the prompt text. Args: - - prompt_text (str): Text to prompt the model. + - task (str): Text to prompt the model. - max_length (int): Maximum length of the response. Returns: @@ -119,10 +121,10 @@ class HuggingfaceLLM: max_length = self.max_length + self.print_dashboard(task) + try: - inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( - self.device - ) + inputs = self.tokenizer.encode(task, return_tensors="pt").to(self.device) # self.log.start() @@ -181,12 +183,12 @@ class HuggingfaceLLM: # Wrapping synchronous calls with async return self.run(task, *args, **kwargs) - def __call__(self, prompt_text: str): + def __call__(self, task: str): """ Generate a response based on the prompt text. Args: - - prompt_text (str): Text to prompt the model. + - task (str): Text to prompt the model. - max_length (int): Maximum length of the response. Returns: @@ -194,12 +196,12 @@ class HuggingfaceLLM: """ self.load_model() - max_length = self.max_ + max_length = self.max_length + + self.print_dashboard(task) try: - inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( - self.device - ) + inputs = self.tokenizer.encode(task, return_tensors="pt").to(self.device) # self.log.start() @@ -258,3 +260,37 @@ class HuggingfaceLLM: return {"allocated": allocated, "reserved": reserved} else: return {"error": "GPU not available"} + + def print_dashboard(self, task: str): + """Print dashboard""" + + dashboard = print( + colored( + f""" + HuggingfaceLLM Dashboard + -------------------------------------------- + Model Name: {self.model_id} + Tokenizer: {self.tokenizer} + Model MaxLength: {self.max_length} + Model Device: {self.device} + Model Quantization: {self.quantize} + Model Quantization Config: {self.quantization_config} + Model Verbose: {self.verbose} + Model Distributed: {self.distributed} + Model Decoding: {self.decoding} + + ---------------------------------------- + Metadata: + Task Memory Consumption: {self.memory_consumption()} + GPU Available: {self.gpu_available()} + ---------------------------------------- + + Task Environment: + Task: {task} + + """, + "red", + ) + ) + + print(dashboard) diff --git a/swarms/models/jina_embeds.py b/swarms/models/jina_embeds.py new file mode 100644 index 00000000..a72b8a9e --- /dev/null +++ b/swarms/models/jina_embeds.py @@ -0,0 +1,214 @@ +import logging + +import torch +from numpy.linalg import norm +from torch.nn.parallel import DistributedDataParallel as DDP +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + + +def cos_sim(a, b): + return a @ b.T / (norm(a) * norm(b)) + + +class JinaEmbeddings: + """ + A class for running inference on a given model. + + Attributes: + model_id (str): The ID of the model. + device (str): The device to run the model on (either 'cuda' or 'cpu'). + max_length (int): The maximum length of the output sequence. + quantize (bool, optional): Whether to use quantization. Defaults to False. + quantization_config (dict, optional): The configuration for quantization. + verbose (bool, optional): Whether to print verbose logs. Defaults to False. + logger (logging.Logger, optional): The logger to use. Defaults to a basic logger. + + # Usage + ``` + from swarms.models import JinaEmbeddings + + model = JinaEmbeddings() + + embeddings = model("Encode this text") + + print(embeddings) + + + ``` + """ + + def __init__( + self, + model_id: str, + device: str = None, + max_length: int = 500, + quantize: bool = False, + quantization_config: dict = None, + verbose=False, + # logger=None, + distributed=False, + decoding=False, + cos_sim: bool = False, + *args, + **kwargs, + ): + self.logger = logging.getLogger(__name__) + self.device = ( + device if device else ("cuda" if torch.cuda.is_available() else "cpu") + ) + self.model_id = model_id + self.max_length = max_length + self.verbose = verbose + self.distributed = distributed + self.decoding = decoding + self.model, self.tokenizer = None, None + # self.log = Logging() + self.cos_sim = cos_sim + + if self.distributed: + assert ( + torch.cuda.device_count() > 1 + ), "You need more than 1 gpu for distributed processing" + + bnb_config = None + if quantize: + if not quantization_config: + quantization_config = { + "load_in_4bit": True, + "bnb_4bit_use_double_quant": True, + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_compute_dtype": torch.bfloat16, + } + bnb_config = BitsAndBytesConfig(**quantization_config) + + try: + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, quantization_config=bnb_config, trust_remote_code=True + ) + + self.model # .to(self.device) + except Exception as e: + self.logger.error(f"Failed to load the model or the tokenizer: {e}") + raise + + def load_model(self): + """Load the model""" + if not self.model or not self.tokenizer: + try: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + + bnb_config = ( + BitsAndBytesConfig(**self.quantization_config) + if self.quantization_config + else None + ) + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, + quantization_config=bnb_config, + trust_remote_code=True, + ).to(self.device) + + if self.distributed: + self.model = DDP(self.model) + except Exception as error: + self.logger.error(f"Failed to load the model or the tokenizer: {error}") + raise + + def run(self, task: str): + """ + Generate a response based on the prompt text. + + Args: + - task (str): Text to prompt the model. + - max_length (int): Maximum length of the response. + + Returns: + - Generated text (str). + """ + self.load_model() + + max_length = self.max_length + + try: + embeddings = self.model.encode([task], max_length=max_length) + + if self.cos_sim: + print(cos_sim(embeddings[0], embeddings[1])) + else: + return embeddings[0] + except Exception as e: + self.logger.error(f"Failed to generate the text: {e}") + raise + + async def run_async(self, task: str, *args, **kwargs) -> str: + """ + Run the model asynchronously + + Args: + task (str): Task to run. + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Examples: + >>> mpt_instance = MPT('mosaicml/mpt-7b-storywriter', "EleutherAI/gpt-neox-20b", max_tokens=150) + >>> mpt_instance("generate", "Once upon a time in a land far, far away...") + 'Once upon a time in a land far, far away...' + >>> mpt_instance.batch_generate(["In the deep jungles,", "At the heart of the city,"], temperature=0.7) + ['In the deep jungles,', + 'At the heart of the city,'] + >>> mpt_instance.freeze_model() + >>> mpt_instance.unfreeze_model() + + """ + # Wrapping synchronous calls with async + return self.run(task, *args, **kwargs) + + def __call__(self, task: str): + """ + Generate a response based on the prompt text. + + Args: + - task (str): Text to prompt the model. + - max_length (int): Maximum length of the response. + + Returns: + - Generated text (str). + """ + self.load_model() + + max_length = self.max_length + + try: + embeddings = self.model.encode([task], max_length=max_length) + + if self.cos_sim: + print(cos_sim(embeddings[0], embeddings[1])) + else: + return embeddings[0] + except Exception as e: + self.logger.error(f"Failed to generate the text: {e}") + raise + + async def __call_async__(self, task: str, *args, **kwargs) -> str: + """Call the model asynchronously""" "" + return await self.run_async(task, *args, **kwargs) + + def save_model(self, path: str): + """Save the model to a given path""" + self.model.save_pretrained(path) + self.tokenizer.save_pretrained(path) + + def gpu_available(self) -> bool: + """Check if GPU is available""" + return torch.cuda.is_available() + + def memory_consumption(self) -> dict: + """Get the memory consumption of the GPU""" + if self.gpu_available(): + torch.cuda.synchronize() + allocated = torch.cuda.memory_allocated() + reserved = torch.cuda.memory_reserved() + return {"allocated": allocated, "reserved": reserved} + else: + return {"error": "GPU not available"} diff --git a/swarms/embeddings/openai.py b/swarms/models/openai_embeddings.py similarity index 99% rename from swarms/embeddings/openai.py rename to swarms/models/openai_embeddings.py index 230dade9..0aa3473d 100644 --- a/swarms/embeddings/openai.py +++ b/swarms/models/openai_embeddings.py @@ -25,7 +25,7 @@ from tenacity import ( stop_after_attempt, wait_exponential, ) -from swarms.embeddings.base import Embeddings +from swarms.models.embeddings_base import Embeddings def get_from_dict_or_env( diff --git a/swarms/embeddings/pegasus.py b/swarms/models/pegasus.py similarity index 100% rename from swarms/embeddings/pegasus.py rename to swarms/models/pegasus.py diff --git a/swarms/embeddings/simple_ada.py b/swarms/models/simple_ada.py similarity index 99% rename from swarms/embeddings/simple_ada.py rename to swarms/models/simple_ada.py index ba0b4cf7..7eb923b4 100644 --- a/swarms/embeddings/simple_ada.py +++ b/swarms/models/simple_ada.py @@ -1,10 +1,9 @@ import openai from dotenv import load_dotenv +from os import getenv load_dotenv() -from os import getenv - def get_ada_embeddings(text: str, model: str = "text-embedding-ada-002"): """ diff --git a/swarms/models/yarn_mistral.py b/swarms/models/yarn_mistral.py new file mode 100644 index 00000000..ebe107a2 --- /dev/null +++ b/swarms/models/yarn_mistral.py @@ -0,0 +1,265 @@ +import logging + +import torch +from torch.nn.parallel import DistributedDataParallel as DDP +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + + +class YarnMistral128: + """ + A class for running inference on a given model. + + Attributes: + model_id (str): The ID of the model. + device (str): The device to run the model on (either 'cuda' or 'cpu'). + max_length (int): The maximum length of the output sequence. + quantize (bool, optional): Whether to use quantization. Defaults to False. + quantization_config (dict, optional): The configuration for quantization. + verbose (bool, optional): Whether to print verbose logs. Defaults to False. + logger (logging.Logger, optional): The logger to use. Defaults to a basic logger. + + # Usage + ``` + from finetuning_suite import Inference + + model_id = "gpt2-small" + inference = Inference(model_id=model_id) + + prompt_text = "Once upon a time" + generated_text = inference(prompt_text) + print(generated_text) + ``` + """ + + def __init__( + self, + model_id: str = "NousResearch/Yarn-Mistral-7b-128k", + device: str = None, + max_length: int = 500, + quantize: bool = False, + quantization_config: dict = None, + verbose=False, + # logger=None, + distributed=False, + decoding=False, + ): + self.logger = logging.getLogger(__name__) + self.device = ( + device if device else ("cuda" if torch.cuda.is_available() else "cpu") + ) + self.model_id = model_id + self.max_length = max_length + self.verbose = verbose + self.distributed = distributed + self.decoding = decoding + self.model, self.tokenizer = None, None + # self.log = Logging() + + if self.distributed: + assert ( + torch.cuda.device_count() > 1 + ), "You need more than 1 gpu for distributed processing" + + bnb_config = None + if quantize: + if not quantization_config: + quantization_config = { + "load_in_4bit": True, + "bnb_4bit_use_double_quant": True, + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_compute_dtype": torch.bfloat16, + } + bnb_config = BitsAndBytesConfig(**quantization_config) + + try: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, + quantization_config=bnb_config, + use_flash_attention_2=True, + torch_dtype=torch.bfloat16, + device_map="auto", + trust_remote_code=True, + ) + + self.model # .to(self.device) + except Exception as e: + self.logger.error(f"Failed to load the model or the tokenizer: {e}") + raise + + def load_model(self): + """Load the model""" + if not self.model or not self.tokenizer: + try: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + + bnb_config = ( + BitsAndBytesConfig(**self.quantization_config) + if self.quantization_config + else None + ) + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, quantization_config=bnb_config + ).to(self.device) + + if self.distributed: + self.model = DDP(self.model) + except Exception as error: + self.logger.error(f"Failed to load the model or the tokenizer: {error}") + raise + + def run(self, prompt_text: str): + """ + Generate a response based on the prompt text. + + Args: + - prompt_text (str): Text to prompt the model. + - max_length (int): Maximum length of the response. + + Returns: + - Generated text (str). + """ + self.load_model() + + max_length = self.max_length + + try: + inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( + self.device + ) + + # self.log.start() + + if self.decoding: + with torch.no_grad(): + for _ in range(max_length): + output_sequence = [] + + outputs = self.model.generate( + inputs, max_length=len(inputs) + 1, do_sample=True + ) + output_tokens = outputs[0][-1] + output_sequence.append(output_tokens.item()) + + # print token in real-time + print( + self.tokenizer.decode( + [output_tokens], skip_special_tokens=True + ), + end="", + flush=True, + ) + inputs = outputs + else: + with torch.no_grad(): + outputs = self.model.generate( + inputs, max_length=max_length, do_sample=True + ) + + del inputs + return self.tokenizer.decode(outputs[0], skip_special_tokens=True) + except Exception as e: + self.logger.error(f"Failed to generate the text: {e}") + raise + + async def run_async(self, task: str, *args, **kwargs) -> str: + """ + Run the model asynchronously + + Args: + task (str): Task to run. + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Examples: + >>> mpt_instance = MPT('mosaicml/mpt-7b-storywriter', "EleutherAI/gpt-neox-20b", max_tokens=150) + >>> mpt_instance("generate", "Once upon a time in a land far, far away...") + 'Once upon a time in a land far, far away...' + >>> mpt_instance.batch_generate(["In the deep jungles,", "At the heart of the city,"], temperature=0.7) + ['In the deep jungles,', + 'At the heart of the city,'] + >>> mpt_instance.freeze_model() + >>> mpt_instance.unfreeze_model() + + """ + # Wrapping synchronous calls with async + return self.run(task, *args, **kwargs) + + def __call__(self, prompt_text: str): + """ + Generate a response based on the prompt text. + + Args: + - prompt_text (str): Text to prompt the model. + - max_length (int): Maximum length of the response. + + Returns: + - Generated text (str). + """ + self.load_model() + + max_length = self.max_ + + try: + inputs = self.tokenizer.encode(prompt_text, return_tensors="pt").to( + self.device + ) + + # self.log.start() + + if self.decoding: + with torch.no_grad(): + for _ in range(max_length): + output_sequence = [] + + outputs = self.model.generate( + inputs, max_length=len(inputs) + 1, do_sample=True + ) + output_tokens = outputs[0][-1] + output_sequence.append(output_tokens.item()) + + # print token in real-time + print( + self.tokenizer.decode( + [output_tokens], skip_special_tokens=True + ), + end="", + flush=True, + ) + inputs = outputs + else: + with torch.no_grad(): + outputs = self.model.generate( + inputs, max_length=max_length, do_sample=True + ) + + del inputs + + return self.tokenizer.decode(outputs[0], skip_special_tokens=True) + except Exception as e: + self.logger.error(f"Failed to generate the text: {e}") + raise + + async def __call_async__(self, task: str, *args, **kwargs) -> str: + """Call the model asynchronously""" "" + return await self.run_async(task, *args, **kwargs) + + def save_model(self, path: str): + """Save the model to a given path""" + self.model.save_pretrained(path) + self.tokenizer.save_pretrained(path) + + def gpu_available(self) -> bool: + """Check if GPU is available""" + return torch.cuda.is_available() + + def memory_consumption(self) -> dict: + """Get the memory consumption of the GPU""" + if self.gpu_available(): + torch.cuda.synchronize() + allocated = torch.cuda.memory_allocated() + reserved = torch.cuda.memory_reserved() + return {"allocated": allocated, "reserved": reserved} + else: + return {"error": "GPU not available"} diff --git a/swarms/structs/flow.py b/swarms/structs/flow.py index 8d7a09ed..8601b8dd 100644 --- a/swarms/structs/flow.py +++ b/swarms/structs/flow.py @@ -1,9 +1,9 @@ """ TODO: -- Add a retry mechanism -- Add prompt injection letting the agent know it's in a flow, Flow prompt -- Dynamic temperature handling - +- Add tools +- Add open interpreter style conversation +- Add configurable save and restore so the user can restore from previus flows +- Add memory vector database retrieval """ import json @@ -252,7 +252,8 @@ class Flow: History: {response} - """, **kwargs + """, + **kwargs, ) # print(f"Next query: {response}") # break diff --git a/swarms/structs/sequential_workflow.py b/swarms/structs/sequential_workflow.py new file mode 100644 index 00000000..2df95c07 --- /dev/null +++ b/swarms/structs/sequential_workflow.py @@ -0,0 +1,20 @@ +""" +Sequential Workflow + +from swarms.models import OpenAIChat, Mistral +from swarms.structs import SequentialWorkflow + + +llm = OpenAIChat(openai_api_key="") +mistral = Mistral() + +# Max loops will run over the sequential pipeline twice +workflow = SequentialWorkflow(max_loops=2) + +workflow.add("What's the weather in miami", llm) + +workflow.add("Create a report on these metrics", mistral) + +workflow.run() + +""" diff --git a/tests/embeddings/pegasus.py b/tests/embeddings/pegasus.py index d1e901dc..e9632eae 100644 --- a/tests/embeddings/pegasus.py +++ b/tests/embeddings/pegasus.py @@ -1,6 +1,6 @@ import pytest from unittest.mock import patch -from swarms.embeddings.pegasus import PegasusEmbedding +from swarms.models.pegasus import PegasusEmbedding def test_init():