diff --git a/pyproject.toml b/pyproject.toml index 1f443c1d..3ce089f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarms" -version = "4.0.2" +version = "4.0.3" description = "Swarms - Pytorch" license = "MIT" authors = ["Kye Gomez "] @@ -21,6 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.10" ] + [tool.poetry.dependencies] python = ">=3.9.1,<3.12" torch = "2.1.1" diff --git a/swarms/__init__.py b/swarms/__init__.py index dd28a374..aaa6cc83 100644 --- a/swarms/__init__.py +++ b/swarms/__init__.py @@ -1,6 +1,9 @@ """ init file for swarms package. """ # from swarms.telemetry.main import Telemetry # noqa: E402, F403 from swarms.telemetry.bootup import bootup # noqa: E402, F403 +from swarms.telemetry.user_utils import ( + get_user_device_data, +) # noqa: E402, F403 bootup() @@ -14,3 +17,5 @@ from swarms.tokenizers import * # noqa: E402, F403, C0413 from swarms.loaders import * # noqa: E402, F403, C0413 from swarms.artifacts import * # noqa: E402, F403, C0413 from swarms.chunkers import * # noqa: E402, F403, C0413 + +get_user_device_data() diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index 70d4f947..00d9d1f2 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -44,7 +44,8 @@ from swarms.models.timm import TimmModel # noqa: E402 from swarms.models.ultralytics_model import ( UltralyticsModel, ) # noqa: E402 -from swarms.models.vip_llava import VipLlavaMultiModal # noqa: E402 + +# from swarms.models.vip_llava import VipLlavaMultiModal # noqa: E402 from swarms.models.llava import LavaMultiModal # noqa: E402 from swarms.models.qwen import QwenVLMultiModal # noqa: E402 from swarms.models.clipq import CLIPQ # noqa: E402 @@ -72,11 +73,16 @@ from swarms.models.types import ( MultimodalData, ) # noqa: E402 - # 3############ Embedding models from swarms.models.base_embedding_model import BaseEmbeddingModel +##### Utils +from swarms.models.sampling_params import ( + SamplingType, + SamplingParams, +) # noqa: E402 + __all__ = [ "AbstractLLM", "Anthropic", @@ -113,7 +119,7 @@ __all__ = [ "TogetherLLM", "TimmModel", "UltralyticsModel", - "VipLlavaMultiModal", + # "VipLlavaMultiModal", "LavaMultiModal", "QwenVLMultiModal", "CLIPQ", @@ -122,4 +128,6 @@ __all__ = [ "BaseEmbeddingModel", "RoboflowMultiModal", "SegmentAnythingMarkGenerator", + "SamplingType", + "SamplingParams", ] diff --git a/swarms/models/inference_engine.py b/swarms/models/inference_engine.py new file mode 100644 index 00000000..e69de29b diff --git a/swarms/models/model_registry.py b/swarms/models/model_registry.py new file mode 100644 index 00000000..6da04282 --- /dev/null +++ b/swarms/models/model_registry.py @@ -0,0 +1,82 @@ +import pkgutil +import inspect + + +class ModelRegistry: + """ + A registry for storing and querying models. + + Attributes: + models (dict): A dictionary of model names and corresponding model classes. + + Methods: + __init__(): Initializes the ModelRegistry object and retrieves all available models. + _get_all_models(): Retrieves all available models from the models package. + query(text): Queries the models based on the given text and returns a dictionary of matching models. + """ + + def __init__(self): + self.models = self._get_all_models() + + def _get_all_models(self): + """ + Retrieves all available models from the models package. + + Returns: + dict: A dictionary of model names and corresponding model classes. + """ + models = {} + for importer, modname, ispkg in pkgutil.iter_modules( + models.__path__ + ): + module = importer.find_module(modname).load_module( + modname + ) + for name, obj in inspect.getmembers(module): + if inspect.isclass(obj): + models[name] = obj + return models + + def query(self, text): + """ + Queries the models based on the given text and returns a dictionary of matching models. + + Args: + text (str): The text to search for in the model names. + + Returns: + dict: A dictionary of matching model names and corresponding model classes. + """ + return { + name: model + for name, model in self.models.items() + if text in name + } + + def run_model( + self, model_name: str, task: str, img: str, *args, **kwargs + ): + """ + Runs the specified model for the given task and image. + + Args: + model_name (str): The name of the model to run. + task (str): The task to perform using the model. + img (str): The image to process. + *args: Additional positional arguments to pass to the model's run method. + **kwargs: Additional keyword arguments to pass to the model's run method. + + Returns: + The result of running the model. + + Raises: + ValueError: If the specified model is not found in the model registry. + """ + if model_name not in self.models: + raise ValueError(f"Model {model_name} not found") + + # Get the model + model = self.models[model_name] + + # Run the model + return model.run(task, img, *args, **kwargs) diff --git a/swarms/models/qwen.py b/swarms/models/qwen.py index 1533b117..b5a4ed1a 100644 --- a/swarms/models/qwen.py +++ b/swarms/models/qwen.py @@ -13,17 +13,28 @@ class QwenVLMultiModal(BaseMultiModalModel): QwenVLMultiModal is a class that represents a multi-modal model for Qwen chatbot. It inherits from the BaseMultiModalModel class. - Examples: - >>> model = QwenVLMultiModal() - >>> model.run("Hello, how are you?", "https://example.com/image.jpg") + Args: + model_name (str): The name of the model to be used. + device (str): The device to run the model on. + args (tuple): Additional positional arguments. + kwargs (dict): Additional keyword arguments. + quantize (bool): A flag to indicate whether to quantize the model. + return_bounding_boxes (bool): A flag to indicate whether to return bounding boxes for the image. + + + Examples: + >>> qwen = QwenVLMultiModal() + >>> response = qwen.run("Hello", "https://example.com/image.jpg") + >>> print(response) """ - model_name: str = "Qwen/Qwen-VL-Chat" + model_name: str = "Qwen/Qwen-VL" device: str = "cuda" args: tuple = field(default_factory=tuple) kwargs: dict = field(default_factory=dict) quantize: bool = False + return_bounding_boxes: bool = False def __post_init__(self): """ @@ -60,19 +71,44 @@ class QwenVLMultiModal(BaseMultiModalModel): and the image associated with the response (if any). """ try: - query = self.tokenizer.from_list_format( - [ - {"image": img, "text": text}, - ] - ) - - inputs = self.tokenizer(query, return_tensors="pt") - inputs = inputs.to(self.model.device) - pred = self.model.generate(**inputs) - response = self.tokenizer.decode( - pred.cpu()[0], skip_special_tokens=False - ) - return response + if self.return_bounding_boxes: + query = self.tokenizer.from_list_format( + [ + {"image": img, "text": text}, + ] + ) + + inputs = self.tokenizer(query, return_tensors="pt") + inputs = inputs.to(self.model.device) + pred = self.model.generate(**inputs) + response = self.tokenizer.decode( + pred.cpu()[0], skip_special_tokens=False + ) + + image_bb = self.tokenizer.draw_bbox_on_latest_picture( + response + ) + + if image_bb: + image_bb.save("output.jpg") + else: + print("No bounding boxes found in the image.") + + return response, image_bb + else: + query = self.tokenizer.from_list_format( + [ + {"image": img, "text": text}, + ] + ) + + inputs = self.tokenizer(query, return_tensors="pt") + inputs = inputs.to(self.model.device) + pred = self.model.generate(**inputs) + response = self.tokenizer.decode( + pred.cpu()[0], skip_special_tokens=False + ) + return response except Exception as error: print(f"[ERROR]: [QwenVLMultiModal]: {error}") diff --git a/swarms/models/sampling_params.py b/swarms/models/sampling_params.py new file mode 100644 index 00000000..c2fdd121 --- /dev/null +++ b/swarms/models/sampling_params.py @@ -0,0 +1,299 @@ +"""Sampling parameters for text generation.""" +from enum import IntEnum +from functools import cached_property +from typing import Callable, List, Optional, Union + +import torch + +_SAMPLING_EPS = 1e-5 + + +class SamplingType(IntEnum): + GREEDY = 0 + RANDOM = 1 + BEAM = 2 + + +LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor] +"""LogitsProcessor is a function that takes a list of previously generated +tokens and a tensor of the logits for the next token, and returns a modified +tensor of logits to sample from.""" + + +class SamplingParams: + """Sampling parameters for text generation. + + Overall, we follow the sampling parameters from the OpenAI text completion + API (https://platform.openai.com/docs/api-reference/completions/create). + In addition, we support beam search, which is not supported by OpenAI. + + Args: + n: Number of output sequences to return for the given prompt. + best_of: Number of output sequences that are generated from the prompt. + From these `best_of` sequences, the top `n` sequences are returned. + `best_of` must be greater than or equal to `n`. This is treated as + the beam width when `use_beam_search` is True. By default, `best_of` + is set to `n`. + presence_penalty: Float that penalizes new tokens based on whether they + appear in the generated text so far. Values > 0 encourage the model + to use new tokens, while values < 0 encourage the model to repeat + tokens. + frequency_penalty: Float that penalizes new tokens based on their + frequency in the generated text so far. Values > 0 encourage the + model to use new tokens, while values < 0 encourage the model to + repeat tokens. + repetition_penalty: Float that penalizes new tokens based on whether + they appear in the prompt and the generated text so far. Values > 1 + encourage the model to use new tokens, while values < 1 encourage + the model to repeat tokens. + temperature: Float that controls the randomness of the sampling. Lower + values make the model more deterministic, while higher values make + the model more random. Zero means greedy sampling. + top_p: Float that controls the cumulative probability of the top tokens + to consider. Must be in (0, 1]. Set to 1 to consider all tokens. + top_k: Integer that controls the number of top tokens to consider. Set + to -1 to consider all tokens. + min_p: Float that represents the minimum probability for a token to be + considered, relative to the probability of the most likely token. + Must be in [0, 1]. Set to 0 to disable this. + use_beam_search: Whether to use beam search instead of sampling. + length_penalty: Float that penalizes sequences based on their length. + Used in beam search. + early_stopping: Controls the stopping condition for beam search. It + accepts the following values: `True`, where the generation stops as + soon as there are `best_of` complete candidates; `False`, where an + heuristic is applied and the generation stops when is it very + unlikely to find better candidates; `"never"`, where the beam search + procedure only stops when there cannot be better candidates + (canonical beam search algorithm). + stop: List of strings that stop the generation when they are generated. + The returned output will not contain the stop strings. + stop_token_ids: List of tokens that stop the generation when they are + generated. The returned output will contain the stop tokens unless + the stop tokens are special tokens. + include_stop_str_in_output: Whether to include the stop strings in output + text. Defaults to False. + ignore_eos: Whether to ignore the EOS token and continue generating + tokens after the EOS token is generated. + max_tokens: Maximum number of tokens to generate per output sequence. + logprobs: Number of log probabilities to return per output token. + Note that the implementation follows the OpenAI API: The return + result includes the log probabilities on the `logprobs` most likely + tokens, as well the chosen tokens. The API will always return the + log probability of the sampled token, so there may be up to + `logprobs+1` elements in the response. + prompt_logprobs: Number of log probabilities to return per prompt token. + skip_special_tokens: Whether to skip special tokens in the output. + spaces_between_special_tokens: Whether to add spaces between special + tokens in the output. Defaults to True. + logits_processors: List of functions that modify logits based on + previously generated tokens. + """ + + def __init__( + self, + n: int = 1, + best_of: Optional[int] = None, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repetition_penalty: float = 1.0, + temperature: float = 1.0, + top_p: float = 1.0, + top_k: int = -1, + min_p: float = 0.0, + use_beam_search: bool = False, + length_penalty: float = 1.0, + early_stopping: Union[bool, str] = False, + stop: Optional[Union[str, List[str]]] = None, + stop_token_ids: Optional[List[int]] = None, + include_stop_str_in_output: bool = False, + ignore_eos: bool = False, + max_tokens: Optional[int] = 16, + logprobs: Optional[int] = None, + prompt_logprobs: Optional[int] = None, + skip_special_tokens: bool = True, + spaces_between_special_tokens: bool = True, + logits_processors: Optional[List[LogitsProcessor]] = None, + ) -> None: + self.n = n + self.best_of = best_of if best_of is not None else n + self.presence_penalty = presence_penalty + self.frequency_penalty = frequency_penalty + self.repetition_penalty = repetition_penalty + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k + self.min_p = min_p + self.use_beam_search = use_beam_search + self.length_penalty = length_penalty + self.early_stopping = early_stopping + if stop is None: + self.stop = [] + elif isinstance(stop, str): + self.stop = [stop] + else: + self.stop = list(stop) + if stop_token_ids is None: + self.stop_token_ids = [] + else: + self.stop_token_ids = list(stop_token_ids) + self.ignore_eos = ignore_eos + self.max_tokens = max_tokens + self.logprobs = logprobs + self.prompt_logprobs = prompt_logprobs + self.skip_special_tokens = skip_special_tokens + self.spaces_between_special_tokens = ( + spaces_between_special_tokens + ) + self.logits_processors = logits_processors + self.include_stop_str_in_output = include_stop_str_in_output + self._verify_args() + if self.use_beam_search: + self._verify_beam_search() + else: + self._verify_non_beam_search() + if self.temperature < _SAMPLING_EPS: + # Zero temperature means greedy sampling. + self.top_p = 1.0 + self.top_k = -1 + self.min_p = 0.0 + self._verify_greedy_sampling() + + def _verify_args(self) -> None: + if self.n < 1: + raise ValueError(f"n must be at least 1, got {self.n}.") + if self.best_of < self.n: + raise ValueError( + "best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}." + ) + if not -2.0 <= self.presence_penalty <= 2.0: + raise ValueError( + "presence_penalty must be in [-2, 2], got " + f"{self.presence_penalty}." + ) + if not -2.0 <= self.frequency_penalty <= 2.0: + raise ValueError( + "frequency_penalty must be in [-2, 2], got " + f"{self.frequency_penalty}." + ) + if not 0.0 < self.repetition_penalty <= 2.0: + raise ValueError( + "repetition_penalty must be in (0, 2], got " + f"{self.repetition_penalty}." + ) + if self.temperature < 0.0: + raise ValueError( + "temperature must be non-negative, got" + f" {self.temperature}." + ) + if not 0.0 < self.top_p <= 1.0: + raise ValueError( + f"top_p must be in (0, 1], got {self.top_p}." + ) + if self.top_k < -1 or self.top_k == 0: + raise ValueError( + "top_k must be -1 (disable), or at least 1, " + f"got {self.top_k}." + ) + if not 0.0 <= self.min_p <= 1.0: + raise ValueError( + f"min_p must be in [0, 1], got {self.min_p}." + ) + if self.max_tokens is not None and self.max_tokens < 1: + raise ValueError( + "max_tokens must be at least 1, got" + f" {self.max_tokens}." + ) + if self.logprobs is not None and self.logprobs < 0: + raise ValueError( + f"logprobs must be non-negative, got {self.logprobs}." + ) + if ( + self.prompt_logprobs is not None + and self.prompt_logprobs < 0 + ): + raise ValueError( + "prompt_logprobs must be non-negative, got " + f"{self.prompt_logprobs}." + ) + + def _verify_beam_search(self) -> None: + if self.best_of == 1: + raise ValueError( + "best_of must be greater than 1 when using beam " + f"search. Got {self.best_of}." + ) + if self.temperature > _SAMPLING_EPS: + raise ValueError( + "temperature must be 0 when using beam search." + ) + if self.top_p < 1.0 - _SAMPLING_EPS: + raise ValueError( + "top_p must be 1 when using beam search." + ) + if self.top_k != -1: + raise ValueError( + "top_k must be -1 when using beam search." + ) + if self.early_stopping not in [True, False, "never"]: + raise ValueError( + "early_stopping must be True, False, or 'never', " + f"got {self.early_stopping}." + ) + + def _verify_non_beam_search(self) -> None: + if self.early_stopping is not False: + raise ValueError( + "early_stopping is not effective and must be " + "False when not using beam search." + ) + if ( + self.length_penalty < 1.0 - _SAMPLING_EPS + or self.length_penalty > 1.0 + _SAMPLING_EPS + ): + raise ValueError( + "length_penalty is not effective and must be the " + "default value of 1.0 when not using beam search." + ) + + def _verify_greedy_sampling(self) -> None: + if self.best_of > 1: + raise ValueError( + "best_of must be 1 when using greedy sampling." + f"Got {self.best_of}." + ) + + @cached_property + def sampling_type(self) -> SamplingType: + if self.use_beam_search: + return SamplingType.BEAM + if self.temperature < _SAMPLING_EPS: + return SamplingType.GREEDY + return SamplingType.RANDOM + + def __repr__(self) -> str: + return ( + f"SamplingParams(n={self.n}, " + f"best_of={self.best_of}, " + f"presence_penalty={self.presence_penalty}, " + f"frequency_penalty={self.frequency_penalty}, " + f"repetition_penalty={self.repetition_penalty}, " + f"temperature={self.temperature}, " + f"top_p={self.top_p}, " + f"top_k={self.top_k}, " + f"min_p={self.min_p}, " + f"use_beam_search={self.use_beam_search}, " + f"length_penalty={self.length_penalty}, " + f"early_stopping={self.early_stopping}, " + f"stop={self.stop}, " + f"stop_token_ids={self.stop_token_ids}, " + f"include_stop_str_in_output={self.include_stop_str_in_output}, " + f"ignore_eos={self.ignore_eos}, " + f"max_tokens={self.max_tokens}, " + f"logprobs={self.logprobs}, " + f"prompt_logprobs={self.prompt_logprobs}, " + f"skip_special_tokens={self.skip_special_tokens}, " + "spaces_between_special_tokens=" + f"{self.spaces_between_special_tokens})" + ) diff --git a/swarms/structs/__init__.py b/swarms/structs/__init__.py index 11e9e523..15b9ef1e 100644 --- a/swarms/structs/__init__.py +++ b/swarms/structs/__init__.py @@ -17,8 +17,12 @@ from swarms.structs.recursive_workflow import RecursiveWorkflow from swarms.structs.schemas import ( Artifact, ArtifactUpload, + Step, StepInput, + StepOutput, + StepRequestBody, TaskInput, + TaskRequestBody, ) from swarms.structs.sequential_workflow import SequentialWorkflow from swarms.structs.step import Step @@ -105,4 +109,7 @@ __all__ = [ "sigmoid_swarm", "staircase_swarm", "star_swarm", + "StepOutput", + "StepRequestBody", + "TaskRequestBody", ] diff --git a/swarms/structs/task_tree.py b/swarms/structs/task_tree.py new file mode 100644 index 00000000..ec89d150 --- /dev/null +++ b/swarms/structs/task_tree.py @@ -0,0 +1,80 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def continuous_tensor( + inputs: torch.Tensor, seq_length: torch.LongTensor +): + """Convert batched tensor to continuous tensor. + + Args: + inputs (Tensor): batched tensor. + seq_length (Tensor): length of each sequence. + + Return: + Tensor: continuoused tensor. + """ + assert inputs.dim() > 1 + if inputs.size(1) == 1: + return inputs.reshape(1, -1) + + inputs = [inp[:slen] for inp, slen in zip(inputs, seq_length)] + + inputs = torch.cat(inputs).unsqueeze(0) + return inputs + + +def batch_tensor(inputs: torch.Tensor, seq_length: torch.LongTensor): + """Convert continuoused tensor to batched tensor. + + Args: + inputs (Tensor): continuoused tensor. + seq_length (Tensor): length of each sequence. + + Return: + Tensor: batched tensor. + """ + from torch.nn.utils.rnn import pad_sequence + + end_loc = seq_length.cumsum(0) + start_loc = end_loc - seq_length + + inputs = [ + inputs[0, sloc:eloc] for sloc, eloc in zip(start_loc, end_loc) + ] + inputs = pad_sequence(inputs, batch_first=True) + return inputs + + +def page_cache( + paged_cache: torch.Tensor, + batched_cache: torch.Tensor, + cache_length: torch.Tensor, + block_offsets: torch.Tensor, + permute_head: bool = True, +): + """Convert batched cache to paged cache. + + Args: + paged_cache (Tensor): Output paged cache. + batched_cache (Tensor): Input batched cache. + cache_length (Tensor): length of the cache. + block_offsets (Tensor): Offset of each blocks. + """ + assert block_offsets.dim() == 2 + block_size = paged_cache.size(1) + batch_size = batched_cache.size(0) + if permute_head: + batched_cache = batched_cache.permute(0, 2, 1, 3) + + for b_idx in range(batch_size): + cache_len = cache_length[b_idx] + b_cache = batched_cache[b_idx] + block_off = block_offsets[b_idx] + block_off_idx = 0 + for s_start in range(0, cache_len, block_size): + s_end = min(s_start + block_size, cache_len) + s_len = s_end - s_start + b_off = block_off[block_off_idx] + paged_cache[b_off, :s_len] = b_cache[s_start:s_end] + block_off_idx += 1 diff --git a/swarms/utils/dist_utils.py b/swarms/utils/dist_utils.py new file mode 100644 index 00000000..cb766369 --- /dev/null +++ b/swarms/utils/dist_utils.py @@ -0,0 +1,315 @@ +from typing import Callable, Union + +import torch +from torch import Tensor, nn +from torch.distributed._tensor import ( + DeviceMesh, + DTensor, + Replicate, + Shard, + distribute_tensor, +) +from zeta.nn import QuantizedLN + + +try: + from peft.tuners.lora import Linear as LoRALinear +except ImportError: + + class LoRALinear: + pass + + +def try_to_local(tensor: Union[Tensor, DTensor]): + """Try to convert DTensor to Tensor. + + Args: + tensor (Tensor|DTensor): Tensor to convert. + """ + if isinstance(tensor, DTensor): + tensor = tensor.to_local() + return tensor + + +def module_to_local(module: nn.Module): + """convert all DTensor parameters to Tensor parameters in module. + + Args: + module (Module): Module to convert. + """ + for name, mod in module.named_children(): + module_to_local(mod) + + for name, param in module.named_parameters(recurse=False): + module.register_parameter( + name, nn.Parameter(try_to_local(param)) + ) + + for name, buf in module.named_buffers(recurse=False): + module.register_buffer(name, try_to_local(buf)) + + +def rowwise_parallelize_linear( + module: nn.Module, device_mesh: DeviceMesh, to_local: bool = False +) -> None: + """ + This function parallelizes the input :class:`nn.Linear` module in + :class:`RowwiseParallel` style. + + Args: + module (:class:`nn.Module`): + The :class:`nn.Linear` module to be parallelized. + device_mesh (:class:`DeviceMesh`): + Object which describes the mesh topology of devices. + + Returns: + None + """ + for name, param in module.named_parameters(): + dist_spec = ( + [Shard(1)] + if name == "weight" + else [Replicate()] # type: ignore[list-item] + ) + + dist_tensor = distribute_tensor(param, device_mesh, dist_spec) + if to_local: + dist_tensor = try_to_local(dist_tensor) + if name == "bias": + # rowwise linear would add bias more than ones. + dist_tensor /= device_mesh.size() + dist_param = torch.nn.Parameter(dist_tensor) + module.register_parameter(name, dist_param) + + # Weight, bias and scale are registered as buffer in QLinear + for name, buffer in module.named_buffers(): + dist_spec = ( + [Shard(1)] + if name == "weight" + else [Replicate()] # type: ignore[list-item] + ) + + dist_tensor = distribute_tensor( + buffer, device_mesh, dist_spec + ) + if to_local: + dist_tensor = try_to_local(dist_tensor) + if name == "bias": + # rowwise linear would add bias more than ones. + dist_tensor /= device_mesh.size() + module.register_buffer(name, dist_tensor) + + dist_tensor = distribute_tensor( + buffer, device_mesh, dist_spec + ) + if to_local: + dist_tensor = try_to_local(dist_tensor) + module.register_buffer(name, dist_tensor) + + +def rowwise_parallelize_loralinear( + module: LoRALinear, + device_mesh: DeviceMesh, + to_local: bool = False, +) -> None: + """rowwize parallelize lora linear. + + Read S-LoRA for more detail. + """ + rowwise_parallelize_linear( + module.base_layer, device_mesh=device_mesh, to_local=to_local + ) + for mod in module.lora_A.values(): + rowwise_parallelize_linear( + mod, device_mesh=device_mesh, to_local=to_local + ) + for mod in module.lora_B.values(): + colwise_parallelize_linear( + mod, device_mesh=device_mesh, to_local=to_local + ) + module._tp_mode = "rowwise" + + +def rowwise_parallelize_linear_fn( + module: nn.Module, device_mesh: DeviceMesh, to_local: bool = False +) -> None: + """ + This function parallelizes the input :Linear module in + :class:`RowwiseParallel` style. + + Args: + module (:class:`nn.Module`): + The :class:`nn.Linear` module to be parallelized. + device_mesh (:class:`DeviceMesh`): + Object which describes the mesh topology of devices. + + Returns: + None + """ + if isinstance(module, (torch.nn.Linear, QuantizedLN)): + return rowwise_parallelize_linear( + module, device_mesh=device_mesh, to_local=to_local + ) + elif isinstance(module, LoRALinear): + return rowwise_parallelize_loralinear( + module, device_mesh=device_mesh, to_local=to_local + ) + else: + raise TypeError(f"Unsupported module: {type(module)}") + + +def colwise_parallelize_linear( + module: nn.Module, device_mesh: DeviceMesh, to_local: bool = False +) -> None: + """ + This function parallelizes the input :class:`nn.Linear` module in + :class:`ColwiseParallel` style. + + Args: + module (:class:`nn.Module`): + The :class:`nn.Linear` module to be parallelized. + device_mesh (:class:`DeviceMesh`): + Object which describes the mesh topology of devices. + + Returns: + None + """ + + for name, param in module.named_parameters(): + dist_tensor = distribute_tensor( + param, device_mesh, [Shard(0)] + ) + if to_local: + dist_tensor = try_to_local(dist_tensor) + dist_param = torch.nn.Parameter(dist_tensor) + module.register_parameter(name, dist_param) + # Weight, bias and scale are registered as buffer in QLinear + for name, buffer in module.named_buffers(): + dist_tensor = distribute_tensor( + buffer, device_mesh, [Shard(0)] + ) + if to_local: + dist_tensor = try_to_local(dist_tensor) + module.register_buffer(name, dist_tensor) + + +def colwise_parallelize_loralinear( + module: nn.Module, device_mesh: DeviceMesh, to_local: bool = False +) -> None: + """colwise parallelize lora linear.""" + colwise_parallelize_linear( + module.base_layer, device_mesh=device_mesh, to_local=to_local + ) + for mod in module.lora_A.values(): + colwise_parallelize_linear( + mod, device_mesh=device_mesh, to_local=to_local + ) + for mod in module.lora_B.values(): + colwise_parallelize_linear( + mod, device_mesh=device_mesh, to_local=to_local + ) + module._tp_mode = "colwise" + + +def colwise_parallelize_linear_fn( + module: nn.Module, device_mesh: DeviceMesh, to_local: bool = False +) -> None: + """ + This function parallelizes the input :Linear module in + :class:`ColwiseParallel` style. + + Args: + module (:class:`nn.Module`): + The :class:`nn.Linear` module to be parallelized. + device_mesh (:class:`DeviceMesh`): + Object which describes the mesh topology of devices. + + Returns: + None + """ + if isinstance(module, (torch.nn.Linear, QuantizedLN)): + return colwise_parallelize_linear( + module, device_mesh=device_mesh, to_local=to_local + ) + elif isinstance(module, LoRALinear): + return colwise_parallelize_loralinear( + module, device_mesh=device_mesh, to_local=to_local + ) + else: + raise TypeError(f"Unsupported module: {type(module)}") + + +def _partition_module( + mod_name: str, + prefix: str, + module: nn.Module, + device_mesh: DeviceMesh, + func: Callable, +): + """partition module. + + Parameters in module won't be force Replicated. + + Args: + mod_name (str): module name. + prefix (str): Parameter prefix. + module (Module): Module to be partitioned. + device_mesh (DeviceMesh): The device mesh. + func (Callable): partition callback + """ + for name, mod in module.named_children(): + child_name = f"{prefix}{name}" + _partition_module( + child_name, + child_name + ".", + module=mod, + device_mesh=device_mesh, + func=func, + ) + + func(mod_name, module, device_mesh) + + +def partition_module( + module: nn.Module, + device_mesh: DeviceMesh, + func: Callable, + to_local: bool = False, +): + """partition module. + + Parameters in module won't be force Replicated. + + Args: + module (Module): Module to be partitioned. + device_mesh (DeviceMesh): The device mesh. + func (Callable): partition callback. + to_local (bool): Convert all DTensor parameters to Tensor parameters. + """ + _partition_module( + "", "", module=module, device_mesh=device_mesh, func=func + ) + + if to_local: + module_to_local(module) + + +def replicate_module(model: nn.Module, device_mesh: DeviceMesh): + """Replicate all parameters in module. + + Args: + model (Module): Module to perform replicate. + device_mesh (DeviceMesh): The distribution device mesh. + """ + for name, param in model.named_parameters(recurse=False): + param = distribute_tensor( + param, device_mesh=device_mesh, placements=[Replicate()] + ).to_local() + param = nn.Parameter(param) + model.register_parameter(name, param) + + for name, buf in model.named_buffers(recurse=False): + buf = distribute_tensor( + buf, device_mesh=device_mesh, placements=[Replicate()] + ).to_local() + model.register_buffer(name, buf) diff --git a/swarms/utils/inference_convert_utils.py b/swarms/utils/inference_convert_utils.py new file mode 100644 index 00000000..596d222b --- /dev/null +++ b/swarms/utils/inference_convert_utils.py @@ -0,0 +1,79 @@ +import torch + + +def continuous_tensor( + inputs: torch.Tensor, seq_length: torch.LongTensor +): + """Convert batched tensor to continuous tensor. + + Args: + inputs (Tensor): batched tensor. + seq_length (Tensor): length of each sequence. + + Return: + Tensor: continuoused tensor. + """ + assert inputs.dim() > 1 + if inputs.size(1) == 1: + return inputs.reshape(1, -1) + + inputs = [inp[:slen] for inp, slen in zip(inputs, seq_length)] + + inputs = torch.cat(inputs).unsqueeze(0) + return inputs + + +def batch_tensor(inputs: torch.Tensor, seq_length: torch.LongTensor): + """Convert continuoused tensor to batched tensor. + + Args: + inputs (Tensor): continuoused tensor. + seq_length (Tensor): length of each sequence. + + Return: + Tensor: batched tensor. + """ + from torch.nn.utils.rnn import pad_sequence + + end_loc = seq_length.cumsum(0) + start_loc = end_loc - seq_length + + inputs = [ + inputs[0, sloc:eloc] for sloc, eloc in zip(start_loc, end_loc) + ] + inputs = pad_sequence(inputs, batch_first=True) + return inputs + + +def page_cache( + paged_cache: torch.Tensor, + batched_cache: torch.Tensor, + cache_length: torch.Tensor, + block_offsets: torch.Tensor, + permute_head: bool = True, +): + """Convert batched cache to paged cache. + + Args: + paged_cache (Tensor): Output paged cache. + batched_cache (Tensor): Input batched cache. + cache_length (Tensor): length of the cache. + block_offsets (Tensor): Offset of each blocks. + """ + assert block_offsets.dim() == 2 + block_size = paged_cache.size(1) + batch_size = batched_cache.size(0) + if permute_head: + batched_cache = batched_cache.permute(0, 2, 1, 3) + + for b_idx in range(batch_size): + cache_len = cache_length[b_idx] + b_cache = batched_cache[b_idx] + block_off = block_offsets[b_idx] + block_off_idx = 0 + for s_start in range(0, cache_len, block_size): + s_end = min(s_start + block_size, cache_len) + s_len = s_end - s_start + b_off = block_off[block_off_idx] + paged_cache[b_off, :s_len] = b_cache[s_start:s_end] + block_off_idx += 1 diff --git a/tests/models/test_cogagent.py b/tests/models/test_cogagent.py deleted file mode 100644 index ac7fec56..00000000 --- a/tests/models/test_cogagent.py +++ /dev/null @@ -1,71 +0,0 @@ -import pytest -from swarms.models.cog_agent import CogAgent -from unittest.mock import MagicMock -from PIL import Image - - -@pytest.fixture -def cogagent_params(): - return { - "model_name": "ZhipuAI/cogagent-chat", - "tokenizer_name": "I-ModelScope/vicuna-7b-v1.5", - "dtype": "torch.bfloat16", - "low_cpu_mem_usage": True, - "load_in_4bit": True, - "trust_remote_code": True, - "device": "cuda", - } - - -@pytest.fixture -def cogagent(cogagent_params): - return CogAgent(**cogagent_params) - - -def test_init(mocker, cogagent_params, cogagent): - mock_model = mocker.patch( - "swarms.models.cog_agent.AutoModelForCausalLM.from_pretrained" - ) - mock_tokenizer = mocker.patch( - "swarms.models.cog_agent.AutoTokenizer.from_pretrained" - ) - - for param, value in cogagent_params.items(): - assert getattr(cogagent, param) == value - - mock_tokenizer.assert_called_once_with( - cogagent_params["tokenizer_name"] - ) - mock_model.assert_called_once_with( - cogagent_params["model_name"], - torch_dtype=cogagent_params["dtype"], - low_cpu_mem_usage=cogagent_params["low_cpu_mem_usage"], - load_in_4bit=cogagent_params["load_in_4bit"], - trust_remote_code=cogagent_params["trust_remote_code"], - ) - - -def test_run(mocker, cogagent): - task = "How are you?" - img = "images/1.jpg" - mock_image = mocker.patch( - "PIL.Image.open", return_value=MagicMock(spec=Image.Image) - ) - cogagent.model.build_conversation_input_ids = MagicMock( - return_value={ - "input_ids": MagicMock(), - "token_type_ids": MagicMock(), - "attention_mask": MagicMock(), - "images": [MagicMock()], - } - ) - cogagent.model.__call__ = MagicMock(return_value="Mocked output") - cogagent.decode = MagicMock(return_value="Mocked response") - - output = cogagent.run(task, img) - - assert output is not None - mock_image.assert_called_once_with(img) - cogagent.model.build_conversation_input_ids.assert_called_once() - cogagent.model.__call__.assert_called_once() - cogagent.decode.assert_called_once() diff --git a/tests/models/test_elevenlab.py b/tests/models/test_elevenlab.py index b28ecb31..0ba975ca 100644 --- a/tests/models/test_elevenlab.py +++ b/tests/models/test_elevenlab.py @@ -9,7 +9,6 @@ from dotenv import load_dotenv load_dotenv() - # Define some test data SAMPLE_TEXT = "Hello, this is a test." API_KEY = os.environ.get("ELEVEN_API_KEY") diff --git a/tests/models/test_gpt4_vision_api.py b/tests/models/test_gpt4_vision_api.py index c7758a36..26f60960 100644 --- a/tests/models/test_gpt4_vision_api.py +++ b/tests/models/test_gpt4_vision_api.py @@ -10,7 +10,6 @@ from swarms.models.gpt4_vision_api import GPT4VisionAPI load_dotenv() - custom_api_key = os.environ.get("OPENAI_API_KEY") img = "images/swarms.jpeg" diff --git a/tests/models/test_modeelscope_pipeline.py b/tests/models/test_modeelscope_pipeline.py deleted file mode 100644 index 16da0155..00000000 --- a/tests/models/test_modeelscope_pipeline.py +++ /dev/null @@ -1,39 +0,0 @@ -import pytest -from swarms.models.modelscope_pipeline import ModelScopePipeline -from unittest.mock import MagicMock - - -@pytest.fixture -def pipeline_params(): - return { - "type_task": "text-generation", - "model_name": "gpt2", - } - - -@pytest.fixture -def pipeline_model(pipeline_params): - return ModelScopePipeline(**pipeline_params) - - -def test_init(mocker, pipeline_params, pipeline_model): - mock_pipeline = mocker.patch( - "swarms.models.modelscope_pipeline.pipeline" - ) - - for param, value in pipeline_params.items(): - assert getattr(pipeline_model, param) == value - - mock_pipeline.assert_called_once_with( - pipeline_params["type_task"], - model=pipeline_params["model_name"], - ) - - -def test_run(mocker, pipeline_model): - task = "Generate a 10,000 word blog on health and wellness." - pipeline_model.model = MagicMock(return_value="Mocked output") - - output = pipeline_model.run(task) - - assert output is not None diff --git a/tests/models/test_modelscope_llm.py b/tests/models/test_modelscope_llm.py deleted file mode 100644 index 7e9310b2..00000000 --- a/tests/models/test_modelscope_llm.py +++ /dev/null @@ -1,58 +0,0 @@ -import pytest -from swarms.models.modelscope_llm import ModelScopeAutoModel -from unittest.mock import MagicMock - - -@pytest.fixture -def model_params(): - return { - "model_name": "gpt2", - "tokenizer_name": None, - "device": "cuda", - "device_map": "auto", - "max_new_tokens": 500, - "skip_special_tokens": True, - } - - -@pytest.fixture -def modelscope(model_params): - return ModelScopeAutoModel(**model_params) - - -def test_init(mocker, model_params, modelscope): - mock_model = mocker.patch( - "swarms.models.modelscope_llm.AutoModelForCausalLM.from_pretrained" - ) - mock_tokenizer = mocker.patch( - "swarms.models.modelscope_llm.AutoTokenizer.from_pretrained" - ) - - for param, value in model_params.items(): - assert getattr(modelscope, param) == value - - mock_tokenizer.assert_called_once_with( - model_params["tokenizer_name"] - ) - mock_model.assert_called_once_with( - model_params["model_name"], - device_map=model_params["device_map"], - ) - - -def test_run(mocker, modelscope): - task = "Generate a 10,000 word blog on health and wellness." - mocker.patch( - "swarms.models.modelscope_llm.AutoTokenizer.decode", - return_value="Mocked output", - ) - modelscope.model.generate = MagicMock( - return_value=["Mocked token"] - ) - modelscope.tokenizer = MagicMock( - return_value={"input_ids": "Mocked input_ids"} - ) - - output = modelscope.run(task) - - assert output is not None diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py deleted file mode 100644 index 6eec8f27..00000000 --- a/tests/models/test_vllm.py +++ /dev/null @@ -1,141 +0,0 @@ -import pytest -from swarms.models.vllm import vLLM - - -# Fixture for initializing vLLM -@pytest.fixture -def vllm_instance(): - return vLLM() - - -# Test the default initialization of vLLM -def test_vllm_default_init(vllm_instance): - assert isinstance(vllm_instance, vLLM) - assert vllm_instance.model_name == "facebook/opt-13b" - assert vllm_instance.tensor_parallel_size == 4 - assert not vllm_instance.trust_remote_code - assert vllm_instance.revision is None - assert vllm_instance.temperature == 0.5 - assert vllm_instance.top_p == 0.95 - - -# Test custom initialization of vLLM -def test_vllm_custom_init(): - vllm_instance = vLLM( - model_name="custom_model", - tensor_parallel_size=8, - trust_remote_code=True, - revision="123", - temperature=0.7, - top_p=0.9, - ) - assert isinstance(vllm_instance, vLLM) - assert vllm_instance.model_name == "custom_model" - assert vllm_instance.tensor_parallel_size == 8 - assert vllm_instance.trust_remote_code - assert vllm_instance.revision == "123" - assert vllm_instance.temperature == 0.7 - assert vllm_instance.top_p == 0.9 - - -# Test the run method of vLLM -def test_vllm_run(vllm_instance): - task = "Hello, vLLM!" - result = vllm_instance.run(task) - assert isinstance(result, str) - assert len(result) > 0 - - -# Test run method with different temperature and top_p values -@pytest.mark.parametrize( - "temperature, top_p", [(0.2, 0.8), (0.8, 0.2)] -) -def test_vllm_run_with_params(vllm_instance, temperature, top_p): - task = "Temperature and Top-P Test" - result = vllm_instance.run( - task, temperature=temperature, top_p=top_p - ) - assert isinstance(result, str) - assert len(result) > 0 - - -# Test run method with a specific model revision -def test_vllm_run_with_revision(vllm_instance): - task = "Specific Model Revision Test" - result = vllm_instance.run(task, revision="abc123") - assert isinstance(result, str) - assert len(result) > 0 - - -# Test run method with a specific model name -def test_vllm_run_with_custom_model(vllm_instance): - task = "Custom Model Test" - custom_model_name = "my_custom_model" - result = vllm_instance.run(task, model_name=custom_model_name) - assert isinstance(result, str) - assert len(result) > 0 - assert vllm_instance.model_name == custom_model_name - - -# Test run method with invalid task input -def test_vllm_run_invalid_task(vllm_instance): - invalid_task = None - with pytest.raises(ValueError): - vllm_instance.run(invalid_task) - - -# Test run method with a very high temperature value -def test_vllm_run_high_temperature(vllm_instance): - task = "High Temperature Test" - high_temperature = 10.0 - result = vllm_instance.run(task, temperature=high_temperature) - assert isinstance(result, str) - assert len(result) > 0 - - -# Test run method with a very low top_p value -def test_vllm_run_low_top_p(vllm_instance): - task = "Low Top-P Test" - low_top_p = 0.01 - result = vllm_instance.run(task, top_p=low_top_p) - assert isinstance(result, str) - assert len(result) > 0 - - -# Test run method with an empty task -def test_vllm_run_empty_task(vllm_instance): - empty_task = "" - result = vllm_instance.run(empty_task) - assert isinstance(result, str) - assert len(result) == 0 - - -# Test initialization with invalid parameters -def test_vllm_invalid_init(): - with pytest.raises(ValueError): - vLLM( - model_name=None, - tensor_parallel_size=-1, - trust_remote_code="invalid", - revision=123, - temperature=-0.1, - top_p=1.1, - ) - - -# Test running vLLM with a large number of parallel heads -def test_vllm_large_parallel_heads(): - vllm_instance = vLLM(tensor_parallel_size=16) - task = "Large Parallel Heads Test" - result = vllm_instance.run(task) - assert isinstance(result, str) - assert len(result) > 0 - - -# Test running vLLM with trust_remote_code set to True -def test_vllm_trust_remote_code(): - vllm_instance = vLLM(trust_remote_code=True) - task = "Trust Remote Code Test" - result = vllm_instance.run(task) - assert isinstance(result, str) - assert len(result) > 0 diff --git a/tests/structs/test_multi_agent_collab.py b/tests/structs/test_multi_agent_collab.py index 05b914b4..475b32b3 100644 --- a/tests/structs/test_multi_agent_collab.py +++ b/tests/structs/test_multi_agent_collab.py @@ -135,7 +135,6 @@ def test_save(collaboration, tmp_path): # Add more tests here... - # Add more parameterized tests for different scenarios... diff --git a/tests/structs/test_swarmnetwork.py b/tests/structs/test_swarmnetwork.py index b4e3593b..9dc6d903 100644 --- a/tests/structs/test_swarmnetwork.py +++ b/tests/structs/test_swarmnetwork.py @@ -2,7 +2,6 @@ from unittest.mock import Mock, patch import pytest - from swarms.structs.agent import Agent from swarms.structs.swarm_net import SwarmNetwork