diff --git a/pyproject.toml b/pyproject.toml index 8112def8..07bd584e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,46 +29,26 @@ torch = ">=2.1.1,<3.0" transformers = "4.39.0" asyncio = ">=3.4.3,<4.0" einops = "0.7.0" -google-generativeai = "0.3.1" -langchain = "0.1.13" -langchain-core = "0.1.34" +langchain-core = "0.1.33" langchain-community = "0.0.29" langchain-experimental = "0.0.55" -faiss-cpu = "1.7.4" backoff = "2.2.1" -datasets = "*" -optimum = "1.15.0" -supervision = "0.19.0" -opencv-python = "4.9.0.80" -diffusers = "*" -anthropic = "0.21.3" toml = "*" pypdf = "4.1.0" -accelerate = "*" -sentencepiece = "0.1.98" httpx = "0.24.1" -tiktoken = "0.5.2" ratelimit = "2.2.1" loguru = "0.7.2" -huggingface-hub = "*" pydantic = "2.6.4" tenacity = "8.2.3" Pillow = "10.2.0" -chromadb = "0.4.24" termcolor = "2.2.0" -torchvision = "0.16.1" rich = "13.5.2" -bitsandbytes = "*" -sentence-transformers = "*" -peft = "*" psutil = "*" -timm = "*" sentry-sdk = "*" [tool.poetry.dev-dependencies] black = "23.3.0" - [tool.poetry.group.lint.dependencies] ruff = ">=0.0.249,<0.3.5" types-toml = "^0.10.8.1" diff --git a/requirements.txt b/requirements.txt index 68ad9d8a..072e5c9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,40 +1,23 @@ -torch==2.1.1 -transformers -pandas -langchain==0.1.13 +mkdocs +mkdocs-material +mkdocs-glightbox +torch>=2.1.1,<3.0 +transformers==4.39.0 +asyncio>=3.4.3,<4.0 +einops==0.7.0 langchain-core==0.1.33 langchain-community==0.0.29 -langsmith==0.1.17 -langchain-openai==0.0.5 -httpx==0.24.1 -Pillow==9.4.0 -datasets==2.14.5 -pydantic==2.6.4 -huggingface-hub -requests_mock -pypdf==4.0.1 -accelerate==0.22.0 -loguru==0.7.2 -optimum -diffusers -toml -tiktoken==0.5.2 -colored -addict +langchain-experimental==0.0.55 backoff==2.2.1 +toml +pypdf==4.1.0 +httpx==0.24.1 ratelimit==2.2.1 +loguru==0.7.2 +pydantic==2.6.4 +tenacity==8.2.3 +Pillow==10.2.0 termcolor==2.2.0 -opencv-python==4.9.0.80 -timm -torchvision==0.16.1 rich==13.5.2 -mkdocs -mkdocs-material -anthropic==0.2.5 -mkdocs-glightbox -pre-commit==3.6.2 psutil -black -tenacity -supervision sentry-sdk \ No newline at end of file diff --git a/swarms/__init__.py b/swarms/__init__.py index b9eb1426..db28200e 100644 --- a/swarms/__init__.py +++ b/swarms/__init__.py @@ -17,6 +17,6 @@ from swarms.models import * # noqa: E402, F403 from swarms.prompts import * # noqa: E402, F403 from swarms.structs import * # noqa: E402, F403 from swarms.telemetry import * # noqa: E402, F403 -from swarms.tokenizers import * # noqa: E402, F403 from swarms.tools import * # noqa: E402, F403 from swarms.utils import * # noqa: E402, F403 +from swarms.schedulers import * # noqa: E402, F403 diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index 92b0e929..8400073f 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -44,7 +44,6 @@ from swarms.models.qwen import QwenVLMultiModal # noqa: E402 # from swarms.models.roboflow_model import RoboflowMultiModal from swarms.models.sam_supervision import SegmentAnythingMarkGenerator from swarms.models.sampling_params import SamplingParams, SamplingType -from swarms.models.timm import TimmModel # noqa: E402 from swarms.models.together import TogetherLLM # noqa: E402 from swarms.models.types import ( # noqa: E402 AudioModality, diff --git a/swarms/schedulers/__init__.py b/swarms/schedulers/__init__.py new file mode 100644 index 00000000..803b2278 --- /dev/null +++ b/swarms/schedulers/__init__.py @@ -0,0 +1,6 @@ +from swarms.schedulers.agent_process import ( + AgentProcess, + AgentProcessQueue, +) + +__all__ = ["AgentProcess", "AgentProcessQueue"] diff --git a/swarms/schedulers/agent_process.py b/swarms/schedulers/agent_process.py new file mode 100644 index 00000000..cd9ca6e3 --- /dev/null +++ b/swarms/schedulers/agent_process.py @@ -0,0 +1,103 @@ +from datetime import datetime + +from pydantic import BaseModel + +from swarms.structs.omni_agent_types import agents +from swarms.utils.loguru_logger import logger + + +class AgentProcess(BaseModel): + agent_id: int + agent_name: str + prompt: str + response: str = None + time: callable = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + priority: int = 0 + status: str = "Waiting" + pid: int = None + + def set_pid(self, pid: int): + self.pid = pid + + def get_pid(self): + return self.pid + + def set_time(self, time: callable): + self.time = time + + def get_time(self): + return self.time + + +class AgentProcessQueue: + """ + A class representing a queue of agent processes. + + Attributes: + MAX_PID (int): The maximum process ID. + pid_pool (list): A list representing the availability of process IDs. + agent_process_queue (list): A list representing the queue of agent processes. + + Methods: + add(agent_process): Adds an agent process to the queue. + print(): Prints the details of all agent processes in the queue. + + Private Methods: + _get_available_pid(): Returns an available process ID from the pool. + """ + + def __init__(self, max_pid: int = 1024): + self.MAX_PID = max_pid + self.pid_pool = [False for i in range(self.MAX_PID)] + self.agent_process_queue = ( + [] + ) # Currently use list to simulate queue + + def add(self, agents: agents): + """ + Adds an agent process to the queue. + + Args: + agent_process (AgentProcess): The agent process to be added. + + Returns: + None + """ + for agent in agents: + agent_process = AgentProcess( + agent_id=agent.id, + agent_name=agent.agent_name, + prompt=agent.short_memory.return_history_as_string(), + ) + pid = self._get_available_pid() + if pid is None: + logger.warning("No available PID") + return + agent_process.set_pid(pid) + agent_process.set_status("Waiting") + self.agent_process_queue.append(agent_process) + + def print(self): + """ + Prints the details of all agent processes in the queue. + + Returns: + None + """ + for agent_process in self.agent_process_queue: + logger.info( + f"| Agent-process ID: {agent_process.get_pid()} |" + f" Status: {agent_process.get_status()} |" + ) + + def _get_available_pid(self): + """ + Returns an available process ID from the pool. + + Returns: + int or None: The available process ID, or None if no ID is available. + """ + for i, used in enumerate(self.pid_pool): + if not used: + return i + return None diff --git a/swarms/tokenizers/__init__.py b/swarms/tokenizers/__init__.py deleted file mode 100644 index 895c14bc..00000000 --- a/swarms/tokenizers/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# from swarms.tokenizers.anthropic_tokenizer import ( -# AnthropicTokenizer, -# import_optional_dependency, -# ) -from swarms.tokenizers.base_tokenizer import BaseTokenizer -from swarms.tokenizers.openai_tokenizers import OpenAITokenizer -from swarms.tokenizers.r_tokenizers import ( - HuggingFaceTokenizer, - SentencePieceTokenizer, - Tokenizer, -) - - -__all__ = [ - "SentencePieceTokenizer", - "HuggingFaceTokenizer", - "Tokenizer", - "BaseTokenizer", - "OpenAITokenizer", - # "import_optional_dependency", - # "AnthropicTokenizer", -] diff --git a/swarms/tokenizers/anthropic_tokenizer.py b/swarms/tokenizers/anthropic_tokenizer.py deleted file mode 100644 index 77cd07c3..00000000 --- a/swarms/tokenizers/anthropic_tokenizer.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from importlib import import_module -from types import ModuleType - -from anthropic import Anthropic - -from swarms.tokenizers.base_tokenizer import BaseTokenizer - -INSTALL_MAPPING = { - "huggingface_hub": "huggingface-hub", - "pinecone": "pinecone-client", - "opensearchpy": "opensearch-py", -} - - -def import_optional_dependency(name: str) -> ModuleType | None: - """Import an optional dependency. - - If a dependency is missing, an ImportError with a nice message will be raised. - - Args: - name: The module name. - Returns: - The imported module, when found. - None is returned when the package is not found and `errors` is False. - """ - - package_name = INSTALL_MAPPING.get(name) - install_name = package_name if package_name is not None else name - - msg = ( - f"Missing optional dependency: '{install_name}'. " - f"Use poetry or pip to install '{install_name}'." - ) - try: - module = import_module(name) - except ImportError: - raise ImportError(msg) - - return module - - -@dataclass -class AnthropicTokenizer(BaseTokenizer): - """ - Tokenizer class for Anthropic models.] - """ - - max_tokens: int = 500 - client: Anthropic = None - model: str = "claude-2.1" - - def __post_init__(self): - self.DEFAULT_MODEL: str = "claude-2.1" - self.MODEL_PREFIXES_TO_MAX_TOKENS: dict[str, int] = { - "claude-2.1": 200000, - "claude": 100000, - } - self.model = self.model # or self.DEFAULT_MODEL - self.max_tokens = self.max_tokens or self.default_max_tokens() - self.client = ( - self.client - or import_optional_dependency("anthropic").Anthropic() - ) - - def default_max_tokens(self) -> int: - """ - Returns the default maximum number of tokens based on the model prefix. - """ - tokens = next( - v - for k, v in self.MODEL_PREFIXES_TO_MAX_TOKENS.items() - if self.model.startswith(k) - ) - return tokens - - def count_tokens(self, text: str | list) -> int: - """ - Counts the number of tokens in the given text. - - Args: - text: The input text. - - Returns: - The number of tokens in the text. - - Raises: - ValueError: If the input text is not a string. - """ - if isinstance(text, str): - return self.client.count_tokens(text) - else: - raise ValueError("Text must be a string.") diff --git a/swarms/tokenizers/base_tokenizer.py b/swarms/tokenizers/base_tokenizer.py deleted file mode 100644 index fd1bc339..00000000 --- a/swarms/tokenizers/base_tokenizer.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from dataclasses import dataclass, field - - -@dataclass -class BaseTokenizer(ABC): - """ - Base class for tokenizers. - - Attributes: - stop_sequences (List[str]): List of stop sequences. - max_tokens (int): Maximum number of tokens. - stop_token (str): Stop token. - """ - - max_tokens: int - stop_token: str = "<|Response|>" - - def __post_init__(self): - self.stop_sequences: list[str] = field( - default_factory=lambda: ["<|Response|>"], - init=False, - ) - - def count_tokens_left(self, text: str | list[dict]) -> int: - """ - Counts the number of tokens left based on the given text. - - Args: - text (Union[str, List[dict]]): The text to count tokens from. - - Returns: - int: The number of tokens left. - """ - diff = self.max_tokens - self.count_tokens(text) - - if diff > 0: - return diff - else: - return 0 - - @abstractmethod - def count_tokens(self, text: str | list[dict]) -> int: - """ - Counts the number of tokens in the given text. - - Args: - text (Union[str, List[dict]]): The text to count tokens from. - - Returns: - int: The number of tokens. - """ - ... diff --git a/swarms/tokenizers/openai_tokenizers.py b/swarms/tokenizers/openai_tokenizers.py deleted file mode 100644 index 9b02943b..00000000 --- a/swarms/tokenizers/openai_tokenizers.py +++ /dev/null @@ -1,181 +0,0 @@ -from __future__ import annotations - -import logging -from dataclasses import dataclass, field - -import tiktoken -from tiktoken import Encoding - -from swarms.tokenizers.base_tokenizer import BaseTokenizer - - -@dataclass -class OpenAITokenizer(BaseTokenizer): - """ - A class representing an OpenAI tokenizer. - - Attributes: - - DEFAULT_OPENAI_GPT_3_COMPLETION_MODEL (str): The default OpenAI GPT-3 completion model. - - DEFAULT_OPENAI_GPT_3_CHAT_MODEL (str): The default OpenAI GPT-3 chat model. - - DEFAULT_OPENAI_GPT_4_MODEL (str): The default OpenAI GPT-4 model. - - DEFAULT_ENCODING (str): The default encoding. - - DEFAULT_MAX_TOKENS (int): The default maximum number of tokens. - - TOKEN_OFFSET (int): The token offset. - - MODEL_PREFIXES_TO_MAX_TOKENS (dict): A dictionary mapping model prefixes to maximum tokens. - - EMBEDDING_MODELS (list): A list of embedding models. - - model (str): The model name. - - Methods: - - __post_init__(): Initializes the OpenAITokenizer object. - - encoding(): Returns the encoding for the model. - - default_max_tokens(): Returns the default maximum number of tokens. - - count_tokens(text, model): Counts the number of tokens in the given text. - - len(text, model): Returns the length of the text in tokens. - """ - - model: str = "gpt-2" - - def __post_init__(self): - """ - Initializes the OpenAITokenizer object. - Sets the default maximum number of tokens. - """ - self.max_tokens: int = field( - default_factory=self.default_max_tokens - ) - - self.DEFAULT_OPENAI_GPT_3_COMPLETION_MODEL = ( - "text-davinci-003" - ) - self.DEFAULT_OPENAI_GPT_3_CHAT_MODEL = "gpt-3.5-turbo" - self.DEFAULT_OPENAI_GPT_4_MODEL = "gpt-4" - self.DEFAULT_ENCODING = "cl100k_base" - self.EFAULT_MAX_TOKENS = 2049 - self.TOKEN_OFFSET = 8 - - self.MODEL_PREFIXES_TO_MAX_TOKENS = { - "gpt-4-1106": 128000, - "gpt-4-32k": 32768, - "gpt-4": 8192, - "gpt-3.5-turbo-16k": 16384, - "gpt-3.5-turbo": 4096, - "gpt-35-turbo-16k": 16384, - "gpt-35-turbo": 4096, - "text-davinci-003": 4097, - "text-davinci-002": 4097, - "code-davinci-002": 8001, - "text-embedding-ada-002": 8191, - "text-embedding-ada-001": 2046, - } - - self.EMBEDDING_MODELS = [ - "text-embedding-ada-002", - "text-embedding-ada-001", - ] - - @property - def encoding(self) -> Encoding: - """ - Returns the encoding for the model. - If the model is not found, returns the default encoding. - """ - try: - return tiktoken.encoding_for_model(self.model) - except KeyError: - return tiktoken.get_encoding(self.DEFAULT_ENCODING) - - def default_max_tokens(self) -> int: - """ - Returns the default maximum number of tokens based on the model. - """ - tokens = next( - v - for k, v in self.MODEL_PREFIXES_TO_MAX_TOKENS.items() - if self.model.startswith(k) - ) - offset = ( - 0 - if self.model in self.EMBEDDING_MODELS - else self.TOKEN_OFFSET - ) - - return ( - tokens if tokens else self.DEFAULT_MAX_TOKENS - ) - offset - - def count_tokens( - self, text: str | list[dict], model: str | None = None - ) -> int: - """ - Counts the number of tokens in the given text. - If the text is a list of messages, counts the tokens for each message. - If a model is provided, uses that model for encoding. - """ - if isinstance(text, list): - model = model if model else self.model - - try: - encoding = tiktoken.encoding_for_model(model) - except KeyError: - logging.warning( - "model not found. Using cl100k_base encoding." - ) - encoding = tiktoken.get_encoding("cl100k_base") - - if model in { - "gpt-3.5-turbo-0613", - "gpt-3.5-turbo-16k-0613", - "gpt-4-0314", - "gpt-4-32k-0314", - "gpt-4-0613", - "gpt-4-32k-0613", - }: - tokens_per_message = 3 - tokens_per_name = 1 - elif model == "gpt-3.5-turbo-0301": - tokens_per_message = 4 - tokens_per_name = -1 - elif "gpt-3.5-turbo" in model or "gpt-35-turbo" in model: - logging.info( - "gpt-3.5-turbo may update over time. Returning" - " num tokens assuming gpt-3.5-turbo-0613." - ) - return self.count_tokens( - text, model="gpt-3.5-turbo-0613" - ) - elif "gpt-4" in model: - logging.info( - "gpt-4 may update over time. Returning num tokens" - " assuming gpt-4-0613." - ) - return self.count_tokens(text, model="gpt-4-0613") - else: - raise NotImplementedError( - "token_count() is not implemented for model" - f" {model}. See" - " https://github.com/openai/openai-python/blob/main/chatml.md" - " for information on how messages are converted" - " to tokens." - ) - - num_tokens = 0 - - for message in text: - num_tokens += tokens_per_message - for key, value in message.items(): - num_tokens += len(encoding.encode(value)) - if key == "name": - num_tokens += tokens_per_name - - num_tokens += 3 - - return num_tokens - else: - return len(self.encoding.encode(text)) - - def len(self, text: str | list[dict], model: str | None): - """ - Returns the length of the text in tokens. - If a model is provided, uses that model for encoding. - """ - return self.count_tokens(text, model) diff --git a/swarms/tokenizers/r_tokenizers.py b/swarms/tokenizers/r_tokenizers.py deleted file mode 100644 index f807b6ff..00000000 --- a/swarms/tokenizers/r_tokenizers.py +++ /dev/null @@ -1,422 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -import os -import os.path as osp -from collections import deque -from typing import List, Optional, Sequence, Union - -import torch - -from swarms.utils.get_logger import get_logger - - -class SentencePieceTokenizer: - """Tokenizer of sentencepiece. - - Args: - model_file (str): the path of the tokenizer model - """ - - def __init__(self, model_file: str): - from sentencepiece import SentencePieceProcessor - - self.model = SentencePieceProcessor(model_file=model_file) - self._prefix_space_tokens = None - # for stop words - self._maybe_decode_bytes: bool = None - # TODO maybe lack a constant.py - self._indexes_tokens_deque = deque(maxlen=10) - self.max_indexes_num = 5 - self.logger = get_logger("lmdeploy") - - @property - def vocab_size(self): - """vocabulary size.""" - return self.model.vocab_size() - - @property - def bos_token_id(self): - """begine of the sentence token id.""" - return self.model.bos_id() - - @property - def eos_token_id(self): - """end of the sentence token id.""" - return self.model.eos_id() - - @property - def prefix_space_tokens(self): - """tokens without prefix space.""" - if self._prefix_space_tokens is None: - vocab = self.model.IdToPiece(list(range(self.vocab_size))) - self._prefix_space_tokens = { - i - for i, tok in enumerate(vocab) - if tok.startswith("▁") - } - return self._prefix_space_tokens - - def _maybe_add_prefix_space(self, tokens, decoded): - """maybe add prefix space for incremental decoding.""" - if ( - tokens - and not decoded.startswith(" ") - and tokens[0] in self.prefix_space_tokens - ): - return " " + decoded - else: - return decoded - - def indexes_containing_token(self, token: str): - """Return all the possible indexes, whose decoding output may contain - the input token.""" - # traversing vocab is time consuming, can not be accelerated with - # multi threads (computation) or multi process (can't pickle tokenizer) - # so, we maintain latest 10 stop words and return directly if matched - for _token, _indexes in self._indexes_tokens_deque: - if token == _token: - return _indexes - if token == " ": # ' ' is special - token = "▁" - vocab = self.model.IdToPiece(list(range(self.vocab_size))) - indexes = [i for i, voc in enumerate(vocab) if token in voc] - if len(indexes) > self.max_indexes_num: - indexes = self.encode(token, add_bos=False)[-1:] - self.logger.warning( - f"There are too many(>{self.max_indexes_num})" - f" possible indexes may decoding {token}, we will use" - f" {indexes} only" - ) - self._indexes_tokens_deque.append((token, indexes)) - return indexes - - def encode(self, s: str, add_bos: bool = True, **kwargs): - """Tokenize a prompt. - - Args: - s (str): a prompt - Returns: - list[int]: token ids - """ - return self.model.Encode(s, add_bos=add_bos, **kwargs) - - def decode(self, t: Sequence[int], offset: Optional[int] = None): - """De-tokenize. - - Args: - t (List[int]): a list of token ids - offset (int): for incrementally decoding. Default to None, which - means not applied. - Returns: - str: text of decoding tokens - """ - if isinstance(t, torch.Tensor): - t = t.tolist() - t = t[offset:] - out_string = self.model.Decode(t) - if offset: - out_string = self._maybe_add_prefix_space(t, out_string) - return out_string - - def __call__(self, s: Union[str, Sequence[str]]): - """Tokenize prompts. - - Args: - s (str): prompts - Returns: - list[int]: token ids - """ - import addict - - add_bos = False - add_eos = False - - input_ids = self.model.Encode( - s, add_bos=add_bos, add_eos=add_eos - ) - return addict.Addict(input_ids=input_ids) - - -class HuggingFaceTokenizer: - """Tokenizer of sentencepiece. - - Args: - model_dir (str): the directory of the tokenizer model - """ - - def __init__(self, model_dir: str): - from transformers import AutoTokenizer - - model_file = osp.join(model_dir, "tokenizer.model") - backend_tokenizer_file = osp.join(model_dir, "tokenizer.json") - model_file_exists = osp.exists(model_file) - self.logger = get_logger("lmdeploy") - if ( - not osp.exists(backend_tokenizer_file) - and model_file_exists - ): - self.logger.warning( - "Can not find tokenizer.json. " - "It may take long time to initialize the tokenizer." - ) - self.model = AutoTokenizer.from_pretrained( - model_dir, trust_remote_code=True - ) - self._prefix_space_tokens = None - # save tokenizer.json to reuse - if ( - not osp.exists(backend_tokenizer_file) - and model_file_exists - ): - if hasattr(self.model, "backend_tokenizer"): - if os.access(model_dir, os.W_OK): - self.model.backend_tokenizer.save( - backend_tokenizer_file - ) - - if self.model.eos_token_id is None: - generation_config_file = osp.join( - model_dir, "generation_config.json" - ) - if osp.exists(generation_config_file): - with open(generation_config_file) as f: - cfg = json.load(f) - self.model.eos_token_id = cfg["eos_token_id"] - elif hasattr(self.model, "eod_id"): # Qwen remote - self.model.eos_token_id = self.model.eod_id - - # for stop words - self._maybe_decode_bytes: bool = None - # TODO maybe lack a constant.py - self._indexes_tokens_deque = deque(maxlen=10) - self.max_indexes_num = 5 - self.token2id = {} - - @property - def vocab_size(self): - """vocabulary size.""" - return self.model.vocab_size - - @property - def bos_token_id(self): - """begine of the sentence token id.""" - return self.model.bos_token_id - - @property - def eos_token_id(self): - """end of the sentence token id.""" - return self.model.eos_token_id - - @property - def prefix_space_tokens(self): - """tokens without prefix space.""" - if self._prefix_space_tokens is None: - vocab = self.model.convert_ids_to_tokens( - list(range(self.vocab_size)) - ) - self._prefix_space_tokens = { - i - for i, tok in enumerate(vocab) - if tok.startswith( - "▁" if isinstance(tok, str) else b" " - ) - } - return self._prefix_space_tokens - - def _maybe_add_prefix_space( - self, tokens: List[int], decoded: str - ): - """maybe add prefix space for incremental decoding.""" - if ( - tokens - and not decoded.startswith(" ") - and tokens[0] in self.prefix_space_tokens - ): - return " " + decoded - else: - return decoded - - @property - def maybe_decode_bytes(self): - """Check if self.model.convert_ids_to_tokens return not a str value.""" - if self._maybe_decode_bytes is None: - self._maybe_decode_bytes = False - vocab = self.model.convert_ids_to_tokens( - list(range(self.vocab_size)) - ) - for tok in vocab: - if not isinstance(tok, str): - self._maybe_decode_bytes = True - break - return self._maybe_decode_bytes - - def indexes_containing_token(self, token: str): - """Return all the possible indexes, whose decoding output may contain - the input token.""" - # traversing vocab is time consuming, can not be accelerated with - # multi threads (computation) or multi process (can't pickle tokenizer) - # so, we maintain latest 10 stop words and return directly if matched - for _token, _indexes in self._indexes_tokens_deque: - if token == _token: - return _indexes - - if self.token2id == {}: - # decode is slower than convert_ids_to_tokens - if self.maybe_decode_bytes: - self.token2id = { - self.model.decode(i): i - for i in range(self.vocab_size) - } - else: - self.token2id = { - self.model.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } - if token == " ": # ' ' is special - token = "▁" - indexes = [ - i - for _token, i in self.token2id.items() - if token in _token - ] - if len(indexes) > self.max_indexes_num: - indexes = self.encode(token, add_bos=False)[-1:] - self.logger.warning( - f"There are too many(>{self.max_indexes_num})" - f" possible indexes may decoding {token}, we will use" - f" {indexes} only" - ) - self._indexes_tokens_deque.append((token, indexes)) - return indexes - - def encode(self, s: str, add_bos: bool = True, **kwargs): - """Tokenize a prompt. - - Args: - s (str): a prompt - Returns: - list[int]: token ids - """ - encoded = self.model.encode(s, **kwargs) - if not add_bos: - # in the middle of a session - if encoded and encoded[0] == self.bos_token_id: - encoded = encoded[1:] - return encoded - - def decode(self, t: Sequence[int], offset: Optional[int] = None): - """De-tokenize. - - Args: - t (List[int]): a list of token ids - offset (int): for incrementally decoding. Default to None, which - means not applied. - Returns: - str: text of decoding tokens - """ - skip_special_tokens = True - t = t[offset:] - out_string = self.model.decode( - t, skip_special_tokens=skip_special_tokens - ) - if offset: - out_string = self._maybe_add_prefix_space(t, out_string) - return out_string - - def __call__(self, s: Union[str, Sequence[str]]): - """Tokenize prompts. - - Args: - s (str): prompts - Returns: - list[int]: token ids - """ - add_special_tokens = False - return self.model(s, add_special_tokens=add_special_tokens) - - -class Tokenizer: - """Tokenize prompts or de-tokenize tokens into texts. - - Args: - model_file (str): the path of the tokenizer model - """ - - def __init__(self, model_file: str): - if model_file.endswith(".model"): - model_folder = osp.split(model_file)[0] - else: - model_folder = model_file - model_file = osp.join(model_folder, "tokenizer.model") - tokenizer_config_file = osp.join( - model_folder, "tokenizer_config.json" - ) - - model_file_exists = osp.exists(model_file) - config_exists = osp.exists(tokenizer_config_file) - use_hf_model = config_exists or not model_file_exists - self.logger = get_logger("lmdeploy") - if not use_hf_model: - self.model = SentencePieceTokenizer(model_file) - else: - self.model = HuggingFaceTokenizer(model_folder) - - @property - def vocab_size(self): - """vocabulary size.""" - return self.model.vocab_size - - @property - def bos_token_id(self): - """begine of the sentence token id.""" - return self.model.bos_token_id - - @property - def eos_token_id(self): - """end of the sentence token id.""" - return self.model.eos_token_id - - def encode(self, s: str, add_bos: bool = True, **kwargs): - """Tokenize a prompt. - - Args: - s (str): a prompt - Returns: - list[int]: token ids - """ - return self.model.encode(s, add_bos, **kwargs) - - def decode(self, t: Sequence[int], offset: Optional[int] = None): - """De-tokenize. - - Args: - t (List[int]): a list of token ids - offset (int): for incrementally decoding. Default to None, which - means not applied. - Returns: - str: text of decoding tokens - """ - return self.model.decode(t, offset) - - def __call__(self, s: Union[str, Sequence[str]]): - """Tokenize prompts. - - Args: - s (str): prompts - Returns: - list[int]: token ids - """ - return self.model(s) - - def indexes_containing_token(self, token): - """Return all the possible indexes, whose decoding output may contain - the input token.""" - encoded = self.encode(token, add_bos=False) - if len(encoded) > 1: - self.logger.warning( - f"The token {token}, its length of indexes" - f" {encoded} is over than 1. Currently, it can not be" - " used as stop words" - ) - return [] - return self.model.indexes_containing_token(token)