Skip to content

vllm.transformers_utils.tokenizer_base

TokenizerBase

Bases: ABC

Source code in vllm/transformers_utils/tokenizer_base.py
class TokenizerBase(ABC):

    @property
    @abstractmethod
    def all_special_tokens_extended(self) -> list[str]:
        raise NotImplementedError()

    @property
    @abstractmethod
    def all_special_tokens(self) -> list[str]:
        raise NotImplementedError()

    @property
    @abstractmethod
    def all_special_ids(self) -> list[int]:
        raise NotImplementedError()

    @property
    @abstractmethod
    def bos_token_id(self) -> int:
        raise NotImplementedError()

    @property
    @abstractmethod
    def eos_token_id(self) -> int:
        raise NotImplementedError()

    @property
    @abstractmethod
    def sep_token(self) -> str:
        raise NotImplementedError()

    @property
    @abstractmethod
    def pad_token(self) -> str:
        raise NotImplementedError()

    @property
    @abstractmethod
    def is_fast(self) -> bool:
        raise NotImplementedError()

    @property
    @abstractmethod
    def vocab_size(self) -> int:
        raise NotImplementedError()

    @property
    @abstractmethod
    def max_token_id(self) -> int:
        raise NotImplementedError()

    def __len__(self) -> int:
        return self.vocab_size

    @abstractmethod
    def __call__(
        self,
        text: Union[str, list[str], list[int]],
        text_pair: Optional[str] = None,
        add_special_tokens: bool = False,
        truncation: bool = False,
        max_length: Optional[int] = None,
    ):
        raise NotImplementedError()

    @abstractmethod
    def get_vocab(self) -> dict[str, int]:
        raise NotImplementedError()

    @abstractmethod
    def get_added_vocab(self) -> dict[str, int]:
        raise NotImplementedError()

    @abstractmethod
    def encode_one(
        self,
        text: str,
        truncation: bool = False,
        max_length: Optional[int] = None,
    ) -> list[int]:
        raise NotImplementedError()

    @abstractmethod
    def encode(self,
               text: str,
               truncation: Optional[bool] = None,
               max_length: Optional[int] = None,
               add_special_tokens: Optional[bool] = None) -> list[int]:
        raise NotImplementedError()

    @abstractmethod
    def apply_chat_template(self,
                            messages: list["ChatCompletionMessageParam"],
                            tools: Optional[list[dict[str, Any]]] = None,
                            **kwargs) -> list[int]:
        raise NotImplementedError()

    @abstractmethod
    def convert_tokens_to_string(self, tokens: list[str]) -> str:
        raise NotImplementedError()

    @abstractmethod
    def decode(self,
               ids: Union[list[int], int],
               skip_special_tokens: bool = True) -> str:
        raise NotImplementedError()

    @abstractmethod
    def convert_ids_to_tokens(
        self,
        ids: list[int],
        skip_special_tokens: bool = True,
    ) -> list[str]:
        raise NotImplementedError()

all_special_ids abstractmethod property

all_special_ids: list[int]

all_special_tokens abstractmethod property

all_special_tokens: list[str]

all_special_tokens_extended abstractmethod property

all_special_tokens_extended: list[str]

bos_token_id abstractmethod property

bos_token_id: int

eos_token_id abstractmethod property

eos_token_id: int

is_fast abstractmethod property

is_fast: bool

max_token_id abstractmethod property

max_token_id: int

pad_token abstractmethod property

pad_token: str

sep_token abstractmethod property

sep_token: str

vocab_size abstractmethod property

vocab_size: int

__call__ abstractmethod

__call__(
    text: Union[str, list[str], list[int]],
    text_pair: Optional[str] = None,
    add_special_tokens: bool = False,
    truncation: bool = False,
    max_length: Optional[int] = None,
)
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def __call__(
    self,
    text: Union[str, list[str], list[int]],
    text_pair: Optional[str] = None,
    add_special_tokens: bool = False,
    truncation: bool = False,
    max_length: Optional[int] = None,
):
    raise NotImplementedError()

__len__

__len__() -> int
Source code in vllm/transformers_utils/tokenizer_base.py
def __len__(self) -> int:
    return self.vocab_size

apply_chat_template abstractmethod

apply_chat_template(
    messages: list[ChatCompletionMessageParam],
    tools: Optional[list[dict[str, Any]]] = None,
    **kwargs,
) -> list[int]
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def apply_chat_template(self,
                        messages: list["ChatCompletionMessageParam"],
                        tools: Optional[list[dict[str, Any]]] = None,
                        **kwargs) -> list[int]:
    raise NotImplementedError()

convert_ids_to_tokens abstractmethod

convert_ids_to_tokens(
    ids: list[int], skip_special_tokens: bool = True
) -> list[str]
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def convert_ids_to_tokens(
    self,
    ids: list[int],
    skip_special_tokens: bool = True,
) -> list[str]:
    raise NotImplementedError()

convert_tokens_to_string abstractmethod

convert_tokens_to_string(tokens: list[str]) -> str
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def convert_tokens_to_string(self, tokens: list[str]) -> str:
    raise NotImplementedError()

decode abstractmethod

decode(
    ids: Union[list[int], int],
    skip_special_tokens: bool = True,
) -> str
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def decode(self,
           ids: Union[list[int], int],
           skip_special_tokens: bool = True) -> str:
    raise NotImplementedError()

encode abstractmethod

encode(
    text: str,
    truncation: Optional[bool] = None,
    max_length: Optional[int] = None,
    add_special_tokens: Optional[bool] = None,
) -> list[int]
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def encode(self,
           text: str,
           truncation: Optional[bool] = None,
           max_length: Optional[int] = None,
           add_special_tokens: Optional[bool] = None) -> list[int]:
    raise NotImplementedError()

encode_one abstractmethod

encode_one(
    text: str,
    truncation: bool = False,
    max_length: Optional[int] = None,
) -> list[int]
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def encode_one(
    self,
    text: str,
    truncation: bool = False,
    max_length: Optional[int] = None,
) -> list[int]:
    raise NotImplementedError()

get_added_vocab abstractmethod

get_added_vocab() -> dict[str, int]
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def get_added_vocab(self) -> dict[str, int]:
    raise NotImplementedError()

get_vocab abstractmethod

get_vocab() -> dict[str, int]
Source code in vllm/transformers_utils/tokenizer_base.py
@abstractmethod
def get_vocab(self) -> dict[str, int]:
    raise NotImplementedError()

TokenizerRegistry

Source code in vllm/transformers_utils/tokenizer_base.py
class TokenizerRegistry:
    # Tokenizer name -> (tokenizer module, tokenizer class)
    REGISTRY: dict[str, tuple[str, str]] = {}

    @staticmethod
    def register(name: str, module: str, class_name: str) -> None:
        TokenizerRegistry.REGISTRY[name] = (module, class_name)

    @staticmethod
    def get_tokenizer(
        tokenizer_name: str,
        *args,
        **kwargs,
    ) -> TokenizerBase:
        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
        if tokenizer_cls is None:
            raise ValueError(f"Tokenizer {tokenizer_name} not found.")

        tokenizer_module = importlib.import_module(tokenizer_cls[0])
        class_ = getattr(tokenizer_module, tokenizer_cls[1])
        return class_.from_pretrained(*args, **kwargs)

REGISTRY class-attribute instance-attribute

REGISTRY: dict[str, tuple[str, str]] = {}

get_tokenizer staticmethod

get_tokenizer(
    tokenizer_name: str, *args, **kwargs
) -> TokenizerBase
Source code in vllm/transformers_utils/tokenizer_base.py
@staticmethod
def get_tokenizer(
    tokenizer_name: str,
    *args,
    **kwargs,
) -> TokenizerBase:
    tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
    if tokenizer_cls is None:
        raise ValueError(f"Tokenizer {tokenizer_name} not found.")

    tokenizer_module = importlib.import_module(tokenizer_cls[0])
    class_ = getattr(tokenizer_module, tokenizer_cls[1])
    return class_.from_pretrained(*args, **kwargs)

register staticmethod

register(name: str, module: str, class_name: str) -> None
Source code in vllm/transformers_utils/tokenizer_base.py
@staticmethod
def register(name: str, module: str, class_name: str) -> None:
    TokenizerRegistry.REGISTRY[name] = (module, class_name)