vllm.transformers_utils.tokenizer_base

TokenizerBase ¶

Bases: ABC

Source code in vllm/transformers_utils/tokenizer_base.py

class TokenizerBase(ABC):

    @property
    @abstractmethod
    def all_special_tokens_extended(self) -> list[str]:
        raise NotImplementedError()

    @property
    @abstractmethod
    def all_special_tokens(self) -> list[str]:
        raise NotImplementedError()

    @property
    @abstractmethod
    def all_special_ids(self) -> list[int]:
        raise NotImplementedError()

    @property
    @abstractmethod
    def bos_token_id(self) -> int:
        raise NotImplementedError()

    @property
    @abstractmethod
    def eos_token_id(self) -> int:
        raise NotImplementedError()

    @property
    @abstractmethod
    def sep_token(self) -> str:
        raise NotImplementedError()

    @property
    @abstractmethod
    def pad_token(self) -> str:
        raise NotImplementedError()

    @property
    @abstractmethod
    def is_fast(self) -> bool:
        raise NotImplementedError()

    @property
    @abstractmethod
    def vocab_size(self) -> int:
        raise NotImplementedError()

    @property
    @abstractmethod
    def max_token_id(self) -> int:
        raise NotImplementedError()

    def __len__(self) -> int:
        return self.vocab_size

    @abstractmethod
    def __call__(
        self,
        text: Union[str, list[str], list[int]],
        text_pair: Optional[str] = None,
        add_special_tokens: bool = False,
        truncation: bool = False,
        max_length: Optional[int] = None,
    ):
        raise NotImplementedError()

    @abstractmethod
    def get_vocab(self) -> dict[str, int]:
        raise NotImplementedError()

    @abstractmethod
    def get_added_vocab(self) -> dict[str, int]:
        raise NotImplementedError()

    @abstractmethod
    def encode_one(
        self,
        text: str,
        truncation: bool = False,
        max_length: Optional[int] = None,
    ) -> list[int]:
        raise NotImplementedError()

    @abstractmethod
    def encode(self,
               text: str,
               truncation: Optional[bool] = None,
               max_length: Optional[int] = None,
               add_special_tokens: Optional[bool] = None) -> list[int]:
        raise NotImplementedError()

    @abstractmethod
    def apply_chat_template(self,
                            messages: list["ChatCompletionMessageParam"],
                            tools: Optional[list[dict[str, Any]]] = None,
                            **kwargs) -> list[int]:
        raise NotImplementedError()

    @abstractmethod
    def convert_tokens_to_string(self, tokens: list[str]) -> str:
        raise NotImplementedError()

    @abstractmethod
    def decode(self,
               ids: Union[list[int], int],
               skip_special_tokens: bool = True) -> str:
        raise NotImplementedError()

    @abstractmethod
    def convert_ids_to_tokens(
        self,
        ids: list[int],
        skip_special_tokens: bool = True,
    ) -> list[str]:
        raise NotImplementedError()

all_special_ids `abstractmethod` `property` ¶

all_special_ids: list[int]

all_special_tokens `abstractmethod` `property` ¶

all_special_tokens: list[str]

all_special_tokens_extended `abstractmethod` `property` ¶

all_special_tokens_extended: list[str]

bos_token_id `abstractmethod` `property` ¶

bos_token_id: int

eos_token_id `abstractmethod` `property` ¶

eos_token_id: int

is_fast `abstractmethod` `property` ¶

is_fast: bool

max_token_id `abstractmethod` `property` ¶

max_token_id: int

pad_token `abstractmethod` `property` ¶

pad_token: str

sep_token `abstractmethod` `property` ¶

sep_token: str

vocab_size `abstractmethod` `property` ¶

vocab_size: int

call `abstractmethod` ¶

__call__(
    text: Union[str, list[str], list[int]],
    text_pair: Optional[str] = None,
    add_special_tokens: bool = False,
    truncation: bool = False,
    max_length: Optional[int] = None,
)

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def __call__(
    self,
    text: Union[str, list[str], list[int]],
    text_pair: Optional[str] = None,
    add_special_tokens: bool = False,
    truncation: bool = False,
    max_length: Optional[int] = None,
):
    raise NotImplementedError()

len ¶

__len__() -> int

Source code in vllm/transformers_utils/tokenizer_base.py

def __len__(self) -> int:
    return self.vocab_size

apply_chat_template `abstractmethod` ¶

apply_chat_template(
    messages: list[ChatCompletionMessageParam],
    tools: Optional[list[dict[str, Any]]] = None,
    **kwargs,
) -> list[int]

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def apply_chat_template(self,
                        messages: list["ChatCompletionMessageParam"],
                        tools: Optional[list[dict[str, Any]]] = None,
                        **kwargs) -> list[int]:
    raise NotImplementedError()

convert_ids_to_tokens `abstractmethod` ¶

convert_ids_to_tokens(
    ids: list[int], skip_special_tokens: bool = True
) -> list[str]

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def convert_ids_to_tokens(
    self,
    ids: list[int],
    skip_special_tokens: bool = True,
) -> list[str]:
    raise NotImplementedError()

convert_tokens_to_string `abstractmethod` ¶

convert_tokens_to_string(tokens: list[str]) -> str

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def convert_tokens_to_string(self, tokens: list[str]) -> str:
    raise NotImplementedError()

decode `abstractmethod` ¶

decode(
    ids: Union[list[int], int],
    skip_special_tokens: bool = True,
) -> str

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def decode(self,
           ids: Union[list[int], int],
           skip_special_tokens: bool = True) -> str:
    raise NotImplementedError()

encode `abstractmethod` ¶

encode(
    text: str,
    truncation: Optional[bool] = None,
    max_length: Optional[int] = None,
    add_special_tokens: Optional[bool] = None,
) -> list[int]

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def encode(self,
           text: str,
           truncation: Optional[bool] = None,
           max_length: Optional[int] = None,
           add_special_tokens: Optional[bool] = None) -> list[int]:
    raise NotImplementedError()

encode_one `abstractmethod` ¶

encode_one(
    text: str,
    truncation: bool = False,
    max_length: Optional[int] = None,
) -> list[int]

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def encode_one(
    self,
    text: str,
    truncation: bool = False,
    max_length: Optional[int] = None,
) -> list[int]:
    raise NotImplementedError()

get_added_vocab `abstractmethod` ¶

get_added_vocab() -> dict[str, int]

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def get_added_vocab(self) -> dict[str, int]:
    raise NotImplementedError()

get_vocab `abstractmethod` ¶

get_vocab() -> dict[str, int]

Source code in vllm/transformers_utils/tokenizer_base.py

@abstractmethod
def get_vocab(self) -> dict[str, int]:
    raise NotImplementedError()

TokenizerRegistry ¶

Source code in vllm/transformers_utils/tokenizer_base.py

class TokenizerRegistry:
    # Tokenizer name -> (tokenizer module, tokenizer class)
    REGISTRY: dict[str, tuple[str, str]] = {}

    @staticmethod
    def register(name: str, module: str, class_name: str) -> None:
        TokenizerRegistry.REGISTRY[name] = (module, class_name)

    @staticmethod
    def get_tokenizer(
        tokenizer_name: str,
        *args,
        **kwargs,
    ) -> TokenizerBase:
        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
        if tokenizer_cls is None:
            raise ValueError(f"Tokenizer {tokenizer_name} not found.")

        tokenizer_module = importlib.import_module(tokenizer_cls[0])
        class_ = getattr(tokenizer_module, tokenizer_cls[1])
        return class_.from_pretrained(*args, **kwargs)

REGISTRY `class-attribute` `instance-attribute` ¶

REGISTRY: dict[str, tuple[str, str]] = {}

get_tokenizer `staticmethod` ¶

get_tokenizer(
    tokenizer_name: str, *args, **kwargs
) -> TokenizerBase

Source code in vllm/transformers_utils/tokenizer_base.py

@staticmethod
def get_tokenizer(
    tokenizer_name: str,
    *args,
    **kwargs,
) -> TokenizerBase:
    tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
    if tokenizer_cls is None:
        raise ValueError(f"Tokenizer {tokenizer_name} not found.")

    tokenizer_module = importlib.import_module(tokenizer_cls[0])
    class_ = getattr(tokenizer_module, tokenizer_cls[1])
    return class_.from_pretrained(*args, **kwargs)

register `staticmethod` ¶

register(name: str, module: str, class_name: str) -> None

Source code in vllm/transformers_utils/tokenizer_base.py

@staticmethod
def register(name: str, module: str, class_name: str) -> None:
    TokenizerRegistry.REGISTRY[name] = (module, class_name)

vllm.transformers_utils.tokenizer_base

TokenizerBase ¶

all_special_ids abstractmethod property ¶

all_special_tokens abstractmethod property ¶

all_special_tokens_extended abstractmethod property ¶

bos_token_id abstractmethod property ¶

eos_token_id abstractmethod property ¶

is_fast abstractmethod property ¶

max_token_id abstractmethod property ¶

pad_token abstractmethod property ¶

sep_token abstractmethod property ¶

vocab_size abstractmethod property ¶

__call__ abstractmethod ¶

__len__ ¶

apply_chat_template abstractmethod ¶

convert_ids_to_tokens abstractmethod ¶

convert_tokens_to_string abstractmethod ¶

decode abstractmethod ¶

encode abstractmethod ¶

encode_one abstractmethod ¶

get_added_vocab abstractmethod ¶

get_vocab abstractmethod ¶

TokenizerRegistry ¶

REGISTRY class-attribute instance-attribute ¶

get_tokenizer staticmethod ¶

register staticmethod ¶

all_special_ids `abstractmethod` `property` ¶

all_special_tokens `abstractmethod` `property` ¶

all_special_tokens_extended `abstractmethod` `property` ¶

bos_token_id `abstractmethod` `property` ¶

eos_token_id `abstractmethod` `property` ¶

is_fast `abstractmethod` `property` ¶

max_token_id `abstractmethod` `property` ¶

pad_token `abstractmethod` `property` ¶

sep_token `abstractmethod` `property` ¶

vocab_size `abstractmethod` `property` ¶

call `abstractmethod` ¶

len ¶

apply_chat_template `abstractmethod` ¶

convert_ids_to_tokens `abstractmethod` ¶

convert_tokens_to_string `abstractmethod` ¶

decode `abstractmethod` ¶

encode `abstractmethod` ¶

encode_one `abstractmethod` ¶

get_added_vocab `abstractmethod` ¶

get_vocab `abstractmethod` ¶

REGISTRY `class-attribute` `instance-attribute` ¶

get_tokenizer `staticmethod` ¶

register `staticmethod` ¶