Skip to content

vllm.entrypoints.openai.serving_tokenization

logger module-attribute

logger = init_logger(__name__)

OpenAIServingTokenization

Bases: OpenAIServing

Source code in vllm/entrypoints/openai/serving_tokenization.py
class OpenAIServingTokenization(OpenAIServing):

    def __init__(
        self,
        engine_client: EngineClient,
        model_config: ModelConfig,
        models: OpenAIServingModels,
        *,
        request_logger: Optional[RequestLogger],
        chat_template: Optional[str],
        chat_template_content_format: ChatTemplateContentFormatOption,
    ) -> None:
        super().__init__(engine_client=engine_client,
                         model_config=model_config,
                         models=models,
                         request_logger=request_logger)

        self.chat_template = chat_template
        self.chat_template_content_format: Final = chat_template_content_format

    async def create_tokenize(
        self,
        request: TokenizeRequest,
        raw_request: Request,
    ) -> Union[TokenizeResponse, ErrorResponse]:
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

        request_id = f"tokn-{self._base_request_id(raw_request)}"

        try:
            (
                lora_request,
                prompt_adapter_request,
            ) = self._maybe_get_adapters(request)

            tokenizer = await self.engine_client.get_tokenizer(lora_request)

            if isinstance(request, TokenizeChatRequest):
                tool_dicts = (None if request.tools is None else
                              [tool.model_dump() for tool in request.tools])
                (
                    _,
                    request_prompts,
                    engine_prompts,
                ) = await self._preprocess_chat(
                    request,
                    tokenizer,
                    request.messages,
                    tool_dicts=tool_dicts,
                    chat_template=request.chat_template or self.chat_template,
                    chat_template_content_format=self.
                    chat_template_content_format,
                    add_generation_prompt=request.add_generation_prompt,
                    continue_final_message=request.continue_final_message,
                    chat_template_kwargs=request.chat_template_kwargs,
                    add_special_tokens=request.add_special_tokens,
                )
            else:
                (request_prompts,
                 engine_prompts) = await self._preprocess_completion(
                     request,
                     tokenizer,
                     request.prompt,
                     add_special_tokens=request.add_special_tokens,
                 )
        except (ValueError, TypeError, jinja2.TemplateError) as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(f"{e} {e.__cause__}")

        input_ids: list[int] = []
        for i, engine_prompt in enumerate(engine_prompts):
            self._log_inputs(request_id,
                             request_prompts[i],
                             params=None,
                             lora_request=lora_request,
                             prompt_adapter_request=prompt_adapter_request)

            # Silently ignore prompt adapter since it does not affect
            # tokenization (Unlike in Embeddings API where an error is raised)
            if isinstance(engine_prompt,
                          dict) and "prompt_token_ids" in engine_prompt:
                input_ids.extend(engine_prompt["prompt_token_ids"])

        token_strs = None
        if request.return_token_strs:
            token_strs = tokenizer.convert_ids_to_tokens(input_ids)

        return TokenizeResponse(tokens=input_ids,
                                token_strs=token_strs,
                                count=len(input_ids),
                                max_model_len=self.max_model_len)

    async def create_detokenize(
        self,
        request: DetokenizeRequest,
        raw_request: Request,
    ) -> Union[DetokenizeResponse, ErrorResponse]:
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

        request_id = f"tokn-{self._base_request_id(raw_request)}"

        (
            lora_request,
            prompt_adapter_request,
        ) = self._maybe_get_adapters(request)

        tokenizer = await self.engine_client.get_tokenizer(lora_request)

        self._log_inputs(request_id,
                         request.tokens,
                         params=None,
                         lora_request=lora_request,
                         prompt_adapter_request=prompt_adapter_request)

        # Silently ignore prompt adapter since it does not affect tokenization
        # (Unlike in Embeddings API where an error is raised)

        prompt_input = await self._tokenize_prompt_input_async(
            request,
            tokenizer,
            request.tokens,
        )
        input_text = prompt_input["prompt"]

        return DetokenizeResponse(prompt=input_text)

chat_template instance-attribute

chat_template = chat_template

chat_template_content_format instance-attribute

chat_template_content_format: Final = (
    chat_template_content_format
)

__init__

__init__(
    engine_client: EngineClient,
    model_config: ModelConfig,
    models: OpenAIServingModels,
    *,
    request_logger: Optional[RequestLogger],
    chat_template: Optional[str],
    chat_template_content_format: ChatTemplateContentFormatOption,
) -> None
Source code in vllm/entrypoints/openai/serving_tokenization.py
def __init__(
    self,
    engine_client: EngineClient,
    model_config: ModelConfig,
    models: OpenAIServingModels,
    *,
    request_logger: Optional[RequestLogger],
    chat_template: Optional[str],
    chat_template_content_format: ChatTemplateContentFormatOption,
) -> None:
    super().__init__(engine_client=engine_client,
                     model_config=model_config,
                     models=models,
                     request_logger=request_logger)

    self.chat_template = chat_template
    self.chat_template_content_format: Final = chat_template_content_format

create_detokenize async

create_detokenize(
    request: DetokenizeRequest, raw_request: Request
) -> Union[DetokenizeResponse, ErrorResponse]
Source code in vllm/entrypoints/openai/serving_tokenization.py
async def create_detokenize(
    self,
    request: DetokenizeRequest,
    raw_request: Request,
) -> Union[DetokenizeResponse, ErrorResponse]:
    error_check_ret = await self._check_model(request)
    if error_check_ret is not None:
        return error_check_ret

    request_id = f"tokn-{self._base_request_id(raw_request)}"

    (
        lora_request,
        prompt_adapter_request,
    ) = self._maybe_get_adapters(request)

    tokenizer = await self.engine_client.get_tokenizer(lora_request)

    self._log_inputs(request_id,
                     request.tokens,
                     params=None,
                     lora_request=lora_request,
                     prompt_adapter_request=prompt_adapter_request)

    # Silently ignore prompt adapter since it does not affect tokenization
    # (Unlike in Embeddings API where an error is raised)

    prompt_input = await self._tokenize_prompt_input_async(
        request,
        tokenizer,
        request.tokens,
    )
    input_text = prompt_input["prompt"]

    return DetokenizeResponse(prompt=input_text)

create_tokenize async

create_tokenize(
    request: TokenizeRequest, raw_request: Request
) -> Union[TokenizeResponse, ErrorResponse]
Source code in vllm/entrypoints/openai/serving_tokenization.py
async def create_tokenize(
    self,
    request: TokenizeRequest,
    raw_request: Request,
) -> Union[TokenizeResponse, ErrorResponse]:
    error_check_ret = await self._check_model(request)
    if error_check_ret is not None:
        return error_check_ret

    request_id = f"tokn-{self._base_request_id(raw_request)}"

    try:
        (
            lora_request,
            prompt_adapter_request,
        ) = self._maybe_get_adapters(request)

        tokenizer = await self.engine_client.get_tokenizer(lora_request)

        if isinstance(request, TokenizeChatRequest):
            tool_dicts = (None if request.tools is None else
                          [tool.model_dump() for tool in request.tools])
            (
                _,
                request_prompts,
                engine_prompts,
            ) = await self._preprocess_chat(
                request,
                tokenizer,
                request.messages,
                tool_dicts=tool_dicts,
                chat_template=request.chat_template or self.chat_template,
                chat_template_content_format=self.
                chat_template_content_format,
                add_generation_prompt=request.add_generation_prompt,
                continue_final_message=request.continue_final_message,
                chat_template_kwargs=request.chat_template_kwargs,
                add_special_tokens=request.add_special_tokens,
            )
        else:
            (request_prompts,
             engine_prompts) = await self._preprocess_completion(
                 request,
                 tokenizer,
                 request.prompt,
                 add_special_tokens=request.add_special_tokens,
             )
    except (ValueError, TypeError, jinja2.TemplateError) as e:
        logger.exception("Error in preprocessing prompt inputs")
        return self.create_error_response(f"{e} {e.__cause__}")

    input_ids: list[int] = []
    for i, engine_prompt in enumerate(engine_prompts):
        self._log_inputs(request_id,
                         request_prompts[i],
                         params=None,
                         lora_request=lora_request,
                         prompt_adapter_request=prompt_adapter_request)

        # Silently ignore prompt adapter since it does not affect
        # tokenization (Unlike in Embeddings API where an error is raised)
        if isinstance(engine_prompt,
                      dict) and "prompt_token_ids" in engine_prompt:
            input_ids.extend(engine_prompt["prompt_token_ids"])

    token_strs = None
    if request.return_token_strs:
        token_strs = tokenizer.convert_ids_to_tokens(input_ids)

    return TokenizeResponse(tokens=input_ids,
                            token_strs=token_strs,
                            count=len(input_ids),
                            max_model_len=self.max_model_len)