Skip to content

vllm.v1.structured_output.backend_guidance

logger module-attribute

logger = init_logger(__name__)

GuidanceBackend dataclass

Bases: StructuredOutputBackend

Source code in vllm/v1/structured_output/backend_guidance.py
@dataclass
class GuidanceBackend(StructuredOutputBackend):

    def __post_init__(self):
        self.disable_any_whitespace = \
            self.vllm_config.decoding_config.disable_any_whitespace
        self.disable_additional_properties = \
            self.vllm_config.decoding_config.disable_additional_properties

        self.ll_tokenizer = llguidance_hf.from_tokenizer(
            self.tokenizer, self.vocab_size)

    def compile_grammar(self, request_type: StructuredOutputOptions,
                        grammar_spec: str) -> StructuredOutputGrammar:
        self.serialized_grammar = serialize_guidance_grammar(
            request_type, grammar_spec, self.disable_any_whitespace,
            self.disable_additional_properties)

        ll_matcher = llguidance.LLMatcher(
            self.ll_tokenizer,
            self.serialized_grammar,
            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
        )

        r = GuidanceGrammar(
            ll_matcher=ll_matcher,
            ll_tokenizer=self.ll_tokenizer,
            vocab_size=self.vocab_size,
        )

        r.check_error()
        return r

    def allocate_token_bitmask(self, max_num_seqs: int):
        return llguidance_torch.allocate_token_bitmask(
            max_num_seqs, self.ll_tokenizer.vocab_size)

    def destroy(self):
        pass

__init__

__init__(
    vllm_config: VllmConfig,
    tokenizer: AnyTokenizer,
    vocab_size: int,
) -> None

__post_init__

__post_init__()
Source code in vllm/v1/structured_output/backend_guidance.py
def __post_init__(self):
    self.disable_any_whitespace = \
        self.vllm_config.decoding_config.disable_any_whitespace
    self.disable_additional_properties = \
        self.vllm_config.decoding_config.disable_additional_properties

    self.ll_tokenizer = llguidance_hf.from_tokenizer(
        self.tokenizer, self.vocab_size)

allocate_token_bitmask

allocate_token_bitmask(max_num_seqs: int)
Source code in vllm/v1/structured_output/backend_guidance.py
def allocate_token_bitmask(self, max_num_seqs: int):
    return llguidance_torch.allocate_token_bitmask(
        max_num_seqs, self.ll_tokenizer.vocab_size)

compile_grammar

compile_grammar(
    request_type: StructuredOutputOptions, grammar_spec: str
) -> StructuredOutputGrammar
Source code in vllm/v1/structured_output/backend_guidance.py
def compile_grammar(self, request_type: StructuredOutputOptions,
                    grammar_spec: str) -> StructuredOutputGrammar:
    self.serialized_grammar = serialize_guidance_grammar(
        request_type, grammar_spec, self.disable_any_whitespace,
        self.disable_additional_properties)

    ll_matcher = llguidance.LLMatcher(
        self.ll_tokenizer,
        self.serialized_grammar,
        log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
    )

    r = GuidanceGrammar(
        ll_matcher=ll_matcher,
        ll_tokenizer=self.ll_tokenizer,
        vocab_size=self.vocab_size,
    )

    r.check_error()
    return r

destroy

destroy()
Source code in vllm/v1/structured_output/backend_guidance.py
def destroy(self):
    pass

GuidanceGrammar dataclass

Bases: StructuredOutputGrammar

Source code in vllm/v1/structured_output/backend_guidance.py
@dataclass
class GuidanceGrammar(StructuredOutputGrammar):
    ll_matcher: llguidance.LLMatcher
    ll_tokenizer: llguidance.LLTokenizer
    vocab_size: int
    printed_error: bool = False
    terminated: bool = False

    def check_error(self):
        if not self.printed_error:
            err = self.ll_matcher.get_error()
            if err:
                self.printed_error = True
                logger.warning("LLMatcher error: %s", err)

    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
        """Accepts a list of tokens and advances the parser.

        Returns True if the parser was advanced successfully.
        Returns False if the parser failed to advance.
        """

        if self.ll_tokenizer.eos_token in tokens:
            self.terminated = True

        if self.ll_matcher.is_stopped():
            return True

        # TODO - Add jump decoding support in the future:
        # self.ll_matcher.compute_ff_bytes() - this should always work
        # self.ll_matcher.compute_ff_tokens() - this only works for
        #   "canonical" tokenizers
        # For conversion between the two, see
        # https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md

        r = self.ll_matcher.consume_tokens(tokens)

        self.check_error()

        return r

    def validate_tokens(self, tokens: list[int]) -> list[int]:
        """Checks if the list of tokens are accepted by the parser in sequence.
        Will not advance the parser.

        Returns the prefix list of tokens that are accepted by the parser.
        """
        if len(tokens) == 0:
            return []
        if self.ll_matcher.is_stopped():
            return []

        num_tokens = self.ll_matcher.validate_tokens(tokens)

        self.check_error()

        return tokens[:num_tokens]

    def rollback(self, num_tokens: int) -> None:
        self.ll_matcher.rollback(num_tokens)
        self.check_error()

    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
        # this will automatically return [EOS] mask if the matcher is stopped
        # or otherwise in an error state
        llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
        self.check_error()

    def is_terminated(self) -> bool:
        return self.terminated

    def reset(self):
        # This method may be not needed anymore? TODO
        self.ll_matcher.reset()

ll_matcher instance-attribute

ll_matcher: LLMatcher

ll_tokenizer instance-attribute

ll_tokenizer: LLTokenizer

printed_error class-attribute instance-attribute

printed_error: bool = False

terminated class-attribute instance-attribute

terminated: bool = False

vocab_size instance-attribute

vocab_size: int

__init__

__init__(
    ll_matcher: LLMatcher,
    ll_tokenizer: LLTokenizer,
    vocab_size: int,
    printed_error: bool = False,
    terminated: bool = False,
) -> None

accept_tokens

accept_tokens(request_id: str, tokens: list[int]) -> bool

Accepts a list of tokens and advances the parser.

Returns True if the parser was advanced successfully. Returns False if the parser failed to advance.

Source code in vllm/v1/structured_output/backend_guidance.py
def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
    """Accepts a list of tokens and advances the parser.

    Returns True if the parser was advanced successfully.
    Returns False if the parser failed to advance.
    """

    if self.ll_tokenizer.eos_token in tokens:
        self.terminated = True

    if self.ll_matcher.is_stopped():
        return True

    # TODO - Add jump decoding support in the future:
    # self.ll_matcher.compute_ff_bytes() - this should always work
    # self.ll_matcher.compute_ff_tokens() - this only works for
    #   "canonical" tokenizers
    # For conversion between the two, see
    # https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md

    r = self.ll_matcher.consume_tokens(tokens)

    self.check_error()

    return r

check_error

check_error()
Source code in vllm/v1/structured_output/backend_guidance.py
def check_error(self):
    if not self.printed_error:
        err = self.ll_matcher.get_error()
        if err:
            self.printed_error = True
            logger.warning("LLMatcher error: %s", err)

fill_bitmask

fill_bitmask(bitmask: Tensor, idx: int) -> None
Source code in vllm/v1/structured_output/backend_guidance.py
def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
    # this will automatically return [EOS] mask if the matcher is stopped
    # or otherwise in an error state
    llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
    self.check_error()

is_terminated

is_terminated() -> bool
Source code in vllm/v1/structured_output/backend_guidance.py
def is_terminated(self) -> bool:
    return self.terminated

reset

reset()
Source code in vllm/v1/structured_output/backend_guidance.py
def reset(self):
    # This method may be not needed anymore? TODO
    self.ll_matcher.reset()

rollback

rollback(num_tokens: int) -> None
Source code in vllm/v1/structured_output/backend_guidance.py
def rollback(self, num_tokens: int) -> None:
    self.ll_matcher.rollback(num_tokens)
    self.check_error()

validate_tokens

validate_tokens(tokens: list[int]) -> list[int]

Checks if the list of tokens are accepted by the parser in sequence. Will not advance the parser.

Returns the prefix list of tokens that are accepted by the parser.

Source code in vllm/v1/structured_output/backend_guidance.py
def validate_tokens(self, tokens: list[int]) -> list[int]:
    """Checks if the list of tokens are accepted by the parser in sequence.
    Will not advance the parser.

    Returns the prefix list of tokens that are accepted by the parser.
    """
    if len(tokens) == 0:
        return []
    if self.ll_matcher.is_stopped():
        return []

    num_tokens = self.ll_matcher.validate_tokens(tokens)

    self.check_error()

    return tokens[:num_tokens]

_walk_json_for_additional_properties

_walk_json_for_additional_properties(data: object)
Source code in vllm/v1/structured_output/backend_guidance.py
def _walk_json_for_additional_properties(data: object):
    if isinstance(data, dict):
        for value in data.values():
            _walk_json_for_additional_properties(value)
        if 'additionalProperties' not in data and \
            ('properties' in data or 'patternProperties' in data):
            data['additionalProperties'] = False
    elif isinstance(data, list):
        for item in data:
            _walk_json_for_additional_properties(item)

process_for_additional_properties

process_for_additional_properties(
    guide_json: Union[str, dict[str, Any]],
) -> dict[str, Any]
Source code in vllm/v1/structured_output/backend_guidance.py
def process_for_additional_properties(
        guide_json: Union[str, dict[str, Any]]) -> dict[str, Any]:
    if isinstance(guide_json, str):
        guide_json_obj = json.loads(guide_json)
    else:
        # copy for modifications
        guide_json_obj = copy.deepcopy(guide_json)
    _walk_json_for_additional_properties(guide_json_obj)
    return guide_json_obj

serialize_guidance_grammar

serialize_guidance_grammar(
    request_type: StructuredOutputOptions,
    grammar_spec: Union[str, dict[str, Any]],
    disable_any_whitespace: bool = False,
    disable_additional_properties: bool = False,
) -> str
Source code in vllm/v1/structured_output/backend_guidance.py
def serialize_guidance_grammar(
    request_type: StructuredOutputOptions,
    grammar_spec: Union[str, dict[str, Any]],
    disable_any_whitespace: bool = False,
    disable_additional_properties: bool = False,
) -> str:

    def _process_schema(grammar_spec: Union[str, dict[str, Any]], ) -> str:
        if disable_additional_properties:
            grammar_spec = process_for_additional_properties(grammar_spec)
        return llguidance.LLMatcher.grammar_from_json_schema(
            grammar_spec,
            defaults={
                "whitespace_flexible": not disable_any_whitespace,
            })

    if request_type == StructuredOutputOptions.JSON:
        return _process_schema(grammar_spec)
    elif request_type == StructuredOutputOptions.JSON_OBJECT:
        return llguidance.LLMatcher.grammar_from_json_schema(
            '{"type": "object"}',
            defaults={
                "whitespace_flexible": not disable_any_whitespace,
            })
    else:
        if request_type == StructuredOutputOptions.REGEX:
            tp = "regex"
        elif request_type == StructuredOutputOptions.GRAMMAR:
            tp = "grammar"
        elif request_type == StructuredOutputOptions.CHOICE:
            tp = "choice"
        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
            if isinstance(grammar_spec, str):
                s_tag = json.loads(grammar_spec)
            else:
                s_tag = grammar_spec
            triggers: list[str] = s_tag["triggers"]
            tags: list[llguidance.StructTag] = []
            for s in s_tag["structures"]:
                begin: str = s["begin"]
                trig = next((t for t in triggers if begin.startswith(t)), None)
                if trig is None:
                    raise ValueError(
                        f"Trigger {begin} not found in triggers {triggers}")
                tags.append(
                    llguidance.StructTag(
                        trigger=trig,
                        begin=s["begin"],
                        grammar=_process_schema(s["schema"]),
                        end=s["end"],
                    ))
            if not tags:
                raise ValueError(
                    "No structural tags found in the grammar spec.")
            return llguidance.StructTag.to_grammar(tags)
        else:
            logger.error("Validation should have already occurred. "
                         "Please file an issue.")
            raise ValueError("grammar is not of valid supported types. "
                             f"({request_type!s})")
        return llguidance.grammar_from(tp, grammar_spec)

validate_guidance_grammar

validate_guidance_grammar(
    sampling_params: SamplingParams,
    tokenizer: Optional[LLTokenizer] = None,
) -> None
Source code in vllm/v1/structured_output/backend_guidance.py
def validate_guidance_grammar(
        sampling_params: SamplingParams,
        tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
    tp, grm = get_structured_output_key(sampling_params)
    guidance_grm = serialize_guidance_grammar(tp, grm)
    err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
    if err:
        raise ValueError(f"Grammar error: {err}")