Skip to content

vllm.model_executor.guided_decoding.outlines_decoding

JSON_GRAMMAR module-attribute

JSON_GRAMMAR = '\n?start: object | array\n\n?value: object\n| array\n| UNESCAPED_STRING\n| SIGNED_NUMBER      -> number\n| "true"             -> true\n| "false"            -> false\n| "null"             -> null\n\narray  : "[" [value ("," value)*] "]"\nobject : "{" [pair ("," pair)*] "}"\npair   : UNESCAPED_STRING ":" value\n\n%import common.UNESCAPED_STRING\n%import common.SIGNED_NUMBER\n%import common.WS\n\n%ignore WS\n'

_MAX_THREADPOOL_WORKERS module-attribute

_MAX_THREADPOOL_WORKERS = 16

global_thread_pool module-attribute

global_thread_pool = None

GuidedDecodingMode

Bases: Enum

Source code in vllm/model_executor/guided_decoding/outlines_decoding.py
class GuidedDecodingMode(Enum):
    JSON = "json"
    REGEX = "regex"
    CHOICE = "choice"
    GRAMMAR = "grammar"

CHOICE class-attribute instance-attribute

CHOICE = 'choice'

GRAMMAR class-attribute instance-attribute

GRAMMAR = 'grammar'

JSON class-attribute instance-attribute

JSON = 'json'

REGEX class-attribute instance-attribute

REGEX = 'regex'

_get_guide_and_mode

_get_guide_and_mode(
    guided_params: GuidedDecodingParams,
) -> Union[
    tuple[str, GuidedDecodingMode], tuple[None, None]
]
Source code in vllm/model_executor/guided_decoding/outlines_decoding.py
def _get_guide_and_mode(
    guided_params: GuidedDecodingParams
) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]:
    if guided_params.json:
        if isinstance(guided_params.json, dict):
            # turn dict into hashable string
            json = json_dumps(guided_params.json)
        else:
            json = guided_params.json
        return json, GuidedDecodingMode.JSON
    elif guided_params.regex:
        return guided_params.regex, GuidedDecodingMode.REGEX
    elif guided_params.choice:
        # choice just uses regex
        choices = [
            regex_escape(str(choice)) for choice in guided_params.choice
        ]
        choices_regex = "(" + "|".join(choices) + ")"
        return choices_regex, GuidedDecodingMode.CHOICE
    elif guided_params.grammar:
        return guided_params.grammar, GuidedDecodingMode.GRAMMAR
    elif guided_params.json_object:
        return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
    else:
        return None, None

_get_logits_processor

_get_logits_processor(
    guide: str,
    tokenizer: PreTrainedTokenizerBase,
    mode: GuidedDecodingMode,
    whitespace_pattern: Union[str, None],
    reasoner: Optional[ReasoningParser],
) -> Union[
    JSONLogitsProcessor,
    RegexLogitsProcessor,
    CFGLogitsProcessor,
]
Source code in vllm/model_executor/guided_decoding/outlines_decoding.py
def _get_logits_processor(
    guide: str,
    tokenizer: PreTrainedTokenizerBase,
    mode: GuidedDecodingMode,
    whitespace_pattern: Union[str, None],
    reasoner: Optional[ReasoningParser],
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
    if mode == GuidedDecodingMode.JSON:
        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
                                   reasoner)
    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
        return RegexLogitsProcessor(guide, tokenizer, reasoner)
    elif mode == GuidedDecodingMode.GRAMMAR:
        return CFGLogitsProcessor(guide, tokenizer, reasoner)
    else:
        raise ValueError(f"Unknown guided decoding mode {mode}")

get_local_outlines_guided_decoding_logits_processor

get_local_outlines_guided_decoding_logits_processor(
    guided_params: GuidedDecodingParams,
    tokenizer: PreTrainedTokenizerBase,
    reasoner: Optional[ReasoningParser],
) -> Union[
    JSONLogitsProcessor,
    RegexLogitsProcessor,
    CFGLogitsProcessor,
    None,
]

Given an OpenAI-compatible request, check for guided decoding parameters and get the necessary logits processor for the given guide. We cache logit processors by (guide, tokenizer), and on cache hit we make a shallow copy to reuse the same underlying FSM.

Source code in vllm/model_executor/guided_decoding/outlines_decoding.py
def get_local_outlines_guided_decoding_logits_processor(
    guided_params: GuidedDecodingParams,
    tokenizer: PreTrainedTokenizerBase,
    reasoner: Optional[ReasoningParser],
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
           None]:
    """
    Given an OpenAI-compatible request, check for guided decoding parameters
    and get the necessary logits processor for the given guide.
    We cache logit processors by (guide, tokenizer), and on cache hit
    we make a shallow copy to reuse the same underlying FSM.
    """
    guide, mode = _get_guide_and_mode(guided_params)
    if not guide or not mode:
        return None

    return _get_logits_processor(guide, tokenizer, mode,
                                 guided_params.whitespace_pattern, reasoner)

get_outlines_guided_decoding_logits_processor async

get_outlines_guided_decoding_logits_processor(
    guided_params: GuidedDecodingParams,
    tokenizer: PreTrainedTokenizerBase,
    reasoner: Optional[ReasoningParser],
) -> Union[
    JSONLogitsProcessor,
    RegexLogitsProcessor,
    CFGLogitsProcessor,
    None,
]

Given an OpenAI-compatible request, check for guided decoding parameters and get the necessary logits processor for the given guide. We cache logit processors by (guide, tokenizer), and on cache hit we make a shallow copy to reuse the same underlying FSM.

Source code in vllm/model_executor/guided_decoding/outlines_decoding.py
async def get_outlines_guided_decoding_logits_processor(
    guided_params: GuidedDecodingParams,
    tokenizer: PreTrainedTokenizerBase,
    reasoner: Optional[ReasoningParser],
) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
           None]:
    """
    Given an OpenAI-compatible request, check for guided decoding parameters
    and get the necessary logits processor for the given guide.
    We cache logit processors by (guide, tokenizer), and on cache hit
    we make a shallow copy to reuse the same underlying FSM.
    """
    global global_thread_pool
    guide, mode = _get_guide_and_mode(guided_params)
    if not guide or not mode:
        return None

    if global_thread_pool is None:
        max_workers = os.cpu_count() or 2
        if max_workers > _MAX_THREADPOOL_WORKERS:
            max_workers = _MAX_THREADPOOL_WORKERS
        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers)
    loop = asyncio.get_running_loop()

    return await loop.run_in_executor(global_thread_pool,
                                      _get_logits_processor, guide, tokenizer,
                                      mode, guided_params.whitespace_pattern,
                                      reasoner)