Skip to content

vllm.v1.metrics.stats

FinishedRequestStats dataclass

Stats associated with a finished request.

Source code in vllm/v1/metrics/stats.py
@dataclass
class FinishedRequestStats:
    """Stats associated with a finished request."""

    finish_reason: "FinishReason"
    e2e_latency: float = 0.0
    num_prompt_tokens: int = 0
    num_generation_tokens: int = 0
    max_tokens_param: Optional[int] = None
    queued_time: float = 0.0
    prefill_time: float = 0.0
    inference_time: float = 0.0
    decode_time: float = 0.0

decode_time class-attribute instance-attribute

decode_time: float = 0.0

e2e_latency class-attribute instance-attribute

e2e_latency: float = 0.0

finish_reason instance-attribute

finish_reason: FinishReason

inference_time class-attribute instance-attribute

inference_time: float = 0.0

max_tokens_param class-attribute instance-attribute

max_tokens_param: Optional[int] = None

num_generation_tokens class-attribute instance-attribute

num_generation_tokens: int = 0

num_prompt_tokens class-attribute instance-attribute

num_prompt_tokens: int = 0

prefill_time class-attribute instance-attribute

prefill_time: float = 0.0

queued_time class-attribute instance-attribute

queued_time: float = 0.0

__init__

__init__(
    finish_reason: FinishReason,
    e2e_latency: float = 0.0,
    num_prompt_tokens: int = 0,
    num_generation_tokens: int = 0,
    max_tokens_param: Optional[int] = None,
    queued_time: float = 0.0,
    prefill_time: float = 0.0,
    inference_time: float = 0.0,
    decode_time: float = 0.0,
) -> None

IterationStats

Stats associated with a single set of EngineCoreOutputs.

Source code in vllm/v1/metrics/stats.py
class IterationStats:
    """Stats associated with a single set of EngineCoreOutputs."""

    def __init__(self):
        self.iteration_timestamp = time.time()
        self.num_generation_tokens = 0
        self.num_prompt_tokens = 0
        self.num_preempted_reqs = 0
        self.finished_requests: list[FinishedRequestStats] = []
        self.max_num_generation_tokens_iter: list[int] = []
        self.n_params_iter: list[int] = []
        self.time_to_first_tokens_iter: list[float] = []
        self.time_per_output_tokens_iter: list[float] = []
        self.waiting_lora_adapters: dict[str, int] = {}
        self.running_lora_adapters: dict[str, int] = {}

    def _time_since(self, start: float) -> float:
        """Calculate an interval relative to this iteration's timestamp."""
        return self.iteration_timestamp - start

    def update_from_output(self, output: "EngineCoreOutput",
                           engine_core_timestamp: float, is_prefilling: bool,
                           prompt_len: int, req_stats: RequestStateStats,
                           lora_stats: Optional[LoRAStats]):
        num_new_generation_tokens = len(output.new_token_ids)

        self.num_generation_tokens += num_new_generation_tokens
        if is_prefilling:
            self.num_prompt_tokens += prompt_len

            first_token_latency = self._time_since(req_stats.arrival_time)
            self.time_to_first_tokens_iter.append(first_token_latency)

        req_stats.num_generation_tokens += num_new_generation_tokens

        # Process request-level engine core events
        if output.events is not None:
            self.update_from_events(output.request_id, output.events,
                                    is_prefilling, req_stats, lora_stats)

        # Process the batch-level "new tokens" engine core event
        if is_prefilling:
            req_stats.first_token_ts = engine_core_timestamp
        else:
            tpot = engine_core_timestamp - req_stats.last_token_ts
            self.time_per_output_tokens_iter.append(tpot)

        req_stats.last_token_ts = engine_core_timestamp

    def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
                           is_prefilling: bool, req_stats: RequestStateStats,
                           lora_stats: Optional[LoRAStats]):
        # Avoid circular dependency
        from vllm.v1.engine import EngineCoreEventType
        for event in events:
            if event.type == EngineCoreEventType.QUEUED:
                req_stats.queued_ts = event.timestamp
                if lora_stats is not None:
                    lora_stats.waiting_requests.add(req_id)
            elif event.type == EngineCoreEventType.SCHEDULED:
                if req_stats.scheduled_ts == 0.0:  # ignore preemptions
                    req_stats.scheduled_ts = event.timestamp
                LoRARequestStates.scheduled_request(lora_stats, req_id)
            elif event.type == EngineCoreEventType.PREEMPTED:
                self.num_preempted_reqs += 1
                LoRARequestStates.preempted_request(lora_stats, req_id)

    def update_from_finished_request(self, finish_reason: "FinishReason",
                                     num_prompt_tokens: int,
                                     max_tokens_param: Optional[int],
                                     req_stats: RequestStateStats):
        e2e_latency = self._time_since(req_stats.arrival_time)

        # Queued interval is from first QUEUED event to first SCHEDULED
        queued_time = req_stats.scheduled_ts - req_stats.queued_ts

        # Prefill interval is from first SCHEDULED to first NEW_TOKEN
        # Any preemptions during prefill is included in the interval
        prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts

        # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
        # Any preemptions during decode are included
        decode_time = req_stats.last_token_ts - req_stats.first_token_ts

        # Inference interval is from first SCHEDULED to last NEW_TOKEN
        # Any preemptions during prefill or decode are included
        inference_time = req_stats.last_token_ts - req_stats.scheduled_ts

        finished_req = \
            FinishedRequestStats(finish_reason=finish_reason,
                                 e2e_latency=e2e_latency,
                                 num_prompt_tokens=num_prompt_tokens,
                                 num_generation_tokens=req_stats.num_generation_tokens,
                                 max_tokens_param=max_tokens_param,
                                 queued_time=queued_time,
                                 prefill_time=prefill_time,
                                 inference_time=inference_time,
                                 decode_time=decode_time)
        self.finished_requests.append(finished_req)

finished_requests instance-attribute

finished_requests: list[FinishedRequestStats] = []

iteration_timestamp instance-attribute

iteration_timestamp = time()

max_num_generation_tokens_iter instance-attribute

max_num_generation_tokens_iter: list[int] = []

n_params_iter instance-attribute

n_params_iter: list[int] = []

num_generation_tokens instance-attribute

num_generation_tokens = 0

num_preempted_reqs instance-attribute

num_preempted_reqs = 0

num_prompt_tokens instance-attribute

num_prompt_tokens = 0

running_lora_adapters instance-attribute

running_lora_adapters: dict[str, int] = {}

time_per_output_tokens_iter instance-attribute

time_per_output_tokens_iter: list[float] = []

time_to_first_tokens_iter instance-attribute

time_to_first_tokens_iter: list[float] = []

waiting_lora_adapters instance-attribute

waiting_lora_adapters: dict[str, int] = {}

__init__

__init__()
Source code in vllm/v1/metrics/stats.py
def __init__(self):
    self.iteration_timestamp = time.time()
    self.num_generation_tokens = 0
    self.num_prompt_tokens = 0
    self.num_preempted_reqs = 0
    self.finished_requests: list[FinishedRequestStats] = []
    self.max_num_generation_tokens_iter: list[int] = []
    self.n_params_iter: list[int] = []
    self.time_to_first_tokens_iter: list[float] = []
    self.time_per_output_tokens_iter: list[float] = []
    self.waiting_lora_adapters: dict[str, int] = {}
    self.running_lora_adapters: dict[str, int] = {}

_time_since

_time_since(start: float) -> float

Calculate an interval relative to this iteration's timestamp.

Source code in vllm/v1/metrics/stats.py
def _time_since(self, start: float) -> float:
    """Calculate an interval relative to this iteration's timestamp."""
    return self.iteration_timestamp - start

update_from_events

update_from_events(
    req_id: str,
    events: list[EngineCoreEvent],
    is_prefilling: bool,
    req_stats: RequestStateStats,
    lora_stats: Optional[LoRAStats],
)
Source code in vllm/v1/metrics/stats.py
def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
                       is_prefilling: bool, req_stats: RequestStateStats,
                       lora_stats: Optional[LoRAStats]):
    # Avoid circular dependency
    from vllm.v1.engine import EngineCoreEventType
    for event in events:
        if event.type == EngineCoreEventType.QUEUED:
            req_stats.queued_ts = event.timestamp
            if lora_stats is not None:
                lora_stats.waiting_requests.add(req_id)
        elif event.type == EngineCoreEventType.SCHEDULED:
            if req_stats.scheduled_ts == 0.0:  # ignore preemptions
                req_stats.scheduled_ts = event.timestamp
            LoRARequestStates.scheduled_request(lora_stats, req_id)
        elif event.type == EngineCoreEventType.PREEMPTED:
            self.num_preempted_reqs += 1
            LoRARequestStates.preempted_request(lora_stats, req_id)

update_from_finished_request

update_from_finished_request(
    finish_reason: FinishReason,
    num_prompt_tokens: int,
    max_tokens_param: Optional[int],
    req_stats: RequestStateStats,
)
Source code in vllm/v1/metrics/stats.py
def update_from_finished_request(self, finish_reason: "FinishReason",
                                 num_prompt_tokens: int,
                                 max_tokens_param: Optional[int],
                                 req_stats: RequestStateStats):
    e2e_latency = self._time_since(req_stats.arrival_time)

    # Queued interval is from first QUEUED event to first SCHEDULED
    queued_time = req_stats.scheduled_ts - req_stats.queued_ts

    # Prefill interval is from first SCHEDULED to first NEW_TOKEN
    # Any preemptions during prefill is included in the interval
    prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts

    # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
    # Any preemptions during decode are included
    decode_time = req_stats.last_token_ts - req_stats.first_token_ts

    # Inference interval is from first SCHEDULED to last NEW_TOKEN
    # Any preemptions during prefill or decode are included
    inference_time = req_stats.last_token_ts - req_stats.scheduled_ts

    finished_req = \
        FinishedRequestStats(finish_reason=finish_reason,
                             e2e_latency=e2e_latency,
                             num_prompt_tokens=num_prompt_tokens,
                             num_generation_tokens=req_stats.num_generation_tokens,
                             max_tokens_param=max_tokens_param,
                             queued_time=queued_time,
                             prefill_time=prefill_time,
                             inference_time=inference_time,
                             decode_time=decode_time)
    self.finished_requests.append(finished_req)

update_from_output

update_from_output(
    output: EngineCoreOutput,
    engine_core_timestamp: float,
    is_prefilling: bool,
    prompt_len: int,
    req_stats: RequestStateStats,
    lora_stats: Optional[LoRAStats],
)
Source code in vllm/v1/metrics/stats.py
def update_from_output(self, output: "EngineCoreOutput",
                       engine_core_timestamp: float, is_prefilling: bool,
                       prompt_len: int, req_stats: RequestStateStats,
                       lora_stats: Optional[LoRAStats]):
    num_new_generation_tokens = len(output.new_token_ids)

    self.num_generation_tokens += num_new_generation_tokens
    if is_prefilling:
        self.num_prompt_tokens += prompt_len

        first_token_latency = self._time_since(req_stats.arrival_time)
        self.time_to_first_tokens_iter.append(first_token_latency)

    req_stats.num_generation_tokens += num_new_generation_tokens

    # Process request-level engine core events
    if output.events is not None:
        self.update_from_events(output.request_id, output.events,
                                is_prefilling, req_stats, lora_stats)

    # Process the batch-level "new tokens" engine core event
    if is_prefilling:
        req_stats.first_token_ts = engine_core_timestamp
    else:
        tpot = engine_core_timestamp - req_stats.last_token_ts
        self.time_per_output_tokens_iter.append(tpot)

    req_stats.last_token_ts = engine_core_timestamp

LoRARequestStates

Per-LoRA request state stats.

Source code in vllm/v1/metrics/stats.py
class LoRARequestStates:
    """Per-LoRA request state stats."""

    def __init__(self):
        self.lora_name_to_stats: dict[str, LoRAStats] = {}

    def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]:
        if req_state.lora_name is None:
            return None
        if req_state.lora_name not in self.lora_name_to_stats:
            self.lora_name_to_stats[req_state.lora_name] = LoRAStats()
        return self.lora_name_to_stats[req_state.lora_name]

    def add_request(self, req_state: 'RequestState'):
        if (lora_stats := self.get_stats(req_state)) is not None:
            lora_stats.waiting_requests.add(req_state.request_id)

    def finish_request(self, req_state: 'RequestState'):
        if req_state.lora_name is None:
            return
        lora_stats = self.lora_name_to_stats[req_state.lora_name]
        lora_stats.running_requests.remove(req_state.request_id)

    def abort_request(self, req_state: 'RequestState'):
        if req_state.lora_name is None:
            return
        lora_stats = self.lora_name_to_stats[req_state.lora_name]
        lora_stats.waiting_requests.discard(req_state.request_id)
        lora_stats.running_requests.discard(req_state.request_id)

    # Break the pattern for this lifecycle methods so we can
    # call this from IterationStats.update_from_events()
    @staticmethod
    def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
        if lora_stats is None:
            return
        lora_stats.waiting_requests.remove(request_id)
        lora_stats.running_requests.add(request_id)

    @staticmethod
    def preempted_request(lora_stats: Optional[LoRAStats], request_id: str):
        if lora_stats is None:
            return
        lora_stats.running_requests.remove(request_id)
        lora_stats.waiting_requests.add(request_id)

    def update_iteration_stats(self,
                               iteration_stats: Optional[IterationStats]):
        if iteration_stats is None:
            return
        for lora_name, stats in self.lora_name_to_stats.items():
            if stats.waiting_requests:
                iteration_stats.waiting_lora_adapters[lora_name] = \
                    len(stats.waiting_requests)
            if stats.running_requests:
                iteration_stats.running_lora_adapters[lora_name] = \
                    len(stats.running_requests)

lora_name_to_stats instance-attribute

lora_name_to_stats: dict[str, LoRAStats] = {}

__init__

__init__()
Source code in vllm/v1/metrics/stats.py
def __init__(self):
    self.lora_name_to_stats: dict[str, LoRAStats] = {}

abort_request

abort_request(req_state: RequestState)
Source code in vllm/v1/metrics/stats.py
def abort_request(self, req_state: 'RequestState'):
    if req_state.lora_name is None:
        return
    lora_stats = self.lora_name_to_stats[req_state.lora_name]
    lora_stats.waiting_requests.discard(req_state.request_id)
    lora_stats.running_requests.discard(req_state.request_id)

add_request

add_request(req_state: RequestState)
Source code in vllm/v1/metrics/stats.py
def add_request(self, req_state: 'RequestState'):
    if (lora_stats := self.get_stats(req_state)) is not None:
        lora_stats.waiting_requests.add(req_state.request_id)

finish_request

finish_request(req_state: RequestState)
Source code in vllm/v1/metrics/stats.py
def finish_request(self, req_state: 'RequestState'):
    if req_state.lora_name is None:
        return
    lora_stats = self.lora_name_to_stats[req_state.lora_name]
    lora_stats.running_requests.remove(req_state.request_id)

get_stats

get_stats(req_state: RequestState) -> Optional[LoRAStats]
Source code in vllm/v1/metrics/stats.py
def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]:
    if req_state.lora_name is None:
        return None
    if req_state.lora_name not in self.lora_name_to_stats:
        self.lora_name_to_stats[req_state.lora_name] = LoRAStats()
    return self.lora_name_to_stats[req_state.lora_name]

preempted_request staticmethod

preempted_request(
    lora_stats: Optional[LoRAStats], request_id: str
)
Source code in vllm/v1/metrics/stats.py
@staticmethod
def preempted_request(lora_stats: Optional[LoRAStats], request_id: str):
    if lora_stats is None:
        return
    lora_stats.running_requests.remove(request_id)
    lora_stats.waiting_requests.add(request_id)

scheduled_request staticmethod

scheduled_request(
    lora_stats: Optional[LoRAStats], request_id: str
)
Source code in vllm/v1/metrics/stats.py
@staticmethod
def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
    if lora_stats is None:
        return
    lora_stats.waiting_requests.remove(request_id)
    lora_stats.running_requests.add(request_id)

update_iteration_stats

update_iteration_stats(
    iteration_stats: Optional[IterationStats],
)
Source code in vllm/v1/metrics/stats.py
def update_iteration_stats(self,
                           iteration_stats: Optional[IterationStats]):
    if iteration_stats is None:
        return
    for lora_name, stats in self.lora_name_to_stats.items():
        if stats.waiting_requests:
            iteration_stats.waiting_lora_adapters[lora_name] = \
                len(stats.waiting_requests)
        if stats.running_requests:
            iteration_stats.running_lora_adapters[lora_name] = \
                len(stats.running_requests)

LoRAStats dataclass

Source code in vllm/v1/metrics/stats.py
@dataclass
class LoRAStats:
    waiting_requests: set[str] = field(default_factory=set)
    running_requests: set[str] = field(default_factory=set)

running_requests class-attribute instance-attribute

running_requests: set[str] = field(default_factory=set)

waiting_requests class-attribute instance-attribute

waiting_requests: set[str] = field(default_factory=set)

__init__

__init__(
    waiting_requests: set[str] = set(),
    running_requests: set[str] = set(),
) -> None

PrefixCacheStats dataclass

Stores prefix cache hit statistics.

Source code in vllm/v1/metrics/stats.py
@dataclass
class PrefixCacheStats:
    """Stores prefix cache hit statistics."""
    # Whether reset_prefix_cache was invoked.
    reset: bool = False
    # The number of requests in this update.
    requests: int = 0
    # The number of queries in these requests. Note that "queries" here
    # means the number of tokens that were queried from the cache.
    queries: int = 0
    # The number of hits in these requests.
    hits: int = 0

hits class-attribute instance-attribute

hits: int = 0

queries class-attribute instance-attribute

queries: int = 0

requests class-attribute instance-attribute

requests: int = 0

reset class-attribute instance-attribute

reset: bool = False

__init__

__init__(
    reset: bool = False,
    requests: int = 0,
    queries: int = 0,
    hits: int = 0,
) -> None

RequestStateStats dataclass

Stats that need to be tracked across delta updates.

Source code in vllm/v1/metrics/stats.py
@dataclass
class RequestStateStats:
    """Stats that need to be tracked across delta updates."""

    num_generation_tokens: int = 0

    # This is a engine frontend timestamp (wall-clock)
    arrival_time: float = 0.0

    # These are engine core timestamps (monotonic)
    queued_ts: float = 0.0
    scheduled_ts: float = 0.0
    first_token_ts: float = 0.0
    last_token_ts: float = 0.0

arrival_time class-attribute instance-attribute

arrival_time: float = 0.0

first_token_ts class-attribute instance-attribute

first_token_ts: float = 0.0

last_token_ts class-attribute instance-attribute

last_token_ts: float = 0.0

num_generation_tokens class-attribute instance-attribute

num_generation_tokens: int = 0

queued_ts class-attribute instance-attribute

queued_ts: float = 0.0

scheduled_ts class-attribute instance-attribute

scheduled_ts: float = 0.0

__init__

__init__(
    num_generation_tokens: int = 0,
    arrival_time: float = 0.0,
    queued_ts: float = 0.0,
    scheduled_ts: float = 0.0,
    first_token_ts: float = 0.0,
    last_token_ts: float = 0.0,
) -> None

SchedulerStats dataclass

Stats associated with the scheduler.

Source code in vllm/v1/metrics/stats.py
@dataclass
class SchedulerStats:
    """Stats associated with the scheduler."""

    num_running_reqs: int = 0
    num_waiting_reqs: int = 0

    kv_cache_usage: float = 0.0

    prefix_cache_stats: PrefixCacheStats = field(
        default_factory=PrefixCacheStats)

    spec_decoding_stats: Optional[SpecDecodingStats] = None

    num_corrupted_reqs: int = 0

kv_cache_usage class-attribute instance-attribute

kv_cache_usage: float = 0.0

num_corrupted_reqs class-attribute instance-attribute

num_corrupted_reqs: int = 0

num_running_reqs class-attribute instance-attribute

num_running_reqs: int = 0

num_waiting_reqs class-attribute instance-attribute

num_waiting_reqs: int = 0

prefix_cache_stats class-attribute instance-attribute

prefix_cache_stats: PrefixCacheStats = field(
    default_factory=PrefixCacheStats
)

spec_decoding_stats class-attribute instance-attribute

spec_decoding_stats: Optional[SpecDecodingStats] = None

__init__

__init__(
    num_running_reqs: int = 0,
    num_waiting_reqs: int = 0,
    kv_cache_usage: float = 0.0,
    prefix_cache_stats: PrefixCacheStats = PrefixCacheStats(),
    spec_decoding_stats: Optional[SpecDecodingStats] = None,
    num_corrupted_reqs: int = 0,
) -> None