These types are defined in this file to avoid importing vllm.engine.metrics
and therefore importing prometheus_client.
This is required due to usage of Prometheus multiprocess mode to enable
metrics after splitting out the uvicorn process from the engine process.
Prometheus multiprocess mode requires setting PROMETHEUS_MULTIPROC_DIR
before prometheus_client is imported. Typically, this is done by setting
the env variable before launch, but since we are a library, we need to
do this in Python code and lazily import prometheus_client.
StatLoggerBase
Bases: ABC
Base class for StatLogger.
Source code in vllm/engine/metrics_types.py
| class StatLoggerBase(ABC):
"""Base class for StatLogger."""
def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
# Tracked stats over current local logging interval.
self.num_prompt_tokens: List[int] = []
self.num_generation_tokens: List[int] = []
self.last_local_log = time.time()
self.local_interval = local_interval
self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
@abstractmethod
def log(self, stats: Stats) -> None:
raise NotImplementedError
@abstractmethod
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
raise NotImplementedError
def maybe_update_spec_decode_metrics(self, stats: Stats):
"""Save spec decode metrics (since they are unlikely
to be emitted at same time as log interval)."""
if stats.spec_decode_metrics is not None:
self.spec_decode_metrics = stats.spec_decode_metrics
|
last_local_log
instance-attribute
local_interval
instance-attribute
local_interval = local_interval
num_generation_tokens
instance-attribute
num_generation_tokens: List[int] = []
num_prompt_tokens
instance-attribute
spec_decode_metrics
instance-attribute
__init__
Source code in vllm/engine/metrics_types.py
| def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
# Tracked stats over current local logging interval.
self.num_prompt_tokens: List[int] = []
self.num_generation_tokens: List[int] = []
self.last_local_log = time.time()
self.local_interval = local_interval
self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
|
info
abstractmethod
Source code in vllm/engine/metrics_types.py
| @abstractmethod
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
raise NotImplementedError
|
log
abstractmethod
log(stats: Stats) -> None
Source code in vllm/engine/metrics_types.py
| @abstractmethod
def log(self, stats: Stats) -> None:
raise NotImplementedError
|
maybe_update_spec_decode_metrics
maybe_update_spec_decode_metrics(stats: Stats)
Save spec decode metrics (since they are unlikely
to be emitted at same time as log interval).
Source code in vllm/engine/metrics_types.py
| def maybe_update_spec_decode_metrics(self, stats: Stats):
"""Save spec decode metrics (since they are unlikely
to be emitted at same time as log interval)."""
if stats.spec_decode_metrics is not None:
self.spec_decode_metrics = stats.spec_decode_metrics
|
Stats
dataclass
Created by LLMEngine for use by StatLogger.
Source code in vllm/engine/metrics_types.py
| @dataclass
class Stats:
"""Created by LLMEngine for use by StatLogger."""
now: float
# System stats (should have _sys suffix)
# Scheduler State
num_running_sys: int
num_waiting_sys: int
num_swapped_sys: int
# KV Cache Usage in %
gpu_cache_usage_sys: float
cpu_cache_usage_sys: float
# Prefix caching block hit rate
cpu_prefix_cache_hit_rate: float
gpu_prefix_cache_hit_rate: float
# Iteration stats (should have _iter suffix)
num_prompt_tokens_iter: int
num_generation_tokens_iter: int
num_tokens_iter: int
time_to_first_tokens_iter: List[float]
time_per_output_tokens_iter: List[float]
num_preemption_iter: int
# Request stats (should have _requests suffix)
# Latency
time_e2e_requests: List[float]
time_queue_requests: List[float]
time_inference_requests: List[float]
time_prefill_requests: List[float]
time_decode_requests: List[float]
# Metadata
num_prompt_tokens_requests: List[int]
num_generation_tokens_requests: List[int]
n_requests: List[int]
max_num_generation_tokens_requests: List[int]
max_tokens_requests: List[int]
finished_reason_requests: List[str]
waiting_lora_adapters: List[str]
running_lora_adapters: List[str]
max_lora: str
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
cpu_cache_usage_sys
instance-attribute
cpu_cache_usage_sys: float
cpu_prefix_cache_hit_rate
instance-attribute
cpu_prefix_cache_hit_rate: float
finished_reason_requests
instance-attribute
finished_reason_requests: List[str]
gpu_cache_usage_sys
instance-attribute
gpu_cache_usage_sys: float
gpu_prefix_cache_hit_rate
instance-attribute
gpu_prefix_cache_hit_rate: float
max_lora
instance-attribute
max_num_generation_tokens_requests
instance-attribute
max_num_generation_tokens_requests: List[int]
max_tokens_requests
instance-attribute
n_requests
instance-attribute
num_generation_tokens_iter
instance-attribute
num_generation_tokens_iter: int
num_generation_tokens_requests
instance-attribute
num_generation_tokens_requests: List[int]
num_preemption_iter
instance-attribute
num_prompt_tokens_iter
instance-attribute
num_prompt_tokens_iter: int
num_prompt_tokens_requests
instance-attribute
num_prompt_tokens_requests: List[int]
num_running_sys
instance-attribute
num_swapped_sys
instance-attribute
num_tokens_iter
instance-attribute
num_waiting_sys
instance-attribute
running_lora_adapters
instance-attribute
spec_decode_metrics
class-attribute
instance-attribute
time_decode_requests
instance-attribute
time_e2e_requests
instance-attribute
time_inference_requests
instance-attribute
time_per_output_tokens_iter
instance-attribute
time_prefill_requests
instance-attribute
time_queue_requests
instance-attribute
time_to_first_tokens_iter
instance-attribute
waiting_lora_adapters
instance-attribute
__init__
__init__(
now: float,
num_running_sys: int,
num_waiting_sys: int,
num_swapped_sys: int,
gpu_cache_usage_sys: float,
cpu_cache_usage_sys: float,
cpu_prefix_cache_hit_rate: float,
gpu_prefix_cache_hit_rate: float,
num_prompt_tokens_iter: int,
num_generation_tokens_iter: int,
num_tokens_iter: int,
time_to_first_tokens_iter: List[float],
time_per_output_tokens_iter: List[float],
num_preemption_iter: int,
time_e2e_requests: List[float],
time_queue_requests: List[float],
time_inference_requests: List[float],
time_prefill_requests: List[float],
time_decode_requests: List[float],
num_prompt_tokens_requests: List[int],
num_generation_tokens_requests: List[int],
n_requests: List[int],
max_num_generation_tokens_requests: List[int],
max_tokens_requests: List[int],
finished_reason_requests: List[str],
waiting_lora_adapters: List[str],
running_lora_adapters: List[str],
max_lora: str,
spec_decode_metrics: Optional[
SpecDecodeWorkerMetrics
] = None,
) -> None