Production Metrics
vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the /metrics endpoint on the vLLM OpenAI compatible API server.
The following metrics are exposed:
class Metrics:labelname_finish_reason = "finished_reason"_gauge_cls = prometheus_client.Gauge_counter_cls = prometheus_client.Counter_histogram_cls = prometheus_client.Histogramdef __init__(self, labelnames: List[str], max_model_len: int):# Unregister any existing vLLM collectorsself._unregister_vllm_metrics()# Config Informationself._create_info_cache_config()# System stats# Scheduler Stateself.gauge_scheduler_running = self._gauge_cls(name="vllm:num_requests_running",documentation="Number of requests currently running on GPU.",labelnames=labelnames)self.gauge_scheduler_waiting = self._gauge_cls(name="vllm:num_requests_waiting",documentation="Number of requests waiting to be processed.",labelnames=labelnames)self.gauge_scheduler_swapped = self._gauge_cls(name="vllm:num_requests_swapped",documentation="Number of requests swapped to CPU.",labelnames=labelnames)# KV Cache Usage in %self.gauge_gpu_cache_usage = self._gauge_cls(name="vllm:gpu_cache_usage_perc",documentation="GPU KV-cache usage. 1 means 100 percent usage.",labelnames=labelnames)self.gauge_cpu_cache_usage = self._gauge_cls(name="vllm:cpu_cache_usage_perc",documentation="CPU KV-cache usage. 1 means 100 percent usage.",labelnames=labelnames)# Iteration statsself.counter_num_preemption = self._counter_cls(name="vllm:num_preemptions_total",documentation="Cumulative number of preemption from the engine.",labelnames=labelnames)self.counter_prompt_tokens = self._counter_cls(name="vllm:prompt_tokens_total",documentation="Number of prefill tokens processed.",labelnames=labelnames)self.counter_generation_tokens = self._counter_cls(name="vllm:generation_tokens_total",documentation="Number of generation tokens processed.",labelnames=labelnames)self.histogram_time_to_first_token = self._histogram_cls(name="vllm:time_to_first_token_seconds",documentation="Histogram of time to first token in seconds.",labelnames=labelnames,buckets=[0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,0.75, 1.0, 2.5, 5.0, 7.5, 10.0])self.histogram_time_per_output_token = self._histogram_cls(name="vllm:time_per_output_token_seconds",documentation="Histogram of time per output token in seconds.",labelnames=labelnames,buckets=[0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,1.0, 2.5])# Request stats# Latencyself.histogram_e2e_time_request = self._histogram_cls(name="vllm:e2e_request_latency_seconds",documentation="Histogram of end to end request latency in seconds.",labelnames=labelnames,buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])# Metadataself.histogram_num_prompt_tokens_request = self._histogram_cls(name="vllm:request_prompt_tokens",documentation="Number of prefill tokens processed.",labelnames=labelnames,buckets=build_1_2_5_buckets(max_model_len),)self.histogram_num_generation_tokens_request = \self._histogram_cls(name="vllm:request_generation_tokens",documentation="Number of generation tokens processed.",labelnames=labelnames,buckets=build_1_2_5_buckets(max_model_len),)self.histogram_best_of_request = self._histogram_cls(name="vllm:request_params_best_of",documentation="Histogram of the best_of request parameter.",labelnames=labelnames,buckets=[1, 2, 5, 10, 20],)self.histogram_n_request = self._histogram_cls(name="vllm:request_params_n",documentation="Histogram of the n request parameter.",labelnames=labelnames,buckets=[1, 2, 5, 10, 20],)self.counter_request_success = self._counter_cls(name="vllm:request_success_total",documentation="Count of successfully processed requests.",labelnames=labelnames + [Metrics.labelname_finish_reason])# Speculatie decoding statsself.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(name="vllm:spec_decode_draft_acceptance_rate",documentation="Speulative token acceptance rate.",labelnames=labelnames)self.gauge_spec_decode_efficiency = self._gauge_cls(name="vllm:spec_decode_efficiency",documentation="Speculative decoding system efficiency.",labelnames=labelnames)self.counter_spec_decode_num_accepted_tokens = (self._counter_cls(name="vllm:spec_decode_num_accepted_tokens_total",documentation="Number of accepted tokens.",labelnames=labelnames))self.counter_spec_decode_num_draft_tokens = self._counter_cls(name="vllm:spec_decode_num_draft_tokens_total",documentation="Number of draft tokens.",labelnames=labelnames)self.counter_spec_decode_num_emitted_tokens = (self._counter_cls(name="vllm:spec_decode_num_emitted_tokens_total",documentation="Number of emitted tokens.",labelnames=labelnames))# Deprecated in favor of vllm:prompt_tokens_totalself.gauge_avg_prompt_throughput = self._gauge_cls(name="vllm:avg_prompt_throughput_toks_per_s",documentation="Average prefill throughput in tokens/s.",labelnames=labelnames,)# Deprecated in favor of vllm:generation_tokens_totalself.gauge_avg_generation_throughput = self._gauge_cls(name="vllm:avg_generation_throughput_toks_per_s",documentation="Average generation throughput in tokens/s.",labelnames=labelnames,)def _create_info_cache_config(self) -> None:# Config Informationself.info_cache_config = prometheus_client.Info(name='vllm:cache_config',documentation='information of cache_config')def _unregister_vllm_metrics(self) -> None:for collector in list(prometheus_client.REGISTRY._collector_to_names):if hasattr(collector, "_name") and "vllm" in collector._name:prometheus_client.REGISTRY.unregister(collector)