Production Metrics

vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the /metrics endpoint on the vLLM OpenAI compatible API server.

The following metrics are exposed:

  1. class Metrics:
  2. labelname_finish_reason = "finished_reason"
  3. _gauge_cls = prometheus_client.Gauge
  4. _counter_cls = prometheus_client.Counter
  5. _histogram_cls = prometheus_client.Histogram
  6. def __init__(self, labelnames: List[str], max_model_len: int):
  7. # Unregister any existing vLLM collectors
  8. self._unregister_vllm_metrics()
  9. # Config Information
  10. self._create_info_cache_config()
  11. # System stats
  12. # Scheduler State
  13. self.gauge_scheduler_running = self._gauge_cls(
  14. name="vllm:num_requests_running",
  15. documentation="Number of requests currently running on GPU.",
  16. labelnames=labelnames)
  17. self.gauge_scheduler_waiting = self._gauge_cls(
  18. name="vllm:num_requests_waiting",
  19. documentation="Number of requests waiting to be processed.",
  20. labelnames=labelnames)
  21. self.gauge_scheduler_swapped = self._gauge_cls(
  22. name="vllm:num_requests_swapped",
  23. documentation="Number of requests swapped to CPU.",
  24. labelnames=labelnames)
  25. # KV Cache Usage in %
  26. self.gauge_gpu_cache_usage = self._gauge_cls(
  27. name="vllm:gpu_cache_usage_perc",
  28. documentation="GPU KV-cache usage. 1 means 100 percent usage.",
  29. labelnames=labelnames)
  30. self.gauge_cpu_cache_usage = self._gauge_cls(
  31. name="vllm:cpu_cache_usage_perc",
  32. documentation="CPU KV-cache usage. 1 means 100 percent usage.",
  33. labelnames=labelnames)
  34. # Iteration stats
  35. self.counter_num_preemption = self._counter_cls(
  36. name="vllm:num_preemptions_total",
  37. documentation="Cumulative number of preemption from the engine.",
  38. labelnames=labelnames)
  39. self.counter_prompt_tokens = self._counter_cls(
  40. name="vllm:prompt_tokens_total",
  41. documentation="Number of prefill tokens processed.",
  42. labelnames=labelnames)
  43. self.counter_generation_tokens = self._counter_cls(
  44. name="vllm:generation_tokens_total",
  45. documentation="Number of generation tokens processed.",
  46. labelnames=labelnames)
  47. self.histogram_time_to_first_token = self._histogram_cls(
  48. name="vllm:time_to_first_token_seconds",
  49. documentation="Histogram of time to first token in seconds.",
  50. labelnames=labelnames,
  51. buckets=[
  52. 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
  53. 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
  54. ])
  55. self.histogram_time_per_output_token = self._histogram_cls(
  56. name="vllm:time_per_output_token_seconds",
  57. documentation="Histogram of time per output token in seconds.",
  58. labelnames=labelnames,
  59. buckets=[
  60. 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
  61. 1.0, 2.5
  62. ])
  63. # Request stats
  64. # Latency
  65. self.histogram_e2e_time_request = self._histogram_cls(
  66. name="vllm:e2e_request_latency_seconds",
  67. documentation="Histogram of end to end request latency in seconds.",
  68. labelnames=labelnames,
  69. buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
  70. # Metadata
  71. self.histogram_num_prompt_tokens_request = self._histogram_cls(
  72. name="vllm:request_prompt_tokens",
  73. documentation="Number of prefill tokens processed.",
  74. labelnames=labelnames,
  75. buckets=build_1_2_5_buckets(max_model_len),
  76. )
  77. self.histogram_num_generation_tokens_request = \
  78. self._histogram_cls(
  79. name="vllm:request_generation_tokens",
  80. documentation="Number of generation tokens processed.",
  81. labelnames=labelnames,
  82. buckets=build_1_2_5_buckets(max_model_len),
  83. )
  84. self.histogram_best_of_request = self._histogram_cls(
  85. name="vllm:request_params_best_of",
  86. documentation="Histogram of the best_of request parameter.",
  87. labelnames=labelnames,
  88. buckets=[1, 2, 5, 10, 20],
  89. )
  90. self.histogram_n_request = self._histogram_cls(
  91. name="vllm:request_params_n",
  92. documentation="Histogram of the n request parameter.",
  93. labelnames=labelnames,
  94. buckets=[1, 2, 5, 10, 20],
  95. )
  96. self.counter_request_success = self._counter_cls(
  97. name="vllm:request_success_total",
  98. documentation="Count of successfully processed requests.",
  99. labelnames=labelnames + [Metrics.labelname_finish_reason])
  100. # Speculatie decoding stats
  101. self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
  102. name="vllm:spec_decode_draft_acceptance_rate",
  103. documentation="Speulative token acceptance rate.",
  104. labelnames=labelnames)
  105. self.gauge_spec_decode_efficiency = self._gauge_cls(
  106. name="vllm:spec_decode_efficiency",
  107. documentation="Speculative decoding system efficiency.",
  108. labelnames=labelnames)
  109. self.counter_spec_decode_num_accepted_tokens = (self._counter_cls(
  110. name="vllm:spec_decode_num_accepted_tokens_total",
  111. documentation="Number of accepted tokens.",
  112. labelnames=labelnames))
  113. self.counter_spec_decode_num_draft_tokens = self._counter_cls(
  114. name="vllm:spec_decode_num_draft_tokens_total",
  115. documentation="Number of draft tokens.",
  116. labelnames=labelnames)
  117. self.counter_spec_decode_num_emitted_tokens = (self._counter_cls(
  118. name="vllm:spec_decode_num_emitted_tokens_total",
  119. documentation="Number of emitted tokens.",
  120. labelnames=labelnames))
  121. # Deprecated in favor of vllm:prompt_tokens_total
  122. self.gauge_avg_prompt_throughput = self._gauge_cls(
  123. name="vllm:avg_prompt_throughput_toks_per_s",
  124. documentation="Average prefill throughput in tokens/s.",
  125. labelnames=labelnames,
  126. )
  127. # Deprecated in favor of vllm:generation_tokens_total
  128. self.gauge_avg_generation_throughput = self._gauge_cls(
  129. name="vllm:avg_generation_throughput_toks_per_s",
  130. documentation="Average generation throughput in tokens/s.",
  131. labelnames=labelnames,
  132. )
  133. def _create_info_cache_config(self) -> None:
  134. # Config Information
  135. self.info_cache_config = prometheus_client.Info(
  136. name='vllm:cache_config',
  137. documentation='information of cache_config')
  138. def _unregister_vllm_metrics(self) -> None:
  139. for collector in list(prometheus_client.REGISTRY._collector_to_names):
  140. if hasattr(collector, "_name") and "vllm" in collector._name:
  141. prometheus_client.REGISTRY.unregister(collector)