Production Metrics

vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the /metrics endpoint on the vLLM OpenAI compatible API server.

The following metrics are exposed:

  1. class Metrics:
  2. labelname_finish_reason = "finished_reason"
  3. def __init__(self, labelnames: List[str], max_model_len: int):
  4. # Unregister any existing vLLM collectors
  5. for collector in list(REGISTRY._collector_to_names):
  6. if hasattr(collector, "_name") and "vllm" in collector._name:
  7. REGISTRY.unregister(collector)
  8. # Config Information
  9. self.info_cache_config = Info(
  10. name='vllm:cache_config',
  11. documentation='information of cache_config')
  12. # System stats
  13. # Scheduler State
  14. self.gauge_scheduler_running = Gauge(
  15. name="vllm:num_requests_running",
  16. documentation="Number of requests currently running on GPU.",
  17. labelnames=labelnames)
  18. self.gauge_scheduler_waiting = Gauge(
  19. name="vllm:num_requests_waiting",
  20. documentation="Number of requests waiting to be processed.",
  21. labelnames=labelnames)
  22. self.gauge_scheduler_swapped = Gauge(
  23. name="vllm:num_requests_swapped",
  24. documentation="Number of requests swapped to CPU.",
  25. labelnames=labelnames)
  26. # KV Cache Usage in %
  27. self.gauge_gpu_cache_usage = Gauge(
  28. name="vllm:gpu_cache_usage_perc",
  29. documentation="GPU KV-cache usage. 1 means 100 percent usage.",
  30. labelnames=labelnames)
  31. self.gauge_cpu_cache_usage = Gauge(
  32. name="vllm:cpu_cache_usage_perc",
  33. documentation="CPU KV-cache usage. 1 means 100 percent usage.",
  34. labelnames=labelnames)
  35. # Iteration stats
  36. self.counter_num_preemption = Counter(
  37. name="vllm:num_preemptions_total",
  38. documentation="Cumulative number of preemption from the engine.",
  39. labelnames=labelnames)
  40. self.counter_prompt_tokens = Counter(
  41. name="vllm:prompt_tokens_total",
  42. documentation="Number of prefill tokens processed.",
  43. labelnames=labelnames)
  44. self.counter_generation_tokens = Counter(
  45. name="vllm:generation_tokens_total",
  46. documentation="Number of generation tokens processed.",
  47. labelnames=labelnames)
  48. self.histogram_time_to_first_token = Histogram(
  49. name="vllm:time_to_first_token_seconds",
  50. documentation="Histogram of time to first token in seconds.",
  51. labelnames=labelnames,
  52. buckets=[
  53. 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
  54. 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
  55. ])
  56. self.histogram_time_per_output_token = Histogram(
  57. name="vllm:time_per_output_token_seconds",
  58. documentation="Histogram of time per output token in seconds.",
  59. labelnames=labelnames,
  60. buckets=[
  61. 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
  62. 1.0, 2.5
  63. ])
  64. # Request stats
  65. # Latency
  66. self.histogram_e2e_time_request = Histogram(
  67. name="vllm:e2e_request_latency_seconds",
  68. documentation="Histogram of end to end request latency in seconds.",
  69. labelnames=labelnames,
  70. buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
  71. # Metadata
  72. self.histogram_num_prompt_tokens_request = Histogram(
  73. name="vllm:request_prompt_tokens",
  74. documentation="Number of prefill tokens processed.",
  75. labelnames=labelnames,
  76. buckets=build_1_2_5_buckets(max_model_len),
  77. )
  78. self.histogram_num_generation_tokens_request = Histogram(
  79. name="vllm:request_generation_tokens",
  80. documentation="Number of generation tokens processed.",
  81. labelnames=labelnames,
  82. buckets=build_1_2_5_buckets(max_model_len),
  83. )
  84. self.histogram_best_of_request = Histogram(
  85. name="vllm:request_params_best_of",
  86. documentation="Histogram of the best_of request parameter.",
  87. labelnames=labelnames,
  88. buckets=[1, 2, 5, 10, 20],
  89. )
  90. self.histogram_n_request = Histogram(
  91. name="vllm:request_params_n",
  92. documentation="Histogram of the n request parameter.",
  93. labelnames=labelnames,
  94. buckets=[1, 2, 5, 10, 20],
  95. )
  96. self.counter_request_success = Counter(
  97. name="vllm:request_success_total",
  98. documentation="Count of successfully processed requests.",
  99. labelnames=labelnames + [Metrics.labelname_finish_reason])
  100. # Deprecated in favor of vllm:prompt_tokens_total
  101. self.gauge_avg_prompt_throughput = Gauge(
  102. name="vllm:avg_prompt_throughput_toks_per_s",
  103. documentation="Average prefill throughput in tokens/s.",
  104. labelnames=labelnames,
  105. )
  106. # Deprecated in favor of vllm:generation_tokens_total
  107. self.gauge_avg_generation_throughput = Gauge(
  108. name="vllm:avg_generation_throughput_toks_per_s",
  109. documentation="Average generation throughput in tokens/s.",
  110. labelnames=labelnames,
  111. )