Production Metrics

vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the /metrics endpoint on the vLLM OpenAI compatible API server.

The following metrics are exposed:

  1. class Metrics:
  2. def __init__(self, labelnames: List[str]):
  3. # Unregister any existing vLLM collectors
  4. for collector in list(REGISTRY._collector_to_names):
  5. if hasattr(collector, "_name") and "vllm" in collector._name:
  6. REGISTRY.unregister(collector)
  7. # Config Information
  8. self.info_cache_config = Info(
  9. name='vllm:cache_config',
  10. documentation='information of cache_config')
  11. # System stats
  12. self.gauge_scheduler_running = Gauge(
  13. name="vllm:num_requests_running",
  14. documentation="Number of requests currently running on GPU.",
  15. labelnames=labelnames)
  16. self.gauge_scheduler_swapped = Gauge(
  17. name="vllm:num_requests_swapped",
  18. documentation="Number of requests swapped to CPU.",
  19. labelnames=labelnames)
  20. self.gauge_scheduler_waiting = Gauge(
  21. name="vllm:num_requests_waiting",
  22. documentation="Number of requests waiting to be processed.",
  23. labelnames=labelnames)
  24. self.gauge_gpu_cache_usage = Gauge(
  25. name="vllm:gpu_cache_usage_perc",
  26. documentation="GPU KV-cache usage. 1 means 100 percent usage.",
  27. labelnames=labelnames)
  28. self.gauge_cpu_cache_usage = Gauge(
  29. name="vllm:cpu_cache_usage_perc",
  30. documentation="CPU KV-cache usage. 1 means 100 percent usage.",
  31. labelnames=labelnames)
  32. # Raw stats from last model iteration
  33. self.counter_prompt_tokens = Counter(
  34. name="vllm:prompt_tokens_total",
  35. documentation="Number of prefill tokens processed.",
  36. labelnames=labelnames)
  37. self.counter_generation_tokens = Counter(
  38. name="vllm:generation_tokens_total",
  39. documentation="Number of generation tokens processed.",
  40. labelnames=labelnames)
  41. self.histogram_time_to_first_token = Histogram(
  42. name="vllm:time_to_first_token_seconds",
  43. documentation="Histogram of time to first token in seconds.",
  44. labelnames=labelnames,
  45. buckets=[
  46. 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
  47. 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
  48. ])
  49. self.histogram_time_per_output_token = Histogram(
  50. name="vllm:time_per_output_token_seconds",
  51. documentation="Histogram of time per output token in seconds.",
  52. labelnames=labelnames,
  53. buckets=[
  54. 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
  55. 1.0, 2.5
  56. ])
  57. self.histogram_e2e_request_latency = Histogram(
  58. name="vllm:e2e_request_latency_seconds",
  59. documentation="Histogram of end to end request latency in seconds.",
  60. labelnames=labelnames,
  61. buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
  62. # Legacy metrics
  63. self.gauge_avg_prompt_throughput = Gauge(
  64. name="vllm:avg_prompt_throughput_toks_per_s",
  65. documentation="Average prefill throughput in tokens/s.",
  66. labelnames=labelnames,
  67. )
  68. self.gauge_avg_generation_throughput = Gauge(
  69. name="vllm:avg_generation_throughput_toks_per_s",
  70. documentation="Average generation throughput in tokens/s.",
  71. labelnames=labelnames,
  72. )