Environment Variables

vLLM uses the following environment variables to configure the system:

  1. environment_variables: Dict[str, Callable[[], Any]] = {
  2. # ================== Installation Time Env Vars ==================
  3. # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
  4. "VLLM_TARGET_DEVICE":
  5. lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
  6. # Maximum number of compilation jobs to run in parallel.
  7. # By default this is the number of CPUs
  8. "MAX_JOBS":
  9. lambda: os.getenv("MAX_JOBS", None),
  10. # Number of threads to use for nvcc
  11. # By default this is 1.
  12. # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
  13. "NVCC_THREADS":
  14. lambda: os.getenv("NVCC_THREADS", None),
  15. # If set, vllm will build with Neuron support
  16. "VLLM_BUILD_WITH_NEURON":
  17. lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
  18. # If set, vllm will use precompiled binaries (*.so)
  19. "VLLM_USE_PRECOMPILED":
  20. lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
  21. # If set, vllm will install Punica kernels
  22. "VLLM_INSTALL_PUNICA_KERNELS":
  23. lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
  24. # CMake build type
  25. # If not set, defaults to "Debug" or "RelWithDebInfo"
  26. # Available options: "Debug", "Release", "RelWithDebInfo"
  27. "CMAKE_BUILD_TYPE":
  28. lambda: os.getenv("CMAKE_BUILD_TYPE"),
  29. # If set, vllm will print verbose logs during installation
  30. "VERBOSE":
  31. lambda: bool(int(os.getenv('VERBOSE', '0'))),
  32. # Root directory for VLLM configuration files
  33. # Note that this not only affects how vllm finds its configuration files
  34. # during runtime, but also affects how vllm installs its configuration
  35. # files during **installation**.
  36. "VLLM_CONFIG_ROOT":
  37. lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
  38. "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
  39. # ================== Runtime Env Vars ==================
  40. # used in distributed environment to determine the master address
  41. 'VLLM_HOST_IP':
  42. lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
  43. # used in distributed environment to manually set the communication port
  44. # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
  45. # VLLM_PORT will be used as the first port, and the rest will be generated
  46. # by incrementing the VLLM_PORT value.
  47. # '0' is used to make mypy happy
  48. 'VLLM_PORT':
  49. lambda: int(os.getenv('VLLM_PORT', '0'))
  50. if 'VLLM_PORT' in os.environ else None,
  51. # If true, will load models from ModelScope instead of Hugging Face Hub.
  52. # note that the value is true or false, not numbers
  53. "VLLM_USE_MODELSCOPE":
  54. lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
  55. # Instance id represents an instance of the VLLM. All processes in the same
  56. # instance should have the same instance id.
  57. "VLLM_INSTANCE_ID":
  58. lambda: os.environ.get("VLLM_INSTANCE_ID", None),
  59. # path to cudatoolkit home directory, under which should be bin, include,
  60. # and lib directories.
  61. "CUDA_HOME":
  62. lambda: os.environ.get("CUDA_HOME", None),
  63. # Path to the NCCL library file. It is needed because nccl>=2.19 brought
  64. # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
  65. "VLLM_NCCL_SO_PATH":
  66. lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
  67. # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
  68. # library file in the locations specified by `LD_LIBRARY_PATH`
  69. "LD_LIBRARY_PATH":
  70. lambda: os.environ.get("LD_LIBRARY_PATH", None),
  71. # flag to control if vllm should use triton flash attention
  72. "VLLM_USE_TRITON_FLASH_ATTN":
  73. lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
  74. ("true", "1")),
  75. # local rank of the process in the distributed setting, used to determine
  76. # the GPU device id
  77. "LOCAL_RANK":
  78. lambda: int(os.environ.get("LOCAL_RANK", "0")),
  79. # used to control the visible devices in the distributed setting
  80. "CUDA_VISIBLE_DEVICES":
  81. lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
  82. # timeout for each iteration in the engine
  83. "VLLM_ENGINE_ITERATION_TIMEOUT_S":
  84. lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
  85. # API key for VLLM API server
  86. "VLLM_API_KEY":
  87. lambda: os.environ.get("VLLM_API_KEY", None),
  88. # S3 access information, used for tensorizer to load model from S3
  89. "S3_ACCESS_KEY_ID":
  90. lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
  91. "S3_SECRET_ACCESS_KEY":
  92. lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
  93. "S3_ENDPOINT_URL":
  94. lambda: os.environ.get("S3_ENDPOINT_URL", None),
  95. # Usage stats collection
  96. "VLLM_USAGE_STATS_SERVER":
  97. lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
  98. "VLLM_NO_USAGE_STATS":
  99. lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
  100. "VLLM_DO_NOT_TRACK":
  101. lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
  102. "DO_NOT_TRACK", None) or "0") == "1",
  103. "VLLM_USAGE_SOURCE":
  104. lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
  105. # Logging configuration
  106. # If set to 0, vllm will not configure logging
  107. # If set to 1, vllm will configure logging using the default configuration
  108. # or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
  109. "VLLM_CONFIGURE_LOGGING":
  110. lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
  111. "VLLM_LOGGING_CONFIG_PATH":
  112. lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
  113. # this is used for configuring the default logging level
  114. "VLLM_LOGGING_LEVEL":
  115. lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"),
  116. # Trace function calls
  117. # If set to 1, vllm will trace function calls
  118. # Useful for debugging
  119. "VLLM_TRACE_FUNCTION":
  120. lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
  121. # Backend for attention computation
  122. # Available options:
  123. # - "TORCH_SDPA": use torch.nn.MultiheadAttention
  124. # - "FLASH_ATTN": use FlashAttention
  125. # - "XFORMERS": use XFormers
  126. # - "ROCM_FLASH": use ROCmFlashAttention
  127. "VLLM_ATTENTION_BACKEND":
  128. lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
  129. # CPU key-value cache space
  130. # default is 4GB
  131. "VLLM_CPU_KVCACHE_SPACE":
  132. lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
  133. # If the env var is set, it uses the Ray's compiled DAG API
  134. # which optimizes the control plane overhead.
  135. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
  136. "VLLM_USE_RAY_COMPILED_DAG":
  137. lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
  138. # Use dedicated multiprocess context for workers.
  139. # Both spawn and fork work
  140. "VLLM_WORKER_MULTIPROC_METHOD":
  141. lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
  142. # Timeout for fetching images when serving multimodal models
  143. # Default is 5 seconds
  144. "VLLM_IMAGE_FETCH_TIMEOUT":
  145. lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
  146. }