Environment Variables

vLLM uses the following environment variables to configure the system:

Warning

Please note that VLLM_PORT and VLLM_HOST_IP set the port and ip for vLLM’s internal usage. It is not the port and ip for the API server. If you use --host $VLLM_HOST_IP and --port $VLLM_PORT to start the API server, it will not work.

All environment variables used by vLLM are prefixed with VLLM_. Special care should be taken for Kubernetes users: please do not name the service as vllm, otherwise environment variables set by Kubernetes might conflict with vLLM’s environment variables, because Kubernetes sets environment variables for each service with the capitalized service name as the prefix.

  1. environment_variables: Dict[str, Callable[[], Any]] = {
  2. # ================== Installation Time Env Vars ==================
  3. # Target device of vLLM, supporting [cuda (by default),
  4. # rocm, neuron, cpu, openvino]
  5. "VLLM_TARGET_DEVICE":
  6. lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
  7. # Maximum number of compilation jobs to run in parallel.
  8. # By default this is the number of CPUs
  9. "MAX_JOBS":
  10. lambda: os.getenv("MAX_JOBS", None),
  11. # Number of threads to use for nvcc
  12. # By default this is 1.
  13. # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
  14. "NVCC_THREADS":
  15. lambda: os.getenv("NVCC_THREADS", None),
  16. # If set, vllm will use precompiled binaries (*.so)
  17. "VLLM_USE_PRECOMPILED":
  18. lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
  19. # If set, vllm will install Punica kernels
  20. "VLLM_INSTALL_PUNICA_KERNELS":
  21. lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
  22. # CMake build type
  23. # If not set, defaults to "Debug" or "RelWithDebInfo"
  24. # Available options: "Debug", "Release", "RelWithDebInfo"
  25. "CMAKE_BUILD_TYPE":
  26. lambda: os.getenv("CMAKE_BUILD_TYPE"),
  27. # If set, vllm will print verbose logs during installation
  28. "VERBOSE":
  29. lambda: bool(int(os.getenv('VERBOSE', '0'))),
  30. # Root directory for VLLM configuration files
  31. # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
  32. # Note that this not only affects how vllm finds its configuration files
  33. # during runtime, but also affects how vllm installs its configuration
  34. # files during **installation**.
  35. "VLLM_CONFIG_ROOT":
  36. lambda: os.path.expanduser(
  37. os.getenv(
  38. "VLLM_CONFIG_ROOT",
  39. os.path.join(get_default_config_root(), "vllm"),
  40. )),
  41. # ================== Runtime Env Vars ==================
  42. # Root directory for VLLM cache files
  43. # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
  44. "VLLM_CACHE_ROOT":
  45. lambda: os.path.expanduser(
  46. os.getenv(
  47. "VLLM_CACHE_ROOT",
  48. os.path.join(get_default_cache_root(), "vllm"),
  49. )),
  50. # used in distributed environment to determine the master address
  51. 'VLLM_HOST_IP':
  52. lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
  53. # used in distributed environment to manually set the communication port
  54. # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
  55. # VLLM_PORT will be used as the first port, and the rest will be generated
  56. # by incrementing the VLLM_PORT value.
  57. # '0' is used to make mypy happy
  58. 'VLLM_PORT':
  59. lambda: int(os.getenv('VLLM_PORT', '0'))
  60. if 'VLLM_PORT' in os.environ else None,
  61. # If true, will load models from ModelScope instead of Hugging Face Hub.
  62. # note that the value is true or false, not numbers
  63. "VLLM_USE_MODELSCOPE":
  64. lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
  65. # Instance id represents an instance of the VLLM. All processes in the same
  66. # instance should have the same instance id.
  67. "VLLM_INSTANCE_ID":
  68. lambda: os.environ.get("VLLM_INSTANCE_ID", None),
  69. # Interval in seconds to log a warning message when the ring buffer is full
  70. "VLLM_RINGBUFFER_WARNING_INTERVAL":
  71. lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
  72. # path to cudatoolkit home directory, under which should be bin, include,
  73. # and lib directories.
  74. "CUDA_HOME":
  75. lambda: os.environ.get("CUDA_HOME", None),
  76. # Path to the NCCL library file. It is needed because nccl>=2.19 brought
  77. # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
  78. "VLLM_NCCL_SO_PATH":
  79. lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
  80. # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
  81. # library file in the locations specified by `LD_LIBRARY_PATH`
  82. "LD_LIBRARY_PATH":
  83. lambda: os.environ.get("LD_LIBRARY_PATH", None),
  84. # flag to control if vllm should use triton flash attention
  85. "VLLM_USE_TRITON_FLASH_ATTN":
  86. lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
  87. ("true", "1")),
  88. # local rank of the process in the distributed setting, used to determine
  89. # the GPU device id
  90. "LOCAL_RANK":
  91. lambda: int(os.environ.get("LOCAL_RANK", "0")),
  92. # used to control the visible devices in the distributed setting
  93. "CUDA_VISIBLE_DEVICES":
  94. lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
  95. # timeout for each iteration in the engine
  96. "VLLM_ENGINE_ITERATION_TIMEOUT_S":
  97. lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
  98. # API key for VLLM API server
  99. "VLLM_API_KEY":
  100. lambda: os.environ.get("VLLM_API_KEY", None),
  101. # S3 access information, used for tensorizer to load model from S3
  102. "S3_ACCESS_KEY_ID":
  103. lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
  104. "S3_SECRET_ACCESS_KEY":
  105. lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
  106. "S3_ENDPOINT_URL":
  107. lambda: os.environ.get("S3_ENDPOINT_URL", None),
  108. # Usage stats collection
  109. "VLLM_USAGE_STATS_SERVER":
  110. lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
  111. "VLLM_NO_USAGE_STATS":
  112. lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
  113. "VLLM_DO_NOT_TRACK":
  114. lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
  115. "DO_NOT_TRACK", None) or "0") == "1",
  116. "VLLM_USAGE_SOURCE":
  117. lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
  118. # Logging configuration
  119. # If set to 0, vllm will not configure logging
  120. # If set to 1, vllm will configure logging using the default configuration
  121. # or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
  122. "VLLM_CONFIGURE_LOGGING":
  123. lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
  124. "VLLM_LOGGING_CONFIG_PATH":
  125. lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
  126. # this is used for configuring the default logging level
  127. "VLLM_LOGGING_LEVEL":
  128. lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"),
  129. # Trace function calls
  130. # If set to 1, vllm will trace function calls
  131. # Useful for debugging
  132. "VLLM_TRACE_FUNCTION":
  133. lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
  134. # Backend for attention computation
  135. # Available options:
  136. # - "TORCH_SDPA": use torch.nn.MultiheadAttention
  137. # - "FLASH_ATTN": use FlashAttention
  138. # - "XFORMERS": use XFormers
  139. # - "ROCM_FLASH": use ROCmFlashAttention
  140. # - "FLASHINFER": use flashinfer
  141. "VLLM_ATTENTION_BACKEND":
  142. lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
  143. # CPU key-value cache space
  144. # default is 4GB
  145. "VLLM_CPU_KVCACHE_SPACE":
  146. lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
  147. # OpenVINO key-value cache space
  148. # default is 4GB
  149. "VLLM_OPENVINO_KVCACHE_SPACE":
  150. lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
  151. # OpenVINO KV cache precision
  152. # default is bf16 if natively supported by platform, otherwise f16
  153. # To enable KV cache compression, please, explicitly specify u8
  154. "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
  155. lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
  156. # Enables weights compression during model export via HF Optimum
  157. # default is False
  158. "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
  159. lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
  160. # If the env var is set, then all workers will execute as separate
  161. # processes from the engine, and we use the same mechanism to trigger
  162. # execution on all workers.
  163. # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
  164. "VLLM_USE_RAY_SPMD_WORKER":
  165. lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)),
  166. # If the env var is set, it uses the Ray's compiled DAG API
  167. # which optimizes the control plane overhead.
  168. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
  169. "VLLM_USE_RAY_COMPILED_DAG":
  170. lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
  171. # Use dedicated multiprocess context for workers.
  172. # Both spawn and fork work
  173. "VLLM_WORKER_MULTIPROC_METHOD":
  174. lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
  175. # Path to the cache for storing downloaded assets
  176. "VLLM_ASSETS_CACHE":
  177. lambda: os.path.expanduser(
  178. os.getenv(
  179. "VLLM_ASSETS_CACHE",
  180. os.path.join(get_default_cache_root(), "vllm", "assets"),
  181. )),
  182. # Timeout for fetching images when serving multimodal models
  183. # Default is 5 seconds
  184. "VLLM_IMAGE_FETCH_TIMEOUT":
  185. lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
  186. # Path to the XLA persistent cache directory.
  187. # Only used for XLA devices such as TPUs.
  188. "VLLM_XLA_CACHE_PATH":
  189. lambda: os.path.expanduser(
  190. os.getenv(
  191. "VLLM_ASSETS_CACHE",
  192. os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
  193. )),
  194. "VLLM_FUSED_MOE_CHUNK_SIZE":
  195. lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
  196. # If set, vllm will skip the deprecation warnings.
  197. "VLLM_NO_DEPRECATION_WARNING":
  198. lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
  199. }