Tensorize vLLM Model

Source vllm-project/vllm.

  1. 1import argparse
  2. 2import dataclasses
  3. 3import json
  4. 4import os
  5. 5import uuid
  6. 6
  7. 7from vllm import LLM
  8. 8from vllm.engine.arg_utils import EngineArgs
  9. 9from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
  10. 10 TensorizerConfig,
  11. 11 tensorize_vllm_model)
  12. 12from vllm.utils import FlexibleArgumentParser
  13. 13
  14. 14# yapf conflicts with isort for this docstring
  15. 15# yapf: disable
  16. 16"""
  17. 17tensorize_vllm_model.py is a script that can be used to serialize and
  18. 18deserialize vLLM models. These models can be loaded using tensorizer
  19. 19to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
  20. 20or locally. Tensor encryption and decryption is also supported, although
  21. 21libsodium must be installed to use it. Install vllm with tensorizer support
  22. 22using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
  23. 23https://github.com/coreweave/tensorizer
  24. 24
  25. 25To serialize a model, install vLLM from source, then run something
  26. 26like this from the root level of this repository:
  27. 27
  28. 28python -m examples.tensorize_vllm_model \
  29. 29 --model facebook/opt-125m \
  30. 30 serialize \
  31. 31 --serialized-directory s3://my-bucket \
  32. 32 --suffix v1
  33. 33
  34. 34Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
  35. 35and saves it to your S3 bucket. A local directory can also be used. This
  36. 36assumes your S3 credentials are specified as environment variables
  37. 37in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and
  38. 38`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide
  39. 39`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint`
  40. 40as CLI args to this script.
  41. 41
  42. 42You can also encrypt the model weights with a randomly-generated key by
  43. 43providing a `--keyfile` argument.
  44. 44
  45. 45To deserialize a model, you can run something like this from the root
  46. 46level of this repository:
  47. 47
  48. 48python -m examples.tensorize_vllm_model \
  49. 49 --model EleutherAI/gpt-j-6B \
  50. 50 --dtype float16 \
  51. 51 deserialize \
  52. 52 --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
  53. 53
  54. 54Which downloads the model tensors from your S3 bucket and deserializes them.
  55. 55
  56. 56You can also provide a `--keyfile` argument to decrypt the model weights if
  57. 57they were serialized with encryption.
  58. 58
  59. 59To support distributed tensor-parallel models, each model shard will be
  60. 60serialized to a separate file. The tensorizer_uri is then specified as a string
  61. 61template with a format specifier such as '%03d' that will be rendered with the
  62. 62shard's rank. Sharded models serialized with this script will be named as
  63. 63model-rank-%03d.tensors
  64. 64
  65. 65For more information on the available arguments for serializing, run
  66. 66`python -m examples.tensorize_vllm_model serialize --help`.
  67. 67
  68. 68Or for deserializing:
  69. 69
  70. 70`python -m examples.tensorize_vllm_model deserialize --help`.
  71. 71
  72. 72Once a model is serialized, tensorizer can be invoked with the `LLM` class
  73. 73directly to load models:
  74. 74
  75. 75 llm = LLM(model="facebook/opt-125m",
  76. 76 load_format="tensorizer",
  77. 77 model_loader_extra_config=TensorizerConfig(
  78. 78 tensorizer_uri = path_to_tensors,
  79. 79 num_readers=3,
  80. 80 )
  81. 81 )
  82. 82
  83. 83A serialized model can be used during model loading for the vLLM OpenAI
  84. 84inference server. `model_loader_extra_config` is exposed as the CLI arg
  85. 85`--model-loader-extra-config`, and accepts a JSON string literal of the
  86. 86TensorizerConfig arguments desired.
  87. 87
  88. 88In order to see all of the available arguments usable to configure
  89. 89loading with tensorizer that are given to `TensorizerConfig`, run:
  90. 90
  91. 91`python -m examples.tensorize_vllm_model deserialize --help`
  92. 92
  93. 93under the `tensorizer options` section. These can also be used for
  94. 94deserialization in this example script, although `--tensorizer-uri` and
  95. 95`--path-to-tensors` are functionally the same in this case.
  96. 96"""
  97. 97
  98. 98
  99. 99def parse_args():
  100. 100 parser = FlexibleArgumentParser(
  101. 101 description="An example script that can be used to serialize and "
  102. 102 "deserialize vLLM models. These models "
  103. 103 "can be loaded using tensorizer directly to the GPU "
  104. 104 "extremely quickly. Tensor encryption and decryption is "
  105. 105 "also supported, although libsodium must be installed to "
  106. 106 "use it.")
  107. 107 parser = EngineArgs.add_cli_args(parser)
  108. 108 subparsers = parser.add_subparsers(dest='command')
  109. 109
  110. 110 serialize_parser = subparsers.add_parser(
  111. 111 'serialize', help="Serialize a model to `--serialized-directory`")
  112. 112
  113. 113 serialize_parser.add_argument(
  114. 114 "--suffix",
  115. 115 type=str,
  116. 116 required=False,
  117. 117 help=(
  118. 118 "The suffix to append to the serialized model directory, which is "
  119. 119 "used to construct the location of the serialized model tensors, "
  120. 120 "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
  121. 121 "`--suffix` is `v1`, the serialized model tensors will be "
  122. 122 "saved to "
  123. 123 "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
  124. 124 "If none is provided, a random UUID will be used."))
  125. 125 serialize_parser.add_argument(
  126. 126 "--serialized-directory",
  127. 127 type=str,
  128. 128 required=True,
  129. 129 help="The directory to serialize the model to. "
  130. 130 "This can be a local directory or S3 URI. The path to where the "
  131. 131 "tensors are saved is a combination of the supplied `dir` and model "
  132. 132 "reference ID. For instance, if `dir` is the serialized directory, "
  133. 133 "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
  134. 134 "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
  135. 135 "where `suffix` is given by `--suffix` or a random UUID if not "
  136. 136 "provided.")
  137. 137
  138. 138 serialize_parser.add_argument(
  139. 139 "--keyfile",
  140. 140 type=str,
  141. 141 required=False,
  142. 142 help=("Encrypt the model weights with a randomly-generated binary key,"
  143. 143 " and save the key at this path"))
  144. 144
  145. 145 deserialize_parser = subparsers.add_parser(
  146. 146 'deserialize',
  147. 147 help=("Deserialize a model from `--path-to-tensors`"
  148. 148 " to verify it can be loaded and used."))
  149. 149
  150. 150 deserialize_parser.add_argument(
  151. 151 "--path-to-tensors",
  152. 152 type=str,
  153. 153 required=True,
  154. 154 help="The local path or S3 URI to the model tensors to deserialize. ")
  155. 155
  156. 156 deserialize_parser.add_argument(
  157. 157 "--keyfile",
  158. 158 type=str,
  159. 159 required=False,
  160. 160 help=("Path to a binary key to use to decrypt the model weights,"
  161. 161 " if the model was serialized with encryption"))
  162. 162
  163. 163 TensorizerArgs.add_cli_args(deserialize_parser)
  164. 164
  165. 165 return parser.parse_args()
  166. 166
  167. 167
  168. 168
  169. 169def deserialize():
  170. 170 llm = LLM(model=args.model,
  171. 171 load_format="tensorizer",
  172. 172 tensor_parallel_size=args.tensor_parallel_size,
  173. 173 model_loader_extra_config=tensorizer_config
  174. 174 )
  175. 175 return llm
  176. 176
  177. 177
  178. 178if __name__ == '__main__':
  179. 179 args = parse_args()
  180. 180
  181. 181 s3_access_key_id = (getattr(args, 's3_access_key_id', None)
  182. 182 or os.environ.get("S3_ACCESS_KEY_ID", None))
  183. 183 s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
  184. 184 or os.environ.get("S3_SECRET_ACCESS_KEY", None))
  185. 185 s3_endpoint = (getattr(args, 's3_endpoint', None)
  186. 186 or os.environ.get("S3_ENDPOINT_URL", None))
  187. 187
  188. 188 credentials = {
  189. 189 "s3_access_key_id": s3_access_key_id,
  190. 190 "s3_secret_access_key": s3_secret_access_key,
  191. 191 "s3_endpoint": s3_endpoint
  192. 192 }
  193. 193
  194. 194 model_ref = args.model
  195. 195
  196. 196 model_name = model_ref.split("/")[1]
  197. 197
  198. 198 keyfile = args.keyfile if args.keyfile else None
  199. 199
  200. 200 if args.model_loader_extra_config:
  201. 201 config = json.loads(args.model_loader_extra_config)
  202. 202 tensorizer_args = \
  203. 203 TensorizerConfig(**config)._construct_tensorizer_args()
  204. 204 tensorizer_args.tensorizer_uri = args.path_to_tensors
  205. 205 else:
  206. 206 tensorizer_args = None
  207. 207
  208. 208 if args.command == "serialize":
  209. 209 eng_args_dict = {f.name: getattr(args, f.name) for f in
  210. 210 dataclasses.fields(EngineArgs)}
  211. 211
  212. 212 engine_args = EngineArgs.from_cli_args(
  213. 213 argparse.Namespace(**eng_args_dict)
  214. 214 )
  215. 215
  216. 216 input_dir = args.serialized_directory.rstrip('/')
  217. 217 suffix = args.suffix if args.suffix else uuid.uuid4().hex
  218. 218 base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
  219. 219 if engine_args.tensor_parallel_size > 1:
  220. 220 model_path = f"{base_path}/model-rank-%03d.tensors"
  221. 221 else:
  222. 222 model_path = f"{base_path}/model.tensors"
  223. 223
  224. 224 tensorizer_config = TensorizerConfig(
  225. 225 tensorizer_uri=model_path,
  226. 226 encryption_keyfile=keyfile,
  227. 227 **credentials)
  228. 228
  229. 229 tensorize_vllm_model(engine_args, tensorizer_config)
  230. 230
  231. 231 elif args.command == "deserialize":
  232. 232 if not tensorizer_args:
  233. 233 tensorizer_config = TensorizerConfig(
  234. 234 tensorizer_uri=args.path_to_tensors,
  235. 235 encryption_keyfile = keyfile,
  236. 236 **credentials
  237. 237 )
  238. 238 deserialize()
  239. 239 else:
  240. 240 raise ValueError("Either serialize or deserialize must be specified.")