Tensorize vLLM Model
Source vllm-project/vllm.
1import argparse2import dataclasses3import json4import os5import uuid67from vllm import LLM8from vllm.engine.arg_utils import EngineArgs9from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,10 TensorizerConfig,11 tensorize_vllm_model)12from vllm.utils import FlexibleArgumentParser1314# yapf conflicts with isort for this docstring15# yapf: disable16"""17tensorize_vllm_model.py is a script that can be used to serialize and18deserialize vLLM models. These models can be loaded using tensorizer19to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,20or locally. Tensor encryption and decryption is also supported, although21libsodium must be installed to use it. Install vllm with tensorizer support22using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit23https://github.com/coreweave/tensorizer2425To serialize a model, install vLLM from source, then run something26like this from the root level of this repository:2728python -m examples.tensorize_vllm_model \29 --model facebook/opt-125m \30 serialize \31 --serialized-directory s3://my-bucket \32 --suffix v13334Which downloads the model from HuggingFace, loads it into vLLM, serializes it,35and saves it to your S3 bucket. A local directory can also be used. This36assumes your S3 credentials are specified as environment variables37in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and38`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide39`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint`40as CLI args to this script.4142You can also encrypt the model weights with a randomly-generated key by43providing a `--keyfile` argument.4445To deserialize a model, you can run something like this from the root46level of this repository:4748python -m examples.tensorize_vllm_model \49 --model EleutherAI/gpt-j-6B \50 --dtype float16 \51 deserialize \52 --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors5354Which downloads the model tensors from your S3 bucket and deserializes them.5556You can also provide a `--keyfile` argument to decrypt the model weights if57they were serialized with encryption.5859To support distributed tensor-parallel models, each model shard will be60serialized to a separate file. The tensorizer_uri is then specified as a string61template with a format specifier such as '%03d' that will be rendered with the62shard's rank. Sharded models serialized with this script will be named as63model-rank-%03d.tensors6465For more information on the available arguments for serializing, run66`python -m examples.tensorize_vllm_model serialize --help`.6768Or for deserializing:6970`python -m examples.tensorize_vllm_model deserialize --help`.7172Once a model is serialized, tensorizer can be invoked with the `LLM` class73directly to load models:7475 llm = LLM(model="facebook/opt-125m",76 load_format="tensorizer",77 model_loader_extra_config=TensorizerConfig(78 tensorizer_uri = path_to_tensors,79 num_readers=3,80 )81 )8283A serialized model can be used during model loading for the vLLM OpenAI84inference server. `model_loader_extra_config` is exposed as the CLI arg85`--model-loader-extra-config`, and accepts a JSON string literal of the86TensorizerConfig arguments desired.8788In order to see all of the available arguments usable to configure89loading with tensorizer that are given to `TensorizerConfig`, run:9091`python -m examples.tensorize_vllm_model deserialize --help`9293under the `tensorizer options` section. These can also be used for94deserialization in this example script, although `--tensorizer-uri` and95`--path-to-tensors` are functionally the same in this case.96"""979899def parse_args():100 parser = FlexibleArgumentParser(101 description="An example script that can be used to serialize and "102 "deserialize vLLM models. These models "103 "can be loaded using tensorizer directly to the GPU "104 "extremely quickly. Tensor encryption and decryption is "105 "also supported, although libsodium must be installed to "106 "use it.")107 parser = EngineArgs.add_cli_args(parser)108 subparsers = parser.add_subparsers(dest='command')109110 serialize_parser = subparsers.add_parser(111 'serialize', help="Serialize a model to `--serialized-directory`")112113 serialize_parser.add_argument(114 "--suffix",115 type=str,116 required=False,117 help=(118 "The suffix to append to the serialized model directory, which is "119 "used to construct the location of the serialized model tensors, "120 "e.g. if `--serialized-directory` is `s3://my-bucket/` and "121 "`--suffix` is `v1`, the serialized model tensors will be "122 "saved to "123 "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "124 "If none is provided, a random UUID will be used."))125 serialize_parser.add_argument(126 "--serialized-directory",127 type=str,128 required=True,129 help="The directory to serialize the model to. "130 "This can be a local directory or S3 URI. The path to where the "131 "tensors are saved is a combination of the supplied `dir` and model "132 "reference ID. For instance, if `dir` is the serialized directory, "133 "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "134 "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "135 "where `suffix` is given by `--suffix` or a random UUID if not "136 "provided.")137138 serialize_parser.add_argument(139 "--keyfile",140 type=str,141 required=False,142 help=("Encrypt the model weights with a randomly-generated binary key,"143 " and save the key at this path"))144145 deserialize_parser = subparsers.add_parser(146 'deserialize',147 help=("Deserialize a model from `--path-to-tensors`"148 " to verify it can be loaded and used."))149150 deserialize_parser.add_argument(151 "--path-to-tensors",152 type=str,153 required=True,154 help="The local path or S3 URI to the model tensors to deserialize. ")155156 deserialize_parser.add_argument(157 "--keyfile",158 type=str,159 required=False,160 help=("Path to a binary key to use to decrypt the model weights,"161 " if the model was serialized with encryption"))162163 TensorizerArgs.add_cli_args(deserialize_parser)164165 return parser.parse_args()166167168169def deserialize():170 llm = LLM(model=args.model,171 load_format="tensorizer",172 tensor_parallel_size=args.tensor_parallel_size,173 model_loader_extra_config=tensorizer_config174 )175 return llm176177178if __name__ == '__main__':179 args = parse_args()180181 s3_access_key_id = (getattr(args, 's3_access_key_id', None)182 or os.environ.get("S3_ACCESS_KEY_ID", None))183 s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)184 or os.environ.get("S3_SECRET_ACCESS_KEY", None))185 s3_endpoint = (getattr(args, 's3_endpoint', None)186 or os.environ.get("S3_ENDPOINT_URL", None))187188 credentials = {189 "s3_access_key_id": s3_access_key_id,190 "s3_secret_access_key": s3_secret_access_key,191 "s3_endpoint": s3_endpoint192 }193194 model_ref = args.model195196 model_name = model_ref.split("/")[1]197198 keyfile = args.keyfile if args.keyfile else None199200 if args.model_loader_extra_config:201 config = json.loads(args.model_loader_extra_config)202 tensorizer_args = \203 TensorizerConfig(**config)._construct_tensorizer_args()204 tensorizer_args.tensorizer_uri = args.path_to_tensors205 else:206 tensorizer_args = None207208 if args.command == "serialize":209 eng_args_dict = {f.name: getattr(args, f.name) for f in210 dataclasses.fields(EngineArgs)}211212 engine_args = EngineArgs.from_cli_args(213 argparse.Namespace(**eng_args_dict)214 )215216 input_dir = args.serialized_directory.rstrip('/')217 suffix = args.suffix if args.suffix else uuid.uuid4().hex218 base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"219 if engine_args.tensor_parallel_size > 1:220 model_path = f"{base_path}/model-rank-%03d.tensors"221 else:222 model_path = f"{base_path}/model.tensors"223224 tensorizer_config = TensorizerConfig(225 tensorizer_uri=model_path,226 encryption_keyfile=keyfile,227 **credentials)228229 tensorize_vllm_model(engine_args, tensorizer_config)230231 elif args.command == "deserialize":232 if not tensorizer_args:233 tensorizer_config = TensorizerConfig(234 tensorizer_uri=args.path_to_tensors,235 encryption_keyfile = keyfile,236 **credentials237 )238 deserialize()239 else:240 raise ValueError("Either serialize or deserialize must be specified.")