Aqlm Example
Source vllm-project/vllm.
1from vllm import LLM, SamplingParams2from vllm.utils import FlexibleArgumentParser345def main():67 parser = FlexibleArgumentParser(description='AQLM examples')89 parser.add_argument('--model',10 '-m',11 type=str,12 default=None,13 help='model path, as for HF')14 parser.add_argument('--choice',15 '-c',16 type=int,17 default=0,18 help='known good models by index, [0-4]')19 parser.add_argument('--tensor-parallel-size',20 '-t',21 type=int,22 default=1,23 help='tensor parallel size')2425 args = parser.parse_args()2627 models = [28 "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",29 "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",30 "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",31 "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",32 "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",33 ]3435 model = LLM(args.model if args.model is not None else models[args.choice],36 tensor_parallel_size=args.tensor_parallel_size)3738 sampling_params = SamplingParams(max_tokens=100, temperature=0)39 outputs = model.generate("Hello my name is",40 sampling_params=sampling_params)41 print(outputs[0].outputs[0].text)424344if __name__ == '__main__':45 main()