Offline Inference Mlpspeculator
Source vllm-project/vllm.
1import gc2import time3from typing import List45from vllm import LLM, SamplingParams678def time_generation(llm: LLM, prompts: List[str],9 sampling_params: SamplingParams):10 # Generate texts from the prompts. The output is a list of RequestOutput11 # objects that contain the prompt, generated text, and other information.12 # Warmup first13 llm.generate(prompts, sampling_params)14 llm.generate(prompts, sampling_params)15 start = time.time()16 outputs = llm.generate(prompts, sampling_params)17 end = time.time()18 print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))19 # Print the outputs.20 for output in outputs:21 generated_text = output.outputs[0].text22 print(f"text: {generated_text!r}")232425if __name__ == "__main__":2627 template = (28 "Below is an instruction that describes a task. Write a response "29 "that appropriately completes the request.\n\n### Instruction:\n{}"30 "\n\n### Response:\n")3132 # Sample prompts.33 prompts = [34 "Write about the president of the United States.",35 ]36 prompts = [template.format(prompt) for prompt in prompts]37 # Create a sampling params object.38 sampling_params = SamplingParams(temperature=0.0, max_tokens=200)3940 # Create an LLM without spec decoding41 llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")4243 print("Without speculation")44 time_generation(llm, prompts, sampling_params)4546 del llm47 gc.collect()4849 # Create an LLM with spec decoding50 llm = LLM(51 model="meta-llama/Llama-2-13b-chat-hf",52 speculative_model="ibm-fms/llama-13b-accelerator",53 # These are currently required for MLPSpeculator decoding54 use_v2_block_manager=True,55 )5657 print("With speculation")58 time_generation(llm, prompts, sampling_params)