Offline Inference Neuron
Source vllm-project/vllm.
1from vllm import LLM, SamplingParams23# Sample prompts.4prompts = [5 "Hello, my name is",6 "The president of the United States is",7 "The capital of France is",8 "The future of AI is",9]10# Create a sampling params object.11sampling_params = SamplingParams(temperature=0.8, top_p=0.95)1213# Create an LLM.14llm = LLM(15 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",16 max_num_seqs=8,17 # The max_model_len and block_size arguments are required to be same as18 # max sequence length when targeting neuron device.19 # Currently, this is a known limitation in continuous batching support20 # in transformers-neuronx.21 # TODO(liangfu): Support paged-attention in transformers-neuronx.22 max_model_len=128,23 block_size=128,24 # The device can be automatically detected when AWS Neuron SDK is installed.25 # The device argument can be either unspecified for automated detection,26 # or explicitly assigned.27 device="neuron",28 tensor_parallel_size=2)29# Generate texts from the prompts. The output is a list of RequestOutput objects30# that contain the prompt, generated text, and other information.31outputs = llm.generate(prompts, sampling_params)32# Print the outputs.33for output in outputs:34 prompt = output.prompt35 generated_text = output.outputs[0].text36 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")