Offline Inference Neuron

Source vllm-project/vllm.

  1. 1from vllm import LLM, SamplingParams
  2. 2
  3. 3# Sample prompts.
  4. 4prompts = [
  5. 5 "Hello, my name is",
  6. 6 "The president of the United States is",
  7. 7 "The capital of France is",
  8. 8 "The future of AI is",
  9. 9]
  10. 10# Create a sampling params object.
  11. 11sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
  12. 12
  13. 13# Create an LLM.
  14. 14llm = LLM(
  15. 15 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  16. 16 max_num_seqs=8,
  17. 17 # The max_model_len and block_size arguments are required to be same as
  18. 18 # max sequence length when targeting neuron device.
  19. 19 # Currently, this is a known limitation in continuous batching support
  20. 20 # in transformers-neuronx.
  21. 21 # TODO(liangfu): Support paged-attention in transformers-neuronx.
  22. 22 max_model_len=128,
  23. 23 block_size=128,
  24. 24 # The device can be automatically detected when AWS Neuron SDK is installed.
  25. 25 # The device argument can be either unspecified for automated detection,
  26. 26 # or explicitly assigned.
  27. 27 device="neuron",
  28. 28 tensor_parallel_size=2)
  29. 29# Generate texts from the prompts. The output is a list of RequestOutput objects
  30. 30# that contain the prompt, generated text, and other information.
  31. 31outputs = llm.generate(prompts, sampling_params)
  32. 32# Print the outputs.
  33. 33for output in outputs:
  34. 34 prompt = output.prompt
  35. 35 generated_text = output.outputs[0].text
  36. 36 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")