Offline Inference Tpu

Source vllm-project/vllm.

  1. 1from vllm import LLM, SamplingParams
  2. 2
  3. 3prompts = [
  4. 4 "A robot may not injure a human being",
  5. 5 "It is only with the heart that one can see rightly;",
  6. 6 "The greatest glory in living lies not in never falling,",
  7. 7]
  8. 8answers = [
  9. 9 " or, through inaction, allow a human being to come to harm.",
  10. 10 " what is essential is invisible to the eye.",
  11. 11 " but in rising every time we fall.",
  12. 12]
  13. 13N = 1
  14. 14# Currently, top-p sampling is disabled. `top_p` should be 1.0.
  15. 15sampling_params = SamplingParams(temperature=0.7,
  16. 16 top_p=1.0,
  17. 17 n=N,
  18. 18 max_tokens=16)
  19. 19
  20. 20# Set `enforce_eager=True` to avoid ahead-of-time compilation.
  21. 21# In real workloads, `enforace_eager` should be `False`.
  22. 22llm = LLM(model="google/gemma-2b", enforce_eager=True)
  23. 23outputs = llm.generate(prompts, sampling_params)
  24. 24for output, answer in zip(outputs, answers):
  25. 25 prompt = output.prompt
  26. 26 generated_text = output.outputs[0].text
  27. 27 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
  28. 28 assert generated_text.startswith(answer)