Offline Inference With Prefix

Source vllm-project/vllm.

  1. 1from vllm import LLM, SamplingParams
  2. 2
  3. 3prefix = (
  4. 4 "You are an expert school principal, skilled in effectively managing "
  5. 5 "faculty and staff. Draft 10-15 questions for a potential first grade "
  6. 6 "Head Teacher for my K-12, all-girls', independent school that emphasizes "
  7. 7 "community, joyful discovery, and life-long learning. The candidate is "
  8. 8 "coming in for a first-round panel interview for a 8th grade Math "
  9. 9 "teaching role. They have 5 years of previous teaching experience "
  10. 10 "as an assistant teacher at a co-ed, public school with experience "
  11. 11 "in middle school math teaching. Based on these information, fulfill "
  12. 12 "the following paragraph: ")
  13. 13
  14. 14# Sample prompts.
  15. 15prompts = [
  16. 16 "Hello, my name is",
  17. 17 "The president of the United States is",
  18. 18 "The capital of France is",
  19. 19 "The future of AI is",
  20. 20]
  21. 21# Create a sampling params object.
  22. 22sampling_params = SamplingParams(temperature=0.0)
  23. 23
  24. 24# Create an LLM.
  25. 25llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
  26. 26
  27. 27generating_prompts = [prefix + prompt for prompt in prompts]
  28. 28
  29. 29# Generate texts from the prompts. The output is a list of RequestOutput objects
  30. 30# that contain the prompt, generated text, and other information.
  31. 31outputs = llm.generate(generating_prompts, sampling_params)
  32. 32# Print the outputs.
  33. 33for output in outputs:
  34. 34 prompt = output.prompt
  35. 35 generated_text = output.outputs[0].text
  36. 36 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
  37. 37
  38. 38print("-" * 80)
  39. 39
  40. 40# The llm.generate call will batch all prompts and send the batch at once
  41. 41# if resources allow. The prefix will only be cached after the first batch
  42. 42# is processed, so we need to call generate once to calculate the prefix
  43. 43# and cache it.
  44. 44outputs = llm.generate(generating_prompts[0], sampling_params)
  45. 45
  46. 46# Subsequent batches can leverage the cached prefix
  47. 47outputs = llm.generate(generating_prompts, sampling_params)
  48. 48
  49. 49# Print the outputs. You should see the same outputs as before
  50. 50for output in outputs:
  51. 51 prompt = output.prompt
  52. 52 generated_text = output.outputs[0].text
  53. 53 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")