Gradio Webserver

Source vllm-project/vllm.

  1. 1import argparse
  2. 2import json
  3. 3
  4. 4import gradio as gr
  5. 5import requests
  6. 6
  7. 7
  8. 8def http_bot(prompt):
  9. 9 headers = {"User-Agent": "vLLM Client"}
  10. 10 pload = {
  11. 11 "prompt": prompt,
  12. 12 "stream": True,
  13. 13 "max_tokens": 128,
  14. 14 }
  15. 15 response = requests.post(args.model_url,
  16. 16 headers=headers,
  17. 17 json=pload,
  18. 18 stream=True)
  19. 19
  20. 20 for chunk in response.iter_lines(chunk_size=8192,
  21. 21 decode_unicode=False,
  22. 22 delimiter=b"\0"):
  23. 23 if chunk:
  24. 24 data = json.loads(chunk.decode("utf-8"))
  25. 25 output = data["text"][0]
  26. 26 yield output
  27. 27
  28. 28
  29. 29def build_demo():
  30. 30 with gr.Blocks() as demo:
  31. 31 gr.Markdown("# vLLM text completion demo\n")
  32. 32 inputbox = gr.Textbox(label="Input",
  33. 33 placeholder="Enter text and press ENTER")
  34. 34 outputbox = gr.Textbox(label="Output",
  35. 35 placeholder="Generated result from the model")
  36. 36 inputbox.submit(http_bot, [inputbox], [outputbox])
  37. 37 return demo
  38. 38
  39. 39
  40. 40if __name__ == "__main__":
  41. 41 parser = argparse.ArgumentParser()
  42. 42 parser.add_argument("--host", type=str, default=None)
  43. 43 parser.add_argument("--port", type=int, default=8001)
  44. 44 parser.add_argument("--model-url",
  45. 45 type=str,
  46. 46 default="http://localhost:8000/generate")
  47. 47 args = parser.parse_args()
  48. 48
  49. 49 demo = build_demo()
  50. 50 demo.queue().launch(server_name=args.host,
  51. 51 server_port=args.port,
  52. 52 share=True)