Gradio OpenAI Chatbot Webserver
Source vllm-project/vllm.
1import argparse23import gradio as gr4from openai import OpenAI56# Argument parser setup7parser = argparse.ArgumentParser(8 description='Chatbot Interface with Customizable Parameters')9parser.add_argument('--model-url',10 type=str,11 default='http://localhost:8000/v1',12 help='Model URL')13parser.add_argument('-m',14 '--model',15 type=str,16 required=True,17 help='Model name for the chatbot')18parser.add_argument('--temp',19 type=float,20 default=0.8,21 help='Temperature for text generation')22parser.add_argument('--stop-token-ids',23 type=str,24 default='',25 help='Comma-separated stop token IDs')26parser.add_argument("--host", type=str, default=None)27parser.add_argument("--port", type=int, default=8001)2829# Parse the arguments30args = parser.parse_args()3132# Set OpenAI's API key and API base to use vLLM's API server.33openai_api_key = "EMPTY"34openai_api_base = args.model_url3536# Create an OpenAI client to interact with the API server37client = OpenAI(38 api_key=openai_api_key,39 base_url=openai_api_base,40)414243def predict(message, history):44 # Convert chat history to OpenAI format45 history_openai_format = [{46 "role": "system",47 "content": "You are a great ai assistant."48 }]49 for human, assistant in history:50 history_openai_format.append({"role": "user", "content": human})51 history_openai_format.append({52 "role": "assistant",53 "content": assistant54 })55 history_openai_format.append({"role": "user", "content": message})5657 # Create a chat completion request and send it to the API server58 stream = client.chat.completions.create(59 model=args.model, # Model name to use60 messages=history_openai_format, # Chat history61 temperature=args.temp, # Temperature for text generation62 stream=True, # Stream response63 extra_body={64 'repetition_penalty':65 1,66 'stop_token_ids': [67 int(id.strip()) for id in args.stop_token_ids.split(',')68 if id.strip()69 ] if args.stop_token_ids else []70 })7172 # Read and return generated text from response stream73 partial_message = ""74 for chunk in stream:75 partial_message += (chunk.choices[0].delta.content or "")76 yield partial_message777879# Create and launch a chat interface with Gradio80gr.ChatInterface(predict).queue().launch(server_name=args.host,81 server_port=args.port,82 share=True)