qwen2.5-vl使用vllm部署gradio页面调用
想在服务器上用vllm部署qwen2.5-vl, 然后使用gradio页面在本地调试,官方代码给了两条命令,列出的request body体结构, 不过要与gradio连用, 还需要重新组织代码。
官方服务代码如下:
vllm serve Qwen/Qwen2.5-VL-7B-Instruct --port 8000 --host 0.0.0.0 --dtype bfloat16 --limit-mm-per-prompt image=5,video=5
import base64
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
image_path = "/path/to/local/image.png"
with open(image_path, "rb") as f:
encoded_image = base64.b64encode(f.read())
encoded_image_text = encoded_image.decode("utf-8")
base64_qwen = f"data:image;base64,{encoded_image_text}"
chat_response = client.chat.completions.create(
model="Qwen/Qwen2.5-VL-7B-Instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": base64_qwen
},
},
{"type": "text", "text": "What is the text in the illustrate?"},
],
},
],
)
print("Chat response:", chat_response)
vllm有两种命令行起服务的方式,vllm serve …和python -m vllm.entrypoints.api_server …, vllm serve是一种简单的方式,可以满足大多数场景,如果想要对参数有更精细调控,可以使用python -m vllm.entrypoints.api_server …。
vllm官方其实不推荐使用api_server作为正式的服务,在生产环境中更建议使用vllm.entrypoints.openai.api_server, 毕竟可以暴露api_key这样的鉴权机制,不过作为本地调试,vllm.entrypoints.api_server足够用了。
一 vllm serve调用
命令行起服务
vllm serve /home/Qwen2.5-VL-3B-Instruct-AWQ \
--port 22 --host 0.0.0.0 --dtype float16 --enforce-eage --max-model-len 4096 --limit-mm-per-prompt image=5,video=5
# -*- coding: utf-8 -*-
# @Time : 2025/3/10 上午11:31
# @Author : yblir
# @File : gradio_demo.py
# explain :
# =======================================================
import datetime
import os
import gradio as gr
import requests
import json
from transformers import AutoProcessor
from PIL import Image
import io
import base64
# todo 手动修改
# processor = AutoProcessor.from_pretrained('/home/Qwen25-VL-3B-Instruct-AWQ')
processor = AutoProcessor.from_pretrained(r'E:\PyCharm\insteresting\Qwen2.5-VL-3B-Instruct-AWQ')
save_file_path = './gradio_output/output.txt'
post_url = "http://127.0.0.1:10091/v1/chat/completions"
def save_file(text, output, file_path='output.txt'):
folder_path = os.path.dirname(file_path)
if not os.path.exists(folder_path):
os.makedirs(folder_path, exist_ok=True)
with open(file_path, 'a', encoding='utf-8') as f:
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
f.write(f"[{timestamp}] prompt: {text}, output: {output}\n")
def _transform_messages(original_messages):
transformed_messages = []
for message in original_messages:
new_content = []
for item in message['content']:
if 'image' in item:
new_item = {'type': 'image', 'image': item['image']}
elif 'text' in item:
new_item = {'type': 'text', 'text': item['text']}
elif 'video' in item:
new_item = {'type': 'video', 'video': item['video']}
else:
continue
new_content.append(new_item)
new_message = {'role': message['role'], 'content': new_content}
transformed_messages.append(new_message)
return transformed_messages
def call_api(image_path, text):
# messages = _transform_messages(messages)
# text2 = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
img = Image.open(image_path)
img_bytes = io.BytesIO()
img.save(img_bytes, format=img.format)
img_bytes = img_bytes.getvalue()
img_base64 = base64.b64encode(img_bytes).decode()
base64_qwen = f"data:image;base64,{img_base64}"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role" : "user",
"content": [
{
"type" : "image_url",
"image_url": {"url": base64_qwen},
},
{"type": "text", "text": text}
]
}
]
payload = {
'model' : '/home/Qwen2.5-VL-3B-Instruct-AWQ',
'messages' : messages,
# 'stream' : True,
# "temperature" : 0.1,
# "top_p" : 0.001,
# "repetition_penalty": 1.05,
# "max_tokens" : 512,
# "stop_token_ids" : []
}
# 发送请求到 API
response = requests.post(
url=post_url,
headers={"Content-Type": "application/json"},
json=payload
)
result = response.json()
print(result)
content = result['choices'][0]['message']['content']
save_file(text, content, save_file_path)
return content, f'文件保存: {save_file_path}'
# 创建 Gradio 界面
demo = gr.Interface(
fn=call_api,
inputs=[
gr.Image(label="上传图片", type="filepath"),
gr.Textbox(label="文本输入"),
],
# outputs=gr.JSON(label="API Response"),
outputs=[
gr.Textbox(label='文本生成'),
gr.Markdown(show_label=False)
],
title="Chat with Qwen2.5-VL-Instruct",
description="分别上传图片和文本"
)
if __name__ == '__main__':
"""
vllm serve /home/Qwen2.5-VL-3B-Instruct-AWQ \
--port 22 --host 0.0.0.0 --dtype float16 --enforce-eage --max-model-len 4096 --limit-mm-per-prompt image=5,video=5
CUDA_VISIBLE_DEVICES=6 python3 -m vllm.entrypoints.api_server --model /home/Qwen2.5-VL-3B-Instruct-AWQ \
--port 22 --host 0.0.0.0 --dtype float16 --enforce-eage --max-model-len 20480 --limit-mm-per-prompt image=5,video=5
"""
# 启动 Gradio 应用
# demo.launch(server_name='0.0.0.0', server_port=7865)
demo.launch(server_name='127.0.0.1', server_port=7865)
关于页面的展示,这两种方式都一样,都在在下面合并介绍。
二 vllm.entrypoints.openai.api_server调用
这种调用方式要改动vllm部分源码:
- /usr/local/lib/python3.10/dist-packages/vllm/multimodal/parse.py
# 增加对图片数据的解包处理
def parse_mm_data(self,
mm_data: MultiModalDataDict) -> MultiModalDataItems:
subparsers = self._get_subparsers()
#print('===================================')
#print("mm_data=",mm_data)
if isinstance(mm_data['image'],str):
#print("----------------------------------")
img_base64=mm_data['image']
img_bytes=base64.b64decode(img_base64)
image=PIL.Image.open(io.BytesIO(img_bytes))
mm_data['image']=image
#print('=====',mm_data['image'])
mm_items = MultiModalDataItems()
for k, v in mm_data.items():
if k not in subparsers:
raise ValueError(f"Unsupported modality: {k}")
mm_items[k] = subparsers[k](v)
#print('mm_items=',mm_items)
return mm_items
- /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/api_server.py
将多余的端口删掉, --port仅保留default选项
之后就可以使用命令行起服务, 这里容器使用桥接模式, 只映射出了22端口
python3 -m vllm.entrypoints.api_server --model /home/Qwen2.5-VL-3B-Instruct-AWQ \
--port 22 --host 0.0.0.0 --dtype float16 --enforce-eage --max-model-len 4096 --limit-mm-per-prompt image=5,video=5
当出现url时,就算服务起成功.
- 本地执行gradio页面代码
# -*- coding: utf-8 -*-
# @Time : 2025/3/10 上午11:31
# @Author : yblir
# @File : gradio_demo.py
# explain :
# =======================================================
import datetime
import os
import gradio as gr
import requests
import json
from transformers import AutoProcessor
from PIL import Image
import io
import base64
# todo 手动修改
# processor = AutoProcessor.from_pretrained('/home/Qwen25-VL-3B-Instruct-AWQ')
processor = AutoProcessor.from_pretrained(r'E:\PyCharm\insteresting\Qwen2.5-VL-3B-Instruct-AWQ')
save_file_path = './gradio_output/output.txt'
post_url = "http://127.0.0.1:10091/generate" # api_serve 需要使用这个
def save_file(text, output, file_path='output.txt'):
folder_path = os.path.dirname(file_path)
if not os.path.exists(folder_path):
os.makedirs(folder_path, exist_ok=True)
with open(file_path, 'a', encoding='utf-8') as f:
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
f.write(f"[{timestamp}] prompt: {text}, output: {output}\n")
def _transform_messages(original_messages):
transformed_messages = []
for message in original_messages:
new_content = []
for item in message['content']:
if 'image' in item:
new_item = {'type': 'image', 'image': item['image']}
elif 'text' in item:
new_item = {'type': 'text', 'text': item['text']}
elif 'video' in item:
new_item = {'type': 'video', 'video': item['video']}
else:
continue
new_content.append(new_item)
new_message = {'role': message['role'], 'content': new_content}
transformed_messages.append(new_message)
return transformed_messages
def call_api(image, text):
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role" : "user",
"content": [
{
"type" : "image",
"image" : image,
"min_pixels": 224 * 224,
"max_pixels": 1280 * 28 * 28,
},
{"type": "text", "text": text},
],
},
]
messages = _transform_messages(messages)
text2 = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
mm_data = {}
img = Image.open(image)
img_bytes = io.BytesIO()
img.save(img_bytes, format=img.format)
img_bytes = img_bytes.getvalue()
img_base64 = base64.b64encode(img_bytes).decode()
mm_data['image'] = img_base64
ll_inputs = {
'prompt' : text2,
'multi_modal_data': mm_data,
}
body = {
'prompt' : ll_inputs,
'stream' : True,
"temperature" : 0.1,
"top_p" : 0.001,
"repetition_penalty": 1.05,
"max_tokens" : 512,
"stop_token_ids" : []
}
# 发送请求到 API
response = requests.post(
url=post_url,
headers={"Content-Type": "application/json"},
json=body
)
full_output = ''
for i, chunk in enumerate(response.iter_lines()):
if chunk.strip():
data = json.loads(chunk)
output = data['text'][0]
if 'assistant' not in output:
continue
_, new_output = output.split('assistant')
new_output = new_output.strip()
full_output = new_output
yield new_output, f'文件保存: {save_file_path}'
save_file(text, full_output, save_file_path)
# 创建 Gradio 界面
demo = gr.Interface(
fn=call_api,
inputs=[
gr.Image(label="上传图片", type="filepath"),
gr.Textbox(label="文本输入"),
],
# outputs=gr.JSON(label="API Response"),
outputs=[
gr.Textbox(label='文本生成'),
gr.Markdown(show_label=False)
],
title="Chat with Qwen2.5-VL-Instruct",
description="分别上传图片和文本"
)
if __name__ == '__main__':
"""
vllm serve /home/Qwen25-VL-3B-Instruct-AWQ \
--port 8811 --host 0.0.0.0 --dtype float16 --limit-mm-per-prompt image=5,video=5
CUDA_VISIBLE_DEVICES=6 python3 -m vllm.entrypoints.api_server --model /home/Qwen2.5-VL-3B-Instruct-AWQ \
--port 22 --host 0.0.0.0 --dtype float16 --enforce-eage --max-model-len 20480 --limit-mm-per-prompt image=5,video=5
"""
# 启动 Gradio 应用
# demo.launch(server_name='0.0.0.0', server_port=7865)
demo.launch(server_name='127.0.0.1', server_port=7865)
运行代码,在浏览器访问
结果输出:
结果保存:
[2025-03-13 00:12:44] prompt: 简单描述下这张图片, output: 这张图片展示了一位蓝发的动漫角色,背景是夕阳下的城市景观。角色穿着一件蓝色的外套,头发被扎成一个高高的发髻。整体色调温暖,给人一种宁静的感觉。
后记:
vllm serve方式直接传入text,没有使用模板, 和有模板时的输出做过对比, 发现没多大区别.
第二种方式文本必须使用模板,否则输出结果会变差.