当前位置：首页 > news >正文

ui-tars和omni-parser使用

news 2025/10/21 3:31:56

ui-tars部署和训练

说明
快速开始
- 环境准备
- ui-tars web推理和训练
- ui-tars api部署
- omni-parser使用

说明

镜像中包含ui-tars、llama-factory和omni-parser。该镜像还在审批中，估计明天可以上线，到时候可以在auto-dl中的社区镜像搜索。

快速开始

使用auto-dl镜像：
在这里插入图片描述

https://www.codewithgpu.com/i/hiyouga/LLaMA-Factory/ui-tars_omni-parser_llama-factory

环境准备

将模型从系统盘移动到数据盘，移动成功后可以选择删除原文件

cp -r /root/model/UI-TARS-7B-DPO /root/autodl-tmp/
cp -r /root/omni  /root/autodl-tmp/

ui-tars web推理和训练

bash /root/LLaMA-Factory/chuli/one.sh

高级设置的提示模板要改成qwen2_vl，否则无法上传图片
在这里插入图片描述
具体的使用方法可以查看llama-factory官方
https://github.com/hiyouga/LLaMA-Factory

ui-tars api部署

进入conda环境

conda activate llama

-tp 是指需要的gpu数量，改成1

python -m vllm.entrypoints.openai.api_server --served-model-name ui-tars \
    --model /root/autodl-tmp/UI-TARS-7B-DPO --limit-mm-per-prompt image=5 --dtype=half -tp 1

使用自定义服务进行映射，方便本地电脑调用：

ssh -CNg -L 8000:127.0.0.1:8000 root@region-9.autodl.pro -p 46525

本地电脑调用示例：

curl http://localhost:8000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "ui-tars",
        "messages": [
            {"role": "user", "content": "我想问你，5的阶乘是多少？<think>\n"}
        ]
    }'
{"id":"chat-7c8149f008a24adfa451a989ba6256d5","object":"chat.completion","created":1741314705,"model":"ui-tars","choices":[{"index":0,"message":{"role":"assistant",
"content":"5的阶乘是120。阶乘运算的数学符号是“!”。在计算机编程语言中，它通常用“ fact”来表示。阶乘的定义为：n! = n * (n - 1) * (n - 2) * ... * 2 * 1，其中n是一个正整数。","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":22,"total_tokens":97,"completion_tokens":75},"prompt_logprobs":null}%

test.py

from model import OpenAIModel, print_with_color

configs = {
    "DEEPSEEK_API_BASE": "http://localhost:8000/v1/chat/completions",
    "DEEPSEEK_API_MODEL": "ui-tars",
    "MAX_TOKENS": 1024,
    "TEMPERATURE": 0,
    "OPENAI_API_KEY": ''
}

def ask(question: str):
    print_with_color("####################deepseek####################", "magenta")
    print_with_color(f"question: {question}", 'yellow')
    mllm = OpenAIModel(base_url=configs["DEEPSEEK_API_BASE"],
                    api_key=configs["OPENAI_API_KEY"],
                    model=configs["DEEPSEEK_API_MODEL"],
                    temperature=configs["TEMPERATURE"],
                    max_tokens=configs["MAX_TOKENS"],
                    disable_proxies=True)
    prompt = question
    images = ['image1.jpg']
    status, rsp = mllm.get_model_response(prompt, images=images)
    if not status:
        print_with_color(f"失败，{rsp}", 'red')
        return
    print_with_color(f"*********************** rsp:\n{rsp}", "yellow")


ask("解释下图片的内容")

model.py

from abc import abstractmethod
from typing import List
import base64
import requests
import sys
from typing import Tuple
from colorama import Fore, Style


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
def print_with_color(text: str, color=""):
    if color == "red":
        print(Fore.RED + text)
    elif color == "green":
        print(Fore.GREEN + text)
    elif color == "yellow":
        print(Fore.YELLOW + text)
    elif color == "blue":
        print(Fore.BLUE + text)
    elif color == "magenta":
        print(Fore.MAGENTA + text)
    elif color == "cyan":
        print(Fore.CYAN + text)
    elif color == "white":
        print(Fore.WHITE + text)
    elif color == "black":
        print(Fore.BLACK + text)
    else:
        print(text)
    print(Style.RESET_ALL)

class BaseModel:
    def __init__(self):
        pass

    @abstractmethod
    def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
        pass


class OpenAIModel(BaseModel):
    def __init__(self, base_url: str, api_key: str, model: str, temperature: float, max_tokens: int, disable_proxies=False):
        super().__init__()
        self.base_url = base_url
        self.api_key = api_key
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.disable_proxies = disable_proxies

    def get_model_response(self, prompt: str, images: List[str]=[], tools: list[dict]=None,
                        history: list[dict]=None, role: str="user") -> Tuple[bool, str]:
        content = [
            {
                "type": "text",
                "text": prompt
            }
        ]
        for img in images:
            base64_img = encode_image(img)
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_img}"
                }
            })
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }
        payload = {
            "model": self.model,
            "messages": [
                {
                    "role": role,
                    "content": content
                }
            ],
            "temperature": self.temperature,
            "max_tokens": self.max_tokens
        }
        if tools:
            payload["tools"] = tools
        if history:
            history.append(payload['messages'][-1])
            payload['messages'] = history
        if self.disable_proxies:
            response = requests.post(self.base_url, headers=headers, json=payload, proxies={}).json()
        else:
            response = requests.post(self.base_url, headers=headers, json=payload).json()
        if "error" not in str(response):
            if not 'usage' in response:
                print_with_color(f"not usage:{response}", 'res')
            else:
                usage = response["usage"]
                prompt_tokens = usage["prompt_tokens"]
                total_tokens = usage["total_tokens"]
                completion_tokens = usage["completion_tokens"]
                print_with_color(f"total_tokens: {total_tokens}, prompt_tokens: {prompt_tokens}, completion_tokens: {completion_tokens}")
                completion_tokens = usage["completion_tokens"]
                if self.model == "gpt-4o":
                    print_with_color(f"Request gpt-4o cost is "
                                f"${'{0:.2f}'.format(prompt_tokens / 1000 * 0.005 + completion_tokens / 1000 * 0.015)}",
                                "yellow")
                else:
                    print_with_color(f"Request cost is "
                                    f"${'{0:.2f}'.format(prompt_tokens / 1000 * 0.01 + completion_tokens / 1000 * 0.03)}",
                                    "yellow")
        else:
            print_with_color(f"执行失败，response: {response}", "red")
            return False, response
        if tools:
            return True, response["choices"][0]["message"]["tool_calls"]
        else:
            return True, response["choices"][0]["message"]["content"]