系统环境
cat /etc/os-release
NAME="EulerOS"
VERSION="2.0 (SP10)"
ID="euleros"
VERSION_ID="2.0"
PRETTY_NAME="EulerOS 2.0 (SP10)"
ANSI_COLOR="0;31"uname -m
aarch64npu-smi info
# 8卡 ...
下载模型
- 安装git git-lfs
yum install git
wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.0/git-lfs-linux-arm64-v3.7.0.tar.gz
tar -xzvf git-lfs-linux-amd64-v3.7.0.tar.gz
cd git-lfs-3.7.0/
./install.sh
- 下载模型
GIT_LFS_SKIP_SMUDGE=1 git clone https://gitcode.com/hf_mirrors/Qwen/Qwen3-0.6B.git
cd Qwen3-0.6B
git lfs install
nohup git lfs pull > /dev/null 2>&1 &
下载镜像
- 修改源
vim /etc/docker/daemon.json{..."registry-mirrors": ["https://docker.xuanyuan.me","https://docker.1ms.run","https://mirror.ccs.tencentyun.com","https://docker-0.unsee.tech","https://docker.m.daocloud.io"],...# 把 Docker 数据放到 大容量数据盘"max-concurrent-downloads": 1,"data-root": "/data2/develop/docker/default-work"
}
- 重新加载 systemd 配置
systemctl daemon-reload
- 重启 Docker
systemctl restart docker
- 查看源
docker info | grep -i Mirror Registry Mirrors:https://xxx.mirror.aliyuncs.com/https://mirror.ccs.tencentyun.com/
- 下载vllm镜像
docker pull quay.io/ascend/vllm-ascend:v0.11.0rc0
vllm 推理部署 Qwen3-0.6B
- docker-compose.yaml
version: '3.8'services:vllm-ascend:image: quay.io/ascend/vllm-ascend:v0.11.0rc0container_name: vllm-Qwen3-0.6Bdevices:# 配置第8张卡单独运行- /dev/davinci7- /dev/davinci_manager- /dev/devmm_svm- /dev/hisi_hdcvolumes:- /usr/local/dcmi:/usr/local/dcmi- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi- /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/- /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info- /etc/ascend_install.info:/etc/ascend_install.info- /data2/models/Qwen3-0.6B:/data/modelports:- "8100:8000"restart: unless-stoppedstdin_open: truetty: truecommand: >vllm serve /data/model--served-model-name Qwen3-0.6B--tensor-parallel-size 1--dtype float16--compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}'--max-num-seqs 4--max-model-len 2048--gpu-memory-utilization 0.8--trust_remote_code
- chat/completions
curl --location 'http://localhost:8100/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data '{"model": "Qwen3-0.6B","messages": [{"role": "user","content": "你好,你是谁,简单自我介绍一下"}],"top_p": 0.95,"stream": true,"stream_options": {"include_usage": true,"continuous_usage_stats": true}
}'
