怎么安装并使用数字人EchoMimicV2
安装EchoMimic V2 的步骤(严格按照github的安装步骤即可):
https://github.com/antgroup/echomimic_v2
添加切换背景的python代码:
import argparse
import os
import random
from datetime import datetime
from pathlib import Path
from typing import List
import time
import mathimport cv2
import numpy as np
import torch
import torchvision
import torch.nn.functional as F
from diffusers import AutoencoderKL, DDIMScheduler
from einops import repeat
from omegaconf import OmegaConf
from PIL import Imagefrom src.models.unet_2d_condition import UNet2DConditionModel
from src.models.unet_3d_emo import EMOUNet3DConditionModel
from src.models.whisper.audio2feature import load_audio_modelfrom src.pipelines.pipeline_echomimicv2_acc import EchoMimicV2Pipeline
from src.utils.util import get_fps, read_frames, save_videos_grid
from src.utils.dwpose_util import draw_pose_select_v2
import sys
from src.models.pose_encoder import PoseEncoder
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeVideoClip, ImageClip, concatenate_videoclipsffmpeg_path = os.getenv('FFMPEG_PATH')
if ffmpeg_path is None:print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=./ffmpeg-4.4-amd64-static")
elif ffmpeg_path not in os.getenv('PATH'):print("add ffmpeg to path")os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"def parse_args():parser = argparse.ArgumentParser()parser.add_argument("--config", type=str, default="./configs/prompts/infer_acc.yaml")parser.add_argument("-W", type=int, default=768)parser.add_argument("-H", type=int, default=768)parser.add_argument("-L", type=int, default=240)parser.add_argument("--seed", type=int, default=420)parser.add_argument("--context_frames", type=int, default=12)parser.add_argument("--context_overlap", type=int, default=3)parser.add_argument("--motion_sync", type=int, default=1)parser.add_argument("--cfg", type=float, default=1.0)parser.add_argument("--steps", type=int, default=6)parser.add_argument("--sample_rate", type=int, default=16000)parser.add_argument("--fps", type=int, default=24)parser.add_argument("--device", type=str, default="cuda")parser.add_argument("--ref_images_dir", type=str, default=f'./assets/halfbody_demo/refimag')parser.add_argument("--audio_dir", type=str, default='./assets/halfbody_demo/audio')parser.add_argument("--pose_dir", type=str, default="./assets/halfbody_demo/pose")parser.add_argument("--refimg_name", type=str, default='natural_bk_openhand/0035.png')parser.add_argument("--audio_name", type=str, default='chinese/echomimicv2_woman.wav')parser.add_argument("--pose_name", type=str, default="01")# 分段处理相关参数parser.add_argument("--segment_duration", type=int, default=10, help="每个视频片段的时长(秒)")parser.add_argument("--concatenate_segments", action="store_true", help="是否将所有片段合并成一个完整视频")parser.add_argument("--keep_segments", action="store_true", help="保留分段文件(当启用合并时)")# 背景合成相关参数parser.add_argument("--background_image", type=str, default=None, help="背景图片路径")parser.add_argument("--avatar_position", type=str, default="right", choices=["left", "right", "center"], help="数字人水平位置")parser.add_argument("--avatar_vertical_position", type=str, default="middle", choices=["top", "middle", "bottom"], help="数字人垂直位置")parser.add_argument("--avatar_horizontal_offset", type=int, default=0, help="数字人水平偏移量(像素,正值向右,负值向左)")parser.add_argument("--avatar_vertical_offset", type=int, default=0, help="数字人垂直偏移量(像素,正值向下,负值向上)")parser.add_argument("--avatar_scale", type=float, default=1.0, help="数字人缩放微调比例(基于2/3高度)")parser.add_argument("--final_width", type=int, default=1920, help="最终视频宽度")parser.add_argument("--final_height", type=int, default=1080, help="最终视频高度")parser.add_argument("--enable_chromakey", action="store_true", default=True, help="启用绿幕抠图(默认开启)")parser.add_argument("--chromakey_threshold", type=float, default=0.15, help="绿幕抠图阈值")parser.add_argument("--chromakey_method", type=str, default="simple", choices=["simple", "enhanced"], help="抠图方法:simple简单/enhanced增强")parser.add_argument("--avatar_margin_ratio", type=float, default=0.25, help="数字人占用屏幕宽度的比例(右侧预留区域)")parser.add_argument("--debug_chromakey", action="store_true", help="调试模式:保存抠图遮罩用于检查")args = parser.parse_args()return argsdef create_composite_video(avatar_video_path, background_image_path, output_path,avatar_position="right", avatar_vertical_position="middle",avatar_horizontal_offset=0, avatar_vertical_offset=0,avatar_scale=1.0, final_width=1920, final_height=1080,enable_chromakey=True, chromakey_threshold=0.15,chromakey_method="simple", avatar_margin_ratio=0.25,debug_chromakey=False):"""将数字人视频与背景图片合成"""print(f"开始合成视频...")print(f"数字人视频: {avatar_video_path}")print(f"背景图片: {background_image_path}")print(f"绿幕抠图: {'启用' if enable_chromakey else '禁用'}")print(f"抠图方法: {chromakey_method}")print(f"数字人位置: {avatar_position}-{avatar_vertical_position}")if avatar_horizontal_offset != 0 or avatar_vertical_offset != 0:print(f"位置偏移: 水平{avatar_horizontal_offset}px, 垂直{avatar_vertical_offset}px")try:# 加载数字人视频avatar_clip = VideoFileClip(avatar_video_path)print(f"数字人视频时长: {avatar_clip.duration}秒")print(f"原始数字人尺寸: {avatar_clip.w}x{avatar_clip.h}")# 处理背景图片 - 使用ImageClip直接创建视频剪辑background_clip = ImageClip(background_image_path, duration=avatar_clip.duration)background_clip = background_clip.resize((final_width, final_height))# 计算数字人的预留区域宽度avatar_area_width = int(final_width * avatar_margin_ratio)# 计算数字人缩放比例,使其高度为最终视频高度的2/3target_avatar_height = int(final_height * 2 / 3 * avatar_scale)# 计算宽度,确保不超过预留区域scale_factor_height = target_avatar_height / avatar_clip.hpotential_width = int(avatar_clip.w * scale_factor_height)# 如果宽度超过预留区域,按宽度缩放if potential_width > avatar_area_width * 0.9: # 留10%边距scale_factor = (avatar_area_width * 0.9) / avatar_clip.wavatar_width = int(avatar_clip.w * scale_factor)avatar_height = int(avatar_clip.h * scale_factor)else:scale_factor = scale_factor_heightavatar_width = potential_widthavatar_height = target_avatar_height# 缩放数字人视频avatar_clip_resized = avatar_clip.resize((avatar_width, avatar_height))print(f"预留区域宽度: {avatar_area_width}px (屏幕宽度的{avatar_margin_ratio:.1%})")print(f"数字人实际尺寸: {avatar_width}x{avatar_height}")print(f"缩放比例: {scale_factor:.2f}")# 默认启用绿幕抠图去除背景if enable_chromakey:print("正在去除数字人背景...")print(f"抠图阈值: {chromakey_threshold}")print(f"抠图方法: {chromakey_method}")try:# 根据选择的方法进行抠图if chromakey_method == "enhanced":avatar_clip_resized = create_enhanced_chromakey_clip(avatar_clip_resized, chromakey_threshold)elif chromakey_method == "simple":avatar_clip_resized = create_simple_chromakey_clip(avatar_clip_resized, chromakey_threshold)else:# 使用原始方法作为备选avatar_clip_resized = apply_chromakey(avatar_clip_resized, chromakey_threshold)print("背景去除完成,数字人现在具有透明背景")except Exception as e:print(f"绿幕抠图失败: {e}")print("尝试使用简单抠图方法...")try:avatar_clip_resized = create_simple_chromakey_clip(avatar_clip_resized, chromakey_threshold)print("简单抠图方法成功")except Exception as e2:print(f"简单抠图也失败: {e2}")print("跳过绿幕抠图,使用原始视频")# 调试模式:保存抠图后的第一帧用于检查if debug_chromakey:try:debug_frame = avatar_clip_resized.get_frame(1)debug_path = output_path.replace('.mp4', '_debug_chromakey.png')Image.fromarray(debug_frame.astype('uint8')).save(debug_path)print(f"调试图片已保存: {debug_path}")except Exception as e:print(f"保存调试图片失败: {e}")# 计算数字人水平位置margin = 30 # 最小边距if avatar_position == "left":x_position = marginelif avatar_position == "right":# 右侧定位:在右侧预留区域内居中right_area_start = final_width - avatar_area_widthx_position = right_area_start + (avatar_area_width - avatar_width) // 2else: # centerx_position = (final_width - avatar_width) // 2# 计算数字人垂直位置if avatar_vertical_position == "top":y_position = marginelif avatar_vertical_position == "bottom":y_position = final_height - avatar_height - marginelse: # middley_position = (final_height - avatar_height) // 2# 应用位置偏移x_position += avatar_horizontal_offsety_position += avatar_vertical_offset# 确保位置不会超出边界x_position = max(0, min(x_position, final_width - avatar_width))y_position = max(0, min(y_position, final_height - avatar_height))print(f"数字人最终位置: ({x_position}, {y_position})")print(f"数字人占用区域: x: {x_position}-{x_position + avatar_width}, y: {y_position}-{y_position + avatar_height}")if avatar_position == "right":print(f"背景讲义安全区域: x: 0-{final_width - avatar_area_width}")# 设置数字人位置avatar_clip_positioned = avatar_clip_resized.set_position((x_position, y_position))# 合成视频print("正在合成视频...")final_clip = CompositeVideoClip([background_clip, avatar_clip_positioned],size=(final_width, final_height))# 保持原音频final_clip = final_clip.set_audio(avatar_clip.audio)# 输出视频print("正在写入视频文件...")final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac",threads=4, preset="medium",ffmpeg_params=["-crf", "23"]) # 提高视频质量# 关闭剪辑释放内存avatar_clip.close()background_clip.close()final_clip.close()print(f"视频合成完成: {output_path}")print(f"最终尺寸: {final_width}x{final_height}")print(f"数字人位置: {avatar_position}-{avatar_vertical_position}")return output_pathexcept Exception as e:print(f"视频合成过程中出错: {str(e)}")import tracebacktraceback.print_exc()return Nonedef apply_chromakey(video_clip, threshold=0.15):"""应用绿幕抠图效果,创建真正的透明背景"""def make_mask(get_frame, t):"""创建遮罩,白色保留,黑色透明"""frame = get_frame(t)# 转换为HSV色彩空间hsv = cv2.cvtColor(frame, cv2.COLOR_RGB2HSV)# 定义多个绿色范围以适应不同的绿幕条件# 主要绿色范围lower_green1 = np.array([40, 50, 50])upper_green1 = np.array([80, 255, 255])# 较暗的绿色lower_green2 = np.array([35, 30, 30])upper_green2 = np.array([85, 255, 200])# 较亮的绿色lower_green3 = np.array([45, 60, 60])upper_green3 = np.array([75, 255, 255])# 创建多个绿色遮罩mask1 = cv2.inRange(hsv, lower_green1, upper_green1)mask2 = cv2.inRange(hsv, lower_green2, upper_green2)mask3 = cv2.inRange(hsv, lower_green3, upper_green3)# 合并所有绿色遮罩green_mask = cv2.bitwise_or(mask1, mask2)green_mask = cv2.bitwise_or(green_mask, mask3)# 形态学操作去除小噪点kernel = np.ones((3,3), np.uint8)green_mask = cv2.morphologyEx(green_mask, cv2.MORPH_OPEN, kernel)green_mask = cv2.morphologyEx(green_mask, cv2.MORPH_CLOSE, kernel)# 边缘腐蚀,去除绿色边缘残留kernel = np.ones((2,2), np.uint8)green_mask = cv2.erode(green_mask, kernel, iterations=1)# 边缘平滑(轻微高斯模糊)green_mask = cv2.GaussianBlur(green_mask, (3, 3), 0)# 反转遮罩:绿色区域为黑色(0),非绿色区域为白色(255)final_mask = cv2.bitwise_not(green_mask)# 转换为0-1范围的浮点数final_mask = final_mask.astype(np.float64) / 255.0return final_mask# 创建带遮罩的视频剪辑 - 使用正确的MoviePy APIfrom moviepy.video.VideoClip import VideoClip# 创建mask clipmask_clip = VideoClip(make_frame=make_mask, duration=video_clip.duration)mask_clip = mask_clip.set_ismask(True) # 重要:标记为maskreturn video_clip.set_mask(mask_clip)def create_enhanced_chromakey_clip(video_clip, threshold=0.15):"""增强版绿幕抠图,提供更好的边缘处理"""def make_enhanced_mask(t):"""创建增强版遮罩"""frame = video_clip.get_frame(t)# 预处理:轻微降噪frame = cv2.bilateralFilter(frame, 5, 50, 50)# 转换色彩空间hsv = cv2.cvtColor(frame, cv2.COLOR_RGB2HSV)lab = cv2.cvtColor(frame, cv2.COLOR_RGB2LAB)# HSV绿色检测lower_green_hsv = np.array([35, 40, 40])upper_green_hsv = np.array([85, 255, 255])mask_hsv = cv2.inRange(hsv, lower_green_hsv, upper_green_hsv)# LAB色彩空间的A通道绿色检测(A通道:绿色为负值)a_channel = lab[:,:,1]mask_lab = (a_channel < 127 - threshold * 127).astype(np.uint8) * 255# 结合两种遮罩combined_mask = cv2.bitwise_or(mask_hsv, mask_lab)# 形态学操作kernel = np.ones((4,4), np.uint8)combined_mask = cv2.morphologyEx(combined_mask, cv2.MORPH_CLOSE, kernel)combined_mask = cv2.morphologyEx(combined_mask, cv2.MORPH_OPEN, kernel)# 边缘腐蚀去除绿边kernel = np.ones((2,2), np.uint8)combined_mask = cv2.erode(combined_mask, kernel, iterations=1)# 边缘羽化combined_mask = cv2.GaussianBlur(combined_mask, (5, 5), 1.5)# 反转遮罩并归一化inverted = 255 - combined_maskreturn inverted.astype(np.float64) / 255.0# 创建mask clip - 使用正确的MoviePy APIfrom moviepy.video.VideoClip import VideoClipmask_clip = VideoClip(make_frame=make_enhanced_mask, duration=video_clip.duration)mask_clip = mask_clip.set_ismask(True) # 重要:标记为maskreturn video_clip.set_mask(mask_clip)def create_simple_chromakey_clip(video_clip, threshold=0.15):"""简单可靠的绿幕抠图方法"""def make_simple_mask(t):frame = video_clip.get_frame(t)# 转换为HSVhsv = cv2.cvtColor(frame, cv2.COLOR_RGB2HSV)# 绿色范围lower_green = np.array([40, 40, 40])upper_green = np.array([80, 255, 255])# 创建遮罩mask = cv2.inRange(hsv, lower_green, upper_green)# 简单的形态学操作kernel = np.ones((3,3), np.uint8)mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)# 边缘平滑mask = cv2.GaussianBlur(mask, (3, 3), 0)# 反转并归一化mask = 255 - maskmask = mask.astype(np.float64) / 255.0return maskfrom moviepy.video.VideoClip import VideoClipmask_clip = VideoClip(make_frame=make_simple_mask, duration=video_clip.duration)mask_clip = mask_clip.set_ismask(True)return video_clip.set_mask(mask_clip)def detect_background_content_area(background_image_path, avatar_area_ratio=0.25):"""检测背景图片的内容区域,确保数字人不遮挡重要内容"""try:# 读取背景图片img = cv2.imread(background_image_path)if img is None:return None# 转换为灰度图gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)# 检测边缘edges = cv2.Canny(gray, 50, 150)# 计算每个区域的边缘密度height, width = gray.shapeavatar_width = int(width * avatar_area_ratio)# 右侧区域的边缘密度right_area = edges[:, -avatar_width:]right_density = np.sum(right_area) / right_area.size# 左侧区域的边缘密度left_area = edges[:, :avatar_width]left_density = np.sum(left_area) / left_area.size# 返回内容密度较低的一侧作为推荐位置if right_density < left_density:return "right"else:return "left"except Exception as e:print(f"背景内容检测失败: {e}")return "right" # 默认返回右侧def split_audio_and_poses(audio_path, pose_dir, segment_duration, fps):"""将音频和pose数据按时间段分割Args:audio_path: 音频文件路径pose_dir: pose文件夹路径segment_duration: 每段时长(秒)fps: 帧率Returns:segments: [(start_time, end_time, start_frame, end_frame), ...]"""# 获取音频总时长audio_clip = AudioFileClip(audio_path)total_duration = audio_clip.durationaudio_clip.close()# 获取pose文件数量pose_files = len(os.listdir(pose_dir))pose_duration = pose_files / fps # pose数据对应的时长# 计算分段信息segments = []current_time = 0while current_time < total_duration:# 计算当前段的结束时间end_time = min(current_time + segment_duration, total_duration)# 计算对应的帧索引(不受pose文件数量限制)start_frame = int(current_time * fps)end_frame = int(end_time * fps)# 计算实际需要的帧数frame_count = end_frame - start_framesegments.append((current_time, end_time, start_frame, end_frame, frame_count))current_time = end_timeprint(f"音频总时长: {total_duration:.2f}秒")print(f"Pose文件数量: {pose_files} (对应 {pose_duration:.2f}秒)")print(f"将分割为 {len(segments)} 个片段,每段约 {segment_duration} 秒")print(f"Pose文件将循环使用以覆盖完整音频")return segmentsdef process_single_segment(pipe, args, config, weight_dtype, device,ref_img_pil, audio_path, pose_dir, segment_info,save_name, segment_idx):"""处理单个音频片段,生成对应的数字人视频Args:segment_info: (start_time, end_time, start_frame, end_frame, frame_count)segment_idx: 片段索引Returns:生成的视频文件路径"""start_time, end_time, start_frame, end_frame, frame_count = segment_infosegment_duration = end_time - start_timeprint(f"\n=== 处理片段 {segment_idx + 1} ===")print(f"时间范围: {start_time:.2f}s - {end_time:.2f}s")print(f"帧范围: {start_frame} - {end_frame}")print(f"片段时长: {segment_duration:.2f}s")print(f"片段帧数: {frame_count}")# 获取pose文件数量,用于循环pose_files = len(os.listdir(pose_dir))print(f"可用pose文件: {pose_files} 个,将循环使用")# 创建音频片段full_audio_clip = AudioFileClip(audio_path)segment_audio_clip = full_audio_clip.subclip(start_time, end_time)# 临时保存音频片段segment_audio_path = f"{save_name}_segment_{segment_idx:03d}_audio.wav"segment_audio_clip.write_audiofile(segment_audio_path, verbose=False, logger=None)# 准备pose数据 - 循环使用pose文件pose_list = []for i in range(frame_count):# 计算当前帧在原始序列中的索引original_frame_idx = start_frame + i# 使用模运算循环使用pose文件pose_idx = original_frame_idx % pose_filestgt_musk = np.zeros((args.W, args.H, 3)).astype('uint8')tgt_musk_path = os.path.join(pose_dir, f"{pose_idx}.npy")if os.path.exists(tgt_musk_path):detected_pose = np.load(tgt_musk_path, allow_pickle=True).tolist()imh_new, imw_new, rb, re, cb, ce = detected_pose['draw_pose_params']im = draw_pose_select_v2(detected_pose, imh_new, imw_new, ref_w=800)im = np.transpose(np.array(im), (1, 2, 0))tgt_musk[rb:re, cb:ce, :] = imelse:print(f"Warning: Pose file {tgt_musk_path} not found, using empty pose")tgt_musk_pil = Image.fromarray(np.array(tgt_musk)).convert('RGB')pose_list.append(torch.Tensor(np.array(tgt_musk_pil)).to(dtype=weight_dtype, device=device).permute(2, 0, 1) / 255.0)# 显示循环使用信息(仅前几帧和循环点)if i < 5 or pose_idx == 0:print(f" 帧 {i+1}/{frame_count}: 使用pose文件 {pose_idx}.npy")poses_tensor = torch.stack(pose_list, dim=1).unsqueeze(0)# 设置随机种子(为了一致性,基于片段索引)if args.seed is not None and args.seed > -1:segment_seed = args.seed + segment_idxelse:segment_seed = random.randint(100, 1000000)generator = torch.manual_seed(segment_seed)print(f"开始生成片段 {segment_idx + 1} 的数字人视频...")print(f"使用种子: {segment_seed}")# 生成数字人视频video = pipe(ref_img_pil,segment_audio_path,poses_tensor,args.W,args.H,frame_count,args.steps,args.cfg,generator=generator,audio_sample_rate=args.sample_rate,context_frames=args.context_frames,fps=args.fps,context_overlap=args.context_overlap,start_idx=0).videos# 保存数字人视频segment_avatar_path = f"{save_name}_segment_{segment_idx:03d}_avatar.mp4"save_videos_grid(video,segment_avatar_path,n_rows=1,fps=args.fps,)# 添加音频video_clip = VideoFileClip(segment_avatar_path)video_with_audio = video_clip.set_audio(segment_audio_clip)segment_final_path = f"{save_name}_segment_{segment_idx:03d}.mp4"video_with_audio.write_videofile(segment_final_path, codec="libx264", audio_codec="aac",threads=2, verbose=False, logger=None)# 清理临时文件video_clip.close()video_with_audio.close()segment_audio_clip.close()full_audio_clip.close()if os.path.exists(segment_audio_path):os.remove(segment_audio_path)if os.path.exists(segment_avatar_path):os.remove(segment_avatar_path)print(f"片段 {segment_idx + 1} 生成完成: {segment_final_path}")return segment_final_pathdef main():args = parse_args()config = OmegaConf.load(args.config)if config.weight_dtype == "fp16":weight_dtype = torch.float16else:weight_dtype = torch.float32device = args.deviceif device.__contains__("cuda") and not torch.cuda.is_available():device = "cpu"inference_config_path = config.inference_configinfer_config = OmegaConf.load(inference_config_path)############# model_init started #############print("正在初始化模型...")## vae initvae = AutoencoderKL.from_pretrained(config.pretrained_vae_path,).to("cuda", dtype=weight_dtype)## reference net initreference_unet = UNet2DConditionModel.from_pretrained(config.pretrained_base_model_path,subfolder="unet",).to(dtype=weight_dtype, device=device)reference_unet.load_state_dict(torch.load(config.reference_unet_path, map_location="cpu"),)## denoising net initif os.path.exists(config.motion_module_path):### stage1 + stage2denoising_unet = EMOUNet3DConditionModel.from_pretrained_2d(config.pretrained_base_model_path,config.motion_module_path,subfolder="unet",unet_additional_kwargs=infer_config.unet_additional_kwargs,).to(dtype=weight_dtype, device=device)else:### only stage1denoising_unet = EMOUNet3DConditionModel.from_pretrained_2d(config.pretrained_base_model_path,"",subfolder="unet",unet_additional_kwargs={"use_motion_module": False,"unet_use_temporal_attention": False,"cross_attention_dim": infer_config.unet_additional_kwargs.cross_attention_dim}).to(dtype=weight_dtype, device=device)denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"),strict=False)## face locator initpose_net = PoseEncoder(320, conditioning_channels=3, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")pose_net.load_state_dict(torch.load(config.pose_encoder_path))### load audio processor paramsaudio_processor = load_audio_model(model_path=config.audio_model_path, device=device)############# model_init finished #############width, height = args.W, args.Hsched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)scheduler = DDIMScheduler(**sched_kwargs)pipe = EchoMimicV2Pipeline(vae=vae,reference_unet=reference_unet,denoising_unet=denoising_unet,audio_guider=audio_processor,pose_encoder=pose_net,scheduler=scheduler,)pipe = pipe.to("cuda", dtype=weight_dtype)print("模型初始化完成")date_str = datetime.now().strftime("%Y%m%d")time_str = datetime.now().strftime("%H%M")save_dir_name = f"{time_str}--step_{args.steps}-{args.W}x{args.H}--cfg_{args.cfg}--segments"save_dir = Path(f"output/{date_str}/{save_dir_name}")save_dir.mkdir(exist_ok=True, parents=True)for ref_image_path in config["test_cases"].keys():for file_path in config["test_cases"][ref_image_path]:if ".wav" in file_path:audio_path = file_pathelse:pose_dir = file_pathref_name = Path(ref_image_path).stemaudio_name = Path(audio_path).stemfinal_fps = args.fpsinputs_dict = {"refimg": ref_image_path,"audio": audio_path,"pose": pose_dir,}print('Pose:', inputs_dict['pose'])print('Reference:', inputs_dict['refimg'])print('Audio:', inputs_dict['audio'])if args.background_image:print('Background:', args.background_image)save_path = Path(f"{save_dir}/{ref_name}")save_path.mkdir(exist_ok=True, parents=True)save_name = f"{save_path}/{ref_name}-a-{audio_name}"ref_img_pil = Image.open(ref_image_path).convert("RGB")# 分割音频和pose数据segments = split_audio_and_poses(audio_path, pose_dir, args.segment_duration, args.fps)if not segments:print("没有有效的片段可以处理")continue# 处理每个片段segment_paths = []for segment_idx, segment_info in enumerate(segments):try:segment_path = process_single_segment(pipe, args, config, weight_dtype, device,ref_img_pil, audio_path, pose_dir, segment_info,save_name, segment_idx)segment_paths.append(segment_path)# 清理GPU内存torch.cuda.empty_cache()except Exception as e:print(f"处理片段 {segment_idx + 1} 时出错: {e}")import tracebacktraceback.print_exc()continueprint(f"\n所有片段处理完成,共生成 {len(segment_paths)} 个视频文件")# 如果有背景图片,对每个片段进行合成final_segment_paths = []if args.background_image and os.path.exists(args.background_image):print("\n开始对每个片段进行背景合成...")# 智能检测最佳数字人位置(可选)recommended_position = detect_background_content_area(args.background_image, args.avatar_margin_ratio)if recommended_position != args.avatar_position:print(f"建议将数字人放在{recommended_position}侧以避免遮挡讲义内容")for i, segment_path in enumerate(segment_paths):if os.path.exists(segment_path):composite_path = segment_path.replace(".mp4", "_composite.mp4")result_path = create_composite_video(segment_path,args.background_image,composite_path,avatar_position=args.avatar_position,avatar_vertical_position=args.avatar_vertical_position,avatar_horizontal_offset=args.avatar_horizontal_offset,avatar_vertical_offset=args.avatar_vertical_offset,avatar_scale=args.avatar_scale,final_width=args.final_width,final_height=args.final_height,enable_chromakey=args.enable_chromakey,chromakey_threshold=args.chromakey_threshold,chromakey_method=args.chromakey_method,avatar_margin_ratio=args.avatar_margin_ratio,debug_chromakey=args.debug_chromakey)if result_path:final_segment_paths.append(result_path)print(f"片段 {i+1} 合成完成: {composite_path}")# 可选:删除原始片段文件if not args.keep_segments:os.remove(segment_path)else:print(f"片段 {i+1} 合成失败,保留原始文件")final_segment_paths.append(segment_path)else:final_segment_paths = segment_paths# 如果启用了合并功能,将所有片段合并成一个完整视频if args.concatenate_segments and len(final_segment_paths) > 1:print(f"\n开始合并 {len(final_segment_paths)} 个片段...")try:clips = []for path in final_segment_paths:if os.path.exists(path):clip = VideoFileClip(path)clips.append(clip)if clips:final_clip = concatenate_videoclips(clips)merged_path = f"{save_name}_complete.mp4"final_clip.write_videofile(merged_path, codec="libx264", audio_codec="aac",threads=4, preset="medium")print(f"合并完成: {merged_path}")# 关闭所有剪辑for clip in clips:clip.close()final_clip.close()# 如果不需要保留片段文件,删除它们if not args.keep_segments:for path in final_segment_paths:if os.path.exists(path):os.remove(path)print("片段文件已清理")else:print("没有有效的视频片段可以合并")except Exception as e:print(f"合并视频时出错: {e}")import tracebacktraceback.print_exc()print(f"\n处理完成!")if args.concatenate_segments:print(f"最终合并视频: {save_name}_complete.mp4")else:print(f"生成的片段文件: {len(final_segment_paths)} 个")for i, path in enumerate(final_segment_paths):print(f" 片段 {i+1}: {path}")if __name__ == "__main__":main()
EchoMimic V2 的推理脚本:
python infer_acc_new3_segments.py --config='./configs/prompts/infer_acc.yaml' --background_image './open.png' --avatar_position right --avatar_vertical_position top --avatar_horizontal_offset -10 --avatar_vertical_offset 50 --avatar_scale 0.85 --chromakey_method simple --final_width 1920 --final_height 1080 --segment_duration 10
代码解读:
这段代码在做什么
它是 EchoMimic V2 的推理脚本:
给定一张参考人像(ref image)、一段语音(wav,16kHz)、一串人体姿态序列(pose 目录下的 *.npy),按音频时轴切成若干段,每段分别做“音频驱动+姿态条件”的人像生成,得到一系列短视频段(每段含原音频);可选把每段绿幕抠图后叠加到一张背景图片上,再把所有片段拼接成一个完整视频。
⸻
输入与依赖(你需要准备什么)
- 运行环境(GPU 推荐)
• Python 3.10+(3.10/3.11都可)
• NVIDIA GPU + CUDA(强烈建议,脚本里多处 .to(“cuda”),纯 CPU 会报错或极慢)
• FFmpeg(moviepy 写视频依赖),脚本里要求设置 FFMPEG_PATH
• 主要 Python 依赖:
• torch, torchvision
• diffusers
• einops
• omegaconf
• numpy, opencv-python (cv2)
• Pillow
• moviepy
• 你的工程里自带的模块:src.models., src.pipelines., src.utils.*
小提示:如果没有现成 requirements.txt,可先:
pip install torch torchvision diffusers einops omegaconf numpy opencv-python pillow moviepy
- 模型与配置文件(关键)
• 一个 主配置 ./configs/prompts/infer_acc.yaml(通过 --config 提供,默认就是它)
需要包含:
• weight_dtype:fp16 或 fp32
• pretrained_vae_path
• pretrained_base_model_path
• reference_unet_path(权重 .pt / .bin)
• motion_module_path(可为空串,表示 only stage1)
• denoising_unet_path
• pose_encoder_path
• audio_model_path
• inference_config(指向另一个 YAML,含 noise_scheduler_kwargs、unet_additional_kwargs 等)
• test_cases:一个 字典,键是参考图路径,值是列表,里面必须包含一个 .wav 和一个 pose 目录。例如:
test_cases:./assets/halfbody_demo/refimag/natural_bk_openhand/0035.png:- ./assets/halfbody_demo/audio/chinese/echomimicv2_woman.wav- ./assets/halfbody_demo/pose/01
• 一个 推理超参配置(被上面 inference_config 引用),例如:
noise_scheduler_kwargs:num_train_timesteps: 1000beta_start: 0.00085beta_end: 0.012beta_schedule: "scaled_linear"unet_additional_kwargs:use_motion_module: trueunet_use_temporal_attention: truecross_attention_dim: 1024
- 数据文件结构
参考默认值/示例:
assets/halfbody_demo/refimag/natural_bk_openhand/0035.png # 参考人像audio/chinese/echomimicv2_woman.wav # 语音(建议16kHz)pose/01/ # 姿态序列0.npy1.npy2.npy...
• pose/*.npy:每个是字典,脚本会读取 draw_pose_params,并用 draw_pose_select_v2 生成姿态渲染图当条件输入。
• 背景图(可选):任意分辨率图片,最终会缩放到 --final_width x --final_height。
⸻
脚本整体流程(重要)
1. 解析参数(分辨率、步数、CFG、FPS、设备、分段时长、是否合成背景、是否拼接等)。
2. 检查并注入 FFmpeg 到 PATH(通过 FFMPEG_PATH 环境变量)。
3. 加载配置:主 YAML + 推理 YAML。
4. 加载模型:
• AutoencoderKL(VAE)
• UNet2DConditionModel(reference_unet)
• EMOUNet3DConditionModel(denoising_unet;可带 motion module)
• PoseEncoder
• 音频特征模型 load_audio_model
• 调度器 DDIMScheduler
• 组装 EchoMimicV2Pipeline
5. 按 test_cases 遍历:得到 (ref_img, audio, pose_dir)。
6. 切分音频与姿态(split_audio_and_poses):
• 以 --segment_duration(默认10秒)把音频切片。
• 姿态帧数 = FPS × 段时长;姿态 .npy 不够长会循环使用(模运算)。
7. 逐段生成(process_single_segment):
• 截取该段音频到 *segment###_audio.wav
• 构造该段的 poses_tensor(循环使用 pose)
• 调用 pipe(…) 生成视频张量,写成 *segment###avatar.mp4
• 再把该段音频叠回,输出最终段 *segment###.mp4
8. 可选背景合成(create_composite_video):
• 对每段进行绿幕抠图(默认 --enable_chromakey),三种方法:simple / enhanced / apply_chromakey
• 按 --avatar_position(left/right/center)和 --avatar_margin_ratio(右侧给讲义留空)把人像叠加到背景图上
9. 可选拼接(–concatenate_segments):
• 用 moviepy.concatenate_videoclips 把所有段合成一个 *complete.mp4
10. 输出:
• 目录:output/YYYYMMDD/HHMM–step{steps}-{W}x{H}–cfg{cfg}–segments/{ref_name}/
• 文件:若合并则 *-complete.mp4;否则是每段 mp4。
⸻
快速起步(按这个顺序跑)
1. 创建与激活环境(示例)
建议用 venv 或 conda
conda create -n echomimic python=3.10 -y
conda activate echomimicpip install torch torchvision --index-url https://download.pytorch.org/whl/cu121 # 依你的CUDA版本选择
pip install diffusers einops omegaconf numpy opencv-python pillow moviepy
2. 准备 FFmpeg
# 假设把 ffmpeg 静态包解压到 ./ffmpeg-4.4-amd64-static
export FFMPEG_PATH=./ffmpeg-4.4-amd64-static
export PATH=$FFMPEG_PATH:$PATH
ffmpeg -version # 能看到版本说明即可
3. 确认/编写配置文件• ./configs/prompts/infer_acc.yaml(按上面的示例把各个模型权重路径填好)
• inference_config 里放采样器和 UNet 额外参数(示例见上)4. 准备数据(ref 图、音频 wav、pose 目录)
5. 标准运行命令(最小可跑版)
python infer_acc_new3_segments.py \--config ./configs/prompts/infer_acc.yaml \-W 768 -H 768 --steps 6 --cfg 1.0 --fps 24 \--segment_duration 10 \--device cuda
6. 加上背景合成与拼接(完整生产版)
python infer_acc_new3_segments.py \--config ./configs/prompts/infer_acc.yaml \-W 768 -H 768 --steps 12 --cfg 1.2 --fps 24 \--segment_duration 10 \--concatenate_segments \--background_image ./assets/bg/slide_bg.png \--avatar_position right \--avatar_vertical_position middle \--final_width 1920 --final_height 1080 \--enable_chromakey \--chromakey_method enhanced \--avatar_margin_ratio 0.25 \--device cuda
运行完成后到 output/YYYYMMDD/…/{refname}/ 找 *-complete.mp4 或每段 *segment###.mp4。
⸻
常用参数速查(只列最关键的)
• --config:主 YAML;里面定义模型权重路径、test_cases 等(最重要)
• -W/-H:生成帧分辨率(默认 768×768)
• --steps:扩散步数(越大越细致,但更慢)
• --cfg:classifier-free guidance scale(>1 会更“贴条件”)
• --fps:输出帧率,与 pose 帧序列一一对应
• --sample_rate:音频采样率(默认 16000)
• --context_frames / --context_overlap:时序上下文窗口与重叠(影响长序列一致性)
• --segment_duration:每段秒数(默认 10s)
• --concatenate_segments:启用把所有片段拼接成一个
• --keep_segments:在拼接完 保留 分段文件(默认会按情况删除中间文件)
• --background_image:提供则做背景合成(绿幕抠图)
• --avatar_position:left/right/center(右侧常用于“讲义留白”)
• --avatar_margin_ratio:为头像在右侧预留的屏幕比例(默认 0.25)
• --enable_chromakey / --chromakey_method:绿幕抠图开关与方法(simple/enhanced)
• --debug_chromakey:保存抠图遮罩首帧,便于调参
⸻
输出目录结构
output/20250922/1530--step_12-768x768--cfg_1.2--segments/{ref_stem}/{ref_stem}-a-{audio_stem}_segment_000.mp4{ref_stem}-a-{audio_stem}_segment_001.mp4...{ref_stem}-a-{audio_stem}_complete.mp4 # 若启用拼接
⸻
关键实现细节(读代码用)
• 分段:split_audio_and_poses 读取音频总时长与 pose_dir 文件数,用 fps 算每段需要的帧数;pose 不够就循环取(% pose_files)。这保证长音频也能跑完。
• 随机种子:每段 seed = base_seed + segment_idx,确保可复现。
• 合成:create_composite_video 会:
• 缩放背景到最终分辨率(默认 1920×1080)
• 把头像视频按高度 2/3(可用 --avatar_scale 微调)进行等比缩放,并约束在右侧预留区内
• 绿幕抠图生成 mask(白=保留,黑=透明),把人像叠到背景
• 位置可通过 --avatar_horizontal_offset/–avatar_vertical_offset 做小幅微调
• 智能位置建议:detect_background_content_area 粗略计算左右区域边缘密度,建议把人像放到边缘密度较低的一侧,以免遮挡讲义关键内容(仅提示,不强制)。
⸻
常见坑 & 排障建议
1. 无 CUDA / CPU 跑不动
代码里有几处硬编码到 “cuda”:
• vae = AutoencoderKL(…).to(“cuda”, dtype=…)
• pose_net = PoseEncoder(…).to(dtype=…, device=“cuda”)
如果机器没 GPU 或 CUDA 不可用,会直接报错。
修复建议(两处用 device 变量替代):
# 原:
vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
# 改:
vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to(device, dtype=weight_dtype)# 原:
pose_net = PoseEncoder(...).to(dtype=weight_dtype, device="cuda")
# 改:
pose_net = PoseEncoder(...).to(dtype=weight_dtype, device=device)
另外前面已有:
device = args.device
if device.__contains__("cuda") and not torch.cuda.is_available():device = "cpu"
改完才能真正在 CPU 上fallback(但速度会很慢)。
2. FFmpeg 未配置
启动时会提示:
please download ffmpeg-static and export to FFMPEG_PATH
请正确下载/解压后:
export FFMPEG_PATH=/path/to/ffmpeg-static
export PATH=$FFMPEG_PATH:$PATH
再 ffmpeg -version 验证。
3. 音频采样率
推荐使用 16 kHz(–sample_rate 16000)。如果源 wav 不是 16k,moviepy 仍能处理,但你自己的 audio2feature 模型可能需要 16k 输入(看 load_audio_model 的要求)。
4. *pose .npy 格式不匹配
需要包含 draw_pose_params 且与你的 draw_pose_select_v2 适配。若缺失,会出现:
Warning: Pose file ... not found, using empty pose
这会导致该帧是空条件,画面可能异常。
5. 显存不足/OOM
• 使用 weight_dtype=fp16
• 降低 -W -H(分辨率)
• 降低 --steps
• 分段更短:--segment_duration 6
• 适当降低 --fps(会减少每段帧数)
6. 绿幕边缘发绿/毛边
• 尝试 --chromakey_method enhanced
• 降低或提高 --chromakey_threshold(默认 0.15)
• 开 --debug_chromakey 看首帧遮罩,调整 morphology 或阈值
7. 拼接时音画不同步
• 确保所有段 fps 一致(脚本会统一写出)
• 最终拼接用相同 codec/参数(脚本已固定)
⸻
小抄:典型命令三连
(A) 最小跑通
python infer_acc_new3_segments.py --config ./configs/prompts/infer_acc.yaml --device cuda
(B) 开启背景合成(右侧放人,讲义在左)
python infer_acc_new3_segments.py \--config ./configs/prompts/infer_acc.yaml \--background_image ./assets/bg/slide_bg.png \--avatar_position right --avatar_margin_ratio 0.28 \--enable_chromakey --chromakey_method enhanced \--final_width 1920 --final_height 1080 \--device cuda
© 拼接所有片段
python infer_acc_new3_segments.py \--config ./configs/prompts/infer_acc.yaml \--concatenate_segments \--device cuda