#train中调用的有dataset RIFE
#import os
import cv2
import math
import time
import torch
import torch.distributed as dist
import numpy as np
import random
import argparse#在model文件夹下的rife代码文件中导入model类
from model.RIFE import Model
#导入数据用于训练插帧模型
from dataset import *
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
#分布式训练
from torch.utils.data.distributed import DistributedSamplerdevice = torch.device("cuda")log_path = 'train_log'#针对于学习率 初始设置和后期的余弦退火
def get_learning_rate(step):if step < 2000:mul = step / 2000.return 3e-4 * mulelse:mul = np.cos((step - 2000) / (args.epoch * args.step_per_epoch - 2000.) * math.pi) * 0.5 + 0.5return (3e-4 - 3e-6) * mul + 3e-6#光流图像转化为rgb
def flow2rgb(flow_map_np):h, w, _ = flow_map_np.shapergb_map = np.ones((h, w, 3)).astype(np.float32)normalized_flow_map = flow_map_np / (np.abs(flow_map_np).max())rgb_map[:, :, 0] += normalized_flow_map[:, :, 0]rgb_map[:, :, 1] -= 0.5 * (normalized_flow_map[:, :, 0] + normalized_flow_map[:, :, 1])rgb_map[:, :, 2] += normalized_flow_map[:, :, 1]return rgb_map.clip(0, 1)#如果等于0 开启日志记录 也就是主进程
#训练和验证采取的数据大小是一致的话 应该不会产生问题
def train(model, local_rank):if local_rank == 0:writer = SummaryWriter('train')writer_val = SummaryWriter('validate')#非主进程不写入日志else:writer = Nonewriter_val = Nonestep = 0nr_eval = 0#这个'train'是传给vimeodataset中的init函数中的 也就是 dataset_name#此时执行init函数dataset = VimeoDataset('train')sampler = DistributedSampler(dataset)#使用你传入的 dataset 对象 每次从 sampler 中获取一组索引# 然后对每个索引调用 dataset.__getitem__(index) 来获取数据 固定接口# 把这些数据打包成一个 batch 返回 是那个张量 9train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, drop_last=True, sampler=sampler)args.step_per_epoch = train_data.__len__()#不是train 不是test 这个是在sequences中 找索引的话再tri_testlist中#意思就是train和va 都是从tri_trainlist中选择的 在train代码中二者都涉及的 tri_testlist就是纯为了测试dataset_val = VimeoDataset('validation')val_data = DataLoader(dataset_val, batch_size=16, pin_memory=True, num_workers=1)print('training...')#train_data val_datatime_stamp = time.time()for epoch in range(args.epoch):sampler.set_epoch(epoch)#对每一个训练的数据for i, data in enumerate(train_data):data_time_interval = time.time() - time_stamptime_stamp = time.time()#data指的是return torch.cat((img0, img1, gt), 0), timestepdata_gpu, timestep = datadata_gpu = data_gpu.to(device, non_blocking=True) / 255.timestep = timestep.to(device, non_blocking=True)#img0 是 img1 gt是img2 img1是img3 在拼接的时候是按照torch.cat((img0, img1, gt), 0), timestep#过程就是imgs进行生成中间帧 然后gt用于和生成的中间帧进行损失计算 看update的输入是imgs 和 gtimgs = data_gpu[:, :6]#将中间帧单独提取出来gt = data_gpu[:, 6:9]learning_rate = get_learning_rate(step) * args.world_size / 4#去看RIFE中的updata函数 一个是merged[2] 一个是日志信息#pred是最终的生成帧 调用updata进行模型参数更新#调用模型的 update() 方法,执行前向传播、损失计算、反向传播和参数更新pred, info = model.update(imgs, gt, learning_rate, training=True) # pass timestep if you are training RIFEmtrain_time_interval = time.time() - time_stamptime_stamp = time.time()#每隔200step记录一次if step % 200 == 1 and local_rank == 0:writer.add_scalar('learning_rate', learning_rate, step)writer.add_scalar('loss/l1', info['loss_l1'], step)writer.add_scalar('loss/tea', info['loss_tea'], step)writer.add_scalar('loss/distill', info['loss_distill'], step)#每隔1000步if step % 1000 == 1 and local_rank == 0:gt = (gt.permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')#从mask到flow1去RIFE中去看updata函数中的返回的是啥#这些都是日志文件mask = (torch.cat((info['mask'], info['mask_tea']), 3).permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')pred = (pred.permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')merged_img = (info['merged_tea'].permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')flow0 = info['flow'].permute(0, 2, 3, 1).detach().cpu().numpy()flow1 = info['flow_tea'].permute(0, 2, 3, 1).detach().cpu().numpy()for i in range(5):imgs = np.concatenate((merged_img[i], pred[i], gt[i]), 1)[:, :, ::-1]writer.add_image(str(i) + '/img', imgs, step, dataformats='HWC')writer.add_image(str(i) + '/flow', np.concatenate((flow2rgb(flow0[i]), flow2rgb(flow1[i])), 1), step, dataformats='HWC')writer.add_image(str(i) + '/mask', mask[i], step, dataformats='HWC')writer.flush()if local_rank == 0:print('epoch:{} {}/{} time:{:.2f}+{:.2f} loss_l1:{:.4e}'.format(epoch, i, args.step_per_epoch, data_time_interval, train_time_interval, info['loss_l1']))step += 1nr_eval += 1#调用下面的验证代码 evaluate#调用验证函数 evaluate(),在验证集上评估模型性能#保存当前模型权重到指定路径(只主进程保存)if nr_eval % 5 == 0:evaluate(model, val_data, step, local_rank, writer_val)#RIFE中Model类中的save_model方法model.save_model(log_path, local_rank) dist.barrier()#模型结构(比如卷积层)期望输入是 [B, 9, 960, 960]
# 验证时却给了 [B, 9, 1080, 1920]
# 所以报错:张量维度不匹配
def evaluate(model, val_data, nr_eval, local_rank, writer_val):loss_l1_list = []loss_distill_list = []loss_tea_list = []psnr_list = []psnr_list_teacher = []time_stamp = time.time()for i, data in enumerate(val_data):data_gpu, timestep = datadata_gpu = data_gpu.to(device, non_blocking=True) / 255. imgs = data_gpu[:, :6]gt = data_gpu[:, 6:9]with torch.no_grad():pred, info = model.update(imgs, gt, training=False)merged_img = info['merged_tea']loss_l1_list.append(info['loss_l1'].cpu().numpy())loss_tea_list.append(info['loss_tea'].cpu().numpy())loss_distill_list.append(info['loss_distill'].cpu().numpy())for j in range(gt.shape[0]):psnr = -10 * math.log10(torch.mean((gt[j] - pred[j]) * (gt[j] - pred[j])).cpu().data)psnr_list.append(psnr)psnr = -10 * math.log10(torch.mean((merged_img[j] - gt[j]) * (merged_img[j] - gt[j])).cpu().data)psnr_list_teacher.append(psnr)gt = (gt.permute(0, 2, 3, 1).cpu().numpy() * 255).astype('uint8')pred = (pred.permute(0, 2, 3, 1).cpu().numpy() * 255).astype('uint8')merged_img = (merged_img.permute(0, 2, 3, 1).cpu().numpy() * 255).astype('uint8')flow0 = info['flow'].permute(0, 2, 3, 1).cpu().numpy()flow1 = info['flow_tea'].permute(0, 2, 3, 1).cpu().numpy()if i == 0 and local_rank == 0:for j in range(10):imgs = np.concatenate((merged_img[j], pred[j], gt[j]), 1)[:, :, ::-1]writer_val.add_image(str(j) + '/img', imgs.copy(), nr_eval, dataformats='HWC')writer_val.add_image(str(j) + '/flow', flow2rgb(flow0[j][:, :, ::-1]), nr_eval, dataformats='HWC')eval_time_interval = time.time() - time_stampif local_rank != 0:returnwriter_val.add_scalar('psnr', np.array(psnr_list).mean(), nr_eval)writer_val.add_scalar('psnr_teacher', np.array(psnr_list_teacher).mean(), nr_eval)if __name__ == "__main__":parser = argparse.ArgumentParser()parser.add_argument('--epoch', default=1, type=int)parser.add_argument('--batch_size', default=16, type=int, help='minibatch size')parser.add_argument('--local_rank','--local-rank', default=-1, type=int, help='local rank')parser.add_argument('--world_size', default=1, type=int, help='world size')args = parser.parse_args()torch.distributed.init_process_group(backend="nccl", world_size=args.world_size)torch.cuda.set_device(args.local_rank)seed = 1234random.seed(seed)np.random.seed(seed)torch.manual_seed(seed)torch.cuda.manual_seed_all(seed)torch.backends.cudnn.benchmark = True#调用init函数 此时model可以调用这个类中的任意函数model = Model(args.local_rank)train(model, args.local_rank)