当前位置: 首页 > news >正文

python学习打卡:DAY 45 Tensorboard使用介绍

浙大疏锦行-CSDN博客

知识点回顾:

  1. tensorboard的发展历史和原理
  2. tensorboard的常见操作
  3. tensorboard在cifar上的实战:MLP和CNN模型

效果展示如下,很适合拿去组会汇报撑页数:

作业:对resnet18在cifar10上采用微调策略下,用tensorboard监控训练过程。

PS:

  1. tensorboard和torch版本存在一定的不兼容性,如果报错请新建环境尝试。启动tensorboard的时候需要先在cmd中进入对应的环境,conda activate xxx,再用cd命令进入环境(如果本来就是正确的则无需操作)。
  2. tensorboard的代码还有有一定的记忆量,实际上深度学习的经典代码都是类似于八股文,看多了就习惯了,难度远远小于考研数学等需要思考的内容
  3. 实际上对目前的ai而言,你只需要先完成最简单的demo,然后让他给你加上tensorboard需要打印的部分即可。---核心是弄懂tensorboard可以打印什么信息,以及如何看可视化后的结果,把ai当成记忆大师用到的时候通过它来调取对应的代码即可。

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter # 导入TensorBoard的核心类
import matplotlib.pyplot as plt
import os
import time
from tqdm import tqdm
import torchvision # 确保torchvision被导入以使用make_grid# --- 步骤 1: 准备数据加载器 (保持不变) ---
def get_cifar10_loaders(batch_size=128):"""获取CIFAR-10的数据加载器,包含数据增强"""train_transform = transforms.Compose([transforms.RandomResizedCrop(224), # ResNet通常在224x224的图像上预训练transforms.RandomHorizontalFlip(),transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # ImageNet的标准化参数])test_transform = transforms.Compose([transforms.Resize(256),transforms.CenterCrop(224),transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)test_dataset = datasets.CIFAR10(root='./data', train=False, transform=test_transform)train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)return train_loader, test_loader# --- 步骤 2: 模型创建与冻结/解冻函数 (保持不变) ---
def create_resnet18(pretrained=True, num_classes=10):"""创建并修改ResNet18模型"""model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1 if pretrained else None)in_features = model.fc.in_featuresmodel.fc = nn.Linear(in_features, num_classes)return modeldef set_freeze_state(model, freeze=True):"""冻结或解冻模型的特征提取层"""print(f"--- {'冻结' if freeze else '解冻'} 特征提取层 ---")for name, param in model.named_parameters():if 'fc' not in name: # 只训练最后的全连接层param.requires_grad = not freeze# --- 步骤 3: 封装了TensorBoard的训练与评估总控函数 ---
def train_with_tensorboard(model, device, train_loader, test_loader, epochs, freeze_epochs, writer):"""使用TensorBoard监控的完整训练流程"""# 初始化优化器和损失函数criterion = nn.CrossEntropyLoss()# 初始只优化未冻结的参数optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5, verbose=True)# --- TensorBoard初始记录 ---print("正在记录初始信息到TensorBoard...")dataiter = iter(train_loader)images, _ = next(dataiter)writer.add_graph(model, images.to(device)) # 记录模型图img_grid = torchvision.utils.make_grid(images[:16]) # 取16张图预览writer.add_image('CIFAR-10 样本图像', img_grid)print("✅ 初始信息记录完成。")# 开始训练global_step = 0for epoch in range(1, epochs + 1):# --- 解冻控制 ---if epoch == freeze_epochs + 1:set_freeze_state(model, freeze=False)# 解冻后需要为优化器加入所有参数optimizer = optim.Adam(model.parameters(), lr=1e-4) # 使用更小的学习率进行全局微调print("优化器已更新以包含所有参数,学习率已降低。")# --- 训练部分 ---model.train()train_loss, train_correct, train_total = 0, 0, 0loop = tqdm(train_loader, desc=f"Epoch [{epoch}/{epochs}] Training", leave=False)for data, target in loop:data, target = data.to(device), target.to(device)optimizer.zero_grad()output = model(data)loss = criterion(output, target)loss.backward()optimizer.step()train_loss += loss.item() * data.size(0)_, pred = output.max(1)train_correct += pred.eq(target).sum().item()train_total += data.size(0)writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)global_step += 1loop.set_postfix(loss=loss.item())loop.close()# 记录Epoch级训练指标avg_train_loss = train_loss / train_totalavg_train_acc = 100. * train_correct / train_totalwriter.add_scalar('Train/Epoch_Loss', avg_train_loss, epoch)writer.add_scalar('Train/Epoch_Accuracy', avg_train_acc, epoch)# --- 评估部分 ---model.eval()test_loss, test_correct, test_total = 0, 0, 0with torch.no_grad():for data, target in test_loader:data, target = data.to(device), target.to(device)output = model(data)loss = criterion(output, target)test_loss += loss.item() * data.size(0)_, pred = output.max(1)test_correct += pred.eq(target).sum().item()test_total += data.size(0)# 记录Epoch级测试指标avg_test_loss = test_loss / test_totalavg_test_acc = 100. * test_correct / test_totalwriter.add_scalar('Test/Epoch_Loss', avg_test_loss, epoch)writer.add_scalar('Test/Epoch_Accuracy', avg_test_acc, epoch)# 记录权重和梯度的直方图 (每个epoch记录一次)for name, param in model.named_parameters():writer.add_histogram(f'Weights/{name}', param, epoch)if param.grad is not None:writer.add_histogram(f'Gradients/{name}', param.grad, epoch)# 更新学习率调度器scheduler.step(avg_test_loss)writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], epoch)print(f"Epoch {epoch} 完成 | 训练准确率: {avg_train_acc:.2f}% | 测试准确率: {avg_test_acc:.2f}%")# --- 步骤 4: 主执行流程 ---
if __name__ == "__main__":# --- 配置 ---EPOCHS = 15FREEZE_EPOCHS = 5 # 先冻结训练5轮,再解冻训练10轮BATCH_SIZE = 64DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")# --- TensorBoard 初始化 ---log_dir = "runs/resnet18_finetune_cifar10"version = 1while os.path.exists(f"{log_dir}_v{version}"):version += 1log_dir = f"{log_dir}_v{version}"writer = SummaryWriter(log_dir)print(f"TensorBoard 日志将保存在: {log_dir}")# --- 开始实验 ---train_loader, test_loader = get_cifar10_loaders(batch_size=BATCH_SIZE)model = create_resnet18(pretrained=True).to(DEVICE)set_freeze_state(model, freeze=True) # 初始冻结print("\n--- 开始使用ResNet18微调模型 ---")print("训练完成后,在终端运行 `tensorboard --logdir=runs` 来查看可视化结果。")train_with_tensorboard(model, DEVICE, train_loader, test_loader, EPOCHS, FREEZE_EPOCHS, writer)writer.close() # 关闭writerprint("\n✅ 训练完成,TensorBoard日志已保存。")

http://www.dtcms.com/a/277859.html

相关文章:

  • 言语理解高频词之生僻成语
  • 驱动开发(3)|rk356x驱动GPIO基础应用之点亮led灯
  • idea docker插件连接docker失败
  • [RPA] 批量数据抓取指定商品名称信息
  • Pandas-数据清洗与处理
  • Spring高级特性——反射和动态代理的性能优化
  • SQL预编译:安全高效数据库操作的关键
  • 《1.5倍与2倍的扩容密码:Java容器的内存性能抉择》
  • 【牛客刷题】四个选项:高考选择题方案统计(并查集+动态规划)
  • 01.深入理解 Python 中的 if __name__ == “__main__“
  • TensorFlow深度学习实战(25)——变分自编码器详解与实现
  • 工作流执行路径的有效性
  • 零基础入门物联网-远程门禁开关:软件安装
  • 014_批处理与大规模任务
  • 【容器】资源平台初探 - K8s核心资源全解析:从Pod到StatefulSet
  • 板凳-------Mysql cookbook学习 (十一--------8)
  • Burp suite的下载安装基础用法(密码喷洒,密码爆破)
  • 算法入门--动态规划(C++)
  • Ribbon实战
  • 【枚举+差分】P6070 『MdOI R1』Decrease
  • RAG升级:Re-rank模型微调,实现极致检索精度
  • 【读书笔记】《C++ Software Design》第八章 The Type Erasure Design Pattern
  • 虚拟线程,多线程,单线程
  • 小白成长之路-LVS
  • 神经网络的基础原理介绍(网络、传播、梯度、以及一些常见的神经网络原型介绍)
  • 【设计模式】策略模式(政策(Policy)模式)
  • pycharm+SSH 深度学习项目 远程后台运行命令
  • AI生成单词消消乐游戏. HTML代码
  • hercules zos 安裝 jdk 8
  • 【读书笔记】《C++ Software Design》第十章与第十一章 The Singleton Pattern The Last Guideline