基于ResNet50的手写符号识别系统
基于ResNet50的手写符号识别系统
项目概述
本项目实现了两个手写符号识别模型:
- ABCD字母识别模型:用于识别手写的A、B、C、D四个字母
- ✓×符号识别模型:用于识别手写的对勾(✓)和叉号(×)
两个模型均基于ResNet50预训练模型,采用迁移学习方式进行微调训练。项目包含完整的数据预处理、模型训练与评估、模型部署和预测的全流程实现。
数据集预处理
ABCD字母数据集
import tensorflow as tf
import os
import numpy as np
from sklearn.model_selection import train_test_split
import random# 数据路径定义
root_dir = "C:/Users/asus/Desktop/judge" # 你存放数据集的根目录
output_root = "balanced_emnist_abcd"
train_root = os.path.join(output_root, "train")
test_root = os.path.join(output_root, "test")# 创建输出目录
os.makedirs(train_root, exist_ok=True)
os.makedirs(test_root, exist_ok=True)# 筛选目标字符和标签
target_chars = ['A', 'B', 'C', 'D'] # 你要处理的字符类别
target_labels = [10, 11, 12, 13] # 与这些字符对应的标签(根据实际数据集标签设置)# 目标图像尺寸(统一为 64x64)
target_image_size = (64, 64)# 仅进行颜色反转的函数
def invert_image(image):"""反转图像颜色:黑色变白色,白色变黑色"""image = 255 - image # 图像反转return image# 随机水平和纵向拉伸(不超过原始图片的7%)
def random_scale(image):"""对图像进行随机拉伸,纵向或横向拉伸,最大拉伸 7%"""scale = random.uniform(0.93, 1.07) # 拉伸比例范围在 93% 到 107% 之间image = tf.image.resize(image, [int(target_image_size[0] * scale), int(target_image_size[1] * scale)])image = tf.image.resize(image, target_image_size) # 拉伸后再调整为目标尺寸return imagedef preprocess_image(img_path, label, invert=False, scale=False):"""图像预处理,包括颜色反转、尺寸调整、拉伸"""img = tf.io.read_file(img_path) # 读取图像文件img = tf.image.decode_png(img, channels=3) # 解码为 RGB 图像img = tf.image.resize(img, target_image_size) # 调整为目标尺寸if invert:img = invert_image(img) # 仅进行颜色反转if scale:img = random_scale(img) # 对图像进行随机拉伸img = tf.cast(img, tf.float32) / 255.0 # 归一化处理# 返回处理后的图像和标签return img, label, img_path# 保存图像函数
def save_image(image, label, img_path, target_folder, suffix=""):"""保存图像到指定路径,支持给文件名加后缀"""img_path_str = img_path.numpy().decode('utf-8') # 将 Tensor 转换为字符串img_name = os.path.basename(img_path_str) # 直接使用字符串的文件名img_name = img_name.replace('.png', f'_{suffix}.png') # 为增强图像加后缀output_img_path = os.path.join(target_folder, str(label), img_name)os.makedirs(os.path.dirname(output_img_path), exist_ok=True)tf.keras.preprocessing.image.save_img(output_img_path, image) # 保存为 PNG 格式# 遍历每个子集并处理图像
for target_char, target_label in zip(target_chars, target_labels):print(f"\nProcessing {target_char} images...")target_dir = os.path.join(root_dir, target_char) # 子文件夹路径image_paths = [os.path.join(target_dir, fname) for fname in os.listdir(target_dir) if fname.endswith('.png')]# 创建数据对列表image_label_pairs = [(img_path, target_label) for img_path in image_paths]# 将数据划分为训练集和测试集(80% 训练集,20% 测试集)train_pairs, test_pairs = train_test_split(image_label_pairs, test_size=0.2, random_state=42)# 创建训练集和测试集 TensorFlow 数据集train_dataset = tf.data.Dataset.from_generator(lambda: train_pairs,output_signature=(tf.TensorSpec(shape=(), dtype=tf.string), tf.TensorSpec(shape=(), dtype=tf.int64)))test_dataset = tf.data.Dataset.from_generator(lambda: test_pairs,output_signature=(tf.TensorSpec(shape=(), dtype=tf.string), tf.TensorSpec(shape=(), dtype=tf.int64)))# 预处理train_dataset = train_dataset.map(lambda img_path, label: preprocess_image(img_path, label, invert=True, scale=True))test_dataset = test_dataset.map(lambda img_path, label: preprocess_image(img_path, label, invert=True))# 生成增强数据,应用多种不同的增强方式,每种图像进行多次增强,并保存for image, label, img_path in train_dataset:for i in range(5): # 对每个图像进行 5 次增强# 对所有图像应用不同的增强(包括反转、拉伸等)processed_image, processed_label, processed_img_path = preprocess_image(img_path, label, invert=True, scale=True)save_image(processed_image, processed_label, processed_img_path, train_root, suffix=f"enhanced_{i}")# 遍历测试集并保存图像for image, label, img_path in test_dataset:save_image(image, label, img_path, test_root)print(f"Images for {target_char} saved successfully.")
预处理步骤:
- 图像尺寸统一调整为64×64
- 颜色反转处理,使得黑底白字变为白底黑字
- 随机拉伸增强,最大不超过原始图片的7%
- 将处理后的数据分为训练集(80%)和测试集(20%)
- 对训练数据进行数据增强,每个原始图像生成5个增强版本
- 对增强数据应用标准化处理
数据增强技术:
- 随机拉伸
- 颜色反转
- 尺寸调整
✓×符号数据集
import tensorflow as tf
import os
import numpy as np
from sklearn.model_selection import train_test_split
import random# 数据路径定义
root_dir = "D:/Grad/judge" # 你存放数据集的根目录
output_root = "balanced_emnist_duicuo"
train_root = os.path.join(output_root, "train")
test_root = os.path.join(output_root, "test")# 创建输出目录
os.makedirs(train_root, exist_ok=True)
os.makedirs(test_root, exist_ok=True)# 目标符号和标签
target_chars = ['dui', 'cuo'] # 符号✓和×对应的文件夹
target_labels = [0, 1] # 对应标签(0: 代表✓, 1: 代表×)# 目标图像尺寸(统一为 64x64)
target_image_size = (64, 64)# 随机水平和纵向拉伸(不超过原始图片的7%)
def random_scale(image):"""对图像进行随机拉伸,纵向或横向拉伸,最大拉伸 7%"""scale = random.uniform(0.93, 1.07) # 拉伸比例范围在 93% 到 107% 之间image = tf.image.resize(image, [int(target_image_size[0] * scale), int(target_image_size[1] * scale)])image = tf.image.resize(image, target_image_size) # 拉伸后再调整为目标尺寸return imagedef rotate_left_90(image):"""将图像整体向左旋转90度"""image = tf.image.rot90(image, k=3) # 旋转90度,k=3表示旋转90度return image# 随机裁剪(Random Crop)
def random_crop(image):"""对图像进行随机裁剪,裁剪比例范围为90%-100%"""crop_size = random.uniform(0.9, 1.0)cropped_image = tf.image.resize_with_crop_or_pad(image, target_height=int(target_image_size[0] * crop_size),target_width=int(target_image_size[1] * crop_size))return cropped_image# 放大符号
def zoom_symbol(image):"""对图像进行符号放大,模拟放大手写符号"""scale_factor = random.uniform(1.2, 1.5) # 随机选择放大倍数height, width, _ = image.shapenew_height = int(height * scale_factor)new_width = int(width * scale_factor)image = tf.image.resize(image, (new_height, new_width)) # 放大图像image = tf.image.resize_with_crop_or_pad(image, target_height=target_image_size[0],target_width=target_image_size[1]) # 缩小到目标大小return image# 图像颜色调整
def random_color_jitter(image):"""对图像进行颜色抖动,调整亮度、对比度、饱和度"""image = tf.image.random_brightness(image, max_delta=0.2) # 随机亮度调整image = tf.image.random_contrast(image, lower=0.7, upper=1.3) # 随机对比度调整image = tf.image.random_saturation(image, lower=0.7, upper=1.3) # 随机饱和度调整return image# 添加高斯模糊去噪
def random_gaussian_blur(image):"""对图像应用高斯模糊去噪"""image = tf.image.random_contrast(image, lower=0.8, upper=1.2) # 对比度微调return imagedef preprocess_image(img_path, label, scale=False, rotate=False, jitter=True, blur=False, zoom=False):"""图像预处理,包括尺寸调整、拉伸和旋转等"""img = tf.io.read_file(img_path) # 读取图像文件img = tf.image.decode_png(img, channels=3) # 解码为 RGB 图像img = tf.image.resize(img, target_image_size) # 调整为目标尺寸# 应用增强if scale:img = random_scale(img) # 对图像进行随机拉伸if rotate:print(f"Rotating image {img_path}...") # 打印调试信息img = rotate_left_90(img) # 对图像进行旋转if jitter:img = random_color_jitter(img) # 随机颜色调整if blur:img = random_gaussian_blur(img) # 高斯模糊去噪if zoom:img = zoom_symbol(img) # 放大符号区域img = tf.cast(img, tf.float32) / 255.0 # 归一化处理return img, label, img_path# 保存图像函数
def save_image(image, label, img_path, target_folder, suffix=""):"""保存图像到指定路径,支持给文件名加后缀"""img_path_str = img_path.numpy().decode('utf-8') # 将 Tensor 转换为字符串img_name = os.path.basename(img_path_str) # 直接使用字符串的文件名img_name = img_name.replace('.png', f'_{suffix}.png') # 为增强图像加后缀output_img_path = os.path.join(target_folder, str(label), img_name)os.makedirs(os.path.dirname(output_img_path), exist_ok=True)tf.keras.preprocessing.image.save_img(output_img_path, image) # 保存为 PNG 格式# 处理每个符号的数据
for target_char, target_label in zip(target_chars, target_labels):print(f"\nProcessing {target_char} images...")target_dir = os.path.join(root_dir, target_char) # 子文件夹路径image_paths = [os.path.join(target_dir, fname) for fname in os.listdir(target_dir) if fname.endswith('.png')]# 创建数据对列表image_label_pairs = [(img_path, target_label) for img_path in image_paths]# 将数据划分为训练集和测试集(80% 训练集,20% 测试集)train_pairs, test_pairs = train_test_split(image_label_pairs, test_size=0.2, random_state=42)# 创建训练集和测试集 TensorFlow 数据集train_dataset = tf.data.Dataset.from_generator(lambda: train_pairs,output_signature=(tf.TensorSpec(shape=(), dtype=tf.string), tf.TensorSpec(shape=(), dtype=tf.int64)))test_dataset = tf.data.Dataset.from_generator(lambda: test_pairs,output_signature=(tf.TensorSpec(shape=(), dtype=tf.string), tf.TensorSpec(shape=(), dtype=tf.int64)))# 预处理if target_char == 'dui': # 如果是"✓"符号,禁用旋转train_dataset = train_dataset.map(lambda img_path, label: preprocess_image(img_path, label, scale=True, rotate=False, jitter=True, blur=True,zoom=True))test_dataset = test_dataset.map(lambda img_path, label: preprocess_image(img_path, label, rotate=True)) # 添加旋转else: # 对"×"符号应用所有增强操作train_dataset = train_dataset.map(lambda img_path, label: preprocess_image(img_path, label, scale=True, rotate=True, jitter=True, blur=True,zoom=True))test_dataset = test_dataset.map(lambda img_path, label: preprocess_image(img_path, label, rotate=True))# 生成增强数据,应用多种不同的增强方式,每种图像进行多次增强,并保存for image, label, img_path in train_dataset:for i in range(4): # 对每个图像进行 4 次增强# 对所有图像应用不同的增强(包括拉伸、旋转、翻转、颜色调整等)processed_image, processed_label, processed_img_path = preprocess_image(img_path, label, scale=True,rotate=True, jitter=True, blur=True,zoom=True)save_image(processed_image, processed_label, processed_img_path, train_root, suffix=f"enhanced_{i}")# 遍历测试集并保存图像for image, label, img_path in test_dataset:processed_image, processed_label, processed_img_path = preprocess_image(img_path, label, rotate=True)save_image(processed_image, processed_label, processed_img_path, test_root)print(f"Images for {target_char} saved successfully.")
预处理步骤:
- 图像尺寸统一调整为64×64
- 按符号类型应用不同的增强策略:
- 对勾(✓)符号:颜色调整、缩放、模糊处理、放大
- 叉号(×)符号:额外应用了旋转增强
- 将处理后的数据分为训练集(80%)和测试集(20%)
- 对训练数据进行更丰富的数据增强,每个原始图像生成4个增强版本
数据增强技术:
- 随机水平拉伸
- 旋转处理
- 随机裁剪
- 符号放大
- 颜色调整
- 高斯模糊去噪
模型架构与训练
ABCD字母识别模型
模型架构:
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from torchvision import models, transforms
import time
import os
from torchvision.datasets import ImageFolder
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np# 配置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体为 SimHei,支持中文
plt.rcParams['axes.unicode_minus'] = False # 解决负号问题# 配置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')# 超参数
batch_size = 64
lr = 0.0005 # 初始学习率
total_epoch = 50 # 训练轮次
momentum = 0.9
patience = 10 # 早停的耐心次数# 数据路径
train_dir = "balanced_emnist_abcd/train"
test_dir = "balanced_emnist_abcd/test"# 数据增强和预处理
train_transform = transforms.Compose([transforms.Resize((64, 64)), # 统一尺寸transforms.ToTensor(), # 转为张量transforms.Normalize((0.5,), (0.5,)) # 归一化
])test_transform = transforms.Compose([transforms.Resize((64, 64)),transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,)) # 归一化
])# 加载训练集和测试集
train_dataset = ImageFolder(root=train_dir, transform=train_transform)
test_dataset = ImageFolder(root=test_dir, transform=test_transform)# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)# 加载预训练的 ResNet-50 模型
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
model.fc = nn.Sequential(nn.Linear(model.fc.in_features, 512),nn.ReLU(),nn.Dropout(0.3),nn.BatchNorm1d(512),nn.Linear(512, 4) # 输出 4 类(A, B, C, D)
)# 将模型移动到设备
model = model.to(device)# 损失函数与优化器
criterion = nn.CrossEntropyLoss() # 使用交叉熵损失
optimizer = torch.optim.Adam(model.parameters(), lr=lr)# 学习率调度器
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)# 早停机制变量
best_accuracy = 0.0
epochs_without_improvement = 0# 保存模型
checkpoint_dir = './checkpoints'
if not os.path.exists(checkpoint_dir):os.makedirs(checkpoint_dir)# 保存最佳模型
def save_model(epoch, model, accuracy, best_accuracy):if accuracy > best_accuracy:best_accuracy = accuracytorch.save(model.state_dict(), os.path.join(checkpoint_dir, f'best_model_epoch_{epoch + 1}.pth'))return best_accuracy# 训练函数
def train(epoch):model.train()running_loss = 0.0correct_train = 0total_train = 0for i, (images, labels) in enumerate(train_loader):images, labels = images.to(device), labels.to(device)# 前向传播outputs = model(images)# 计算损失loss = criterion(outputs, labels)optimizer.zero_grad()loss.backward() # 反向传播optimizer.step() # 更新参数running_loss += loss.item()# 计算训练准确率_, predicted = torch.max(outputs.data, 1)total_train += labels.size(0)correct_train += (predicted == labels).sum().item()if (i + 1) % 10 == 0:print(f"Epoch [{epoch + 1}/{total_epoch}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")train_accuracy = 100 * correct_train / total_trainprint(f"训练准确率 (Epoch {epoch + 1}): {train_accuracy:.2f}%")print(f"Epoch [{epoch + 1}], 平均损失: {running_loss / len(train_loader):.4f}")return running_loss / len(train_loader), train_accuracy # 返回损失和准确率# 测试函数
def test(epoch):global best_accuracy, epochs_without_improvementmodel.eval()correct, total = 0, 0test_loss = 0.0all_labels = []all_preds = []all_probs = []with torch.no_grad():for images, labels in test_loader:images, labels = images.to(device), labels.to(device)outputs = model(images)_, predicted = torch.max(outputs.data, 1)# 计算测试损失loss = criterion(outputs, labels)test_loss += loss.item()total += labels.size(0)correct += (predicted == labels).sum().item()# 收集所有标签和预测值,用于计算更多指标all_labels.extend(labels.cpu().numpy())all_preds.extend(predicted.cpu().numpy())all_probs.extend(torch.softmax(outputs, dim=1).cpu().numpy())accuracy = 100 * correct / totalprint(f"测试准确率 (Epoch {epoch + 1}): {accuracy:.2f}%")# 计算精确率、召回率和F1分数precision = precision_score(all_labels, all_preds, average='weighted')recall = recall_score(all_labels, all_preds, average='weighted')f1 = f1_score(all_labels, all_preds, average='weighted')print(f"精确率 (Precision): {precision:.4f}")print(f"召回率 (Recall): {recall:.4f}")print(f"F1分数 (F1 Score): {f1:.4f}")# 绘制混淆矩阵cm = confusion_matrix(all_labels, all_preds)plt.figure(figsize=(6, 5))sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['A', 'B', 'C', 'D'],yticklabels=['A', 'B', 'C', 'D'])plt.xlabel('预测标签')plt.ylabel('真实标签')plt.title('混淆矩阵')plt.show()# 计算多类 AUC-ROC# 将标签转换为二进制形式all_labels_bin = label_binarize(all_labels, classes=[0, 1, 2, 3]) # 类别A=0, B=1, C=2, D=3fpr = {}tpr = {}roc_auc = {}# 为每个类别计算 AUC-ROC 曲线for i in range(4): # 有4个类别fpr[i], tpr[i], _ = roc_curve(all_labels_bin[:, i], [prob[i] for prob in all_probs])roc_auc[i] = auc(fpr[i], tpr[i])# 绘制每个类别的 ROC 曲线plt.figure(figsize=(6, 5))for i in range(4):plt.plot(fpr[i], tpr[i], lw=2, label='类别 %d (AUC = %0.2f)' % (i, roc_auc[i]))plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')plt.xlim([0.0, 1.0])plt.ylim([0.0, 1.05])plt.xlabel('假阳性率')plt.ylabel('真正率')plt.title('多类接收者操作特征 (ROC) 曲线')plt.legend(loc="lower right")plt.show()# 早停逻辑:如果准确率没有提升,增加`epochs_without_improvement`best_accuracy = save_model(epoch, model, accuracy, best_accuracy)if accuracy > best_accuracy:epochs_without_improvement = 0else:epochs_without_improvement += 1return test_loss / len(test_loader), accuracy # 返回测试损失和准确率# 训练和测试
def main():start_time = time.time()# 用于保存损失曲线的列表train_losses = []test_losses = []for epoch in range(total_epoch):train_loss, train_accuracy = train(epoch)test_loss, accuracy = test(epoch)# 保存损失train_losses.append(train_loss)test_losses.append(test_loss)# 通过 ReduceLROnPlateau 更新学习率scheduler.step(accuracy) # 使用 ReduceLROnPlateau 更新学习率# 早停判断:如果连续 `patience` 轮没有提升,停止训练if epochs_without_improvement >= patience:print("由于没有提升,提前停止训练。")breakend_time = time.time()print(f"训练完成,总耗时 {(end_time - start_time):.2f} 秒。")# 绘制损失曲线plt.figure(figsize=(10, 5))plt.plot(range(1, len(train_losses) + 1), train_losses, label="训练损失")plt.plot(range(1, len(test_losses) + 1), test_losses, label="测试损失")plt.xlabel('轮次 (Epoch)')plt.ylabel('损失 (Loss)')plt.title('训练与测试损失曲线')plt.legend()plt.show()if __name__ == "__main__":main()
训练参数:
- 批次大小:64
- 初始学习率:0.0005
- 优化器:Adam
- 损失函数:交叉熵损失
- 早停策略:10轮准确率无提升则停止
- 学习率调度:ReduceLROnPlateau
评估指标:
- 准确率(Accuracy)
- 精确率(Precision)
- 召回率(Recall)
- F1分数
- 混淆矩阵
- 多类AUC-ROC曲线
✓×符号识别模型
模型架构:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from torchvision import transforms, models
from torchvision.datasets import ImageFolder
import os
import time
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import seaborn as sns
import numpy as np# 配置中文字体
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 设置字体为 Microsoft YaHei,支持中文
plt.rcParams['axes.unicode_minus'] = False # 解决负号问题# 配置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')# 超参数
batch_size = 32
lr = 0.0005 # 初始学习率
total_epoch = 50 # 训练轮次
momentum = 0.9
patience = 10 # 早停的耐心次数
weight_decay = 0.0001 # L2正则化,防止过拟合
dropout_rate = 0.5 # Dropout值(防止过拟合)# 数据路径
train_dir = "balanced_emnist_duicuo/train" # 二分类数据路径
test_dir = "balanced_emnist_duicuo/test" # 二分类测试数据路径# 数据增强和预处理
train_transform = transforms.Compose([transforms.Resize((64, 64)), # 统一尺寸transforms.RandomHorizontalFlip(), # 随机水平翻转transforms.RandomRotation(20), # 随机旋转transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2), # 随机颜色抖动transforms.RandomAffine(20, scale=(0.8, 1.2)), # 随机仿射变换(拉伸)transforms.ToTensor(), # 转为张量transforms.Normalize((0.5,), (0.5,)) # 归一化
])test_transform = transforms.Compose([transforms.Resize((64, 64)),transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,)) # 归一化
])# 加载训练集和测试集
train_dataset = ImageFolder(root=train_dir, transform=train_transform)
test_dataset = ImageFolder(root=test_dir, transform=test_transform)# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)# 使用预训练模型ResNet50
model = models.resnet50(pretrained=True) # 加载预训练的ResNet50
model.fc = nn.Sequential(nn.Linear(model.fc.in_features, 512),nn.ReLU(),nn.Dropout(dropout_rate),nn.Linear(512, 2) # 修改最后一层以适应二分类问题
)# 如果使用GPU
model = model.to(device)# 损失函数与优化器
criterion = nn.CrossEntropyLoss() # 使用交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # L2正则化# 学习率调度器
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)# 早停机制变量
best_accuracy = 0.0
epochs_without_improvement = 0# 保存模型路径
checkpoint_dir = './checkpoints_2class' # 二分类模型保存路径
if not os.path.exists(checkpoint_dir):os.makedirs(checkpoint_dir)# 保存最佳模型
def save_model(epoch, model, accuracy, best_accuracy):if accuracy > best_accuracy:best_accuracy = accuracytorch.save(model.state_dict(), os.path.join(checkpoint_dir, f'best_model_epoch_{epoch + 1}.pth'))return best_accuracy# 训练函数
def train(epoch):model.train()running_loss = 0.0correct_train = 0total_train = 0for i, (images, labels) in enumerate(train_loader):images, labels = images.to(device), labels.to(device)# 前向传播outputs = model(images)# 计算损失loss = criterion(outputs, labels)optimizer.zero_grad()loss.backward() # 反向传播optimizer.step() # 更新参数running_loss += loss.item()# 计算训练准确率_, predicted = torch.max(outputs.data, 1)total_train += labels.size(0)correct_train += (predicted == labels).sum().item()if (i + 1) % 10 == 0:print(f"Epoch [{epoch + 1}/{total_epoch}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")train_accuracy = 100 * correct_train / total_trainprint(f"训练准确率 (Epoch {epoch + 1}): {train_accuracy:.2f}%")print(f"Epoch [{epoch + 1}], 平均损失: {running_loss / len(train_loader):.4f}")return running_loss / len(train_loader), train_accuracy # 返回损失和准确率# 测试函数
def test(epoch):global best_accuracy, epochs_without_improvementmodel.eval()correct, total = 0, 0test_loss = 0.0all_labels = []all_preds = []all_probs = []with torch.no_grad():for images, labels in test_loader:images, labels = images.to(device), labels.to(device)outputs = model(images)_, predicted = torch.max(outputs.data, 1)# 计算测试损失loss = criterion(outputs, labels)test_loss += loss.item()total += labels.size(0)correct += (predicted == labels).sum().item()# 收集所有标签和预测值,用于计算更多指标all_labels.extend(labels.cpu().numpy())all_preds.extend(predicted.cpu().numpy())all_probs.extend(torch.softmax(outputs, dim=1).cpu().numpy())accuracy = 100 * correct / totalprint(f"测试准确率 (Epoch {epoch + 1}): {accuracy:.2f}%")# 计算精确率、召回率和F1分数precision = precision_score(all_labels, all_preds, average='weighted')recall = recall_score(all_labels, all_preds, average='weighted')f1 = f1_score(all_labels, all_preds, average='weighted')print(f"精确率 (Precision): {precision:.4f}")print(f"召回率 (Recall): {recall:.4f}")print(f"F1分数 (F1 Score): {f1:.4f}")# 绘制混淆矩阵cm = confusion_matrix(all_labels, all_preds)plt.figure(figsize=(6, 5))sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['✓', '×'], yticklabels=['✓', '×'])plt.xlabel('预测标签')plt.ylabel('真实标签')plt.title('混淆矩阵')plt.show()# 计算AUC-ROCfpr, tpr, _ = roc_curve(all_labels, [prob[1] for prob in all_probs]) # 以类别1的概率作为正类roc_auc = auc(fpr, tpr)# 绘制AUC-ROC曲线plt.figure(figsize=(6, 5))plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC曲线 (AUC = %0.2f)' % roc_auc)plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')plt.xlim([0.0, 1.0])plt.ylim([0.0, 1.05])plt.xlabel('假阳性率')plt.ylabel('真正率')plt.title('接收者操作特征 (ROC)')plt.legend(loc="lower right")plt.show()# 早停逻辑:如果准确率没有提升,增加epochs_without_improvementbest_accuracy = save_model(epoch, model, accuracy, best_accuracy)if accuracy > best_accuracy:epochs_without_improvement = 0else:epochs_without_improvement += 1return test_loss / len(test_loader), accuracy # 返回测试损失和准确率# 训练和测试
def main():start_time = time.time()# 用于保存损失和准确率曲线的列表train_losses = []test_losses = []train_accuracies = []test_accuracies = []for epoch in range(total_epoch):train_loss, train_accuracy = train(epoch)test_loss, accuracy = test(epoch)# 保存损失和准确率train_losses.append(train_loss)test_losses.append(test_loss)train_accuracies.append(train_accuracy)test_accuracies.append(accuracy)# 通过 ReduceLROnPlateau 更新学习率scheduler.step(accuracy) # 使用 ReduceLROnPlateau 更新学习率# 早停判断:如果连续 patience 轮没有提升,停止训练if epochs_without_improvement >= patience:print("由于没有提升,提前停止训练。")breakend_time = time.time()print(f"训练完成,总耗时 {(end_time - start_time):.2f} 秒。")# 绘制损失曲线plt.figure(figsize=(10, 5))plt.plot(range(1, len(train_losses) + 1), train_losses, label="训练损失")plt.plot(range(1, len(test_losses) + 1), test_losses, label="测试损失")plt.xlabel('轮次 (Epoch)')plt.ylabel('损失 (Loss)')plt.title('训练与测试损失曲线')plt.legend()plt.show()# 绘制准确率曲线plt.figure(figsize=(10, 5))plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label="训练准确率")plt.plot(range(1, len(test_accuracies) + 1), test_accuracies, label="测试准确率")plt.xlabel('轮次 (Epoch)')plt.ylabel('准确率 (Accuracy)')plt.title('训练与测试准确率曲线')plt.legend()plt.show()if __name__ == "__main__":main()
训练参数:
- 批次大小:32
- 初始学习率:0.0005
- 优化器:Adam(带L2正则化)
- 正则化参数:0.0001
- 损失函数:交叉熵损失
- Dropout率:0.5
- 早停策略:10轮准确率无提升则停止
- 学习率调度:ReduceLROnPlateau
评估指标:
- 准确率(Accuracy)
- 精确率(Precision)
- 召回率(Recall)
- F1分数
- 混淆矩阵
- 二分类AUC-ROC曲线
模型预测与应用
ABCD字母预测(test2.py)
预处理与预测流程:
- 加载训练好的模型
- 图像预处理:
- 顺时针旋转90度
- 调整为64×64大小
- 转换为RGB格式
- 标准化处理
- 模型推理获取预测类别
使用示例:
import torch
from torchvision import transforms, models
from PIL import Image
import os# 配置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')# 加载训练好的模型
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT) # 使用预训练的ResNet50
model.fc = torch.nn.Sequential(torch.nn.Linear(model.fc.in_features, 512),torch.nn.ReLU(),torch.nn.Dropout(0.3),torch.nn.BatchNorm1d(512),torch.nn.Linear(512, 4) # 输出4类(A, B, C, D)
)# 加载最好的模型
checkpoint_path = './checkpoints/best_model_epoch_2.pth' # 请根据实际路径修改
model.load_state_dict(torch.load(checkpoint_path))
model = model.to(device)
model.eval() # 设置为评估模式# 图片转换方式
transform = transforms.Compose([transforms.Lambda(lambda x: x.rotate(-90, expand=True)), # 顺时针旋转90度transforms.Resize((64, 64)),transforms.Lambda(lambda x: x.convert("RGB")), # 强制转换为RGB格式transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))
])# 测试图片所在目录
image_dir = r"./exam_13" # 请根据实际路径修改
image_files = os.listdir(image_dir)[:20] # 获取目录下的前20张图片
# 处理并保存第一张旋转后的图片
if image_files:first_image_path = os.path.join(image_dir, image_files[0])original_image = Image.open(first_image_path)# 旋转并保存到根目录rotated_image = original_image.rotate(-90, expand=True) # 也修改这里为顺时针旋转90度save_path = os.path.join(os.getcwd(), "rotated_sample.jpg")rotated_image.save(save_path)print(f"已将旋转后的第一张图片保存到: {save_path}")
# 类别标签映射
class_labels = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}# 遍历每张图片进行预测
for image_name in image_files:image_path = os.path.join(image_dir, image_name)# 加载图片并转换为RGB格式(解决RGBA问题)image = Image.open(image_path).convert("RGB") # 强制转换为RGB格式image = transform(image).unsqueeze(0).to(device) # 预处理并增加batch维度# 进行推理with torch.no_grad():outputs = model(image)_, predicted = torch.max(outputs.data, 1) # 获取预测标签# 打印结果predicted_label = predicted.item()print(f"图片 {image_name} 的预测结果是:{class_labels[predicted_label]}")
✓×符号预测
预处理与预测流程:
- 加载训练好的二分类模型
- 图像预处理:
- 调整为64×64大小
- 标准化处理
- 模型推理获取预测类别
- 将数字标签(0,1)映射为符号(✓,×)
使用示例:
from ye import model, device
import torch
from torchvision import models, transforms
from PIL import Imagedef predict_image(image_path):# 重新加载模型model.load_state_dict(torch.load('./checkpoints_2class/best_model_epoch_7.pth')) # 使用最新训练的模型model.eval()# 图像预处理transform = transforms.Compose([transforms.Resize((64, 64)), # 统一尺寸transforms.ToTensor(), # 转为张量transforms.Normalize((0.5,), (0.5,)) # 归一化])img = Image.open(image_path).convert('RGB')img = transform(img).unsqueeze(0).to(device) # 增加batch维度# 预测with torch.no_grad():outputs = model(img)_, predicted = torch.max(outputs, 1)# 显示预测结果class_names = ['A', 'B', 'C', 'D']predicted_class = class_names[predicted.item()]print(f"Predicted Class: {predicted_class}")
if __name__ == "__main__":predict_image("./exam_13/cell_021.png")
模型性能与评估
两个模型均采用了以下方法进行性能评估:
- 准确率/损失曲线:展示训练过程中模型性能的变化趋势
- 混淆矩阵:直观显示分类结果,识别模型的强项和弱项
- ROC曲线和AUC指标:评估模型的判别能力
- 精确率、召回率和F1分数:全面评估模型性能
技术难点与解决方案
-
数据预处理挑战:
- 手写符号多样性大,样式变化多
- 解决方案:应用多种数据增强技术,提高模型泛化能力
-
图像质量问题:
- 手写图像可能存在模糊、RGBA格式等问题
- 解决方案:统一转换为RGB格式,应用适当的预处理步骤
-
旋转角度校正:
- 手写图像方向可能不一致
- 解决方案:根据需要添加旋转预处理,确保图像方向一致
-
类别不平衡:
- 不同字母或符号的样本数量可能不均衡
- 解决方案:数据增强时对少数类别进行更多增强
未来改进方向
- 增加更多字母和符号类别,扩展识别范围
- 尝试其他深度学习模型架构,如EfficientNet或Vision Transformer
- 添加检测模块,实现自动定位和识别
- 开发移动端应用,实现实时手写识别
- 优化模型大小,提高推理速度
总结
本项目成功实现了两个手写符号识别模型,分别针对ABCD字母和✓×符号。通过迁移学习和精心设计的数据预处理流程,模型展现出良好的识别性能。此外,项目提供了完整的训练、评估和预测代码,可以作为其他手写符号识别任务的参考和基础。