当前位置：首页 > news >正文

从零构建CNN：框架与自定义实现对比

news 2025/8/23 19:46:53

文章目录

- 引言
- 项目结构
- 一、代码结构解析
- - 1.1 训练流程控制 (main.py)
  - 1.2 PyTorch实现的CNN模型 (cnn_pytorch.py)
  - 1.3 自定义实现CNN模型 (cnn_custom.py)
- 二、关键算法细节剖析
- - 2.1 卷积操作
  - 2.2 自定义实现卷积层
  - 2.3 ReLU与池化
  - 2.4 全连接层
- 总结

引言

卷积神经网络 (Convolutional Neural Network, CNN) 是图像识别和处理中的核心技术，特别在计算机视觉任务中广泛应用。本文通过在一个简单的图像分类任务中对比PyTorch实现与自定义实现两种方案，解析CNN的关键技术细节。

项目结构

.
├── README.md
├── cnn_pytorch.py
├── cnn_custom.py
└── main.py

项目地址：https://github.com/tangpan360/cnn-from-scratch.git

一、代码结构解析

1.1 训练流程控制 (main.py)

# main.py
import torch
import torch.optim as optim
from cnn_pytorch import SimpleCNN  # 导入模型类
from cnn_custom import SimpleCNNCustom  # 导入自定义模型

# 超参数设定
batch_size = 4  # 一次处理 4 张图片
channels = 2  # 2通道
height = 32  # 图片高度
width = 32  # 图片宽度
num_classes = 10  # 假设有 10 个分类
epochs = 10  # 训练轮数
learning_rate = 0.001  # 学习率

# 生成一个随机数据集，模拟训练集
train_data = torch.randn(100, channels, height, width)
train_labels = torch.randint(0, num_classes, (100,))

# 初始化 pytorch 模型
# model = SimpleCNN(num_classes=num_classes)
# 初始化自定义模型
model = SimpleCNNCustom(num_classes=num_classes)

# 定义损失函数（交叉熵损失）
criterion = torch.nn.CrossEntropyLoss()

# 选择优化器（Adam）
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 训练过程
for epoch in range(epochs):
    model.train()  # 切换到训练模式

    # 生成当前 batch 的数据
    inputs = train_data
    labels = train_labels

    # 清空之前的梯度
    optimizer.zero_grad()

    # 前向传播
    outputs = model(inputs)

    # 计算损失
    loss = criterion(outputs, labels)

    # 反向传播
    loss.backward()

    # 更新模型参数
    optimizer.step()

    # 每个 epoch 打印一次损失
    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

1.2 PyTorch实现的CNN模型 (cnn_pytorch.py)

# cnn_pytorch.py
import torch
import torch.nn as nn

# 定义包含两层卷积的 CNN 模型
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        # 第一层卷积：2 通道 -> 16 通道，卷积核大小 3x3，步长 1，填充 1
        self.conv1 = nn.Conv2d(in_channels=2, out_channels=16, kernel_size=3, stride=1, padding=1)
        # 第二层卷积：16 通道 -> 32 通道，卷积核大小 3x3，步长 1，填充 1
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        
        self.relu = nn.ReLU()  # ReLU 激活函数
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)  # 2x2 最大池化
        
        # 全连接层：将特征展平后，映射到 num_classes 类别
        self.fc1 = nn.Linear(32 * 8 * 8, num_classes)
        
    def forward(self, x):
        # 第一层卷积 -> ReLU -> 池化
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        
        # 第二层卷积 -> ReLU -> 池化
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)
        
        # 展平
        x = torch.flatten(x, start_dim=1)
        
        # 全连接层
        x = self.fc1(x)
        return x

1.3 自定义实现CNN模型 (cnn_custom.py)

# cnn_custom.py
import torch
import torch.nn as nn
import torch.nn.functional as F


# 手动实现卷积操作
class Conv2dCustom(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super(Conv2dCustom, self).__init__()
        # 卷积核的初始化使用 Xavier 均匀分布
        self.weight = nn.Parameter(torch.randn(out_channels, in_channels, kernel_size, kernel_size))  # 初始化为随机值
        nn.init.xavier_uniform_(self.weight)  # 使用 Xavier 均匀分布初始化权重
        self.bias = nn.Parameter(torch.zeros(out_channels))  # 偏置初始化为零
        self.stride = stride
        self.padding = padding

    def forward(self, x):
        # 输入的维度是 (batch_size, in_channels, height, width)
        batch_size, in_channels, height, width = x.size()

        # 计算输出的尺寸
        kernel_size = self.weight.size(2)
        out_height = (height + 2 * self.padding - kernel_size) // self.stride + 1
        out_width = (width + 2 * self.padding - kernel_size) // self.stride + 1

        # 扩展输入 x 到与卷积核对齐的形式
        x_padded = F.pad(x, (self.padding, self.padding, self.padding, self.padding))

        # 进行卷积计算
        out = torch.zeros(batch_size, self.weight.size(0), out_height, out_width).to(x.device)
        for i in range(out_height):
            for j in range(out_width):
                # 获取当前卷积窗口
                h_start = i * self.stride
                h_end = h_start + kernel_size
                w_start = j * self.stride
                w_end = w_start + kernel_size

                # 提取当前窗口的数据
                x_slice = x_padded[:, :, h_start:h_end, w_start:w_end]  # shape: (batch_size, in_channels, kernel_size, kernel_size)

                # 调整 x_slice 和 self.weight 的形状以便广播
                x_slice = x_slice.unsqueeze(1)  # shape: (batch_size, 1, in_channels, kernel_size, kernel_size)
                weight = self.weight.unsqueeze(0)  # shape: (1, out_channels, in_channels, kernel_size, kernel_size)

                # 计算卷积结果
                element_wise = x_slice * weight  # 逐元素相乘
                conv_result = element_wise.sum(dim=(2, 3, 4))  # 在指定维度求和
                out[:, :, i, j] = conv_result + self.bias

        return out


# 自定义 ReLU 激活函数
class ReLUCustom(nn.Module):
    def forward(self, x):
        result = torch.max(x, torch.tensor(0.0).to(x.device))
        return result


# 自定义最大池化层
class MaxPool2dCustom(nn.Module):
    def __init__(self, kernel_size, stride=None, padding=0):
        super(MaxPool2dCustom, self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride if stride is not None else kernel_size
        self.padding = padding

    def forward(self, x):
        batch_size, in_channels, height, width = x.size()

        # 计算输出的尺寸
        out_height = (height + 2 * self.padding - self.kernel_size) // self.stride + 1
        out_width = (width + 2 * self.padding - self.kernel_size) // self.stride + 1

        # 对输入进行 padding
        x_padded = F.pad(x, (self.padding, self.padding, self.padding, self.padding))

        # 最大池化操作
        out = torch.zeros(batch_size, in_channels, out_height, out_width).to(x.device)
        for i in range(out_height):
            for j in range(out_width):
                # 计算窗口的起始和结束位置
                h_start = i * self.stride
                h_end = h_start + self.kernel_size
                w_start = j * self.stride
                w_end = w_start + self.kernel_size

                # 提取当前窗口的数据
                x_slice = x_padded[:, :, h_start:h_end, w_start:w_end]

                # 对窗口进行最大池化
                max_values = x_slice.amax(dim=(2, 3))
                out[:, :, i, j] = max_values
        return out


# 自定义全连接层
class LinearCustom(nn.Module):
    def __init__(self, in_features, out_features):
        super(LinearCustom, self).__init__()
        # 使用 Kaiming 初始化权重，适用于 ReLU 激活函数
        self.weight = nn.Parameter(torch.empty(in_features, out_features))
        nn.init.kaiming_normal_(self.weight, mode='fan_out', nonlinearity='relu')
        # 初始化偏置为 0
        self.bias = nn.Parameter(torch.zeros(out_features))

    def forward(self, x):
        # 将输入展平
        x = x.view(x.size(0), -1)
        # 计算线性变换结果
        output = torch.matmul(x, self.weight) + self.bias
        return output


# 定义自定义 CNN 模型
class SimpleCNNCustom(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNNCustom, self).__init__()

        # 第一层卷积
        self.conv1 = Conv2dCustom(in_channels=2, out_channels=16, kernel_size=3, padding=1)
        # 第二层卷积
        self.conv2 = Conv2dCustom(in_channels=16, out_channels=32, kernel_size=3, padding=1)

        # 激活函数
        self.relu = ReLUCustom()

        # 最大池化
        self.pool = MaxPool2dCustom(kernel_size=2, stride=2)

        # 全连接层
        self.fc1 = LinearCustom(32 * 8 * 8, num_classes)

    def forward(self, x):
        # 第一层卷积 -> 激活 -> 池化
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)

        # 第二层卷积 -> 激活 -> 池化
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)

        # 展平
        x = x.view(x.size(0), -1)

        # 全连接层
        x = self.fc1(x)
        return x