卷积神经网络(CNN)
- 实验目的与要求
- 掌握卷积神经网络在图像处理中的应用;
- 构建和训练一个基础的CNN来解决图像分类问题。
二、实验内容
基于公开的CIFAR-10图像分类数据集上的图像分类问题,构建包含卷积层,激活函数,池化层和全连接层的卷积神经网络,通过尝试应用不同的优化器和调整超参数,记录模型在训练集和验证集上的性能对比等实验结果。具体实验步骤如下:
- 下载公开的CIFAR-10图像分类数据集。
- 分析和简述该数据集的特点。
- 构建一个基本的CNN结构,包含卷积层,激活函数,池化层和全连接层。
- 初始化网络参数,训练模型。
- 尝试应用不同的优化器和调整超参数。
- 观察并记录模型在训练集和验证集上的性能。
- 对模型进行评估和测试。
- 实验方法
下载CIFAR-10数据集,采用以下方式,修改downl参数为True,当指定root无该数据集时,自动下载。
CIFAR-10是一个常用的计算机视觉数据集,用于图像分类任务。它包含10个类别的彩色图像,每个类别包含6000张32x32像素大小的图像(32*32像素的图片,肉眼都难以辨认物体)。这10个类别分别是:飞机(airplane)、汽车(automobile)、鸟(bird)、猫(cat)、鹿(deer)、狗(dog)、青蛙(frog)、马(horse)、船(ship)和卡车(truck)。
构建一个CNN(不是最终模型),来尝试完成这个图像分类任务:
第一层:conv,输入是RGB图像,输入通道为3,输出通道设置为32,卷积核大小3*3。使用ReLU激活函数
第二层:最大池化层,池化核大小2*2,步幅2。
第三层:conv,输入通道32,输出通道设置为64,卷积核大小3*3。使用ReLU激活函数
第四层:最大池化层,池化核大小2*2,步幅2。
第五层:全连接层,输入维度 64*6*6,输出维度64,接ReLU激活函数。
第六层:全连接层,输入维度64,输出维度10(十个分类)。
编写训练函数,和测试函数,以SGD(随机梯度下降)为优化器,交叉熵为损失函数,设置好训练轮次,确定批次数量大小,适时打印数据,查看训练进度,做出可视化。
四、详细的模型设计及运行结果
- 模型设计及结果
导入相关库,matplotlib用来数据可视化,观察变化。
- import numpy as np
- import torchvision
- import torchvision.transforms as transforms
- import torch.nn as nn
- import torch.nn.functional as F
- import torch
- import torch.optim as optim
- from matplotlib import pyplot as plt
CNN模型设计代码如下,具体参数如上所述。
- class CNN(nn.Module):
- def __init__(self):
- super(CNN, self).__init__()
- self.conv1 = nn.Conv2d(3, 32, 3)
- self.pool = nn.MaxPool2d(2, 2)
- self.conv2 = nn.Conv2d(32, 64, 3)
- self.fc1 = nn.Linear(64 * 6 * 6, 64)
- self.fc2 = nn.Linear(64, 10)
- def forward(self, x):
- x = self.pool(F.relu(self.conv1(x)))
- x = self.pool(F.relu(self.conv2(x)))
- x = x.view(-1, 64 * 6 * 6)
- x = F.relu(self.fc1(x))
- x = self.fc2(x)
- return x
定义train_model函数,用于模型训练,传入定义的模型,训练数据集,测试数据集(每个轮次后都测试下准确率),损失函数,优化器,训练轮次。
同时记录,每个轮次的平均训练损失,以及每个轮次后测试的准确率。
- def train_model(model, train_loader, test_loader, criterion, optimizer, epochs):
- model.to(device)
- train_losses = []
- test_accuracies = []
- for epoch in range(epochs):
- running_loss = 0.0
- for i, data in enumerate(train_loader, 0):
- inputs, labels = data[0].to(device), data[1].to(device)
- optimizer.zero_grad()
- outputs = model(inputs)
- loss = criterion(outputs, labels)
- loss.backward()
- optimizer.step()
- running_loss += loss.item()
- if i % 2000 == 1999: # 每 2000 mini-batches 输出一次
- print('[%d, %5d] loss: %.3f' %
- (epoch + 1, i + 1, running_loss / 2000))
- running_loss = 0.0
- # 计算测试准确率
- test_accuracy = test_model(model, test_loader)
- train_losses.append(running_loss / len(train_loader))
- test_accuracies.append(test_accuracy)
- return train_losses, test_accuracies
定义test_model函数,传入模型和测试集,将模型调整为测试模式,不计算梯度,返回模型在测试集上的准确率。
- def test_model(model, test_loader):
- model.eval()
- correct = 0
- total = 0
- with torch.no_grad():
- for data in test_loader:
- images, labels = data[0].to(device), data[1].to(device)
- outputs = model(images)
- _, predicted = torch.max(outputs.data, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
- accuracy = 100 * correct / total
- print('Accuracy of the network on the test images: %d %%' % accuracy)
- return accuracy
定义可视化函数,将训练损失,测试准确率绘制出来,观察变化情况。同时抽取四张图片,观察预测结果和真实标签。
- def visualize(train_losses, test_accuracies, images, labels, predicted_labels):
- # 绘制损失变化曲线
- plt.figure()
- plt.plot(train_losses, color='blue', label='Training Loss')
- for i, loss in enumerate(train_losses):
- plt.scatter(i, loss, color='red') # 添加损失点标注
- plt.text(i, loss, f'{loss:.5f}', fontsize=9, va='bottom', ha='center', color='black') # 添加具体数值
- plt.xlabel('Epoch')
- plt.ylabel('Loss')
- plt.title('Training Loss')
- plt.legend()
- plt.show()
- # 绘制准确率变化曲线
- plt.figure()
- plt.plot(test_accuracies, color='green', label='Test Accuracy')
- for i, acc in enumerate(test_accuracies):
- plt.scatter(i, acc, color='orange') # 添加准确率点标注
- plt.text(i, acc, f'{acc:.2f}%', fontsize=9, va='bottom', ha='center', color='black') # 添加具体数值
- plt.xlabel('Epoch')
- plt.ylabel('Accuracy')
- plt.title('Test Accuracy')
- plt.legend()
- plt.show()
- # CIFAR-10 类别
- classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
- # 随机选择 4 个样本并显示图片和标签
- fig, axes = plt.subplots(1, 4, figsize=(10, 3))
- for i, ax in enumerate(axes):
- ax.imshow(np.transpose(images[i].cpu().numpy(), (1, 2, 0)))
- ax.set_title(f'Real: {classes[labels[i].item()]}\nPredicted: {classes[predicted_labels[i].item()]}')
- ax.axis('off')
- plt.show()
数据集的处理,都是基本操作
- transform = transforms.Compose([
- transforms.ToTensor(),
- transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
- ])
- train_dataset = torchvision.datasets.CIFAR10(root='./CIFAR-10', train=True, download=False, transform=transform)
- test_dataset = torchvision.datasets.CIFAR10(root='./CIFAR-10', train=False, download=False, transform=transform)
- train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
- test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False)
实例化模型,定义交叉熵损失函数,定义SGD优化器,开始训练,epoches为5,然后取出4张图片可视化
- model = CNN().to(device)
- criterion = nn.CrossEntropyLoss()
- optimizer = optim.SGD(model.parameters(), lr=0.001,momentum=0.9)
- train_losses, test_accuracies = train_model(model, train_loader, test_loader, criterion, optimizer, epochs=5)
- # 取出一批测试样本进行可视化
- images, labels = next(iter(test_loader))
- outputs = model(images.to(device))
- _, predicted = torch.max(outputs, 1)
- visualize(train_losses, test_accuracies, images, labels, predicted)
结果如上所示,准确率在70%以下,感觉不够理想。尝试增加网络深度。
修改网络(增加深度,增加参数量)如下,迭代次数5次:
- class CNN(nn.Module):
- def __init__(self):
- super(CNN, self).__init__()
- self.conv1 = nn.Conv2d(3, 32, 3,1,1)
- self.conv2 = nn.Conv2d(32, 64, 3,1,1)
- self.pool1 = nn.MaxPool2d(2, 2)
- self.bn1 = nn.BatchNorm2d(64)
- self.conv3 = nn.Conv2d(64,128,3,1,1)
- self.conv4 = nn.Conv2d(128,128,3,1,1)
- self.pool2 = nn.MaxPool2d(2, 2)
- self.bn2 = nn.BatchNorm2d(128)
- self.fc1 = nn.Linear(128 * 8 * 8, 64)
- self.fc2 = nn.Linear(64, 128)
- self.fc3 = nn.Linear(128,10)
- def forward(self, x):
- x = self.conv1(x)
- x = self.conv2(x)
- x = self.pool1(x)
- x = self.bn1(x)
- x = F.relu(x)
- x = self.conv3(x)
- x = self.conv4(x)
- x = self.pool2(x)
- x = self.bn2(x)
- x = F.relu(x)
- x = x.view(-1, 128*8*8)
- x = self.fc1(x)
- x = self.fc2(x)
- x = self.fc3(x)
- return x
可以发现确实提高了不少,达到76.48%,说明增加网络深度,调整网络结构,是很有效的方式。
- 模型的特色
这个模型较改动前相对较深,包含了多个卷积层和池化层。这有助于提取输入图像的复杂特征。在修正后的网络中,使用了 Batch Normalization 层,有助于加速收敛并提高模型的稳定性和泛化能力。使用了 Dropout 层,有助于防止过拟合。使用最大池化层,用于降低特征图的空间尺寸,减少模型参数和计算量。最后几层是全连接层,用于将特征图展平并进行分类。
五.实验总结
本次实验是在CIFAR-10数据集上处理的,虽然样本量是够多了,但是图片是32*32像素点的,拿来做简单的CNN神经网络测试还行,但是现实意义不大,显示中的图片几乎没有这么模糊的图片。所以,要测试实用性高的网络,使用VOC,COCO这些数据集才比较靠谱。
借本次实验也是了解到了卷积在特征提取方面的强大性,学习相关知识,卷积神经网络的可解释性还是比较高的。卷积神经网络中往往要搭配池化层,可能是上采样也可能是下采样,本次实验中用到的池化层是最大池化层,用于下采样,还有平均池化层也是不错的方案。最后的多分类上,要将tensor展平才能进入全连接层,全连接网络也是分类任务的一大利器。
其实,这个实验还能把每一类单独拿出来预测,看看是哪个类别难住了CNN(看看哪个类别的准确率最低)。对比与实验一的手写数据集,如果拿CNN去做,那肯定是简单的几层卷积就能达到很好的效果。
这次实验,每一轮测试训练时间至少都在15min以上,还是在N卡加持下,数据小,网络结构简单的基础上进行的,可见深度学习对算力的要求之高,也难怪NVIDIA黄老板赚的盆满锅满。
附:完整代码
import numpy as np
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
from matplotlib import pyplot as plt
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(3, 32, 3,1,1)
self.conv2 = nn.Conv2d(32, 64, 3,1,1)
self.pool1 = nn.MaxPool2d(2, 2)
self.bn1 = nn.BatchNorm2d(64)
self.conv3 = nn.Conv2d(64,128,3,1,1)
self.conv4 = nn.Conv2d(128,128,3,1,1)
self.pool2 = nn.MaxPool2d(2, 2)
self.bn2 = nn.BatchNorm2d(128)
self.fc1 = nn.Linear(128 * 8 * 8, 64)
self.fc2 = nn.Linear(64, 128)
self.fc3 = nn.Linear(128,10)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.pool1(x)
x = self.bn1(x)
x = F.relu(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.pool2(x)
x = self.bn2(x)
x = F.relu(x)
x = x.view(-1, 128*8*8)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
return x
# 训练模型
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs):
model.train()
model.to(device)
train_losses = []
test_accuracies = []
for epoch in range(epochs):
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
inputs, labels = data[0].to(device), data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
# print(outputs.shape,labels.shape)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 2000 == 1999: # 每 2000 mini-batches 输出一次
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
# 计算测试准确率
test_accuracy = test_model(model, test_loader)
train_losses.append(running_loss / len(train_loader))
test_accuracies.append(test_accuracy)
return train_losses, test_accuracies
# 测试模型
def test_model(model, test_loader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for data in test_loader:
images, labels = data[0].to(device), data[1].to(device) # Move data to GPU
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print('Accuracy of the network on the test images: %d %%' % accuracy)
return accuracy
# 可视化函数
def visualize(train_losses, test_accuracies, images, labels, predicted_labels):
# 绘制损失变化曲线
plt.figure()
plt.plot(train_losses, color='blue', label='Training Loss')
for i, loss in enumerate(train_losses):
plt.scatter(i, loss, color='red') # 添加损失点标注
plt.text(i, loss, f'{loss:.5f}', fontsize=9, va='bottom', ha='center', color='black') # 添加具体数值
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.show()
# 绘制准确率变化曲线
plt.figure()
plt.plot(test_accuracies, color='green', label='Test Accuracy')
for i, acc in enumerate(test_accuracies):
plt.scatter(i, acc, color='orange') # 添加准确率点标注
plt.text(i, acc, f'{acc:.2f}%', fontsize=9, va='bottom', ha='center', color='black') # 添加具体数值
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Test Accuracy')
plt.legend()
plt.show()
# CIFAR-10 类别
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
# 随机选择 4 个样本并显示图片和标签
fig, axes = plt.subplots(1, 4, figsize=(10, 3))
for i, ax in enumerate(axes):
ax.imshow(np.transpose(images[i].cpu().numpy(), (1, 2, 0))) # Move data to CPU for visualization
ax.set_title(f'Real: {classes[labels[i].item()]}\nPredicted: {classes[predicted_labels[i].item()]}')
ax.axis('off')
plt.show()
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset = torchvision.datasets.CIFAR10(root='./CIFAR-10', train=True, download=False, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='./CIFAR-10', train=False, download=False, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001,momentum=0.9)
train_losses, test_accuracies = train_model(model, train_loader, test_loader, criterion, optimizer, epochs=5)
# 取出一批测试样本进行可视化
images, labels = next(iter(test_loader[211::]))
outputs = model(images.to(device))
_, predicted = torch.max(outputs, 1)
visualize(train_losses, test_accuracies, images, labels, predicted)