Machine Learining（李宏毅2020） —— hw03_李宏毅机器学习2020hw3-CSDN博客

本文链接：https://blog.csdn.net/David_B/article/details/117171074

本文介绍如何使用PyTorch和Convolutional Neural Networks (CNN) 对11类食物图片进行分类，涉及数据预处理、模型构建、训练与验证过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Machine Learining —— hw03：CNN

作业说明

通过Convolutional Neural Networks对食物进行分类
数据集中的食物图采集于网上，总共11类：：
Bread,Dairy product,Dessert,Egg,Fried food,Meat,Noodles/Pasta,Rice,Seafood,Soup,
Vegetable/Fruit
用一个数字表示一个类

数据说明

在这里插入图片描述
数据规模：

Training set：9866张
Validation set：3430张
Testing set：3347张
training、validation文件夹中图片的命名格式：[类别]_[编号].jpg，图片大小不一致

testing文件夹中图片命名格式：[编号].jpg，图片大小不一致

预测结果的保持形式为：第一列为id，第二类为Category

作业过程

导入所需库

cv2的安装：

pip install opencv-python

#导入需要的库
import os
import numpy as np
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import time

读取图片

用OpenCV（cv2）读取图片并存放在numpy中

#定义一个读取图片的函数
def readfile(path, label):
    #label是一个布尔值，代表需不需要返回函数中的y
    image_dir = sorted(os.listdir(path))   #os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表
                                           #sorted 可以对所有可迭代的对象进行排序操作
    #image_dir存储所有图片命名
    x = np.zeros((len(image_dir), 128, 128, 3), dtype = np.uint8)  #uint8是专门用于存储各种图像的（包括RGB，灰度图像等），范围是从0–255
    #x存储图片，每张彩色图片尺寸128（高）*128（宽）*3（彩色三通道）
    y = np.zeros((len(image_dir)), dtype = np.uint8)
    #y存储标签，每个y大小为1
    
    for i, file in enumerate(image_dir):  #enumerate() 函数将数据对象组合为一个索引序列，同时列出数据和数据下标
        img = cv2.imread(os.path.join(path, file))  #os.path.join()函数路径拼接文件路径，给出图片的完整路径
                                                    #cv2.imread(filepath，)用于读取图片文件
        x[i, :, :] = cv2.resize(img, (128, 128))
        #cv2.resize()函数将不同大小的图片统一为128（高）*128（宽）
        if label:
            y[i] = int(file.split('_')[0]) #y获得类别（数字）
                                           #str.split(“_”)[0]得到的是第一个_之前的内容;
                                           #str.split(“_”)[3]得到的是第三个_后和第四个_前之间的内容

    if label:
        return x, y
    else:
        return x
    

#分别将training set、validation set、testing set用函数readfile()读取
workspace_dir = './hw3/food-11'
print('Reading data')
train_x, train_y = readfile(os.path.join(workspace_dir, 'training'), True)
print('Size of training data = {}'.format(len(train_x)))
val_x, val_y = readfile(os.path.join(workspace_dir, 'validation'), True)
print('Size of validation data = {}'.format(len(val_x)))
test_x, test_y = readfile(os.path.join(workspace_dir, 'testing'), False)
print('Size of testing data = {}'.format(len(test_x)))

Reading data
Size of training data = 9866
Size of validation data = 3430
Size of testing data = 3347

定义Dataset

在pytorch中，可以使用torch.utils.data的Dataset及DataLoader包装 data

Dataset需要overload两个函数：

len：传回dataset的大小
getitem：定义当前函数利用[]取值时，dataset怎么传回数据

#transforms对图片预处理
#training时，通过随机旋转、水平翻转进行数据增强（data augmentation）
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),  #随机翻转图片
    transforms.RandomRotation(15),      #随机旋转图片
    transforms.ToTensor(),              #将图片变成Tensor，并把数值normalize到[0,1]
])

#testing时，不需要进行数据增强
test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
])

class ImgDataset(Dataset):
    def __init__(self, x, y = None, transform = None):
        self.x = x
        self.y = y
        #label 需要LongTensor型
        if y is not None:
            self.y = torch.LongTensor(y)
        self.transform = transform
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        X = self.x[index]
        if self.transform is not None:
            X = self.transform(X)
        if self.y is not None:
            Y = self.y[index]
            return X, Y
        else:
            return X
        
batch_size = 128
train_set = ImgDataset(train_x, train_y, train_transform)
val_set = ImgDataset(val_x, val_y, test_transform)
train_loader = DataLoader(train_set, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_set, batch_size = batch_size, shuffle = False)

定义模型

一级卷积层由卷积层cov+批标准化batchnorm+激活函数ReLU+最大池化MaxPool构成

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        
        #input 的维度[3, 128, 128]
        #torch.nn.Conv2d(in_channels, out_channels, kernel_size卷积核大小, stride卷积步长, padding特征图填充宽度)
        #torch.nn.MaxPool2d(kernel_size, stride, padding)
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),  #输出[64, 128, 128]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      #输出[64, 64, 64]  
            
            nn.Conv2d(64, 128, 3, 1, 1),  #输出[128, 64, 64]
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      #输出[128, 32, 32] 
            
            nn.Conv2d(128, 256, 3, 1, 1),  #输出[256, 32, 32]
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      #输出[256, 16, 16] 
            
            nn.Conv2d(256, 512, 3, 1, 1),  #输出[512, 16, 16]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      #输出[512, 8, 8] 
            
            nn.Conv2d(512, 512, 3, 1, 1),  #输出[512, 8, 8]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),      #输出[512, 4, 4] 
        )
        
        #全连接的前向传播神经网络
        self.fc = nn.Sequential(
            nn.Linear(512*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 11)  #输出11个分类
        )
        
    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1) #摊成1维
        return self.fc(out)

训练

使用training set进行训练，使用验证集validation set选择最好的参数

model = Classifier().cuda()  
loss = nn.CrossEntropyLoss()  #定义损失，使用交叉熵计算分类任务的损失
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001) #优化器，使用Adam优化器
num_epoch = 30  #迭代次数

for epoch in range(num_epoch):
    epoch_start_time = time.time()
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0
    
    model.train()  #确保model是在训练model
                   #写model的时候，是测试和训练通用的model，通过model. train()和model. eval()来来设置model的测试阶段和训练阶段
    for i, data in enumerate(train_loader):
        optimizer.zero_grad()  #用optimizer将模型参数的gradient归零
        train_pred = model(data[0].cuda())  #利用model得到预测的概率分布，实际上调用模型的forward函数
        batch_loss = loss(train_pred, data[1].cuda())  #计算loss（prediction和label必须同时在CPU或GPU）
        batch_loss.backward()  #利用反向传播算出每个参数的gradient
        optimizer.step()  #更新gradient
        
        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis = 1) == data[1].numpy())
        train_loss += batch_loss.item()
        
    #验证集
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            val_pred = model(data[0].cuda())
            batch_loss = loss(val_pred, data[1].cuda())
            
            val_acc += np.sum(np.argmax(val_pred.cpu().data.numpy(), axis = 1) == data[1].numpy())
            val_loss += batch_loss.item()
            
        print('[%03d / %03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % (epoch + 1, num_epoch, time.time() - epoch_start_time, train_acc / train_set.__len__(), train_loss / train_set.__len__(), val_acc/val_set.__len__(), val_loss/val_set.__len__()))

测试

用训练的模型在test set上进行测试

test_set = ImgDataset(test_x, transform = test_transform)
test_loader = DataLoader(test_set, batch_size = batch_size, shuffle = False)

model_best.eval()
prediction = []
with torch.no_grad():
    for i, data in enumerate(test_loader):
        test_pred = model_best(data.cuda())
        #预测值中概率最大的下标即为模型预测的食物标签
        test_label = np.argmax(test_pred.cpu().data.numpy(), axis = 1)
        for y in test_label:
            prediction.append(y)
            
with open('predict.csv', 'w') as f:
    f.write('Id, Category\n')
    for i, y in enumerate(prediction):;
        f.write('{},{}\n'.format(i, y))