LEARNED STEP SIZE QUANTIZATION论文复现

该博客介绍了如何实现一个量化卷积神经网络(Quantized Convolutional Neural Network),包括权重和激活的量化过程,以及使用MNIST数据集进行训练。网络中包含了自定义的量化操作和量化卷积层,通过设置不同的scale来优化模型的精度和收敛速度。此外,还展示了训练和测试过程中的损失和准确率。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function

class Round(Function):

    @staticmethod
    def forward(self, input):
        output = torch.round(input)
        return output

    @staticmethod
    def backward(self, grad_output):
        grad_input = grad_output.clone()
        return grad_input


def quant(x, scale):
    return Round.apply(torch.clamp(x / scale, -127, 127))


def dequant(x, scale):
    return x * scale

# ********************* 量化卷积(同时量化A/W,并做卷积) *********************
class Conv2d_Q(nn.Conv2d):
    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size,
            stride=1,
            padding=0,
            dilation=1,
            groups=1,
            bias=True,
            first_layer=0,
    ):
        super().__init__(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias
        )
        self.weight_scale = torch.nn.Parameter(torch.FloatTensor(1), requires_grad=True)
        self.activation_scale = torch.nn.Parameter(torch.FloatTensor(1), requires_grad=True)
        self.fist_batch = 0
        self.first_layer = first_layer

    def forward(self, input):
        if self.fist_batch == 0:
            self.activation_scale = torch.nn.parameter.Parameter(2 * torch.mean(torch.abs(input)) / torch.sqrt(torch.tensor(127.0)))
            self.weight_scale = torch.nn.parameter.Parameter(2 * torch.mean(torch.abs(self.weight)) / torch.sqrt(torch.tensor(127.0)))
            self.fist_batch = 1
        # 量化A和W
        if not self.first_layer:
            input = dequant(quant(input, self.activation_scale), self.activation_scale)
        q_input = input
        q_weight = dequant(quant(self.weight, self.weight_scale), self.weight_scale)
        # 量化卷积
        output = F.conv2d(
            input=q_input,
            weight=q_weight,
            bias=self.bias,
            stride=self.stride,
            padding=self.padding,
            dilation=self.dilation,
            groups=self.groups
        )
        return output


class QuanConv2d(nn.Module):
    def __init__(self, input_channels, output_channels,
                 kernel_size=-1, stride=-1, padding=-1, groups=1, last_relu=0, first_layer=0):
        super(QuanConv2d, self).__init__()
        self.last_relu = last_relu
        self.first_layer = first_layer
        self.q_conv = Conv2d_Q(input_channels, output_channels,
                               kernel_size=kernel_size, stride=stride, padding=padding, groups=groups,
                               first_layer=first_layer)
        self.bn = nn.BatchNorm2d(output_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        if not self.first_layer:
            x = self.relu(x)
        x = self.q_conv(x)
        x = self.bn(x)
        if self.last_relu:
            x = self.relu(x)
        return x


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.quan_model = nn.Sequential(
            QuanConv2d(1, 8, kernel_size=3, stride=1, padding=1, first_layer=1),
            nn.MaxPool2d(kernel_size=2, stride=2),

            QuanConv2d(8, 16, kernel_size=3, stride=1, padding=1),
            QuanConv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=2, stride=2),

            QuanConv2d(32, 10, kernel_size=3, stride=1, padding=1, last_relu=1),
            nn.AvgPool2d(kernel_size=7, stride=1, padding=0),
        )

    def forward(self, x):
        x = self.quan_model(x)
        x = x.view(x.size(0), -1)
        return x


import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torchvision
import torchvision.transforms as transforms

device = torch.device('cuda:0')


def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

def train(epoch):
    model.train()

    for batch_idx, (data, target) in enumerate(train_loader):
        #data, target = data.to(device), target.to(device)
        data, target = Variable(data), Variable(target)
        output = model(data)
        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.data.item(),
                optimizer.param_groups[0]['lr']))
    return


def test():
    model.eval()
    test_loss = 0
    correct = 0

    for data, target in test_loader:
        #data, target = data.to(device), target.to(device)
        data, target = Variable(data), Variable(target)
        output = model(data)
        test_loss += criterion(output, target).data.item()
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    acc = 100. * float(correct) / len(test_loader.dataset)

    print('acc is {}'.format(acc))


if __name__ == '__main__':
    setup_seed(int(time.time()))

    print('==> Preparing data..')
    train_dataset = torchvision.datasets.MNIST(root='../../data', train=True, transform=transforms.ToTensor(),
                                               download=True)
    test_dataset = torchvision.datasets.MNIST(root='../../data', train=False, transform=transforms.ToTensor())
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=128, shuffle=False)

    print('******Initializing model******')
    model = Net()
    #model.to(device)
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight.data)
            if m.bias is not None:
                m.bias.data.zero_()
        elif isinstance(m, nn.Linear):
            m.weight.data.normal_(0, 0.01)
            m.bias.data.zero_()


    criterion = nn.CrossEntropyLoss()


    base_lr = float(0.001)
    param_dict = dict(model.named_parameters())
    params = []
    for key, value in param_dict.items():
        if key=='quan_model.0.q_conv.weight_scale':
            g=1/torch.sqrt(torch.tensor(127.0*8*1*3*3))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        elif key=='quan_model.0.q_conv.activation_scale':
            g=1/torch.sqrt(torch.tensor(127.0*128*1*28*28))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        elif key=='quan_model.2.q_conv.weight_scale':
            g = 1 / torch.sqrt(torch.tensor(127.0*16*8*3*3))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        elif key=='quan_model.2.q_conv.activation_scale':
            g = 1 / torch.sqrt(torch.tensor(127.0*8*14*14))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        elif key=='quan_model.3.q_conv.weight_scale':
            g = 1 / torch.sqrt(torch.tensor(127.0*32*16*3*3))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        elif key=='quan_model.3.q_conv.activation_scale':
            g = 1 / torch.sqrt(torch.tensor(127.0*16*14*14))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        elif key=='quan_model.5.q_conv.weight_scale':
            g = 1 / torch.sqrt(torch.tensor(127.0*10*32*3*3))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        elif key=='quan_model.5.q_conv.activation_scale':
            g = 1 / torch.sqrt(torch.tensor(127.0*32*7*7))
            params += [{'params': [value], 'lr': base_lr*g, 'weight_decay': 0.0}]
        else:
            params += [{'params': [value], 'lr': base_lr, 'weight_decay': 0.0}]

    optimizer = optim.Adam(params, lr=base_lr, weight_decay=0.0)

    for epoch in range(1, 10):
        train(epoch)
        test()
    






其中scale的初始化按照下图进行
在这里插入图片描述
而scale的梯度分别乘以下图的g以使得模型收敛更快,精度更高
在这里插入图片描述

### 时序大模型的相关研究与实现 #### 关于时序大模型的研究背景 近年来,随着深度学习技术的发展,时序数据分析逐渐成为学术界和工业界的热点领域之一。以Transformer架构为核心的时序建模方法被广泛应用于自然语言处理、时间序列预测以及多模态任务等领域[^1]。这些模型通过捕捉长时间依赖关系,在诸如天气预报、金融预测、医疗诊断等多个场景中表现出卓越性能。 #### 推荐的时序大模型论文 以下是几篇具有代表性的时序大模型相关论文及其主要贡献: 1. **《Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting》** - 提出了Temporal Fusion Transformer (TFT),一种专为多步时间序列预测设计的框架。该模型结合了注意力机制与时序特征工程的优势,能够有效应对复杂的时间序列数据[^3]。 2. **《Informer: Beyond Efficient Transformer》** - Informer是一种高效的Transformer变体,特别适合处理超长序列输入的任务。它引入概率稀疏自注意机制(ProbSparse Self-Attention),显著降低了计算成本的同时保持了较高的精度[^4]。 3. **《Time-Series Classification with Learned Shapelets and Temporal Convolutional Networks》** - 这篇文章探讨了基于形状let的学习方法与卷积神经网络相结合的技术路线用于分类问题解决思路,并证明其有效性优于传统方法[^5]。 #### 实现代码及框架建议 对于希望动手实践上述理论成果的朋友来说,可以选择以下几种流行开源项目作为起点: - PyTorch Lightning 和 Hugging Face Transformers 库提供了丰富的预训练模型资源和支持快速原型开发的功能接口; - Darts 是一个专注于时间序列分析的Python库,内置多种经典算法同时也支持最新研究成果集成; - GluonTS 则是由亚马逊团队维护的一个专门针对时间序列应用而构建的强大工具集。 下面给出一段简单的例子展示如何利用PyTorch搭建基础版本LSTM模型完成基本回归型预测任务: ```python import torch from torch import nn class LSTMModel(nn.Module): def __init__(self, input_size=1, hidden_layer_size=100, output_size=1): super().__init__() self.lstm = nn.LSTM(input_size, hidden_layer_size) self.linear = nn.Linear(hidden_layer_size, output_size) self.hidden_cell = None def forward(self, input_seq): lstm_out, _ = self.lstm(input_seq.view(len(input_seq), 1, -1)) predictions = self.linear(lstm_out[-1]) return predictions ```
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

FPGA硅农

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值