caffe全连接层（INNER_PRODUCT）源码注释与分析

最新推荐文章于 2019-10-23 16:27:36 发布

原创最新推荐文章于 2019-10-23 16:27:36 发布 · 7.2k 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#深度学习 #caffe #全连接

深度学习专栏收录该内容

3 篇文章

订阅专栏

本文详细介绍了Caffe框架中全连接层的具体实现原理。主要包括层的初始化过程、前向传播算法以及反向传播算法等内容。该层适用于处理输入特征向量并将其映射到输出特征向量。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

// Copyright 2014 BVLC and contributors.
//这是全连接层的实现
#include <vector>

#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/filler.hpp"
#include "caffe/layer.hpp"
#include "caffe/vision_layers.hpp"
#include "caffe/util/math_functions.hpp"

//  主要是三个方法，setup，forward，backward
//  setup 初始化网络参数，包括了w和b
//	forward 前向传播的实现
//	backward 后向传播的实现

 //   M_ 表示的样本数
//	K_ 表示单个样本的特征长度
//	N_ 表示输出神经元的个数
namespace caffe {

template <typename Dtype>
void InnerProductLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
									 vector<Blob<Dtype>*>* top) 
{
	//若输入或输出的blob的size不为1的话输出警告
	CHECK_EQ(bottom.size(), 1) << "IP Layer takes a single blob as input.";
	CHECK_EQ(top->size(), 1) << "IP Layer takes a single blob as output.";
  
	//通过读取配置proto文件获得输出神经元的个数及是否使用偏置项
	const int num_output = this->layer_param_.inner_product_param().num_output();
	bias_term_ = this->layer_param_.inner_product_param().bias_term();
  
	// Figure out the dimensions
	M_ = bottom[0]->num();//表示样本数
	K_ = bottom[0]->count() / bottom[0]->num();//表示单个样本的特征长度，count_ = num_ * channels_ * height_ * width_;
	N_ = num_output; //全连接之后输出的神经元的个数
	(*top)[0]->Reshape(bottom[0]->num(), num_output, 1, 1);//全连接层输出的Blob维数为样本的个数*输出神经元的个数*1*1（M*N）
	
	// Check if we need to set up the weights
	if (this->blobs_.size() > 0) 
	{
		LOG(INFO) << "Skipping parameter initialization";
	} 
	else 
	{
		//如果配置文件使用偏置项，则开辟2个Blob类智能指针，否则开辟一个
		if (bias_term_) 
		{
			this->blobs_.resize(2);
		} 
		else 
		{
			this->blobs_.resize(1);
		}

    //初始化权重和偏差
		// Intialize the weight
		//vector<shared_ptr<Blob<Dtype> > > blobs_;
		//blobs_[0]指向权重矩阵，blobs_[1]指向偏置矩阵
		//因为是全链接，所以权重矩阵的维数为N_*K_
		this->blobs_[0].reset(new Blob<Dtype>(1, 1, N_, K_));//新开辟一个Blob，指针返回给blobs_[0]；
		
		// fill the weights
		//根据配置文件中的权重核（ weight_filler）的类型初始化填充权重矩阵blobs_[0];
		shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
			this->layer_param_.inner_product_param().weight_filler()));
		weight_filler->Fill(this->blobs_[0].get());
    
		// If necessary, intiialize and fill the bias term
		//填充偏置矩阵，每个输出单元对应一个偏置，共N_个
		if (bias_term_) 
		{
			this->blobs_[1].reset(new Blob<Dtype>(1, 1, 1, N_));
			shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
				this->layer_param_.inner_product_param().bias_filler()));
			bias_filler->Fill(this->blobs_[1].get());
		}
	}  // parameter initialization
  
	// Setting up the bias multiplier
	if (bias_term_) 
	{
		//只是把大小（size）设置了，并未申请内存
		//shared_ptr<SyncedMemory> bias_multiplier_;
		bias_multiplier_.reset(new SyncedMemory(M_ * sizeof(Dtype)));
		Dtype* bias_multiplier_data =
			reinterpret_cast<Dtype*>(bias_multiplier_->mutable_cpu_data());//返回数据在CPU里面的指针
		
		for (int i = 0; i < M_; ++i) 
		{
			bias_multiplier_data[i] = 1.;
		}
	}
}

template <typename Dtype>
//实现的功能就是 y=wx+b
//  x为输入，维度 M_*K_
//	y为输出，维度 M_*N_
//	w为权重，维度 K_*N_
//	b为偏置，维度 N_*1_
//一批次处理多个样本，在每一批次中权重矩阵与偏置矩阵是不变的
Dtype InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
											vector<Blob<Dtype>*>* top) 
{
	const Dtype* bottom_data = bottom[0]->cpu_data();
	Dtype* top_data = (*top)[0]->mutable_cpu_data();
	const Dtype* weight = this->blobs_[0]->cpu_data();//内存中的权重矩阵是N*K
	
	//它的功能其实很直观，即C←αA×B+βC,前两个参数控制A,B是否转置
	//其中A维度是MxK，B维度是KxN，C维度为MxN
	
	//全连接层的forward包括了两步:
	//这一步表示 y←wx，或者说是y←xw'
	//bottom_data:M*K, weight:N*K, top_data:M*N
	caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, 
		(Dtype)1., bottom_data, weight, (Dtype)0., top_data);
   //# 这一步表示 y←y+b
	if (bias_term_) 
	{
		const Dtype* bias = this->blobs_[1]->cpu_data();

		caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, 
			(Dtype)1., reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()),
			bias, (Dtype)1., top_data);
	}
  //所以两步连起来就等价于y=wx+b
	//处理之后矩阵的每一行代表一个样本的N个输出神经元，共M行
	return Dtype(0);
}

template <typename Dtype>
void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
											const bool propagate_down,
											vector<Blob<Dtype>*>* bottom) {
//data传递的是数据，diff传递的是梯度，top_diff的维度是N*M，每一列代表一个样本的error term
  const Dtype* top_diff = top[0]->cpu_diff();
  const Dtype* bottom_data = (*bottom)[0]->cpu_data();
  // Gradient with respect to weight
  //更新W
  //其中A维度是NxM，B维度是MxK，C维度为NxK
  //top_diff:M*N, bottom_data:M*K, this->blobs_[0]->mutable_cpu_diff():N*K
  //C=A'*B,this->blobs_[0]->mutable_cpu_diff()是权重梯度矩阵（N*K）
  caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
      top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff());
  if (bias_term_) {
    // Gradient with respect to bias
    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
        reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), (Dtype)0.,
        this->blobs_[1]->mutable_cpu_diff());
  }
  if (propagate_down) {
    // Gradient with respect to bottom data
    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
        top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
        (*bottom)[0]->mutable_cpu_diff());
  }
}

INSTANTIATE_CLASS(InnerProductLayer);

}  // namespace caffe