ncnn源码学习（九）：常见操作算子（下）

最新推荐文章于 2025-07-25 10:59:38 发布

原创最新推荐文章于 2025-07-25 10:59:38 发布 · 3.3k 阅读

14 ·

CC 4.0 BY-SA版权

ncnn 专栏收录该内容

14 篇文章

订阅专栏

本文深入解析了深度学习中常见的算子，包括reorg、Relu、HardSwish等，详细介绍了它们的工作原理及在ncnn框架中的实现细节。

1.reorg算子：重排

这个源自于yolo V2，如ssd网络一样，它会将不同层级不同大小的特征图concat到一起，用于多尺度检测，不同的是yolo V2使用reorg的方式来进行实现，如图所示：

已知输入大小为：2W*2W，需要得到W*W大小的特征图，那么就可以按照上面的方式，每次取4个元素分配给4个子特征图，按照这种方式从左到右，从上到下遍历就可以得到4个W*W的特征图，多通道的与之类似，对应于ncnn源码为：

int Reorg::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    // 计算输出特征图大小
    int outw = w / stride;
    int outh = h / stride;
    // 计算输出的特征图通道数
    int outc = channels * stride * stride;

    // 为输出blob分配内存
    top_blob.create(outw, outh, outc, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        // 每次对一个channel进行操作
        const Mat m = bottom_blob.channel(q);

        // 每次取出stride*stride个元素
        for (int sh = 0; sh < stride; sh++)
        {
            for (int sw = 0; sw < stride; sw++)
            {
                // 每个channel对应stride*stride个feature map
                float* outptr = top_blob.channel(q*stride*stride + sh*stride + sw);

                for (int i = 0; i < outh; i++)
                {
                    // 取出当前channel对应元素
                    const float* sptr = m.row(i*stride + sh) + sw;
                    for (int j = 0; j < outw; j++)
                    {
                        // 赋值
                        outptr[0] = sptr[0];

                        sptr += stride;
                        outptr++;
                    }
                }
            }
        }
    }

    return 0;
}

这个就很清晰了。

2.Relu操作

这个很简单，就是只取大于0的部分，小于0的部分置为0，ncnn中会有一个读入slope的步骤，就是如果输入大于0，且存在slope，那么将输入乘以一个slope，很简单，代码为：

int ReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    if (bottom_top_blob.elemsize == 1u)
        return ReLU::forward_inplace_int8(bottom_top_blob, opt);

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    // 将小于0的部分置为0
    if (slope == 0.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                if (ptr[i] < 0)
                    ptr[i] = 0;
            }
        }
    }
    else
    {
        // 将大于0的部分乘以一个slope
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                if (ptr[i] < 0)
                    ptr[i] *= slope;
            }
        }
    }

    return 0;
}

3.HardSwish操作

这个激活函数来自于MobilenetV3，具体公式为：

$HardSwish(x)=\frac{x*ReLU6(x)}{6}$

具体可以参考资料[2]，在ncnn里面实际实现的好像是ReLU5/5的版本，具体代码如下：

int HardSwish::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            // 小于lower: -2.5
            if (ptr[i] < lower)
                ptr[i] = 0.f;
            // 大于uppper：2.5
            else if (ptr[i] > upper) ;
            // x*(x*0.2f + 0.5f)
            else
                ptr[i] = ptr[i] * (ptr[i] * alpha + beta);
        }
    }

    return 0;
}

4.Interp：插值操作

就是在resize图像时，需要对图像进行插值，主要分为最近邻插值、双线性插值和三次样方插值，这个是图像处理的知识：

首先看一下双线性插值，其实就是两次线性插值，如图所示：

P1，P2，P3，P4四个未知的像素已知，现在需要求Pm处的像素值，第一次插值就是沿着X方向：

$Pm1=\Delta x1*P2+\Delta x2*P1$

$Pm2=\Delta x1*P4+\Delta x2*P3$

第二次插值就是沿着Y方向：

$Pm=\Delta y1*Pm2+\Delta y2*Pm1$

5.Log：对数算子

其实就是取对数，公式如下：

$f(x)=log_{base}(shift + scale * x)$

具体代码如下：

int Log::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    // 默认底为e
    if (base == -1.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                // 求解对数
                ptr[i] = log(shift + ptr[i] * scale);
            }
        }
    }
    else
    {
        float log_base_inv = 1.f / log(base);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            // 求解对数
            for (int i=0; i<size; i++)
            {
                ptr[i] = log(shift + ptr[i] * scale) * log_base_inv;
            }
        }
    }

    return 0;
}

6.noop：无操作

7.lrn：局部响应归一化操作

这里分为两种情况，第一种是归一化区域为cross channel，很简单就是归一化区域为channel间，如图所示为一个WxHxC的Mat：

每次取出1 x local_size个元素，求元素的平方和，然后对（x, y, i）位置的数值和其周围的数值对（x, y, i）位置进行正则化，可以参考资料[3]：

$b_{x,y}^{i}=p_{x,y}^{i}*(bias+alpha* \sum_{j=max(0,i-local\_size/2)}^{min(N-1,i+local\_size/2)}(p_{x,y}^{j})^{2})^{beta}$

第二种归一化区域就是within channel，就是在每次在一个channel上选取一个local_size x local_size大小区域，求元素的平方和，然后对（x, y, i）位置的数值和其周围的数值对（x, y, i）位置进行正则化，具体代码如下：

int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    size_t elemsize = bottom_top_blob.elemsize;
    int size = w * h;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_top_blob.channel(q);
        float* outptr = square_blob.channel(q);

        // 求每个像素的像素值平方
        for (int i=0; i<size; i++)
        {
            outptr[i] = ptr[i] * ptr[i];
        }
    }

    // 跨通道的norm区域
    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        // 平方和矩阵
        Mat square_sum;
        square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
        if (square_sum.empty())
            return -100;
        square_sum.fill(0.f);

        // alpha = 1.0， local_size = 5
        const float alpha_div_size = alpha / local_size;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            // square sum
            float* ssptr = square_sum.channel(q);

            // 5 x 5大小的region
            for (int p=q - local_size / 2; p<=q + local_size / 2; p++)
            {
                if (p < 0 || p >= channels)
                    continue;

                const float* sptr = square_blob.channel(p);
                // 按照channel方向，求local_size范围内的像素平方和
                for (int i=0; i<size; i++)
                {
                    ssptr[i] += sptr[i];
                }
            }

            float* ptr = bottom_top_blob.channel(q);
            for (int i=0; i<size; i++)
            {
                // x * （1 + 0.2 * ssptr）^ (-0.75)
                ptr[i] = ptr[i] * pow(bias + alpha_div_size * ssptr[i], -beta);
            }
        }
    }
    // channel内，就是不对边界处进行处理
    else if (region_type == NormRegion_WITHIN_CHANNEL)
    {
        int outw = w;
        int outh = h;

        Mat square_blob_bordered = square_blob;
        int pad = local_size / 2;
        if (pad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt_b);
            if (square_blob_bordered.empty())
                return -100;

            w = square_blob_bordered.w;
            h = square_blob_bordered.h;
        }

        // maxk = 25
        const int maxk = local_size * local_size;

        // alpha = 1.0
        const float alpha_div_size = alpha / maxk;

        // norm window offsets
        // 计算norm窗口的偏移量
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;

            // 每一行元素之间间隔
            int gap = w - local_size;
            for (int i = 0; i < local_size; i++)
            {
                for (int j = 0; j < local_size; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            const Mat m = square_blob_bordered.channel(q);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    const float* sptr = m.row(i) + j;

                    float ss = 0.f;

                    // 计算当前channel内，每个norm窗口内的平方和
                    for (int k = 0; k < maxk; k++)
                    {
                        float val = sptr[ space_ofs[k] ];
                        ss += val;
                    }

                    ptr[j] = ptr[j] * pow(bias + alpha_div_size * ss, -beta);
                }

                ptr += outw;
            }
        }
    }

    return 0;
}

8.MemoryData：从文件中读取一个Mat

就是从文件中读取一个Mat数据：

// 从文件中读取一个Mat
int MemoryData::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& top_blobs, const Option& opt) const
{
    Mat& top_blob = top_blobs[0];
    top_blob = data.clone(opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    return 0;
}

9.normalize：归一化操作

直观来说，就是每个元素除以其模长，这里分为三种情况：

第一种：直接考虑全局，就是WxHxC个元素的模长；

第二种：考虑每个channel，就是计算当前channel上WxH个元素的模长；

第三种：考虑WxH个位置上，每个位置的Cx1个元素的模长；

代码如下：

int Normalize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // 整体上的归一化
    if (across_spatial && across_channel)
    {
        // square
        Mat square_sum_blob;
        square_sum_blob.create(channels, elemsize, opt.workspace_allocator);
        if (square_sum_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);

            // 每个channel上元素平方和
            float ssum = 0.f;
            for (int i=0; i<size; i++)
            {
                ssum += ptr[i] * ptr[i];
            }

            square_sum_blob[q] = ssum;
        }

        // 所有元素的平方和
        float ssum = 0.f;
        for (int q=0; q<channels; q++)
        {
            ssum += square_sum_blob[q];
        }

        float a;
        // 计算模长
        if (eps_mode == 0) // caffe/mxnet
        {
            a = 1.f / sqrt(ssum + eps);
        }
        else if (eps_mode == 1) // pytorch
        {
            a = 1.f / std::max((float)sqrt(ssum), eps);
        }
        else //if (eps_mode == 2) // tensorflow
        {
            a = 1.f / sqrt(std::max(ssum, eps));
        }

        // 除以模长
        // channel shared：所有channel的scale都等于scale_data[0]
        if (channel_shared)
        {
            float scale = a * scale_data[0];

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * scale;
                }
            }
        }
        else
        {
            // 每个channel对应不同的scale
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                float scale = a * scale_data[q];

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * scale;
                }
            }
        }

        return 0;
    }

    // 对每个channel归一化
    if (across_spatial && !across_channel)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            float ssum = 0.f;
            for (int i=0; i<size; i++)
            {
                ssum += ptr[i] * ptr[i];
            }

            float a;
            if (eps_mode == 0) // caffe/mxnet
            {
                a = 1.f / sqrt(ssum + eps);
            }
            else if (eps_mode == 1) // pytorch
            {
                a = 1.f / std::max((float)sqrt(ssum), eps);
            }
            else //if (eps_mode == 2) // tensorflow
            {
                a = 1.f / sqrt(std::max(ssum, eps));
            }

            // 计算每个channel上的scale
            float scale = a * (channel_shared ? scale_data[0] : scale_data[q]);

            // 归一化
            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i] * scale;
            }
        }

        return 0;
    }

    // 对wxh的每个元素进行归一化
    if (!across_spatial && across_channel)
    {
        // square sum, 1 / sqrt(ssum)
        Mat square_sum_blob;
        square_sum_blob.create(size, elemsize, opt.workspace_allocator);
        if (square_sum_blob.empty())
            return -100;

        // 如果channel share
        if (channel_shared)
        {
            float scale = scale_data[0];

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<size; i++)
            {
                float ssum = 0.f;
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);
                    ssum += ptr[i] * ptr[i];
                }

                float a;
                if (eps_mode == 0) // caffe/mxnet
                {
                    a = 1.f / sqrt(ssum + eps);
                }
                else if (eps_mode == 1) // pytorch
                {
                    a = 1.f / std::max((float)sqrt(ssum), eps);
                }
                else //if (eps_mode == 2) // tensorflow
                {
                    a = 1.f / sqrt(std::max(ssum, eps));
                }

                square_sum_blob[i] = a * scale;
            }

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * square_sum_blob[i];
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<size; i++)
            {
                float ssum = 0.f;
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);
                    ssum += ptr[i] * ptr[i];
                }

                float a;
                if (eps_mode == 0) // caffe/mxnet
                {
                    a = 1.f / sqrt(ssum + eps);
                }
                else if (eps_mode == 1) // pytorch
                {
                    a = 1.f / std::max((float)sqrt(ssum), eps);
                }
                else //if (eps_mode == 2) // tensorflow
                {
                    a = 1.f / sqrt(std::max(ssum, eps));
                }

                square_sum_blob[i] = a;
            }

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                float scale = scale_data[q];

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * square_sum_blob[i] * scale;
                }
            }
        }

        return 0;
    }

    return 0;
}

10.Permute:置换操作

就是按照某个order将输入mat进行重排：

针对二维形式：

order_type

0 = w h：默认格式

1 = h w：取(h, w)元素操作：顺序为w->h

对于三维形式：

order_type

0 = w h c，默认格式

1 = h w c，取(h, w, c)元素操作：顺序为c->w->h

2 = w c h，取(w, c, h)元素操作：顺序为h->c->w

3 = c w h，取(c, w, h)元素操作：顺序为h->w->c

4 = h c w，取(h,, c, w)元素操作：顺序为w->c->h

5 = c h w，取(c, h, w)元素操作：顺序为w->h->c

具体代码为：

// 按照指定order，对输入进行重排
// 默认存储顺序为：二维wxh，三维wxhxc
int Permute::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    int dims = bottom_blob.dims;

    if (dims == 2)
    {
        // order_type
        // 0 = w h
        // 1 = h w

        if (order_type == 0)
        {
            top_blob = bottom_blob;
        }
        // wxh变成hxw
        else if (order_type == 1)
        {
            top_blob.create(h, w, elemsize, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            const float* ptr = bottom_blob;
            float* outptr = top_blob;

            // 对重排后的Mat赋值
            for (int i = 0; i < w; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    outptr[i*h + j] = ptr[j*w + i];
                }
            }
        }

        return 0;
    }

    // order_type
    // 0 = w h c
    // 1 = h w c
    // 2 = w c h
    // 3 = c w h
    // 4 = h c w
    // 5 = c h w

    // 0.wxhxc
    if (order_type == 0)
    {
        top_blob = bottom_blob;
    }
    // 1.取(h, w, c)元素：顺序为c->w->h
    else if (order_type == 1)
    {
        top_blob.create(h, w, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < w; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    outptr[i*h + j] = ptr[j*w + i];
                }
            }
        }
    }
    // 2.取(w, c, h)元素：顺序为h->c->w
    else if (order_type == 2)
    {
        top_blob.create(w, channels, h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<h; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < channels; i++)
            {
                const float* ptr = bottom_blob.channel(i).row(q);

                for (int j = 0; j < w; j++)
                {
                    outptr[i*w + j] = ptr[j];
                }
            }
        }
    }
    // 3.取(c, w, h)元素：顺序为h->w->c
    else if (order_type == 3)
    {
        top_blob.create(channels, w, h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<h; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < w; i++)
            {
                for (int j = 0; j < channels; j++)
                {
                    const float* ptr = bottom_blob.channel(j).row(q);

                    outptr[i*channels + j] = ptr[i];
                }
            }
        }
    }
    // 4.取(h,, c, w)元素：顺序为w->c->h
    else if (order_type == 4)
    {
        top_blob.create(h, channels, w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<w; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < channels; i++)
            {
                const float* ptr = bottom_blob.channel(i);

                for (int j = 0; j < h; j++)
                {
                    outptr[i*h + j] = ptr[j*w + q];
                }
            }
        }
    }
    // 5.取(c, h, w)元素：顺序为w->h->c
    else if (order_type == 5)
    {
        top_blob.create(channels, h, w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<w; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < h; i++)
            {
                for (int j = 0; j < channels; j++)
                {
                    const float* ptr = bottom_blob.channel(j);

                    outptr[i*channels + j] = ptr[i*w + q];
                }
            }
        }
    }

    return 0;
}

11.power：指数操作

计算公式为：

$f(x)=(shift+scale*x)^{power}$

具体代码如下：

int Power::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            ptr[i] = pow((shift + ptr[i] * scale), power);
        }
    }

    return 0;
}

12.Prelu操作

计算公式为：

$f(x)=\left\{\begin{matrix} x & if\ x\leq0 \\ slope*x & else \end{matrix}\right.$

核心代码如下：

if (ptr[i] < 0)
    ptr[i] *= slope;

13.MVN操作：mean variance normalization

就是减去均值，除以标准差，将输入转换成0均值，方差为1，计算公式为：

$f(x_{i}) =\frac{x_{i}-mean}{\mu }$

具体操作分为两种情况：

第一种：对每个channel进行操作，先求取每个channel的均值，然后每个channel元素减去各自均值，其次求每个channel的标准差，最后将每个去均值的channel元素除以标准差；

第二种：将所有channel视为一个整体，先求取整体的均值，然后每个元素减去均值，其次求所有元素的标准差，最后将所有减去均值的元素除以标准差即可；

具体代码如下：

int MVN::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // prepare sum per channel
    // 对每个通道像素求和
    Mat sum(channels, elemsize, opt.workspace_allocator);
    if (sum.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);

        float s = 0.f;
        for (int i=0; i<size; i++)
        {
            s += ptr[i];
        }

        sum[q] = s;
    }

    // 如果跨通道
    if (across_channels)
    {
        // compute mean across channels
        // 求所有channel的像素均值
        float mean = 0.f;
        for (int q=0; q<channels; q++)
        {
            mean += sum[q];
        }
        mean = mean / (channels * size);

        // subtract mean
        // 然后去均值
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i] - mean;
            }
        }
    }
    else
    {
        // subtract mean
        // 每个通道减去各自的均值
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float mean = sum[q] / size;

            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i] - mean;
            }
        }
    }

    // 方差归一化
    if (normalize_variance)
    {
        // prepare squared sum per channel
        Mat sqsum(channels, elemsize, opt.workspace_allocator);
        if (sqsum.empty())
            return -100;

        // 求取标准差
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = top_blob.channel(q);

            float s = 0.f;
            for (int i=0; i<size; i++)
            {
                s += ptr[i] * ptr[i];
            }

            sqsum[q] = s;
        }

        // 如果跨通道
        if (across_channels)
        {
            // compute squared mean across channels
            float sqmean = 0.f;
            for (int q=0; q<channels; q++)
            {
                sqmean += sqsum[q];
            }
            // 求全局的标准差
            sqmean = sqmean / (channels * size);

            // normalize variance
            float norm_var = sqrt(sqmean) + eps;
            float norm_var_inv = 1.f / norm_var;

            // apply normalize_variance
            // 遍历所有元素，除以标准差
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = outptr[i] * norm_var_inv;
                }
            }
        }
        else
        {
            // apply normalize_variance
            // 对每个channel进行操作
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                float* outptr = top_blob.channel(q);
                float sqmean = sqsum[q] / size;
                float norm_var = sqrt(sqmean) + eps;
                float norm_var_inv = 1.f / norm_var;

                for (int i=0; i<size; i++)
                {
                    outptr[i] = outptr[i] * norm_var_inv;
                }
            }
        }

    }

    return 0;
}

14 .PriorBox：生成先验框

具体分为两种情况：

第一种，只提供num_min_size个尺寸min_sizes和num_ratios个宽高比例aspect_ratios，为了防止太多先验框，避免生成w*h*num_min_size*num_ratios，只生成w*h*(num_min_size+num_ratios-1)个先验框，生成策略为：

(1) aspect_ratios[0]与min_sizes[i], i <=num_min_size

(2) min_sizes[0]与aspect_ratios[i], 1< i <= num_ratios

第二种，同时提供num_min_size个最小尺寸min_sizes，num_max_size个最大尺寸max_sizes，和num_ratios个宽高比aspect_ratios，生成w*h*(num_min_size + num_min_size * num_ratios + num_max_size)个先验框。