5.9.5. NN API

4 月 ago

admin

26 minutes

5.9.5. NN API

仓颉 TensorBoost 中 NN 包提供优化器，网络层和损失函数的构造方法。

构造优化器、网络层或损失函数时，如果输入不符合要求，会抛出异常，有以下几种类型：

初始化数据类型不支持
初始化参数选项不支持
初始化参数值不符合要求
初始化 Tensor 的形状不符合要求

Cell 接口

Layer 和 Loss 结构都继承以下接口，如需使用，要导入 nn package (from CangjieTB import nn.*) 。

接口定义：

public interface CellUnary {
    operator func ()(input0: Tensor): Tensor
}

public interface CellBinary {
    operator func ()(input0: Tensor, input1: Tensor): Tensor
}

public interface CellTernary {
    operator func ()(input0: Tensor, input1: Tensor, input2: Tensor): Tensor
}

public interface CellQuaternarynary {
    operator func ()(input0: Tensor, input1: Tensor, input2: Tensor, input3: Tensor): Tensor
}

public interface CellQuintuplenaryToTuple {
    operator func ()(input0: Tensor, input1: Tensor, input2: Tensor, input3: Tensor, input4: Tensor): (Tensor, Tensor, Tensor)
}

public interface CellSevenTuplenary {
    operator func ()(input0: Tensor, input1: Tensor, input2: Tensor, input3: Tensor, input4: Tensor, input5: Tensor, input6: Tensor): Tensor
}

public interface CellUnaryToTuple {
    operator func ()(input0: Tensor): (Tensor, Tensor, Tensor)
}

public interface CellBinaryToTuple {
    operator func ()(input0: Tensor, input1: Tensor): (Tensor, Tensor, Tensor)
}

optimizer

仓颉 TensorBoost 的优化器使用 struct 表示，用户只要导入 nn.optim package (from CangjieTB import nn.optim.*) 即可直接使用仓颉 TensorBoost 已经封装好的优化器。

优化器基类

public abstract class BaseOptim<T0> where T0 <: OptDiff  {
    public func getParameters()

    public func getOptParameters()

    public func update(gradients: T0): Tensor
}

所有优化器的基类，不要直接使用此类，实例化它的子类。OptDiff 接口定义详见第四章：创建和收集 parameter

优化器基类提供以下方法供用户使用：

名称	作用
getParameters	获取模型参数
getOptParameters	获取优化器参数
update	用于更新网络权重

其中 update 方法参数列表：

名称	含义
gradients	更新网络权重所需的梯度，类型与需要更新权重的网络模型一致。

Adam 优化器

public class AdamOptimizer<T0> <: BaseOptim<T0> where T0 <: OptDiff {
    public init(net: T0, learningRate!: Float32=0.001, beta1!: Float32=0.9, beta2!: Float32=0.999, eps!: Float32=0.00000001, weightDecay!: Float32 = 0.0, lossScale!: Float32 = 1.0, useLocking!: Bool = false, useNesterov!: Bool = false)

    public init(net: T0, learningRate: Tensor, beta1!: Float32=0.9, beta2!: Float32=0.999, eps!: Float32=0.00000001, weightDecay!: Float32 = 0.0,  lossScale!: Float32 = 1.0, useLocking!: Bool = false, useNesterov!: Bool = false)
}

使用的算子：

public func adam(parameter: Tensor, moment1: Tensor, moment2: Tensor, beta1Power: Tensor, beta2Power: Tensor, learningRate: Tensor, beta1: Tensor, beta2: Tensor, eps: Tensor, gradOut: Tensor, useLocking: Bool, useNesterov: Bool): Tensor

Adam(Adaptive Moment Estimation) 算法，是在论文 Adam: A Method for Stochastic Optimization 中提出的一种优化方法，利用梯度的一阶矩估计和二阶矩估计动态调整每个参数的学习率。公式如下：

m=β1∗m+(1−β1)∗gv=β2∗v+(1−β2)∗g∗gl=α∗1−β2t1−β1tw=w−l∗mv+ϵ

m 表示第一个矩向量；v 表示第二个矩向量；g 表示梯度；l 表示比例因子； $\beta_1, \beta_2$ 表示 beta1，beta2；t 表示更新步骤； $beta_1^t beta_2^t$ 表示 beta1Power 和 beta2Power； $\alpha$ 表示 learningRate；w 表示 parameter； $\epsilon$ 表示 eps。

构造方法参数列表：

名称	含义
net	需要更新权重的网络
learningRate	学习率，支持 Float32 的数字（>= 0.0）和非 Parameter 类型的 Tensor（必须为 1 维，且每个元素都>=0），Tensor 的 dtype 必须为 Float32。数字表示静态学习率，Tensor 表示动态学习率。默认值：1e-3
beta1	第一个矩估计的指数衰退率，Float32 类型，范围是 (0.0, 1.0)。默认值：0.9
beta2	第二个矩估计的指数衰退率，Float32 类型，范围是 (0.0, 1.0)。默认值：0.999
eps	加到分母增加数值稳定性的极小项，Float32 类型，大于 0。默认值：1e-8
weightDecay	权重衰减，Float32 类型（>= 0）。默认值：0.0
lossScale	Float32 类型，大于 0，默认值：1.0
useLocking	是否加锁以让 tensor 不被更新。Bool 类型，默认值：false
useNesterov	是否使用 Nesterov Accelerated Gradient (NAG) 算法更新梯度。Bool 类型，默认值：false

支持平台：GPU

代码示例：

from CangjieTB import common.*
from CangjieTB import ops.*
from CangjieTB import nn.*
from CangjieTB import nn.optim.*
from CangjieTB import nn.loss.*
from CangjieTB import nn.layers.*
from CangjieTB import context.*
from CangjieTB import macros.*

@OptDifferentiable
public struct Net {
    let dense1: Dense

    init(dense1_: Dense) {
        dense1 = dense1_
    }

    @Differentiable
    operator func ()(input: Tensor): Tensor {
        input |> this.dense1
    }
}

@Differentiable[except: [lossFn, input, label]]
func train(net: Net, lossFn: SoftmaxCrossEntropyWithLogits, input: Tensor, label: Tensor): Tensor
{
    var output = net(input)
    var lossTensor = lossFn(output, label)
    return lossTensor
}

func gradient(net: Net, lossFn: SoftmaxCrossEntropyWithLogits, input: Tensor, label: Tensor): (Tensor, Net)
{
    var adj = @AdjointOf(train)
    var (loss, bp) = adj(net, lossFn, input, label)
    var gradout = bp(Tensor(Float32(1.0)))
    return (loss, gradout)
}

main(): Int64
{
    let weight_init = initialize(Array<Int64>([16,2048]), initType:InitType.NORMAL, dtype:FLOAT32)
    let dense = Dense(2048, 16, weight_init,has_bias: true)
    let net = Net(dense)
    
    var lrArrary: Array<Float32> = Array<Float32>([0.01, 0.008])
    var learningRate = Tensor(lrArrary, shape: [lrArrary.size])
    var optim = AdamOptimizer<Net>(net, learningRate)   

    let input = parameter(initialize(Array<Int64>([16,2048]), initType:InitType.NORMAL,dtype:FLOAT32), "data")
    let label = parameter(initialize(Array<Int64>([16,16]),initType:InitType.NORMAL,dtype:FLOAT32), "lable")
    var lossFn = SoftmaxCrossEntropyWithLogits(sparse: false)
    var num = 0
    var epoch = 1
    while (num < epoch) {
        var (loss, gradout) = gradient(net,lossFn, input, label)
        optim.update(gradout)  
        print(loss.getShape())
        print("\n")
        print(net.dense1.weight_.getShape())
        num++
    }
    return 0
}

输出为：

[]
[16, 2048]

Momentum 优化器

public class MomentumOptimizer<T0> <: BaseOptim<T0> where T0 <: OptDiff {

    public init(net: T0, learningRate: Float32, momentum: Float32, weightDecay!: Float32 = 0.0, lossScale!: Float32 = 1.0)

    public init(net: T0, learningRate: Tensor, momentum: Float32, weightDecay!: Float32 = 0.0, lossScale!: Float32 = 1.0)
}

使用的算子：

public func applyMomentum(parameter: Tensor, accumlation: Tensor, learningRate: Tensor, gradOut: Tensor, momentum: Tensor): Tensor

Momentum 算法，是在论文On the importance of initialization and momentum in deep learning 中提出的一种优化方法，主要为了降低梯度更新的高敏感性，使得梯度更新方向趋于一致，加快收敛速度。公式如下：

accum=accum∗momentum+grad

w−=lr∗accum

其中，$w$ 是待更新的参数，$grad$ 是更新到 $w$ 的梯度，accum 是 grad 的累加 tensor。

构造方法参数列表：

名称	含义
net	需要更新权重的网络
learningRate	学习率，支持 Float32 的数字（>= 0.0）和非 Parameter 类型的 Tensor（必须为 1 维，且每个元素都>=0），Tensor 的 dtype 必须为 Float32。数字表示静态学习率，Tensor 表示动态学习率
momentum	动量，Float32 类型（>= 0.0），用于更新梯度。
weightDecay	权重衰减，Float32 类型（>= 0.0）。默认值：0.0
lossScale	loss 缩放，Float32 类型，大于 0.0，默认值：1.0

支持平台：GPU

from CangjieTB import common.*
from CangjieTB import ops.*
from CangjieTB import nn.*
from CangjieTB import nn.optim.*
from CangjieTB import nn.loss.*
from CangjieTB import nn.layers.*
from CangjieTB import context.*
from CangjieTB import macros.*

@OptDifferentiable
public struct Net {
    let dense1: Dense

    init(dense1_: Dense) {
        dense1 = dense1_
    }

    @Differentiable
    operator func ()(input: Tensor): Tensor {
        input |> this.dense1
    }
}

@Differentiable[except: [lossFn, input, label]]
func train(net: Net, lossFn: SoftmaxCrossEntropyWithLogits, input: Tensor, label: Tensor): Tensor
{
    var output = net(input)
    var lossTensor = lossFn(output, label)
    return lossTensor
}

func gradient(net: Net, lossFn: SoftmaxCrossEntropyWithLogits, input: Tensor, label: Tensor): (Tensor, Net)
{
    var adj = @AdjointOf(train)
    var (loss, bp) = adj(net, lossFn, input, label)
    var gradout = bp(Tensor(Float32(1.0)))
    return (loss, gradout)
}

main(): Int64
{
    let weight_init = initialize(Array<Int64>([16,2048]), initType:InitType.NORMAL, dtype:FLOAT32)
    let dense = Dense(2048, 16, weight_init,has_bias: true)
    let net = Net(dense)
    var lrArrary: Array<Float32> = Array<Float32>([0.01, 0.008])
    var learningRate = Tensor(lrArrary, shape: [lrArrary.size])
    let momentum: Float32 = 0.9
    var optim = MomentumOptimizer<Net>(net, learningRate, momentum)   

    let input = parameter(initialize(Array<Int64>([16,2048]), initType:InitType.NORMAL,dtype:FLOAT32), "data")
    let label = parameter(initialize(Array<Int64>([16,16]),initType:InitType.NORMAL,dtype:FLOAT32), "lable")
    var lossFn = SoftmaxCrossEntropyWithLogits(sparse: false)
    var num = 0
    var epoch = 1
    while (num < epoch) {
        var (loss, gradout) = gradient(net,lossFn, input, label)
        optim.update(gradout)  
        print(loss.getShape())
        print("\n")
        print(net.dense1.weight_.getShape())
        num++
    }
    return 0
}

输出为：

[]
[16, 2048]

SGD 优化器

public class SGDOptimizer<T0> <: BaseOptim<T0> where T0 <: OptDiff {
    public init(net: T0, learningRate: Tensor, momentum!: Float32 = 0.0, dampening!: Float32 = 0.0, weightDecay!: Float32 = 0.0, nesterov!: Bool = false, lossScale!: Float32 = 1.0)

    public init(net: T0, learningRate!: Float32 = 0.1, momentum!: Float32 = 0.0, dampening!: Float32 = 0.0, weightDecay!: Float32 = 0.0, nesterov!: Bool = false, lossScale!: Float32 = 1.0)
}

使用的算子：

public func sgd(parameters: Tensor, gradient: Tensor, learningRate: Tensor, accum: Tensor, momentum: Tensor, stat: Tensor, dampening!: Float32 = 0.0, weightDecay!: Float32 = 0.0, nesterov!: Bool = false): Tensor

SGD 算法，实现随机梯度下降。 momentum 是可选的。Nesterov 动量基于论文 On the importance of initialization and momentum in deep learning.中的公式：

vt+1=u∗vt+gradient∗(1−dampening)

If nesterov is true:

pt+1=pt−lr∗(gradient+u∗vt+1)

If nesterov is false:

pt+1=pt−lr∗vt+1

在第一次执行时：

vt+1=gradient

其中 p, v and u 分别表示 the parameters, accum, and momentum.

构造方法参数列表：

名称	含义
net	需要更新权重的网络
learningRate	学习率，支持 Float32 的数字（>= 0.0）和非 Parameter 类型的 Tensor（必须为 1 维，且每个元素都>=0），Tensor 的 dtype 必须为 Float32。数字表示静态学习率，Tensor 表示动态学习率
momentum	动量，Float32 类型（>= 0.0），用于更新梯度。默认值：0.0。
dampening	动量阻尼，Float32 类型（>= 0.0）。默认值：0.0。
weightDecay	权重衰减，Float32 类型（>= 0.0）。默认值：0.0。
nesterov	是否启用 Nesterov 向量，如果启用，dampening 必须为 0.0，Bool 类型。默认值：false。
lossScale	loss 缩放，Float32 类型，大于 0.0，默认值：1.0

支持平台：Ascend、GPU

from CangjieTB import common.*
from CangjieTB import ops.*
from CangjieTB import nn.*
from CangjieTB import nn.optim.*
from CangjieTB import nn.loss.*
from CangjieTB import nn.layers.*
from CangjieTB import context.*
from CangjieTB import macros.*

@OptDifferentiable
public struct Net {
    let dense1: Dense

    init(dense1_: Dense) {
        dense1 = dense1_
    }

    @Differentiable
    operator func ()(input: Tensor): Tensor {
        input |> this.dense1
    }
}

@Differentiable[except: [lossFn, input, label]]
func train(net: Net, lossFn: SoftmaxCrossEntropyWithLogits, input: Tensor, label: Tensor): Tensor
{
    var output = net(input)
    var lossTensor = lossFn(output, label)
    return lossTensor
}

func gradient(net: Net, lossFn: SoftmaxCrossEntropyWithLogits, input: Tensor, label: Tensor): (Tensor, Net)
{
    var adj = @AdjointOf(train)
    var (loss, bp) = adj(net, lossFn, input, label)
    var gradout = bp(Tensor(Float32(1.0)))
    return (loss, gradout)
}

main(): Int64
{
    let weight_init = initialize(Array<Int64>([16,2048]), initType:InitType.NORMAL, dtype:FLOAT32)
    let dense = Dense(2048, 16, weight_init,has_bias: true)
    let net = Net(dense)
    
    var lrArrary: Array<Float32> = Array<Float32>([0.01, 0.008])
    var learningRate = Tensor(lrArrary, shape: [lrArrary.size])
    var optim = SGDOptimizer<Net>(net, learningRate)   

    let input = parameter(initialize(Array<Int64>([16,2048]), initType:InitType.NORMAL,dtype:FLOAT32), "data")
    let label = parameter(initialize(Array<Int64>([16,16]),initType:InitType.NORMAL,dtype:FLOAT32), "lable")
    var lossFn = SoftmaxCrossEntropyWithLogits(sparse: false)
    var num = 0
    var epoch = 1
    while (num < epoch) {
        var (loss, gradout) = gradient(net,lossFn, input, label)
        optim.update(gradout)  
        print(loss.getShape())
        print("\n")
        print(net.dense1.weight_.getShape())
        num++
    }
    return 0
}

输出为：

[]
[16, 2048]

layer

仓颉 TensorBoost 的 Layer 使用 struct 表示，用户只要导入 nn.layers package (from CangjieTB import nn.layers.*) 即可直接使用仓颉 TensorBoost 已经封装好的 Layer 结构。

2D 卷积层

Layer 定义：

public struct Conv2d {
    public let weight_: Tensor
    public let bias_: Tensor
    public var in_channels_: Int64 = 0
    public var out_channels_: Int64 = 0
    public var kernel_sizes_: Array<Int64> = []
    public var strides_: Array<Int64> = []
    public var pad_mode_: String = ""
    public var padding_: Array<Int64> = [0, 0, 0, 0]
    public var dilations_: Array<Int64> = []
    public var group_: Int64 = 0
    public var has_bias_: Bool = false

    public init(in_channels: Int64, out_channels: Int64, kernel_size: Array<Int64>, stride: Array<Int64>, pad_mode: String, padding: Array<Int64>, dilation: Array<Int64>, group: Int64, has_bias: Bool, weight: Tensor, bias: Tensor)

    public init(in_channels: Int64, out_channels: Int64, kernel_size: Array<Int64>, stride!: Array<Int64> = Array<Int64>([1, 1]), pad_mode!: String = "same", padding!: Array<Int64> = Array<Int64>([0, 0, 0, 0]), dilation!: Array<Int64> = Array<Int64>([1, 1]), group!: Int64 = 1,has_bias!: Bool = false, weight_init!: InitType = InitType.NORMAL, bias_init!: InitType = InitType.ZERO)

    public  init(in_channels: Int64, out_channels: Int64, kernel_size: Array<Int64>, weight_init: BaseInitializer, bias_init: BaseInitializer, stride!: Array<Int64> = Array<Int64>([1, 1]), pad_mode!: String = "same", padding!: Array<Int64> = Array<Int64>([0, 0, 0, 0]),dilation!: Array<Int64> = Array<Int64>([1, 1]), group!: Int64 = 1, has_bias!: Bool = false)

    public init(in_channels: Int64, out_channels: Int64, kernel_size: Array<Int64>, weight_init: Tensor, bias_init: Tensor, stride!: Array<Int64> = Array<Int64>([1, 1]), pad_mode!: String = "same", padding!: Array<Int64> = Array<Int64>([0, 0, 0, 0]), dilation!: Array<Int64> = Array<Int64>([1, 1]), group!: Int64 = 1, has_bias!: Bool = false)
}

extend Conv2d <: CellUnary {
    @Differentiable
    public operator func ()(input: Tensor): Tensor
}

Conv2d 是 2D 卷积层。输入 tensor 的 shape 是 (N, C_in, H_in, W_in)，其中 N 是 batch_size，C_in 是 input channel number。对于每一个 batch 的数据 (C_in, H_in, W_in)，Conv2d 卷积层按照如下的公式进行计算。

outj=∑i=0Cin−1ccor(Wij,Xi)+bj

其中 ccor 是互相关运算符，$C_{in}$ 为输入通道数, $j$ 的取值范围为 $0$ 到 $C_{out} – 1$, $W_{ij}$ 对应第 j 个 filter 的第 i 个通道, $out_{j}$ 对应输出的第 j 个通道. $W_{ij}$ 的 shape 为 (kernel_size[0], kernel_size[1]), 其中 kernel_size[0] 和 kernel_size[1] 是卷积核的高度和宽度。完整卷积的 shape 为 ($C_{out}$, $C_{in}$ // group, kernel_size[0], kernel_size[1]), 其中 group 代表 input 在 channel 维度上被分组的数目。

当 pad_mode 为 valid 时，输出的高和宽分别为：

⌊1+Hin+padding[0]+padding[1]−kernelsize[0]−(kernelsize[0]−1)×(dilation[0]−1)stride[0]⌋

和

⌊1+Win+padding[2]+padding[3]−kernelsize[1]−(kernelsize[1]−1)×(dilation[1]−1)stride[1]⌋

参数列表：

参数名称	含义
in_channels	输入通道数，支持 Int64 类型，必须是大于 0 的数值。
out_channels	输出通道数，支持 Int64 类型，必须是大于 0 的数值。
kernel_size	滑框大小，支持 Int64 类型 Array。长度为 2 的数组，值必须小于输入 shape 的宽和高并且大于 0。
stride	滑窗跳步，支持 Int64 类型 Array。长度为 2 的数组，值必须大于 0。默认值：Array([1, 1])
pad_mode	padding 模式，String 类型。可选值为：”same”、”valid”。默认值：”same”。
same：采用补全的方式。输出的高度和宽度将与输入 x 相同。padding 的总数将在水平和垂直方向计算，并尽可能均匀地分布在顶部和底部，左侧和右侧。否则，最后一个额外的填充将从底部和右侧完成。如果设置了此模式，则 padding 必须为 Array([0, 0, 0, 0])。
valid：采用丢弃方式。输出的可能最大高度和宽度将在没有填充的情况下返回。多余的像素将被丢弃。如果设置了此模式，则 padding 必须为 Array([0, 0, 0, 0])。
pad：输入 x 两侧的隐式填充。填充的数量将被填充到输入张量边界。 padding 数组的元素必须大于或等于 0。
padding	输入的隐含 padding，支持 Int64 类型 Array。默认值：Array([0, 0, 0, 0])。
dilation	内核元素之间的间距，支持 Int64 类型 Array。默认值：Array([1, 1])。
group	权重分组数，支持 Int64 类型。默认值：1。
has_bias	是否有 bias，Bool 类型。默认值：false。
weight_init	权重初始化方式，支持三种方式：
1）支持 InitType 方式。可选值：ZERO、ONE、NORMAL。默认值：NORMAL。
2）BaseInitializer 随机初始化方式（RandomNormalInitializer，XavierUniformInitializer）
3）支持 FLOAT32 的 Tensor 初始化方式，非 Parameter 类型的 Tensor。
bias_init	bias 初始化方式，支持初始化方式同 weight_init

输入：

输入名称	含义
input	shape 是 (N, $C_{in}$, $H_{in}$, $W_{in}$) 的 Tensor，dtype 为 Float32

输出：

名称	含义
output	shape 是 (N, $C_{out}$, $H_{out}$, $W_{out}$) 的 Tensor，dtype 为 Float32

支持平台：Ascend、GPU、CPU

代码示例，权重初始化采用默认值，其他初始化方式参考 Tensor 的 API 的章节：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([32, 1, 32, 32]))
    let conv = Conv2d(1, 6, Array<Int64>([5, 5]))
    let output = conv(input)
    print(output.getShape())
    return 0
}

输出为：

[32, 6, 32, 32]

3D 卷积层

Layer 定义：

public struct Conv3d {
    public var inChannels_: Int64
    public var outChannels_: Int64
    public var kernelSizes_: Array<Int64>
    public var strides_: Array<Int64>
    public var padMode_: String
    public var padding_: Array<Int64>
    public var dilations_: Array<Int64>
    public var group_: Int64
    public var hasBias_: Bool
    public let weight_: Tensor
    public let bias_: Tensor

    public init(inChannels: Int64, outChannels: Int64, kernelSizes: Array<Int64>, strides: Array<Int64>, padMode: String, padding: Array<Int64>, dilations: Array<Int64>, group: Int64, hasBias: Bool, weightInit: Tensor, biasInit: Tensor)

    public init(inChannels: Int64, outChannels: Int64, kernelSizes!: Array<Int64> = [1, 1, 1], strides!: Array<Int64> = [1, 1, 1], padMode!: String = "same", padding!: Array<Int64> = [0, 0, 0, 0, 0, 0], dilations!: Array<Int64> = [1, 1, 1], group!: Int64 = 1, hasBias!: Bool = false, weightInit!: InitType = InitType.NORMAL, biasInit!: InitType = InitType.ZERO)

    public init (inChannels: Int64, outChannels: Int64, weightInit: BaseInitializer, biasInit: BaseInitializer, kernelSizes!: Array<Int64> = [1, 1, 1], strides!: Array<Int64> = [1, 1, 1], padMode!: String = "same", padding!: Array<Int64> = [0, 0, 0, 0, 0, 0], dilations!: Array<Int64> = [1, 1, 1], group!: Int64 = 1, hasBias!: Bool = false)

    public init (inChannels: Int64, outChannels: Int64, kernelSizes: Array<Int64>, strides: Array<Int64>, padMode: String, padding: Array<Int64>, dilations: Array<Int64>, group: Int64, hasBias: Bool, weightInit: Tensor, biasInit: Tensor)

}

extend Conv3d <: CellUnary {
    @Differentiable
    public operator func ()(input: Tensor): Tensor
}

Conv3d 是 3D 卷积层。输入 tensor 的 shape 是 (N, Cin, Din, Hin, Win)，其中 N 是 batchSize，Cin 是 input channel number。对于每一个 batch 的数据 (Cin, Din, Hin, Win)，Conv3d 卷积层按照如下的公式进行计算。

out(Ni,Coutj)=bias(Coutj)+∑k=0Cin−1ccor(weight(Coutj,k),X(Ni,k))

其中 ccor 是互相关运算符，$C_{in}$ 为输入通道数, $out_j$ 对应输出的第 $j$ 个通道，$j$的取值范围在 [ $0$ , $C_{out} – 1$] 内, $weight(C_{out_j}, k)$ 是 shape 为 (kernel_size[0], kernel_size[1], kernel_size[2]) 的卷积核切片, 其中 kernel_size[0] , kernel_size[1], kernel_size[2] 是卷积核的深度、高度和宽度。bias 为偏置参数，X 为输入 Tensor, 完整卷积的 shape 为 ($C_{out}$, $C_{in}$ // group, kernel_size[0], kernel_size[1], kernel_size[2]), 其中 group 代表 X 在 channel 维度上被分组的数目。

当 padMode 为 valid 时，输出的深度、高度和宽度别为：

Dout=⌊1+Din+padding[0]+padding[1]−(dilation[0]−1)×kernelSize[0]−1stride[0]⌋

Hout=⌊1+Din+padding[2]+padding[3]−(dilation[1]−1)×kernelSize[1]−1stride[1]⌋

Wout=⌊1+Din+padding[4]+padding[5]−(dilation[2]−1)×kernelSize[2]−1stride[2]⌋

参数列表：

参数名称	含义
inChannels	输入通道数，支持 Int64 类型，必须是大于 0 的数值。
outChannels	输出通道数，支持 Int64 类型，必须是大于 0 的数值。
kernelSizes	滑窗大小，支持 Int64 类型 Array。长度为 3 的数组，值必须大于 0 且不大于 [Din, Hin, Win], 默认值：[1, 1, 1]。
strides	滑窗跳步，支持 Int64 类型 Array。长度为 3 的数组，取值必须大于 0。默认值：[1, 1, 1]
padMode	padding 模式，String 类型。可选值为：”same”、”valid”、”pad”。默认值：”same”。
same：采用补全的方式。输出的深度、高度和宽度分别与输入整除 stride 后的值相同。如果设置了此模式，则 padding 数组的元素必须等于 0。
valid：采用丢弃方式。输出的可能最大深度、高度和宽度将在没有填充的情况下返回。多余的像素将被丢弃。如果设置了此模式，则 padding 数组的元素必须等于 0。
pad：在输入的深度、高度和宽度方向上填充 padding 大小的 0。则 padding 数组的元素必须大于或等于 0, 且 padding[0] 和 padding[1] 小于 `kernelSizes[0] - 1) * dilations[0] + 1`; padding[2] 和 padding[3] 小于 `kernelSizes[1] - 1) * dilations[1] + 1`; padding[4] 和 padding[5] 小于 `kernelSizes[2] - 1) * dilations[2] + 1`。
padding	输入的深度、高度和宽度方向上填充的数量, 支持 Int64 类型 Array。默认值：[0, 0, 0, 0, 0, 0]。
dilations	卷积核元素之间的间距，长度是 3 的数组, 支持 Int64 类型 Array。第 1 维仅支持取 1, 其余值需要大于等于 1。默认值：[1, 1, 1]。
group	权重分组数，支持 Int64 类型, 当前仅支持 1。默认值：1。
hasBias	是否有 bias，Bool 类型。默认值：false。
weight_init	权重初始化方式，支持三种方式：
1）支持 InitType 方式。可选值：ZERO、ONE、NORMAL。默认值：NORMAL。
2）BaseInitializer 随机初始化方式（RandomNormalInitializer，XavierUniformInitializer）
3）支持 FLOAT32 的 Tensor 初始化方式，非 Parameter 类型的 Tensor, shape 为 [outChannels, inChannels / group, kernelSizes[0], kernelSizes[1], kernelSizes[2]]。
biasInit	bias 初始化方式，支持初始化方式同 weightInit, 当使用 Tensor 初始化时, 支持 FLOAT32 类型, shape 为 [outChannels]。

输入：

输入名称	含义
input	shape 是 (N, $C_{in}$, $D_{in}$, $H_{in}$, $W_{in}$) 的 Tensor，支持 Float32 数据类型

输出：

名称	含义
output	shape 是 (N, $C_{out}$, $D_{out}$, $H_{out}$, $W_{out}$) 的 Tensor，数据类型与输入一致

支持平台：Ascend、GPU、CPU

代码示例，权重初始化采用默认值，其他初始化方式参考 Tensor 的 API 的章节：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([32, 1, 32, 32, 32]))
    let conv = Conv3d(1, 6, kernelSizes: [1, 1, 1])
    let output = conv(input)
    print(output.getShape())
    return 0
}

输出为：

[32, 6, 32, 32, 32]

BatchNorm2d 层

Layer 定义：

public struct BatchNorm2d {
    public let moving_mean_: Tensor
    public let moving_variance_: Tensor
    public let gamma_: Tensor
    public let beta_: Tensor
    public var num_features_: Int64
    public var eps_: Float32
    public var momentum_: Float32 = 0.0
    public var affine_: Bool = false
    public var use_batch_statistics_: Int64 = -1
    public var isTraining_: Bool = true

    public init(num_features__: Int64, eps__: Float32, momentum__: Float32, affine__: Bool, use_batch_statistics__: Int64, moving_mean__: Tensor, moving_variance__: Tensor, gamma__: Tensor, beta__: Tensor, isTraining__: Bool)

    public init(num_features: Int64, eps!: Float32 = 1e-5, momentum!: Float32 = 0.9, affine!: Bool = true, gamma_init!: InitType = InitType.ONE, beta_init!: InitType = InitType.ZERO, moving_mean_init!: InitType = InitType.ZERO, moving_var_init!: InitType = InitType.ONE, use_batch_statistics!: Int64 = -1, isTraining!: Bool = true)

    public mut func setTrain(mode: Bool)
}

extend BatchNorm2d <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

4D 输入上的批量归一化层。仓颉 TensorBoost 只支持 ‘NCHW’ 数据格式。

批处理规范化广泛用于卷积网络中。它使用少量数据和学习到的参数来重新缩放和更新功能，具体可参考如下公式。

y=x−E[x]Var[x]+ϵ∗γ+β

参数列表：

参数名称	含义
num_features	输入（N，C，H，W）中的 C
eps	分母中增加的值，以保证数值稳定性。默认值：1e-5
momentum	用于 running_mean 和 running_var 计算的动量的浮动超参数。默认值：0.9
affine	设置为 true 时，可以学习 gamma 和 beta。默认值：true
gamma_init	gamma 权重的初始化。包括 ZERO，ONE 等。默认值：ONE
beta_init	beta 权重的初始化。包括 ZERO，ONE 等。默认值：ZERO
moving_mean_init	移动平均值的初始化。包括 ZERO，ONE 等。默认值：ZERO
moving_var_init	移动方差的初始化。包括 ZERO，ONE 等。默认值：ONE
use_batch_statistics	取 1 时，则使用当前批处理数据的平均值和方差值，并跟踪运行平均值和运行方差；取 0 时，则使用指定值的平均值和方差值，不跟踪统计值；取 -1 时，当 isTraning 为 true 时，则使用当前批处理数据的平均值和方差值，并跟踪运行平均值和运行方差，当 isTraning 为 false 时，则使用指定值的平均值和方差值，不跟踪统计值。必须为以上三种取值，默认值 -1。
isTraining	仅在 use_batch_statistics 为 -1 时有效，默认值 true ，可通过成员函数 setTrain(mode: Bool) 来设置。

输入：

输入名称	含义
input	shape 是 (N, C, H, W) 的 Tensor, 支持 Float16\Float32 类型。

输出：

名称	含义
output	shape 是 (N, C, H, W) 的 Tensor

支持平台：GPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([1, 3, 224, 224]))
    let bn = BatchNorm2d(3)
    let output = bn(input)
    print(output.getShape())
    return 0
}

输出为：

[1, 3, 224, 224]

BatchNorm3d 层

Layer 定义：

public struct BatchNorm3d {
    public let movingMean_: Tensor
    public let movingVariance_: Tensor
    public let gamma_: Tensor
    public let beta_: Tensor
    public var numFeatures_: Int64
    public var eps_: Float32 = 1e-5
    public var momentum_: Float32 = 0.0
    public var affine_: Bool = false
    public var useBatchStatistics_: Int64 = 0
    public var isTraining_: Bool = true

    public init(numFeatures: Int64, eps!: Float32 = 1e-5, momentum!: Float32 = 0.9, affine!: Bool = true, gammaInit!: InitType = InitType.ONE, betaInit!: InitType = InitType.ZERO, movingMeanInit!: InitType = InitType.ZERO, movingVarianceInit!: InitType = InitType.ONE, useBatchStatistics!: Int64 = -1, isTraining!: Bool = true)

    public mut func setTrain(mode: Bool)
}

extend BatchNorm3d <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

5D 输入上的批量归一化层。仓颉 TensorBoost 只支持 ‘NCDHW’ 数据格式。

批处理规范化广泛用于卷积网络中。它使用少量数据和学习到的参数来重新缩放和更新功能，具体可参考如下公式。

y=x−E[x]Var[x]+ϵ∗γ+β

参数列表：

参数名称	含义
numFeatures	输入（N，C，D, H，W）中的 C
eps	分母中增加的值，以保证数值稳定性。默认值：1e-5
momentum	用于 running_mean 和 running_var 计算的动量的浮动超参数。默认值：0.9
affine	设置为 true 时，可以学习 gamma 和 beta。默认值：true
gammaInit	gamma 权重的初始化。包括 ZERO，ONE 等。默认值：ONE
betaInit	beta 权重的初始化。包括 ZERO，ONE 等。默认值：ZERO
movingMeanInit	移动平均值的初始化。包括 ZERO，ONE 等。默认值：ZERO
movingVarianceInit	移动方差的初始化。包括 ZERO，ONE 等。默认值：ONE
useBatchStatistics	取 1 时，则使用当前批处理数据的平均值和方差值，并跟踪运行平均值和运行方差；取 0 时，则使用指定值的平均值和方差值，不跟踪统计值；取 -1 时，当 isTraning 为 true 时，则使用当前批处理数据的平均值和方差值，并跟踪运行平均值和运行方差，当 isTraning 为 false 时，则使用指定值的平均值和方差值，不跟踪统计值。必须为以上三种取值，默认值 -1。
isTraning	仅在 useBatchStatistics 为 -1 时有效，默认值 true ，可通过成员函数 setTrain(mode: Bool) 来设置。

输入：

输入名称	含义
input	shape 是 (N, C, D, H, W) 的 Tensor, 支持 Float16\Float32 类型。

输出：

名称	含义
output	shape 是 (N, C, D, H, W) 的 Tensor

支持平台：GPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*
from CangjieTB import common.*

main(): Int64 {
    let net = BatchNorm3d(1)
    let inputX = Tensor(Array<Float32>([1.5, -1.0, -0.5, 2.5]), shape: Array<Int64>([1, 1, 2, 1, 2]))
    let output = net(inputX)
    print(output)
    return 0
}

输出为：

Tensor(shape=[1, 1, 2, 1, 2], dtype=Float32, value=
[[[[[ 6.11591339e-01 -1.13581240e+00]]
   [[-7.86331654e-01  1.31055284e+00]]]]])

Dense 层

Layer 定义：

public struct Dense {
    public let weight_: Tensor
    public let bias_: Tensor
    public var has_bias_: Bool
    public var activation_: String

    public init()

    public init(has_bias: Bool, activation: String, weight: Tensor, bias: Tensor)

    public init(in_channels: Int64, out_channels: Int64, weight_init!: InitType = InitType.NORMAL, bias_init!: InitType = InitType.ZERO,has_bias!: Bool = true, activation!: String = "NOACT")

    public init(in_channels: Int64, out_channels: Int64, weight_init: BaseInitializer, bias_init!: InitType = InitType.ZERO,has_bias!: Bool = true, activation!: String = "NOACT")

    public init(in_channels: Int64, out_channels: Int64, weight_init: Tensor, bias_init!: InitType = InitType.ZERO, has_bias!: Bool = true, activation!: String = "NOACT")
}

extend Dense <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

Dense 是全连接层。Dense 全连接层按照如下的公式进行计算。

outputs=activation(inputs∗kernel+bias),

其中 activation 是 Dense 层指定的激活函数（目前支持的激活函数包括 “RELU”, “RELU6”, “TANH”, “GELU”, “SIGMOID” 和 “SWISH”），kernel 是 Dense 层的权重矩阵，bias 是 Dense 层的偏置向量。

后续计划支持的其他激活函数列表: ‘softmax’: Softmax, ‘logsoftmax’: LogSoftmax, ‘fast_gelu’: FastGelu, ‘elu’: ELU, ‘prelu’: PReLU, ‘leakyrelu’: LeakyReLU, ‘hardswish’: HSwish, ‘hsigmoid’: HSigmoid, ‘logsigmoid’: LogSigmoid

参数列表：

名称	含义
in_channels	输入通道数，支持 Int64 类型，必须是大于 0 的数值。
out_channels	输出通道数，支持 Int64 类型，必须是大于 0 的数值。
weight_init	权重初始化方式，支持三种方式：
1）支持 InitType 方式。可选值：ZERO、ONE、NORMAL。默认值：NORMAL。
2）BaseInitializer 随机初始化方式（RandomNormalInitializer，XavierUniformInitializer）
3）支持 FLOAT32 的 Tensor 初始化方式, 非 Parameter 类型的 Tensor
bias_init	bias 初始化方式，支持初始化方式同 weight_init
has_bias	是否有 bias。Bool 类型，默认值：true。
activation	Dense 层指定的激活器，使用 String 类型表示。目前支持 “NOACT”，”RELU”，”RELU6″，”TANH”，”GELU”，”SIGMOID” 和 “SWISH”, 后续会补充更多激活函数。默认值：”NOACT”。

【备注】“SWISH” 按照如下的公式进行计算：

y=xe−x+1.0

输入：

名称	含义
input	输入的 Tensor，必须大于或等于 2 维。

输出：

支持平台：Ascend、GPU、CPU

代码示例，权重初始化采用默认值，其他初始化方式参考 Tensor 的 API 的章节：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([9, 8, 32]))
    let dense = Dense(32, 16, has_bias: true, activation: "SWISH")
    let output = dense(input)
    print(output.getShape())
    return 0
}

输出为：

[9, 8, 16]

Dropout 层

Layer 定义：

public struct Dropout {
    public var keepProb_: Float32
    public var seed0_: Int64
    public var seed1_: Int64
    public var training_: Bool = true

    public init(keepProb!: Float32 = 0.5, training!: Bool = true)
}

extend Dropout <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

Dropout 是一种正则化手段，根据丢弃概率 1 − keepProb，在训练过程中随机将一些神经元输出设置为 0.0，通过阻止神经元节点间的相关性来减少过拟合，在推理过程中，此层返回与 input 相同的 Tensor。

参数列表：

参数名称	含义
keepProb	输入神经元保留率，取值在 (0.0, 1.0] 之间，默认为 0.5
training	是否处于 training 状态。当 Dropout Layer 处于 training 状态时会对输入进行随机的 drop，否则对输入不做处理，默认为 true

输入：

输入名称	含义
input	Float32 类型的 Tensor

输出：

名称	含义
output	Dropout 操作之后的结果，Float32 类型的 Tensor，shape 和输入相同

支持平台：Ascend、GPU、CPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([2, 8]))
    let dropoutLayer = Dropout(keepProb: 0.2)
    let output = dropoutLayer(input)
    print(output.getShape())
    return 0
}

输出为：

[2, 8]

Embedding 层

Layer 定义：

public struct Embedding {
    public let embeddingTable: Tensor
    public var vocabSize_: Int64 = 0
    public var embeddingSize_: Int64 = 0
    public var useOneHot_: Bool = false

    public init(vocabSize__!: Int64 = 0, embeddingSize__!: Int64 = 0, useOneHot__!: Bool = false, embeddingTable_!: Tensor = Tensor())

    public init(vocabSize: Int64, embeddingSize: Int64, initType: BaseInitializer, useOneHot!: Bool = false, paddingIdx!: Option<Int64> = None)

    public init(vocabSize: Int64, embeddingSize: Int64, initType!: InitType = InitType.NORMAL, useOneHot!: Bool = false, paddingIdx!: Option<Int64> = None)
}

extend Embedding <: CellUnary {
    public operator func ()(inputIds: Tensor): Tensor
}

Embedding Layer 是一个简单的查表层，记录了固定大小的嵌入向量。

该层常用于记录词向量，用索引来查词。layer 的输入是一系列的索引值，输出为索引值对应的词向量。

参数列表：

参数名称	含义
vocabSize	Int64 类型，嵌入向量的词库大小
embeddingSize	Int64 类型，每个嵌入向量的大小
initType	BaseInitializer 类型或 InitType 类型，layer 中嵌入向量的初始化方式，默认为正态分布。
useOneHot	Bool 类型，表明是否使用 One Hot 的方式来进行初始化，默认为 false
paddingIdx	Option 类型，如果非 None，则表明该列对应了符号，用全 0 的向量来表示。如果为 None 则无此特性

输入：

输入名称	含义
inputs	输入的索引 Tensor，shape 为(batchSize, inputLength)，dtype 是 Int32，索引值应在(0, vocabSize) 之间，否则索引所对应的输出向量为 0 向量。

输出：

名称	含义
outputs	嵌入向量 Tensor，shape 为(batchSize, inputLength, embeddingSize)，dtype 是 FLOAT32

支持平台：Ascend、GPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*
from CangjieTB import common.*

main(): Int64 {
    let indexs: Array<Int32> = Array<Int32>([4, 7, 5, 0, 3])
    let inputIds = parameter(Tensor(indexs, shape: Array<Int64>([5])), "inputIds")
    let embeddingSize: Int64 = 4
    let vocabSize: Int64 = 8
    let embeddingLayer = Embedding(vocabSize,
                                   embeddingSize,
                                   initType: InitType.XAVIERUNIFORM,
                                   useOneHot: false)
    let forwardOutput = embeddingLayer(inputIds)
    print(forwardOutput.getShape())
    return 0
}

输出为：

[5, 4]

Flatten 层

Layer 定义：

public struct Flatten {}

extend Flatten <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

Flatten 是数据展平层，按如下公式进行计算。

$$ input shape:(N, _dims) \\ output shape:(N, \Pi_dims) $$

输入：

名称	含义
input	shape 是 $(N, *dims)$ 的 Tensor

输出：

名称	含义
output	shape 是 $(N, \Pi*dims)$ 的 Tensor

支持平台：Ascend、GPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([32, 1, 32, 32]))
    let flatten= Flatten()
    let output = flatten(input)
    print(output.getShape())
    return 0
}

输出为：

[32, 1024]

LayerNorm 层

layer 定义：

public struct LayerNorm {
    public let gamma_: Tensor
    public let beta_: Tensor
    public var beginNormAxis_: Int64 = -1
    public var beginParamsAxis_: Int64 = -1
    public var epsilon_: Float32 = 1e-7

    public init(beginNormAxis__!: Int64 = -1, beginParamsAxis__!: Int64 = -1, epsilon__!: Float32 = 1e-7, gamma__!: Tensor = Tensor(), beta__!: Tensor = Tensor())

    public init(normalizedShape: Array<Int64>, beginNormAxis!: Int64 = -1, beginParamsAxis!: Int64 = -1, gammaInit!: InitType = InitType.ONE, betaInit!: InitType = InitType.ZERO, epsilon!: Float32 = 1e-7)
}

extend LayerNorm <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

根据给定的轴对输入的 Tensor 进行归一化。计算公式如下：

y=x−meanvariance+ϵ∗γ+β

其中，$\gamma$是$gamma$，$\beta$是$beta$，$\epsilon$是$epsilon$。

输入：

输入名称	含义
input	输入 Tensor，目前只支持 Float32 类型。

参数列表：

参数名称	含义
normalizedShape	指定 shape 对 gamma 和 beta 初始化，支持 Array。
gammaInit	gammaInit 参数，可跟随网络进行训练，对输出结果进行缩放，支持 InitType 初始化方式，默认值是 InitType.ONE。
betaInit	betaInit 参数，可跟随网络进行训练，对输出结果进行平移，支持 InitType 初始化方式，默认值是 InitType.ZERO。
gamma	gamma 参数初始化的 Tensor，可跟随网络进行训练，对输出结果进行缩放。
beta	beta 参数初始化的 Tensor，可跟随网络进行训练，对输出结果进行平移。
beginNormAxis	对输入开始执行归一化的维度，value 必须在 [-1, rank(input)) 区间中，默认值为 -1。
beginParamsAxis	对参数开始执行归一化的维度，value 必须在 [-1, rank(input)) 区间中，默认值为 -1。
epsilon	为了数值计算稳定而加入的小 number，默认为 0.0000001。

输出：

输出名称	含义
output	layernorm 计算的结果 Tensor. 与输入 Tensor 具有相同的 shape 和 dtype

支持平台：Ascend、GPU、CPU

代码示例：

from CangjieTB import common.*
from CangjieTB import ops.*
from CangjieTB import nn.layers.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([2, 2, 3]))
    let normalizedShape = Array<Int64>([2, 3])
    let layerNorm = LayerNorm(normalizedShape, beginNormAxis: 1, beginParamsAxis: 1, epsilon: 1e-7)
    let output = layerNorm(input)
    print(output.getShape())
    return 0
}

输出为：

[2, 2, 3]

MaxPool2d 层

Layer 定义：

public struct MaxPool2d {
    public var kernel_sizes_: Array<Int64> = []
    public var strides_: Array<Int64> = []
    public var pad_mod_: String = ""

    public init(kernel_size!: Array<Int64> = Array<Int64>([1, 1]), stride!: Array<Int64> = Array<Int64>([1, 1]), pad_mod!: String = "VALID")
}

extend MaxPool2d <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

MaxPool2d 是最大值池化层，按照如下公式进行计算。

output(Ni,Cj,h,w)=maxm=0,..,kH−1maxn=0,..,kW−1input(Ni,Cj,stride[0]∗h+m,stride[1]∗w+n)

其中 input size 是（N, C, H, W）,output size 是 (N, C, H_out, W_out)，kernel size 是（kH, kW）。

参数列表：

名称	含义
kernel_sizes	Array，滑窗大小。长度为 2 的数组，值必须小于输入 shape 的宽和高并且大于 0。默认值：Array([1, 1])
strides	Array，滑窗跳步。长度为 2 的数组，值必须大于 0。默认值：Array([1, 1])
pad_mod	String，padding 模式，可选值：”VALID”，”SAME”。默认值：”VALID”。

输入：

名称	含义
input	shape 是 (N, C, H, W) 的 Tensor，dtype 只支持 Float32 类型.

输出：

名称	含义
output	shape 是 (N, C, $H_{out}$, $W_{out}$)，dtype 为 Float32 类型的 Tensor

支持平台：Ascend、GPU、CPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([1, 2, 4, 4]))
    var maxPool = MaxPool2d()
    let output = maxPool(input)
    print(output.getShape())
    return 0
}

输出为：

[1, 2, 4, 4]

Relu 层

Layer 定义：

public struct Relu {}

extend Relu <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

Relu 是 Relu 网络层，按照如下公式进行计算。

output=max(0,input)

输入：

输出：

支持平台：Ascend、GPU、CPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = randomNormalTensor(Array<Int64>([32, 6, 28, 28]))
    let relu = Relu()
    let output = relu(input)
    print(output.getShape())
    return 0
}

输出为：

[32, 6, 28, 28]

Softmax 层

Layer 定义：

public struct Softmax {
    public var axis_: Array<Int64>
    public init(axis!: Array<Int64> = Array<Int64>([-1]))
}

extend Softmax <: CellUnary {
    public operator func ()(input: Tensor): Tensor
}

Softmax 网络层，按照如下公式进行计算。

softmax(xi)=exp(xi)∑j=0n−1exp(xj)

参数列表：

名称	含义
axis	需要计算 softmax 的维度，其元素取值应在[-input.getShapeDims(), input.getShapeDims())之间，默认值为 – 1，即计算最后一维。

输入：

名称	含义
input	输入 Tensor，dtype 是 Float32。

输出：

名称	含义
output	softmax 之后的结果 Tensor。与输入 Tensor 具有相同的 shape 和 dtype。

支持平台：Ascend、GPU、CPU

代码示例：

from CangjieTB import nn.layers.*
from CangjieTB import ops.*

main(): Int64 {
    let input = Tensor(Array<Float32>([3.41188848e-01,  2.06148401e-01,  1.33806154e-01, 2.96444386e-01,  4.45214391e-01,  2.77494937e-01]), shape: [2, 3])
    let softmax = Softmax()
    let output = softmax(input)
    print(output)
    return 0
}

输出为：

Tensor(shape=[2, 3], dtype=Float32, value=
[[ 3.72246891e-01  3.25224876e-01  3.02528232e-01]
 [ 3.18305582e-01  3.69363725e-01  3.12330663e-01]])

Loss

仓颉 TensorBoost 的 Layer 使用 struct 表示，用户只要导入 nn.loss package (from CangjieTB import nn.loss.*) 即可直接使用仓颉 TensorBoost 已经封装好的 Loss 结构。

BinaryCrossEntropy 层

Loss 定义：

public struct BinaryCrossEntropy {
    public let reduction: String

    public init(reduction!: String = "mean") {
        this.reduction = reduction
    }

    public operator func ()(x: Tensor, y: Tensor, weight: Tensor): Tensor
}

使用的算子：

public func binaryCrossEntropy(x: Tensor, y: Tensor, weight: Tensor, reduction!: String = "mean"): Tensor

public func adjointBinaryCrossEntropy(x: Tensor, y: Tensor, weight: Tensor, reduction!: String = "mean")

public func binaryCrossEntropyGrad(x: Tensor, yGrad: Tensor, y: Tensor, weight: Tensor, reduction!: String = "mean"): Tensor

计算 logits 和 labels 之间的二值交叉熵。令 logits 为 x ，labels 为 y ，output 为 l(x, y)

L=l1,…,lN⊤,ln=−wn[yn⋅log⁡xn+(1−yn)⋅log⁡(1−xn)]

L 表示所有 batch size 的损失，l 表示一个 batch size 的损失，n 表示在 1 到 N 范围内的其中一个 batch size

ℓ(x,y)={L,if reduction=’none’;mean⁡(L),if reduction=’mean’;sum⁡(L),if reduction=’sum’.

参数列表：

名称	含义
reduction	指定缩减类型，类型为 String，它的值必须是”none”、”mean”、”sum”之一。默认值：”mean”。

输入：

名称	含义
logits	输入 Tensor，dtype 为 Float32 类型，shape 是 (N, *)。
labels	标签 Tensor，dtype 与 shape 需要与 logits 一致。
weight	权重 Tensor，应用于每个 batch 元素损失的重新调整权重， dtype 与 shape 需要与 logits 一致。

输出：

名称	含义
output	输出 Tensor，如果 reduction 为 “none”，则输出是 shape 与 logits 一致的 Tensor，否则，输出为含有一个元素的 scalarTensor。

支持平台：Ascend、GPU

代码示例：

from CangjieTB import nn.*
from CangjieTB import common.*
from CangjieTB import ops.*
from CangjieTB import nn.loss.*

main(): Int64
{
    let logits = parameter(Tensor(Array<Float32>([0.2000000030, 0.6999999881, 0.1000000015]), shape: Array<Int64>([3])), "logits")
    let labels = parameter(Tensor(Array<Float32>([0.0000000000, 1.0000000000, 0.0000000000]), shape: Array<Int64>([3])), "labels")
    let weight = parameter(Tensor(Array<Float32>([1.0000000000, 2.0000000000, 2.0000000000]), shape: Array<Int64>([3])), "weight")
    let loss = BinaryCrossEntropy()
    let output = loss(logits, labels, weight)
    print(output)
    return 0
}

输出为：

Tensor(shape=[1], dtype=Float32, value= [ 3.82404834e-01])

BCEWithLogitsLoss 层

Loss 定义：

public struct BCEWithLogitsLoss {
    public var reduction: String
    public var weight: Tensor
    public var posWeight: Tensor

    public init(weight: Tensor, posWeight: Tensor, reduction!: String  = "mean")
    public operator func ()(logits: Tensor, labels: Tensor): Tensor
}

使用的算子：

public func bceWithLogitsLoss(logits: Tensor, labels: Tensor, weight!: Tensor = Tensor(), posWeight!: Tensor = Tensor(), reduction!: String = "mean"): Tensor

public func adjointBceWithLogitsLoss(logits: Tensor, labels: Tensor, weight!: Tensor = Tensor(), posWeight!: Tensor = Tensor(), reduction!: String = "mean")

计算 logits 经过 sigmoid 激活函数处理后与 label 之间的二值交叉熵。

令 logits 为 X，label 为 Y, weight 为 W，output 为 L。计算公式如下：

pij=sigmoid(Xij)=11+e−XijLij=−[Yij⋅log(pij)+(1−Yij)⋅log(1−pij)]

i 表示第 i 个 sample，j 表示分类，则

ℓ(x,y)={L,if reduction=’none’;mean⁡(L),if reduction=’mean’;sum⁡(L),if reduction=’sum’.

参数列表：

名称	含义
weight	每个 batch 的权重。shape 可以被广播到与 logits 一致。 dtype 为 float32。
posWeight	正例的权重。shape 可以广播到与 logits 一致。 dtype 为 float32。
reduction	指定缩减类型，类型为 String，它的值必须是”none”、”mean”、”sum”之一。默认值：”mean”。

输入：

名称	含义
logits	输入 Tensor，dtype 为 Float32 类型，shape 是 (N, )。号表示可以追加任意的维度。
labels	标签 Tensor，dtype 与 shape 需要与 logits 一致。

输出：

名称	含义
output	Tensor 或标量，如果 reduction 为 “none”，则输出是 Tensor 并且 shape 与 logits 一致, 否则，输出为含有一个元素的 Tensor。

支持平台：Ascend、GPU

代码示例：

from CangjieTB import nn.*
from CangjieTB import ops.*
from CangjieTB import common.*
from CangjieTB import nn.loss.*

main(): Int64 {
    let logits = Tensor(Array<Float32>([-0.8, 1.2, 0.7, -0.1, -0.4, 0.7]), shape: Array<Int64>([2, 3]))
    let labels = Tensor(Array<Float32>([0.3, 0.8, 1.2, -0.6, 0.1, 2.2]), shape: Array<Int64>([2, 3]))
    let weight = Tensor(Array<Float32>([1.0, 1.0, 1.0]), shape: Array<Int64>([3]))
    let posWeight = Tensor(Array<Float32>([1.0, 1.0, 1.0]), shape: Array<Int64>([3]))
    let loss = BCEWithLogitsLoss(weight, posWeight)
    let output = loss(logits, labels)
    print(output)
    return 0
}

输出为：

Tensor(shape=[1], dtype=Float32, value= [ 3.46361160e-01])

L2Loss 层

Loss 定义：

public struct L2Loss {
    public init() {}

    public operator func ()(x: Tensor): Tensor
}

使用的算子：

public func l2Loss(x: Tensor): Tensor

public func adjointL2Loss(x: Tensor)

不使用 sqrt 计算输入张量 L2 范数的一半，计算公式如下：

loss=sum(x∗∗2)/2

输入：

名称	含义
x	输入 Tensor, 支持 Float32 类型。

输出：

名称	含义
output	输出标量 Tensor，与 x 数据类型一致。

支持平台：Ascend、GPU、CPU

代码示例：

from CangjieTB import nn.*
from CangjieTB import ops.*
from CangjieTB import common.*
from CangjieTB import nn.loss.*

main(): Int64 {
    let x = parameter(Tensor(Array<Float32>([1.0, 2.0, 3.0]), shape: Array<Int64>([1, 3])), "x")
    let loss = L2Loss()
    let output = loss(x)
    print(output)
    return 0
}

输出为：

Tensor(shape=[], dtype=Float32, value= 7.00000000e+00)

NLLLoss 层

Loss 定义：

public struct NLLLoss {
    public let reduction: String

    public init(reduction!: String = "mean")

    public operator func ()(x: Tensor, target: Tensor, weight: Tensor): (Tensor, Tensor)
}

使用的算子：

public func nllLoss(x: Tensor, target: Tensor, weight: Tensor, reduction!: String = "mean"): (Tensor, Tensor)

public func adjointNllLoss(x: Tensor, target: Tensor, weight: Tensor, reduction!: String = "mean")

public func nllLossGrad(x: Tensor, yGrad: Tensor, target: Tensor, weight: Tensor, totalWeight: Tensor, reduction!: String = "mean")

计算 labels 和 logits 的负对数似然损失，通常和 softmax、log 一起使用。

Tensor x(input) 是 shape 为 (N, C) 的张量，目标 Tensor t(target) 是 shape 为 (N) 的张量，其中 N 为 batch size，C 为类别数量。

对于每个 $N_i$ loss 的计算公式如下:

ln=−wtnxn,tn,wc=weight[c]⋅1,

nllloss 的计算方式有三种：none、mean、sum，默认方式是 mean，计算公式如下：

{ℓ(x,t)=L=−l1,…,lNT,ifreduction=′none′;ℓ(x,t)=∑n=1N1∑n=1Nwtnln,ifreduction=′mean′;ℓ(x,t)=∑n=1Nln,ifreduction=′sum′.

注意：目标分类是相互排斥的，即目标中只有一个分类是正的，但预测的概率不需要是排斥的。只要求预测的概率分布是有效的，概率分布和为 1。

参数列表：

参数名称	含义
reduction	指定输出使用的 reduction 方式，包括 none、mean、sum。默认值：mean

输入：

输入名称	含义
input	输入 Tensor，shape (N, C), dtype 是 FLOAT32
target	目标标签 Tensor，shape 为(N), dtype 是 INT32
weight	类别加权 Tensor，shape 为(C), dtype 是 FLOAT32

输出：

名称	含义
loss	loss Tensor，reduction 为 none 时，shape 为(N),否则为标量 Tensor，dtype 与 input 相同
total_weight	标量 Tensor，dtype 与 weight 相同

支持平台：Ascend、GPU

代码示例：

from CangjieTB import nn.loss.NLLLoss
from CangjieTB import ops.*
from CangjieTB import common.*

main(): Int64 {
    let input = parameter(Tensor(Array<Float32>([0.5488135219, 0.7151893377, 0.6027633548, 0.5448831916, 0.4236547947, 0.6458941102]), shape: Array<Int64>([3, 2])), "input")
    let target = parameter(Tensor(Array<Int32>([0, 0, 0]), shape: Array<Int64>([3])), "target")
    let weight = parameter(Tensor(Array<Float32>([0.3834415078, 0.7917250395]), shape: Array<Int64>([2])), "weight")
    let nllloss = NLLLoss(reduction: "mean")
    let (loss, totalWeight) = nllloss(input, target, weight)
    print(loss)
    print(totalWeight)
    return 0
}

输出为：

Tensor(shape=[1], dtype=Float32, value= [-5.25077164e-01])
Tensor(shape=[1], dtype=Float32, value= [1.15032458e+00])

SmoothL1Loss 层

Loss 定义：

public struct SmoothL1Loss {
    public let beta: Float32
    public let reduction: String

    public init(beta!: Float32 = 1.0, reduction!: String = "none")

    public operator func ()(logits: Tensor, label: Tensor): Tensor
}

使用的算子：

public func smoothL1Loss(logits: Tensor, label: Tensor, beta!: Float32 = 1.0, reduction!: String = "none"): Tensor

public func adjointSmoothL1Loss(logits: Tensor, label: Tensor, beta!: Float32 = 1.0, reduction!: String = "none")

计算 SmoothL1Loss 函数。在设置 beta 值之后，当|x_i – y_i| < beta 时，为一个二次函数；否则的话，为一个线性函数，缓解了梯度爆炸。

SmoothL1Loss 可以看作是 L1Loss 的修改版，或者是 L1Loss 和 L2Loss 的组合。 L1Loss 计算两个输入张量之间的元素绝对差，而 L2Loss 计算两个输入张量之间的平方差。 L2Loss 通常会导致更快的收敛，但它对异常值的鲁棒性较差。

给定两个长度为 N 的输入 x、y，SmoothL1Loss 的描述如下：

Li={0.5(xi−yi)2beta,if |xi−yi|<beta|xi−yi|−0.5beta,otherwise.

输出结果的计算方式有三种：none、mean、sum，默认方式是 none，计算公式如下：

{L(x,y)=L1,…,LNT,ifreduction=′none′;L(x,y)=∑n=1N1∑n=1NwtnLn,ifreduction=′mean′;L(x,y)=∑n=1NLn,ifreduction=′sum′.

参数列表：

名称	含义
beta	用于控制分段函数从二次函数变为线性函数的点。默认值: 1.0, 类型: Float32。
reduction	指定应用于输出结果的计算方式，比如“none”、“mean”，“sum”，默认值：“none”。对于 Ascend 平台仅支持“none”

输入：

名称	含义
logits	输入 Tensor，shape (N,∗)，类型是 FLOAT32 的 Tensor
label	标签 Tensor，shape (N,∗)，shape 和 dtype 与 logits 一致

输出：

名称	含义
output	损失 Tensor，shape 和 dtype 与 logits 一致

支持平台：Ascend、GPU

代码示例：

from CangjieTB import nn.*
from CangjieTB import ops.*
from CangjieTB import common.*
from CangjieTB import nn.loss.*

main(): Int64
{
    let logits = parameter(Tensor(Array<Float32>([1.0, 2.0, 3.0]), shape: Array<Int64>([3])), "x")
    let label = parameter(Tensor(Array<Float32>([1.0, 2.0, 2.0]), shape: Array<Int64>([3])), "y")
    let loss =SmoothL1Loss()
    let output = loss(logits,label)
    print(output)
    return 0
}

输出为：

Tensor(shape=[3], dtype=Float32, value=
[0.00000000e+00 0.00000000e+00 5.00000000e-01])

SoftmaxCrossEntropyWithLogits 层

Loss 定义：

public struct SoftmaxCrossEntropyWithLogits <: CellBinary {
    public let sparse: Bool
    public var ignoreIndex: Int32 = -1

    public init(sparse!: Bool = false, ignoreIndex!: Option<Int32> = None<Int32>)

    public operator func ()(logits: Tensor, label: Tensor): Tensor
}

使用的算子：

public func softmaxCrossEntropyWithLogits(logits: Tensor, label: Tensor): (Tensor, Tensor)

public func adjointSoftmaxCrossEntropyWithLogits(logits: Tensor, label: Tensor)

public func sparseSoftmaxCrossEntropyWithLogits(logits: Tensor, label: Tensor, isGrad!: Bool = false): Tensor

public func adjointSparseSoftmaxCrossEntropyWithLogits(logits: Tensor, label: Tensor, isGrad!: Bool = false)

计算 softmax 交叉熵损失函数。

使用交叉熵损失测量输入（使用 softmax 函数计算）的概率和目标之间的分布误差，其中分类是互斥的（只有一个分类是正的）。

此函数的典型输入是每个分类的非标准化分数和目标。分数 Tensor x 是 shape 为 (N, C) 的张量，目标 Tensor t 是 shape 为 (N, C) 的张量，其中包含长度 C 的 one-hot 标签。

对于每个 $N_i$ loss 的计算公式如下:

ℓ(xi,ti)=−log⁡(exp⁡(xti)∑jexp⁡(xj))=−xti+log⁡(∑jexp⁡(xj))

$x_i$ 是一维的张量， $t_i$ 是标量.

注意：目标分类是相互排斥的，即目标中只有一个分类是正的，但预测的概率不需要是排斥的。只要求预测的概率分布是有效的，概率分布和为 1。

参数列表：

参数名称	含义
sparse	指定标签是否使用稀疏格式, Ascend 平台必须为 false。默认值：false
ignoreIndex	计算损失时忽略值为 ignoreIndex 的标签，默认为 None，即对所有标签值进行计算。不为 None 时，sparse 必须为 true，loss 值的计算与梯度传递只会考虑未忽略的维度。

输入：

输入名称	含义
logits	输入 Tensor，shape (N, C), Tensor 类型是 FLOAT32
label	标签 Tensor，shape 为(N)。如果 sparse 为 true，则 label dtype 为 INT32。如果 sparse 为 false，则 label 的 dtype 和 shape 必须与 logits 相同。

输出：

名称	含义
output	返回 shape 为空的 tensor，value 为损失值，dtype 与 logits 相同。

支持平台：Ascend、GPU、CPU

代码示例：

from CangjieTB import nn.loss.*
from CangjieTB import ops.*
from CangjieTB import common.*

main(): Int64 {
    let logits = randomNormalTensor(Array<Int64>([10, 1]))
    let label = onesTensor(Array<Int64>([10]), dtype: INT32)
    let loss = SoftmaxCrossEntropyWithLogits(sparse: true)
    let output = loss(logits, label)
    print(output)
    return 0
}

输出为：

Tensor(shape=[], dtype=Float32, value= 1.38155103e+00)

Rate this post

阅读量: 159