欢迎来到尧图网

客户服务 关于我们

您的位置:首页 > 财经 > 金融 > 一步一步用numpy实现神经网络各种层

一步一步用numpy实现神经网络各种层

2024/10/24 6:30:11 来源:https://blog.csdn.net/u012897374/article/details/139655924  浏览:    关键词:一步一步用numpy实现神经网络各种层

1. 首先准备一下数据

if __name__ == "__main__":data = np.array([[2, 1, 0],[2, 2, 0],[5, 4, 1],[4, 5, 1],[2, 3, 0],[3, 2, 0],[6, 5, 1],[4, 1, 0],[6, 3, 1],[7, 4, 1]])x = data[:, :-1]y = data[:, -1]for epoch in range(1000):...

2. 实现Softmax+CrossEntropy层

单独求softmax层有点麻烦, 将softmax+entropy一起求导更方便。

假设对于输入向量 ( x 1 , x 2 , x 3 ) (x_1, x_2, x_3) (x1,x2,x3), 则对应的Loss为:

L = − ∑ i = 1 C y i ln ⁡ p i = − ( y 1 ln ⁡ p 1 + y 2 ln ⁡ p 2 + y 3 ln ⁡ p 3 ) \begin{align*} L&=-\sum_{i=1}^Cy_i \ln p^i \\ &=-(y_1\ln p_1+y_2\ln p_2+y_3\ln p_3) \end{align*} L=i=1Cyilnpi=(y1lnp1+y2lnp2+y3lnp3)

其中 y i y_i yi为ground truth, 为one-hot vector. p i p_i pi为输出概率。

p 1 = e x 1 e x 1 + e x 2 + e x 3 p 2 = e x 2 e x 1 + e x 2 + e x 3 p 3 = e x 3 e x 1 + e x 2 + e x 3 p_1=\frac{e^{x_1}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_2=\frac{e^{x_2}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_3=\frac{e^{x_3}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p1=ex1+ex2+ex3ex1p2=ex1+ex2+ex3ex2p3=ex1+ex2+ex3ex3
则偏导为
∂ L ∂ x 1 = − y 1 1 p 1 ∗ ∂ p 1 ∂ x 1 − y 2 1 p 2 ∗ ∂ p 2 ∂ x 1 − y 3 1 p 3 ∗ ∂ p 3 ∂ x 1 = − y 1 1 p 1 ∗ e x 1 ∗ ( e x 1 + e x 2 + e x 3 ) − e x 1 ∗ e x 1 ( e x 1 + e x 2 + e x 3 ) 2 − y 2 1 p 2 ∗ − e x 2 ∗ e x 1 ( e x 1 + e x 2 + e x 3 ) 2 − y 3 1 p 3 ∗ − e x 3 ∗ e x 1 ( e x 1 + e x 2 + e x 3 ) 2 = − y 1 1 p 1 ( p 1 ∗ p 2 + p 1 ∗ p 3 ) − y 2 1 p 2 ( − p 1 ∗ p 2 ) − y 3 1 p 3 ( − p 1 ∗ p 3 ) = − y 1 ( p 2 + p 3 ) + y 2 ∗ p 2 + y 3 ∗ p 3 = − y 1 ( 1 − p 1 ) + y 2 ∗ p 1 + y 3 ∗ p 1 = y 1 ( p 1 − 1 ) + y 2 ∗ p 1 + y 3 ∗ p 1 \begin{align*} \frac{\partial L}{\partial x_1} &= -y_1\frac{1}{p_1}*\frac{\partial p_1}{\partial x_1} - y_2\frac{1}{p_2}*\frac{\partial p_2}{\partial x_1} - y_3\frac{1}{p_3}*\frac{\partial p_3}{\partial x_1} \\ &= -y_1\frac{1}{p_1} * \frac{e^{x_1} * (e^{x_1}+e^{x_2}+e^{x_3})-e^{x_1}*e^{x_1}}{(e^{x_1}+e^{x_2}+e^{x_3})^2} \\ &\quad\quad-y_2\frac{1}{p_2}*\frac{-e^{x_2}*e^{x_1}}{{(e^{x_1}+e^{x_2}+e^{x_3})^2}}\\ &\quad\quad-y_3\frac{1}{p_3}*\frac{-e^{x_3}*e^{x_1}}{{(e^{x_1}+e^{x_2}+e^{x_3})^2}}\\ &=-y_1\frac{1}{p_1}(p_1*p_2+p_1*p_3)\\ &\quad\quad -y_2\frac{1}{p_2}(-p_1*p_2)\\ &\quad\quad -y_3\frac{1}{p_3}(-p_1*p_3)\\ &=-y1(p_2+p_3)+y_2*p_2+y_3*p_3\\ &=-y_1(1-p_1)+y_2*p_1+y_3*p_1\\ &=y_1(p_1-1)+y_2*p_1+y_3*p_1 \end{align*} x1L=y1p11x1p1y2p21x1p2y3p31x1p3=y1p11(ex1+ex2+ex3)2ex1(ex1+ex2+ex3)ex1ex1y2p21(ex1+ex2+ex3)2ex2ex1y3p31(ex1+ex2+ex3)2ex3ex1=y1p11(p1p2+p1p3)y2p21(p1p2)y3p31(p1p3)=y1(p2+p3)+y2p2+y3p3=y1(1p1)+y2p1+y3p1=y1(p11)+y2p1+y3p1

同理:
∂ L ∂ x 2 = y 1 ∗ p 2 + y 2 ( p 2 − 1 ) + y 3 ∗ p 2 ∂ L ∂ x 3 = y 1 ∗ p 3 + y 2 p 3 + y 3 ∗ ( p 3 − 1 ) \frac{\partial L}{\partial x_2}=y_1*p_2+y_2(p_2-1)+y_3*p_2\\ \frac{\partial L}{\partial x_3}=y_1*p_3+y_2p_3+y_3*(p_3-1) x2L=y1p2+y2(p21)+y3p2x3L=y1p3+y2p3+y3(p31)

y 1 = 1 y_1=1 y1=1时, 对应的导数为 ( p 1 − 1 , p 2 , p 3 ) (p1-1, p_2, p_3) (p11,p2,p3). 当 y 2 = 1 y_2=1 y2=1时,对应的导数为: ( p 1 , p 2 − 1 , p 3 ) (p_1, p2-1, p3) (p1,p21,p3).

例如求得概率为 ( 0.2 , 0.3 , 0.5 ) (0.2, 0.3, 0.5) (0.2,0.3,0.5), label为 ( 0 , 0 , 1 ) (0, 0, 1) (0,0,1), 则导数为 ( 0.2 , 0.3 , − 0.5 ) (0.2, 0.3, -0.5) (0.2,0.3,0.5)

python代码为:

注意求softmax时需要np.exp(x-np.max(x, axis=1, keepdims=True))防止指数运算溢出。

class Softmax:def __init__(self, n_classes):self.n_classes = n_classesdef forward(self, x, y):prob = np.exp(x-np.max(x, axis=1, keepdims=True))prob /= np.sum(prob, axis=1, keepdims=True)# 选出y==1位置的概率loss = -np.sum(np.log(prob[np.arange(len(y), y])) / len(y)self.grad = prob.copy()self.grad[np.arange(len(y), y] -= 1"""因为后面求导数都是直接np.sum而不是np.mean, 因此这里mean一次就可以了"""self.grad /= len(y)  return prob, lossdef backward(self):return self.grad

3. 单独的CrossEntropy

python代码为:

class Entropy:def __init__(self, n_classes):self.n_classes = n_classesself.grad = Nonedef forward(self, x, y):# x: (b, c), y: (b)b = y.shape[0]one_hot_y = np.zeros((b, self.n_classes))one_hot_y[range(len(y)), y] = 1self.grad = one_hot_y * -1 / xreturn np.mean(-one_hot_y * np.log(x), axis=0)def backward(self):return self.grad

2. 单独的Softmax层

from einops import repeat, rearrange, einsum
class Softmax:def __init__(self):def forward(self, x):# x: (b, c)x_exp = np.exp(x)self.output = x_xep / np.sum(x_exp, axis=1, keep_dims=True)return self.outputdef backward(self, prev_grad):b, c = self.output.shapeo = repeat(self.output, 'b c -> b c r', r=c)I = repeat(np.eye(x.shape[1]), 'c1 c2 -> b c1 c2', b=b)self.grad = o * (I - rearrange(o, 'b c1 c2 -> b c2 c1'))return einsum(self.grad, grad[..., None], 'b c c, b c m -> b c m')[..., 0]		

3. Linear层

注意更新 w w w时用的 d w d_w dw, 但是往上一层传递的是 d x d_x dx。因为上一层需要 d L / d o u t dL/d_{out} dL/dout, 而本层的输入 x x x即是上一次层的输出 d L / d o u t = d L / d x dL/d_{out} = dL/dx dL/dout=dL/dx

class Linear:def __init__(self, in_channels, out_channels, lr):self.lr = lrself.w = np.random.rand(in_channels, out_channels)self.b = np.random.rand(out_channels)def forward(self, x):self.x = xreturn x@self.w + self.bdef backward(self, grad):dx = einsum(prev_grad, rearrange(self.w, 'w1 w2 -> w2 w1'), 'c1 b, b c2 -> c1 c2')dw = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')db = np.sum(prev_grad, axis=0)self.w -= self.lr * dwself.b -= self.lr * db"""注意这里往上一层传递的是dx, 因为上一层需要dL/d_out, 而本层的输入x即是上一次层的输出dL/d_out = dL/dx"""return dx

5. 完整训练代码

from einops import *
import numpy as npclass Softmax:def __init__(self, train=True):self.grad = Noneself.train = traindef forward(self, x, y):prob = np.exp(x-np.max(x, axis=1, keepdims=True))prob /= np.sum(prob, axis=1, keepdims=True)if self.train:loss = -np.sum(np.log(prob[range(len(y)), y]))/len(y)self.grad = prob.copy()self.grad[range(len(y)), y] -= 1self.grad /= len(y)return prob, losselse:return probdef backward(self):return self.gradclass Linear:def __init__(self, in_channels, out_channels, lr):self.w = np.random.rand(in_channels, out_channels)self.b = np.random.rand(out_channels)self.lr = lrdef forward(self, x):self.x = xoutput = einsum(x, self.w, 'b c1, c1 c2 -> b c2') + self.breturn outputdef backward(self, prev_grad):cur_grad = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')self.w -= self.lr * cur_gradself.b -= self.lr * np.sum(prev_grad, axis=0)return cur_gradclass Network:def __init__(self, in_channels, out_channels, n_classes, lr):self.lr = lrself.linear = Linear(in_channels, out_channels, lr)self.softmax = Softmax()def forward(self, x, y=None):out = self.linear.forward(x)out = self.softmax.forward(out, y)return outdef backward(self):grad = self.softmax.backward()grad = self.linear.backward(grad)return gradif __name__ == "__main__":data = np.array([[2, 1, 0],[2, 2, 0],[5, 4, 1],[4, 5, 1],[2, 3, 0],[3, 2, 0],[6, 5, 1],[4, 1, 0],[6, 3, 1],[7, 4, 1]])# x = np.concatenate([np.array([[1]] * data.shape[0]), data[:, :2]], axis=1)x = data[:, :-1]y = data[:, -1:].flatten()net = Network(2, 2, 2, 0.1)# loss_fn = CrossEntropy(n_classes=2)for epoch in range(500):prob, loss = net.forward(x, y)# loss = loss_fn.forward(out, y)# grad_ = loss_fn.backward()grad = net.backward()print(loss)net.softmax.train = Falseprint(net.forward(np.array([[0, 0], [0, 4], [8, 6], [10, 10]])), y)

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com