代码举例
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import tensorflow as tf
# In[2]:
a = tf.constant(2)
b = tf.constant(3)
c=a+b
d=a*b
# In[5]:
c
# 这里可以知道其实并不是计算c的步骤,要计算从c要使用session会话激活c
# In[4]:
# 看成一个model
sess=tf.Session()
print (sess.run(c))
print (sess.run(d))
# In[9]:
# 相当于数据的接口,也是传入数据的方式
a = tf.placeholder(tf.int16)
b = tf.placeholder(tf.int16)
# In[10]:
a
# In[7]:
add = tf.add(a, b)
mul = tf.multiply(a, b)
# 以字典形式赋值
print (sess.run(add, feed_dict={a: 2, b: 3}))
print (sess.run(mul, feed_dict={a: 2, b: 3}))
# In[11]:
matrix1 = tf.constant([[3., 3.]])
matrix2 = tf.constant([[2.],[2.]])
# 矩阵乘法
product = tf.matmul(matrix2, matrix1)
print (sess.run(product))
# In[12]:
mat1=tf.Variable(tf.random_normal([3,2]))
mat2=tf.Variable(tf.random_normal([2,3]))
product=tf.matmul(mat1,mat2)
# In[13]:
m1=[[1,3],[2,1],[0,5]]
m2=[[3,2,1],[1,2,3]]
# 输出矩阵相乘结果,以字典赋值的数据进行运算
print (sess.run(product,feed_dict={mat1:m1,mat2:m2}))
# In[14]:
import tensorflow as tf
import numpy
rng = numpy.random
# In[15]:
# 学习率
learning_rate = 0.02
# 轮数
training_epochs = 3000
display_step=50
# In[16]:
train_X = numpy.asarray([3.3,4.4,5.5,6.71,6.93,4.168,9.779,6.182,7.59,2.167,
7.042,10.791,5.313,7.997,5.654,9.27,3.1])
train_Y = numpy.asarray([1.7,2.76,2.09,3.19,1.694,1.573,3.366,2.596,2.53,1.221,
2.827,3.465,1.65,2.904,2.42,2.94,1.3])
n_samples = train_X.shape[0]
# In[17]:
X = tf.placeholder("float")
Y = tf.placeholder("float")
# In[18]:
# 参数
W = tf.Variable(rng.randn(), name="weight")
b = tf.Variable(rng.randn(), name="bias")
# In[19]:
# Construct a linear model
# 线性模型(前向)
pred = tf.add(tf.multiply(X, W), b)
# In[20]:
# Mean squared error
# 损失函数(反向)
cost = tf.reduce_sum(tf.pow(pred-Y, 2))/(2*n_samples)
# Gradient descent
# 梯度下降
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# In[21]:
# Initializing the variables
# 初始化变量
init = tf.global_variables_initializer()
# In[22]:
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Fit all training data
for epoch in range(training_epochs):
for (x, y) in zip(train_X, train_Y):
sess.run(optimizer, feed_dict={X: x, Y: y})
# Display logs per epoch step
if (epoch+1) % display_step == 0:
c = sess.run(cost, feed_dict={X: train_X, Y:train_Y})
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c), "W=", sess.run(W), "b=", sess.run(b))
training_cost = sess.run(cost, feed_dict={X: train_X, Y: train_Y})
print("Training cost=", training_cost, "W=", sess.run(W), "b=", sess.run(b), '\n')
# the testing data
test_X = numpy.asarray([6.83, 4.668, 8.9, 7.91, 5.7, 8.7, 3.1, 2.1])
test_Y = numpy.asarray([1.84, 2.273, 3.2, 2.831, 2.92, 3.24, 1.35, 1.03])
print("Tssting...")
testing_cost = sess.run(
tf.reduce_sum(tf.pow(pred - Y, 2)) / (2 * test_X.shape[0]),
feed_dict={X: test_X, Y: test_Y}) # same function as cost above
print("Test LOSS=", testing_cost)
print("Final Loss:", abs(
training_cost - testing_cost))
线性回归
线性关系来描述输入到输出的映射关系
优化方法:梯度下降
山坡高度:LOSS
地面位置:参数
山坡最低点:LOSS MINIMAL
最低点位置:目标参数
怎么到达:下坡方向、梯度下降
怎么找方向:高度对地面方向求导
梯度下降
1.随机初始化参数
2.开启循环:
带入结果求出结果
与真值进行比较得到loss
对变量求导得到梯度
更新参数
当loss足够小停止
局限性
能够清楚描述分割线性分布的数据,对非线性分布无法适用
从线性到非线性
非线性激励
考量标准:1.正向对输入的调整
2.反向梯度损失
1.sigmoid
y(x)=sigmoid(x)=11+e−xy(x)=sigmoid(x)=\frac{1}{1+e^{-x}}y(x)=sigmoid(x)=1+e−x1
y(x)′=y(x)(1−y(x))y(x)'=y(x)(1-y(x))y(x)′=y(x)(1−y(x))
将输入数据映射到【0,1】中
但是当输入很大或者很小时几乎没有梯度-梯度饱和、梯度弥散
2.tahn
f(x)=tahn(x)=21+e−2x−1f(x)=tahn(x)=\frac{2}{1+e^{-2x}}-1f(x)=tahn(x)=1+e−2x2−1
f′(x)=1−f2(x)f'(x)=1-f^2(x)f′(x)=1−f2(x)
将数据映射到【-1,1】中
同上
3.relu
f(x)=max(0,x)f(x)=max(0,x)f(x)=max(0,x)
优点1:Krizhevsky et al. 发现使用 ReLU 得到的SGD的收敛速度会比 sigmoid/tanh 快很多(如上图右)。有人说这是因为它是linear,而且梯度不会饱和
优点2:相比于 sigmoid/tanh需要计算指数等,计算复杂度高,ReLU 只需要一个阈值就可以得到激活值。
缺点1: ReLU在训练的时候很”脆弱”,一不小心有可能导致神经元”坏死”。
4.leaky relu
f(x)=max(0.01x,x)f(x)=max(0.01x,x)f(x)=max(0.01x,x)
Leaky ReLUs 就是用来解决ReLU坏死的问题的。和ReLU不同,当x<0时,它的值不再是0,而是一个较小斜率(如0.01等)的函数。也就是说f(x)=1(x<0)(ax)+1(x>=0)(x),其中a是一个很小的常数。这样,既修正了数据分布,又保留了一些负轴的值,使得负轴信息不会全部丢失。a不能太大,不然就变成线性问题了。
注意:没有线性回归网络,不然所有的都可以归为一层了
神经网络构建
每一个神经元的数值由前一层神经元数值,神经元参数W,b以及激励函数共同决定第n+1层第k个神经元的方程可由公式表示为
zn+1,k=∑i=1mWn,k,ixn,i+bn,kz_{n+1,k}=\sum_{i=1}^{m}{W_{n,k,i}x_{n,i}+b_{n,k}}zn+1,k=∑i=1mWn,k,ixn,i+bn,k 利用前一层所有的值通过不同的权值和偏置计算
yn+1,k=11+e−zn+1,ky_{n+1,k}=\frac{1}{1+e^{-z_{n+1,k}}}yn+1,k=1+e−zn+1,k1然后通过飞机或函数进行计算下一层的值
神经网络优化
链式法则
计算梯度
loss−−δy−−δx−−δwloss--\delta{y}--\delta{x}--\delta{w}loss−−δy−−δx−−δw
代码示例
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets(".", one_hot=True)
import tensorflow as tf
# Parameters
learning_rate = 0.001
training_epochs = 30
# 每次训练的个数
batch_size = 100
display_step = 1
# Network Parameters
# 神经元的个数/相当于层的宽度,深度为2
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 512 # 2nd layer number of features
# 输入的维度
n_input = 784 # MNIST data input (img shape: 28*28)
# 输出10个类
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activation
# 第一层
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
# 激活函数
layer_1 = tf.nn.relu(layer_1)
# Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
#we can add dropout layer
# 防止过拟合
# drop_out = tf.nn.dropout(layer_2, 0.75)
# Output layer with linear activation
# 到输出层
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Store layers weight & biases
# 权值
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
# 偏置
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Construct model
# 得到输出
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
# 损失和目标
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initializing the variables
# 初始化
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
# 开始训练
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
神经网络的配件
损失函数loss
预测值和真值之间的差值
1.softmax
δzj=ezj∑k=1Kezk\delta{z}_j=\frac{e^{z_j}}{\sum_{k=1}^{K}{e^{z_k}}}δzj=∑k=1Kezkezj
该函数是单调增函数,即输入值越大,输出也就越大,关注最大的
2.corss entropy
L(w)=−1N∑n=1N[ynlogy^n+(1−yn)log(1−y^n)]L(w)=-\frac{1}{N}\sum_{n=1}^{N}{[y_nlog \hat{y}_n+(1-y_n)log (1-\hat{y}_n)]}L(w)=−N1∑n=1N[ynlogy^n+(1−yn)log(1−y^n)]
当误差越大,梯度就越大,参数w调整得越快,训练速度也就越快,生成【0,1】
3.自定义
学习率
动量
正常 x+=−leanrningrate∗dxx+=-leanrningrate*dxx+=−leanrningrate∗dx
动量
v=mu∗v−learningrate∗dxv=mu*v-learningrate*dxv=mu∗v−learningrate∗dx
x+=vx+=vx+=v
动量比直接调大学习率会找到更好的方向
过拟合
应对:
1.正则化regularization,即使参数更加趋于平衡weigh decay
2.dropout,将一些参数随机置零,也能达到参数均衡的效果
3.fine-tuning大部分的参数不同更新