FBDQL的相关注释与目标追踪想法记录

最新推荐文章于 2023-08-20 11:35:47 发布

weixin_30449453

最新推荐文章于 2023-08-20 11:35:47 发布

阅读量159

点赞数

文章标签：人工智能 python

原文链接：http://www.cnblogs.com/yy1921rz/p/10963905.html

版权

#!/usr/bin/env python
from __future__ import print_function
import tensorflow as tf
import cv2//open cv引用
import sys
sys.path.append("game/")
import wrapped_flappy_bird as game
import random
import numpy as np
from collections import deque//作经验池
GAME = 'bird' # the name of the game being played for log files
ACTIONS = 2 # number of valid actions有效动作的数量
GAMMA = 0.99 # decay rate of past observations衰减因子
OBSERVE = 100000. # timesteps to observe before training前一万次该网络是在与环境交互，一万次之后才真正开始使用强化学习训练和优化参数
EXPLORE = 2000000. # frames over which to anneal epsilon从观察阶段进入探索阶段（从observe到explore）

FINAL_EPSILON = 0.0001 # final value of epsilon
INITIAL_EPSILON = 0.0001 # starting value of epsilon

REPLAY_MEMORY = 50000 # number of previous transitions to remember
#经验池：解决相关性及非静态分布问题。具体做法是把每个时间agent与环境交互得到的转移样本储存到回放记忆单元（Replay Memory），要训练是就随机拿出来一些来训练。
BATCH = 32 # size of minibatch batch
#minibatch 深度学习的优化算法，实质上为梯度下降。参数更新有两种方式。
#1.Batch Gradient Descent遍历全部数据集算一次损失函数（Loss Function），然后算函数对各个参数的梯度，更新梯度。
#2.Stochastic Gradient Descent每看一个数据就算一下损失函数，然后求梯度更新参数。
#1.2.->3.Mini-batch Gradient Descent，小批的梯度下降，把数据分为若干批，按批来更新参数。因此一批中的一组数据共同决定本次梯度的方向，下降后方向不易跑偏，减少了随机性。另一方面因为批的样本数与整个数据集相比变小，计算量也相对较小。

FRAME_PER_ACTION = 1

def weight_variable(shape):#初始化卷积网络的权值
    initial = tf.truncated_normal(shape, stddev = 0.01)
    return tf.Variable(initial)
def bias_variable(shape):#初始化卷积网络的偏置
    initial = tf.constant(0.01, shape = shape)
    return tf.Variable(initial)
def conv2d(x, W, stride):#将矩阵X与卷积核W作步长为stride的卷积运算。
    return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
def max_pool_2x2(x):#对矩阵X作最大池化处理。
    return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
def createNetwork():# network weights创建卷积网络的结构

    #定义第1（2、3）个卷积层的权值和偏置
    W_conv1 = weight_variable([8, 8, 4, 32])
    b_conv1 = bias_variable([32])
    W_conv2 = weight_variable([4, 4, 32, 64])
    b_conv2 = bias_variable([64])
    W_conv3 = weight_variable([3, 3, 64, 64])
    b_conv3 = bias_variable([64])

    #定义第1（2）个全连接层的权值和偏置
    W_fc1 = weight_variable([1600, 512])
    b_fc1 = bias_variable([512])
    W_fc2 = weight_variable([512, ACTIONS])
    b_fc2 = bias_variable([ACTIONS])
    # input layer定义输入层
    s = tf.placeholder("float", [None, 80, 80, 4])
    # hidden layers
    h_conv1 = tf.nn.relu(conv2d(s, W_conv1, 4) + b_conv1)#计算第1个卷积层的输出
    h_pool1 = max_pool_2x2(h_conv1)#计算第1个池化层的输出
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2, 2) + b_conv2)
    #h_pool2 = max_pool_2x2(h_conv2)
    h_conv3 = tf.nn.relu(conv2d(h_conv2, W_conv3, 1) + b_conv3)
    #h_pool3 = max_pool_2x2(h_conv3)
    #h_pool3_flat = tf.reshape(h_pool3, [-1, 256])
    h_conv3_flat = tf.reshape(h_conv3, [-1, 1600])#进行张量结构转化，与Flatten层不一样。（Flatten用来将输入压平，即把多维的输入一维化，常用在从卷积层到全连接层的过渡。Flatten不影响batch的大小）
    h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)#计算第一个全连接层的输出
    # readout layer
    readout = tf.matmul(h_fc1, W_fc2) + b_fc2#计算卷积网络的最后输出为1*2（上或下坠两个动作）
    #返回当前状态s、输出动作readout、全连接输出层h_fc1
    return s, readout, h_fc1
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function进行强化学习的训练函数
    #placeholder（type数据类型，shape数据维度，name）
    a = tf.placeholder("float", [None, ACTIONS])#定义动作结构

y = tf.placeholder("float", [None])#定义目标Q值结构

    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1)#计算动作Q值
    cost = tf.reduce_mean(tf.square(y - readout_action))#计算动作Q值得均方误差
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)#调用Adam优化算法
    #调用Optimizer优化器的作用：用来更新和计算影响模型训练和模型输出的网络参数，使其逼近或达到最优值，从而最小化或最大化损失函数。
    # open up a game state to communicate with emulator加载游戏环境
    game_state = game.GameState()
    # store the previous observations in replay memory创建一个双向队列即经验池。
    D = deque()
    # printing
    a_file = open("logs_" + GAME + "/readout.txt", 'w')
    h_file = open("logs_" + GAME + "/hidden.txt", 'w')
    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)#初始化游戏动作数组
    do_nothing[0] = 1#定义什么都不做的动作
    x_t, r_0, terminal = game_state.frame_step(do_nothing)#FB执行动作后的返回值
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)#通过调用opencv将当前游戏界面归一化处理
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)#对归一化结果二值化
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)#将二值化结果转换为四通道状态
    # saving and loading networks构造训练过程的存取对象
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())#初始化所有参数
    checkpoint = tf.train.get_checkpoint_state("saved_networks")#读取已保存的网络参数
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)#恢复已保存的网络参数
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")
    # start training
    epsilon = INITIAL_EPSILON#随机初始化所选择动作的概率，缺省值为INITIAL_EPSILON
    t = 0
    while "flappy bird" != "angry bird":
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s : [s_t]})[0]#输入当前状态计算Q值
        a_t = np.zeros([ACTIONS])#初始化游戏动作数组
        action_index = 0
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= epsilon:#以epsilon为概率随机选择动作
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1#随机选择动作
            else:
                action_index = np.argmax(readout_t)#计算最大Q值的索引（argmax）
                a_t[action_index] = 1#根据索引找到相应的最优动作
        else:
            a_t[0] = 1 # do nothing
        # scale down epsilon通过减小epsilon来降低随机选择动作的概率，使模型稳定下来
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)#FB执行该动作后的返回值
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY)#归一化
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)#二值化
        x_t1 = np.reshape(x_t1, (80, 80, 1))#把二值化结果转化成单通道数据
        #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)#生成新的四通道状态
        # store the transition in D     扩充经验池
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY: #经验池大小超过REPLAY_MEMORY
            D.popleft() #删除保留最早的经验样本
        # only train if done observing
        if t > OBSERVE: #FB与环境交互的次数超过预定观察值
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH) #从经验池中随机选择BATCH=32个经验样本
            # get the batch variables
            #从32个经验样本读取当前状态、当前状态的动作、对32个动作的奖励值
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]

            #读取32个动作的相应后续状态
            s_j1_batch = [d[3] for d in minibatch]
            y_batch = [] #定义目标Q值列表
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) #计算后续状态的Q值
            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                # if terminal, only equals reward
                if terminal: #判断游戏是否结束
                    y_batch.append(r_batch[i]) #把结束时的动作奖赏值作为Q值
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) #计算目标Q值
            # perform gradient step
            train_step.run(feed_dict = {
                #运行梯度下降算法优化卷积网络的权值和偏置
y : y_batch,
                a : a_batch,
                s : s_j_batch}
            )
        # update the old values
        s_t = s_t1
        t += 1
        # save progress every 10000 iterations    FB与环境每交互10000就保存一次网络参数
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)
        # print info observe explore train的三个状态信息的打印
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print("TIMESTEP", t, "/ STATE", state, \ #TIMESTEP为与环境交互的次数
            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
            "/ Q_MAX %e" % np.max(readout_t))
        # write info to files
        '''
        if t % 10000 <= 100:
            a_file.write(",".join([str(x) for x in readout_t]) + '\n')
            h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n')
            cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1)
        '''
def playGame():
    sess = tf.InteractiveSession()
    s, readout, h_fc1 = createNetwork()
    trainNetwork(s, readout, h_fc1, sess)
def main():
    playGame()
if __name__ == "__main__":
    main()

#图像灰度化和二值化的作用参考：https://blog.csdn.net/chenyuping333/article/details/81483593
#卷积网络模块的作用是根据当前状态来预测当前应该执行的最佳动作，其中当前状态是用4帧连续的游戏窗口图像来表示的，最佳动作是指Q值最大的动作。
#强化学习模块的作用是对卷积网络的权值和偏置进行学习训练，使其实现一个从游戏当前状态到动作Q值的映射。
#Flappy Bird的学习训练过程比较简单，折扣因子GAMMA取为0.99，然后从经验池中随机选择32个样本（样本中包括当前状态Sti，当前状态下的动作Ati，动作对应的奖励Rti和下一个状态Sti+1）估计32个Q值Q（St1，At1）（i=1，2，……，32），估计公式为Q（St，At）<——（R_t+1）+0.99*max_A' Q(S_t+1,A')。
#其中，最后32个Q值用来近似计算梯度，并根据梯度下降算法更新网络参数Theta。

疑问：
1.他在每次与环境交互后会在经验池中存入样本，那么他目前的动作与训练就不是实时的，而是在训练时从经验池中取出先前的样本然后再计算Q值，然后通过经验池中随机选取的这32个样本得到的Q值通过梯度下降算法来更新神经网络的参数。

与小车代码的连接：
小车的自动驾驶是通过收集大量数据记录标签经过训练后得到模型，在自动驾驶时通过摄像头实时返回的照片与之前模型中的前后左右所分类的特征进行比对，然后找到概率最大的操作来使小车行进。

目标追踪：
我们在摄像头的引用方面可以参照小车中的相关使用，但在FBDQL中，游戏本身有一个判断机制：无碰撞无穿越、穿越正对管道间隙、发生碰撞游戏结束，这三种情况可以对应三种动作奖赏值，从而可以引入强化学习模型，但是在目标追踪中摄像头自身没有判断机制，或者说我们应该加一个判断机制，比如object刚好在正中央，我们的reward+1，object不在范围内-1，其他虽然在视野范围内但与中心偏离的加+0.1，是否需要另一个模型来提取object是否在正中央的特征，且距离的远近也关乎着小车的前进与后退，我们如何通过该模型解决距离问题？

代码转载自https://github.com/yenchenlin/DeepLearningFlappyBird

转载于:https://www.cnblogs.com/yy1921rz/p/10963905.html