DDPG的流程及tensorflow实现

运行游戏获取当前的状态、动作、回报、下一时刻的状态、游戏结束的标志
根据当前的状态得到目标Q值和当前动作对应的Q指标
根据下一个状态得到其所对应的Q值并选择最大的Q值
得到目标Q值,Q=当前的回报+下一个状态的最大Q值打个折,如果游戏结束就只是当前的回报
将目标Q值和当前状态所对应的Q值做损失
运行游戏选择Q值最大的动作
将采集到的经验按照数量再还回到经验池中去用以更新经验池

import tensorflow as tf
import gym
import numpy as np
import time

#负责估计值
class QNet:

    def __init__(self):

        self.w1 = tf.Variable(tf.truncated_normal([4, 30],stddev=0.1))
        self.b1 = tf.Variable(tf.zeros([30]))

        self.w2 = tf.Variable(tf.truncated_normal([30, 30],stddev=0.1))
        self.b2 = tf.Variable(tf.zeros(30))

        self.w3 = tf.Variable(tf.truncated_normal([30, 2], stddev=0.1))
        self.b3 = tf.Variable(tf.zeros(2))


    def forward(self,observation):
        y = tf.nn.relu(tf.matmul(observation,self.w1)+self.b1)
        y = tf.nn.relu(tf.matmul(y, self.w2) + self.b2)
        y = tf.matmul(y, self.w3) + self.b3

        return y
#负责真实值
class TargetQNet:

    def __init__(self):
        self.w1 = tf.Variable(tf.truncated_normal([4, 30],stddev=0.1))
        self.b1 = tf.Variable(tf.zeros([30]))

        self.w2 = tf.Variable(tf.truncated_normal([30, 30],stddev=0.1))
        self.b2 = tf.Variable(tf.zeros(30))

        self.w3 = tf.Variable(tf.truncated_normal([30, 2], stddev=0.1))
        self.b3 = tf.Variable(tf.zeros(2))

    def forward(self,next_observation):
        y = tf.nn.relu(tf.matmul(next_observation,self.w1)+self.b1)
        y = tf.nn.relu(tf.matmul(y, self.w2) + self.b2)
        y = tf.matmul(y, self.w3) + self.b3

        return y

class Net:

    def __init__(self):
        self.observation = tf.placeholder(dtype=tf.float32, shape=[None, 4])
        self.action = tf.placeholder(dtype=tf.int32,shape=[None,1])
        self.reward = tf.placeholder(dtype=tf.float32,shape=[None,1])
        self.next_observation = tf.placeholder(dtype=tf.float32, shape=[None, 4])
        self.done = tf.placeholder(dtype=tf.bool, shape=[None])

        self.qNet = QNet()
        self.targetQNet = TargetQNet()

    def forward(self, discount):
        #根据当前状态得到Q值(两个)
        self.pre_qs = self.qNet.forward(self.observation)
        #选择当前动作对应的Q值
        self.pre_q = tf.expand_dims(tf.reduce_sum(tf.multiply(tf.squeeze(tf.one_hot(self.action,2),axis=1),self.pre_qs),axis=1),axis=1)

        #根据下一个状态得到Q(t+1)
        self.next_qs = self.targetQNet.forward(self.next_observation)
        #选择最大的Q值maxQ(t+1)
        self.next_q = tf.expand_dims(tf.reduce_max(self.next_qs,axis=1),axis=1)

        #得到目标Q值。如果是最后一步,只用奖励,否则Q(t)=r(t)+dis*maxQ(t+1)
        self.target_q = tf.where(self.done,self.reward,self.reward + discount * self.next_q)


    def play(self):
        self.qs = self.qNet.forward(self.observation)
        #最大那个Q值的索引就是最大Q值对应的动作
        return tf.argmax(self.qs,axis=1)


    def backward(self):
        #目标Q值-预测Q值
        self.loss = tf.reduce_mean((self.target_q - self.pre_q)**2)
        #RMSPropOptimizer优化器比较平稳
        self.optimizer = tf.train.RMSPropOptimizer(0.01).minimize(self.loss)

    def copy_params(self):
        #将Q网络的权重赋予target网络
        return [
            tf.assign(self.targetQNet.w1,self.qNet.w1),
            tf.assign(self.targetQNet.w2, self.qNet.w2),
            tf.assign(self.targetQNet.w3, self.qNet.w3),
            tf.assign(self.targetQNet.b1, self.qNet.b1),
            tf.assign(self.targetQNet.b2, self.qNet.b2),
            tf.assign(self.targetQNet.b3, self.qNet.b3),
        ]

class Game:

    def __init__(self):
        self.env = gym.make('CartPole-v0')

        #用于训练的经验池
        self.experience_pool = []
        #得到游戏初始状态
        self.observation = self.env.reset()

        #创建经验
        for i in range(10000):
            #一开始随机采样(不知道怎么走)
            action = self.env.action_space.sample()
            #将动作传入得到下一时刻的状态回报有没有死等四个值
            next_observation, reward, done, info = self.env.step(action)
            #将这些值放入经验池
            self.experience_pool.append([self.observation, reward, action, next_observation, done])#St,Rt,At,St+1,是否终止
            if done:#done=True表示死了
                #重置游戏
                self.observation = self.env.reset()
            else:

                self.observation = next_observation
    #获取经验
    def get_experiences(self,batch_size):
        experiences = []
        idxs = []
        for _ in range(batch_size):
            #随机取经验(打破相关性)
            idx = np.random.randint(0,len(self.experience_pool))
            idxs.append(idx)#经验序号
            experiences.append(self.experience_pool[idx])#相应的经验
        #idxs是取出经验的序号列表,为了用新的经验替换到老的已训练过的经验
        return idxs,experiences

    def reset(self):
        return self.env.reset()

    def render(self):
        self.env.render()

if __name__ == '__main__':
    game = Game()

    net = Net()
    net.forward(0.9)#打折率0.9
    net.backward()
    copy_op = net.copy_params()
    run_action_op = net.play()#运行游戏

    init = tf.global_variables_initializer()

    with tf.Session()  as sess:
        sess.run(init)

        batch_size = 200#一次取200条经验用来训练

        explore = 0.1#探索值(前期探索值较大,后期较小)
        for k in range(10000000):
            idxs, experiences = game.get_experiences(batch_size)
            #整理数据(方便输入)
            observations = []
            rewards = []
            actions = []
            next_observations = []
            dones = []

            for experience in experiences:
                observations.append(experience[0])
                rewards.append([experience[1]])
                actions.append([experience[2]])
                next_observations.append(experience[3])
                dones.append(experience[4])


            if k % 10 == 0 :
                print("-------------------------------------- copy param -----------------------------------")
                sess.run(copy_op)
                # time.sleep(2)


            _loss,_ = sess.run([net.loss,net.optimizer],feed_dict={
                net.observation: observations,
                net.action: actions,
                net.reward: rewards,
                net.next_observation: next_observations,
                net.done:dones
            })

            explore -= 0.0001
            if explore < 0.0001:
                explore = 0.0001

            print("********************************************", _loss, "********************************",explore)

            count = 0
            run_observation = game.reset()
            #采集多少经验就要还回去多少经验
            for idx in idxs:
                if k > 500:
                    game.render()#训练500次打印图像查看

                #如果随机值小于探索值,就随机选一个动作作为探索
                if np.random.rand() < explore:
                    run_action = np.random.randint(0,2)
                else:#否则就选Q值最大的那个动作
                    run_action = sess.run(run_action_op, feed_dict={
                        net.observation: [run_observation]
                    })[0]
                #得到新的经验
                run_next_observation, run_reward, run_done, run_info = game.env.step(run_action)
                #覆盖经验池里面的旧的经验
                game.experience_pool[idx] = [run_observation, run_reward, run_action, run_next_observation,run_done]
                if run_done:
                    run_observation = game.reset()
                    count+=1
                else:
                    run_observation = run_next_observation
            print("done .......................", count)

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值