【demo】DDQN玩gym之立木杆

import gym
from contents.OpenAI_gym.my_rl_brain import Double_DQN_Agent

my_env = gym.make("CartPole-v0")

my_env = my_env.unwrapped


print("observation:",my_env.observation_space)
print("observation_max",my_env.observation_space.high)
print("observation_mim",my_env.observation_space.low)
print("actions:",my_env.action_space)

my_agent = Double_DQN_Agent(dim_observation=my_env.observation_space.shape[0],
                            dim_action=my_env.action_space.n,
                            reward_decay=0.95,
                            learning_rate=0.05,
                            bacth_size=32,
                            memory_size=800,
                            epsilon=0.7,
                            output_graph=True)

total_steps = 0

for i in range(1000):

    i = my_agent.episode

    observation = my_env.reset()

    epi_reward = 0

    while True:

        my_env.render()

        action = my_agent.choose_action(observation)

        observation_, reward, done, info  = my_env.step(action)

        # 构造reward
        x, x_dot, theta, theta_dot = observation_
        r1 = (my_env.x_threshold - abs(x)) / my_env.x_threshold - 0.8
        r2 = (my_env.theta_threshold_radians - abs(theta)) / my_env.theta_threshold_radians - 0.5
        reward = 0.6*r1 + 0.4*r2

        epi_reward = epi_reward + reward

        my_agent.store_trainsition(observation,action,reward,observation_)

        total_steps = total_steps + 1

        if total_steps > 1000:
            my_agent.learning()

        if done:
            break

        observation = observation_

    my_agent.episode = my_agent.episode + 1

    print("当前第%s轮episode,总reward为%s,当前轮的贪婪值:%s" % (i, epi_reward, my_agent.epsilon))

my_agent.plot_cost()








import tensorflow as tf
import numpy as np
import pandas as pd
import random


class Double_DQN_Agent:

    def __init__(self,dim_observation,dim_action,reward_decay,learning_rate,bacth_size,memory_size,epsilon,output_graph):
        self.dim_observation = dim_observation
        self.dim_action = dim_action
        self.gamma = reward_decay
        self.memory_size = memory_size
        self.learning_rate = learning_rate
        self.batch_size = bacth_size
        self.episode = 0
        self.epsilon = epsilon

        self.memory = np.zeros((memory_size,dim_observation*2+2))

        print("dim_observation",dim_observation)
        print("dim_action",dim_action)

        self.trainsition_count = 0

        num_steps_sum  = 0

        self.target_params = tf.get_collection("target_network_para")
        self.evaluate_params = tf.get_collection("evaluate_network_para")

        self.sess = tf.Session()

        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self.cost_list = []

        self.episode_count = 0

        self.epsilon_base = epsilon

        self.build_network()

        self.sess.run(tf.global_variables_initializer())

        self.step_counter = 0




    def replace_target(self):
        # 把target_work 参数 赋值给 evaluate_network
        for target,evaluate in zip(self.target_params,self.evaluate_params):
            tf.assign(target,evaluate)


    def build_network(self):

        #--------------------------------------- 定义 evaluate_network------------------------------------------------

        # 定义输入
        self.state = tf.placeholder(tf.float32,[None,self.dim_observation],name="input_state")
        self.q_target = tf.placeholder(tf.float32,[None,self.dim_action],name = "input_q_target")

        # 定义网络结构和参数集合

        with tf.variable_scope("evaluate_network"):
            c_names,n_l1,w_initializer,b_initializer = ["evaluate_network_para",tf.GraphKeys.GLOBAL_VARIABLES],\
                                                       10,\
                                                       tf.random_normal_initializer(0.0,0.3),\
                                                       tf.constant_initializer(0.2)

            with tf.variable_scope("l1"):
                w1 = tf.get_variable("w1", [self.dim_observation,n_l1],initializer=w_initializer,collections=c_names)
               #w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable("b1",[1,n_l1],initializer=b_initializer,collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.state,w1) + b1)

            with tf.variable_scope("l2"):
                w2 = tf.get_variable("w2",[n_l1,self.dim_action],initializer=w_initializer,collections=c_names)
                b2 = tf.get_variable("b2",[1,self.dim_action],initializer=b_initializer,collections=c_names )
                self.q_eval = tf.matmul(l1,w2) + b2

        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_eval,self.q_target))

        with tf.variable_scope("train"):
            self.train_op = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

        #--------------------------------------- 定义 target_network------------------------------------------------

        self.state_next = tf.placeholder(tf.float32,[None,self.dim_observation],name = "input_next_state")
        with tf.variable_scope("target_network"):
            c_names,n_l1,w_initializer,b_initializer = ["target_network_para",tf.GraphKeys.GLOBAL_VARIABLES],10,tf.random_normal_initializer(0.,0.3),tf.constant_initializer(0.2)

            with tf.variable_scope("l1"):
                w1 = tf.get_variable("w1", [self.dim_observation, n_l1], initializer=w_initializer,
                                     collections=c_names)
                b1 = tf.get_variable("b1", [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.state_next,w1) + b1)

            with tf.variable_scope("l2"):
                w2 = tf.get_variable("w1", [n_l1, self.dim_action], initializer=w_initializer,
                                     collections=c_names)
                b2 = tf.get_variable("b2", [1, self.dim_action], initializer=b_initializer, collections=c_names)
                self.q_next = tf.matmul(l1,w2) + b2


    def store_trainsition(self,s,a,r,s_):
        item = np.hstack((s,[a,r],s_))
        index = self.trainsition_count%self.memory_size
        self.memory[index,:] = item
        self.trainsition_count = self.trainsition_count + 1

    def choose_action(self,observation):
        random_value = random.random()
        observation = observation[np.newaxis,:]
        if random_value < self.epsilon: # 举例: 90%的概率选择 Qmax的action
            action_values = self.sess.run(self.q_eval,feed_dict={self.state:observation})
       #    action_values = self.sess.run(self.q_eval,feed_dict={self.s    :observation})

            action = np.argmax(action_values)
        else:
            action = np.random.randint(0,self.dim_action)
        return action


    def learning(self):


        if self.episode>90:
            self.epsilon = 1-(1-self.epsilon_base)*(400/(400+self.episode))
        else:
            self.epsilon = self.epsilon_base

        #print("当前transition数量:%s"%self.trainsition_count)

        if self.trainsition_count<self.batch_size:
            print("经验不足,继续探索")


        if self.step_counter%50 ==0:
            self.replace_target()
            #print("参数覆盖!")


        if self.batch_size<=self.trainsition_count and self.trainsition_count<=self.memory_size:
            #print("self.trainsition_count",self.trainsition_count)
            sample_index = np.random.choice(self.trainsition_count, size=self.batch_size)
        else:
            #print("self.memory_size",self.memory_size)
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)

        batch = self.memory[sample_index, : ]

        #  bacth中每一行的格式为: s,a,r,s_
        #  学习过程:根据batch,拆成 s a r s_
        #          (1)  s 结合 实时更新的evaluate_network 生成 q_evaluate
        #         (2)  根据r ,以及,s_结合迟滞更新的target_network得到max的q_next,得到q_target = r + gamma*q_next

        s = batch[:,:self.dim_observation]
        r = batch[:,self.dim_observation+1]
        s_ = batch[:,-self.dim_observation:]
        a = batch[:,self.dim_observation].astype(int)


        q_next,q_evaluate = self.sess.run([self.q_next,self.q_eval],feed_dict={self.state_next:s_,self.state:s})

        # 接下来计算q_target,用q_target来计算self.loss ,对q_target = r + gamma*q_next进行改写
        q_target = q_evaluate.copy()

        q_target[:,a] = r+self.gamma*np.max(q_next,axis = 1)

        _,cost = self.sess.run([self.train_op,self.loss],feed_dict={self.q_target:q_target,self.state:s})  # 只针对evaluate_network训练


        #print("第 %s 轮的cost:%s ,当前epsilon大小为%s"%(self.episode,cost,self.epsilon))



        self.cost_list.append(cost)


    def plot_cost(self):
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.cost_list)), self.cost_list)
        plt.ylabel('Cost')
        plt.xlabel('training steps')
        plt.show()


































  • 明确一点,DQN的input是连续的observation。action的数量是固定的,离散的,已经embedding到网络结构当中了。也就是说DQN处理的是:state连续,action离散且固定的一类强化学习问题。
  • Double-DQN当中有两个网络:evaluate-network 和 target-network。
    • loss也就是error来自于:q_evaluate-(r+gamma*q_next)
    • 得到的loss第一时间更新“用来生成q_evaluate的evaluate-network”,几个时间步后通过参数覆盖的方式迟滞更新target-network
  • DQN是不存在收敛的,只不过震动幅度大大缩小【直觉】
  • DQN是一种“值函数近似方法”。值函数近似方法指的是:对于一个指定的state,可以由函数计算得到该状态的state-value以及action-state-value。
    • 问题来了:当同一个state对应的action非常多的时候,网络非常难以训练,再极端一点,动作是连续的无限的,这种方法彻底失效了。此时需要policy-gradient方法。
    • 基于value策略的问题的思想都是:从众多行为价值中选择一个 最大价值的行为,这是一个确定性策略。但是很多时候最优策略是随机策略,于是基于value的策略失效了。
    • 基于value的强化学习可以解决很多问题,但是当“action连续/observation部分可见/随机策略比最优策略更合适”等问题出现时,value-base就不太行了。【于是,才引入了policy-based方法】
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值