深度强化学习——DDPG算法复现

深度强化学习——DDPG算法复现

DDPG(Deep Deterministic Policy Gradient)是一种基于策略的深度强化学习算法,于2016年由Timothy P. Lillicrap等人提出,论文:“Timothy P. Lillicrap,Jonathan J. Hunt,and Alexander Pritzel et al, CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING,2016”,能够解决Actor-critic收敛速度慢的问题,现在已成为经典的深度强化学习算法。具体原理参见论文。也可参考博客:
https://blog.csdn.net/qq_30615903/article/details/80776715?ops_request_misc=&request_id=&biz_id=102&utm_term=DDPG&utm_medium=distribute.pc_search_result.none-task-blog-2allsobaiduweb~default-0-80776715

算法框架如下:
在这里插入图片描述
实现场景

这里以gym中的游戏“Pendulum-v0”为例,复现论文中的算法。“Pendulum-v0”是简单的杠杆平衡游戏,目标是使杠杆最后能够停留在顶端保持平衡。
在这里插入图片描述
算法平台
算法已经修改成了能够在windows上运行,编译软件是Vscode,python3.6.2,tensorflow1.4.0.

算法实现

导入相应的包

import os
import tensorflow as tf
import numpy as np 

噪声

class OUActionNoise(object):
    def __init__(self,mu,sigma=0.15,theta=0.2,dt=1e-2,x0=None):
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.sigma = sigma
        self.x0 = x0
        self.reset()

    def __call__(self):
        # noise = OUActionNoise
        x = self.x_prev + self.theta*(self.mu - self.x_prev)*self.dt +\
            self.sigma*np.sqrt(self.dt)*np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

经验池

class ReplayBuffer(object):
    def __init__(self,max_size,input_shape,n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size,*input_shape))
        self.new_state_memory = np.zeros((self.mem_size,*input_shape))
        self.action_memory = np.zeros((self.mem_size,n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size,dtype=np.float32)

    def store_transition(self,state,action,reward,state_,done):
        index = self.mem_cntr%self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = 1-int(done)
        self.mem_cntr += 1

    def sample_buffer(self,batch_size):
        max_mem = min(self.mem_cntr,self.mem_size)
        batch = np.random.choice(max_mem,batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        new_states = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        terminal = self.terminal_memory[batch]

        return states,actions,new_states,rewards,terminal

Actor

class Actor(object):
    def __init__(self,lr,n_actions,name,input_dims,sess,fc1_dims,
                 fc2_dims,action_bound,batch_size=64,chkpt_dir = "DDPG_paper_reproduction"):
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.name = name
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.sess = sess
        self.batch_size = batch_size
        self.action_bound = action_bound
        self.ckpt_dir = chkpt_dir
        self.build_network()
        self.params = tf.trainable_variables(scope=self.name)
        self.saver = tf.train.Saver()
        self.checkpoint_file = os.path.join(chkpt_dir,name+'_ddpg.ckpt')

        self.unnormalized_actor_gradients = tf.gradients(self.mu,self.params,
                             -self.action_gradient) 
        self.actor_gradients = list(map(lambda x: tf.div(x,self.batch_size),
                                 self.unnormalized_actor_gradients))
        self.optimize = tf.train.AdamOptimizer(self.lr).\
            apply_gradients(zip(self.actor_gradients,self.params))

    def build_network(self):
        with tf.variable_scope(self.name):
            self.input = tf.placeholder(tf.float32,
                                        shape=[None,*self.input_dims],
                                        name = 'inputs')
            self.action_gradient = tf.placeholder(tf.float32,
                                                    shape = [None,self.n_actions])
            f1 = 1/np.sqrt(self.fc1_dims)
            dense1 = tf.layers.dense(self.input,units=self.fc1_dims,
                                        kernel_initializer=tf.random_uniform_initializer(-f1,f1),
                                        bias_initializer=tf.random_uniform_initializer(-f1,f1))
            batch1 = tf.layers.batch_normalization(dense1)
            layer1_activation = tf.nn.relu(batch1)

            f2 = 1/np.sqrt(self.fc2_dims)
            dense2 = tf.layers.dense(layer1_activation,units=self.fc2_dims,
                                        kernel_initializer=tf.random_uniform_initializer(minval=-f2,maxval=f2),
                                        bias_initializer=tf.random_uniform_initializer(minval=-f2,maxval=f2))
            batch2 = tf.layers.batch_normalization(dense2)
            layer2_activation = tf.nn.relu(batch2)

            f3 = 0.003
            mu = tf.layers.dense(layer2_activation,units=self.n_actions,
                                        # activation = 'tanh',
                                        kernel_initializer=tf.random_uniform_initializer(minval=-f3,maxval=f3),
                                        bias_initializer=tf.random_uniform_initializer(-f3,f3))
            mu = tf.nn.tanh(mu)
            self.mu = tf.multiply(mu,self.action_bound)

    def predict(self,inputs):
        return self.sess.run(self.mu,feed_dict={self.input:inputs})

    def train(self,inputs,gradients):
        self.sess.run(self.optimize,
                        feed_dict = {self.input:inputs,
                                    self.action_gradient:gradients})

    def save_checkpoint(self):
        print('...saving checkpoint...')
        self.saver.save(self.sess,self.checkpoint_file)

    def load_checkpoint(self):
        print('...loading checkpoint...')
        self.saver.restore(self.sess,self.checkpoint_file)

Critic

class Critic(object):
    def __init__(self,lr,n_actions,name,input_dims,sess,fc1_dims,
                 fc2_dims,batch_size=64,chkpt_dir = "DDPG_paper_reproduction"):
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.name = name
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.sess = sess
        self.batch_size = batch_size
        self.ckpt_dir = chkpt_dir
        self.build_network()
        self.params = tf.trainable_variables(scope=name)
        self.saver = tf.train.Saver()
        self.checkpoint_file = os.path.join(chkpt_dir,name+'_ddpg.ckpt')

        self.optimize = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        self.action_gradients = tf.gradients(self.q,self.actions)

    def build_network(self):
        with tf.variable_scope(self.name):
            self.input = tf.placeholder(tf.float32,
                                        shape=[None,*self.input_dims],
                                        name = 'inputs')
            self.actions = tf.placeholder(tf.float32,
                                         shape = [None,self.n_actions],
                                         name='actions')
            self.q_target = tf.placeholder(tf.float32,
                                         shape = [None,1],
                                         name='targets')
            f1 = 1/np.sqrt(self.fc1_dims)
            dense1 = tf.layers.dense(self.input,units=self.fc1_dims,
                                     kernel_initializer=tf.random_uniform_initializer(-f1,f1),
                                    bias_initializer=tf.random_uniform_initializer(-f1,f1))
            batch1 = tf.layers.batch_normalization(dense1)
            layer1_activation = tf.nn.relu(batch1)

            f2 = 1/np.sqrt(self.fc2_dims)
            dense2 = tf.layers.dense(layer1_activation,units=self.fc2_dims,
                                     kernel_initializer=tf.random_uniform_initializer(-f2,f2),
                                     bias_initializer=tf.random_uniform_initializer(-f2,f2))
            batch2 = tf.layers.batch_normalization(dense2)

            action_in = tf.layers.dense(self.actions,units=self.fc2_dims
                                        )# activation='relu'
            action_in = tf.nn.relu(action_in)

            state_actions = tf.add(batch2,action_in)
            state_actions = tf.nn.relu(state_actions)

            f3 = 0.003
            self.q = tf.layers.dense(state_actions,units=1,
                                     kernel_initializer=tf.random_uniform_initializer(-f3,f3),
                                     bias_initializer=tf.random_uniform_initializer(-f3,f3),
                                     kernel_regularizer=tf.keras.regularizers.l2(0.01))
            self.loss = tf.losses.mean_squared_error(self.q_target,self.q)

    def predict(self,inputs,actions):
        return self.sess.run(self.q,
                                feed_dict={self.input:inputs,
                                        self.actions:actions})

    def train(self,inputs,actions,q_target):
        self.sess.run(self.optimize,
                        feed_dict = {self.input:inputs,
                                    self.actions:actions,
                                    self.q_target:q_target})

    def get_action_gradients(self,inputs,actions):
        return self.sess.run(self.action_gradients,
                                feed_dict={self.input:inputs,
                                        self.actions:actions})

    def save_checkpoint(self):
        print('...saving checkpoint...')
        self.saver.save(self.sess,self.checkpoint_file)

    def load_checkpoint(self):
        print('...loading checkpoint...')
        self.saver.restore(self.sess,self.checkpoint_file)

Agent

class Agent(object):
    def __init__(self,alpha,beta,input_dims,tau,env,gamma=0.99,
                 n_actions=2,max_size=100000,layer1_size=400,layer2_size=300,
                 batch_size=64):
        self.gamma=gamma
        self.tau=tau
        self.memory = ReplayBuffer(max_size,input_dims,n_actions)
        self.batch_size = batch_size
        self.sess = tf.Session()
        self.actor = Actor(alpha,n_actions,'Actor',input_dims,self.sess,
                            layer1_size,layer2_size,env.action_space.high)

        self.critic = Critic(beta,n_actions,'Critic',input_dims,self.sess,
                             layer1_size,layer2_size)

        self.target_actor = Actor(alpha,n_actions,'TargetActor',input_dims,
                                  self.sess,layer1_size,layer2_size,
                                  env.action_space.high)

        self.target_critic = Critic(beta,n_actions,'TargetCritic',input_dims,
                                    self.sess,layer1_size,layer2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_critic = \
            [self.target_critic.params[i].assign(
                tf.multiply(self.critic.params[i],self.tau)\
            +tf.multiply(self.target_critic.params[i],1-self.tau))
            for i in range(len(self.target_critic.params))]

        self.update_actor = \
            [self.target_actor.params[i].assign(
                tf.multiply(self.actor.params[i],self.tau)\
            +tf.multiply(self.target_actor.params[i],1-self.tau))
            for i in range(len(self.target_actor.params))]

        self.sess.run(tf.global_variables_initializer())

        self.update_network_parameters(first=True)

    def update_network_parameters(self,first=False):
        if first:
            old_tau = self.tau
            self.tau = 1.0
            self.target_critic.sess.run(self.update_critic)
            self.target_actor.sess.run(self.update_actor)
            self.tau = old_tau
        else:
            self.target_critic.sess.run(self.update_critic)
            self.target_actor.sess.run(self.update_actor)

    def remember(self,state,action,reward,new_state,done):
        self.memory.store_transition(state,action,reward,new_state,done)

    def choose_action(self,state):
        state = state[np.newaxis,:]
        mu = self.actor.predict(state)
        noise = self.noise()
        mu_prime = mu + noise

        return mu_prime[0]

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state,action,new_state,reward,done = \
            self.memory.sample_buffer(self.batch_size)

        critic_value = self.target_critic.predict(new_state,
                                    self.target_actor.predict(new_state))

        target = []
        for j in range(self.batch_size):
            target.append(reward[j]+self.gamma*critic_value[j]*done[j])
        target = np.reshape(target,(self.batch_size,1))

        _ = self.critic.train(state,action,target)

        a_outs = self.actor.predict(state)
        grads = self.critic.get_action_gradients(state,a_outs)
        self.actor.train(state,grads[0])

        self.update_network_parameters()

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

#测试,调用gym模块和前面定义的类

from ddpg_tf_orig import Agent #前面定义的类封装在ddpg_tf_orig.py中
import numpy as np 
import gym


if __name__ == '__main__':
    env = gym.make('Pendulum-v0')
    obs = env.reset()
    agent = Agent(alpha=0.0001,beta=0.001,input_dims=[3],tau=0.001,env=env,
                batch_size=64,layer1_size=400,layer2_size=300,n_actions=1)

    score_history = []
    np.random.seed(0)
    for i in range(1000):
        obs = env.reset()
        env.render()
        done = False
        score = 0
        while not done:
            act = agent.choose_action(obs)
            new_state,reward,done,info = env.step(act)
            agent.remember(obs,act,reward,new_state,int(done))
            agent.learn()
            score += reward
            obs = new_state
        score_history.append(score)
        print('episode',i,'score % .2f' % score,
              '100 game average % .2f' % np.mean(score_history[-100:]))

结束

  • 3
    点赞
  • 27
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值