【强化学习】基于A3C的好奇心Agent in OpenAI-Gym

import tensorflow as tf
import pandas as pd
import numpy as np
from config.config import GLOBAL_NET_SCOPE,ENTROPY_BETA,MAX__EPISODE,MAX_STEP_IN_EPISODE,UPDATE_ITER
from config.config import plus_global_episode_count,reset_global_episode_count,get_global_episode_count
from config.env_setup import take_action,reset_env,load_gym_env,show_or_not


class ACNetwork(object):

    def __init__(self,scope,global_ACNet = None,session = None,
                 optimizer_list = None,n_s = None ,n_a =None):
        # 初始化一个global_network ————只有state输入&基础前馈架构,不需要训练
        self.sess = session
        self.n_s = n_s
        self.n_a = n_a

        if scope == GLOBAL_NET_SCOPE:
            with tf.variable_scope(scope):
                self.action = tf.placeholder(tf.int32 ,[None,],'Action_input')
                self.state = tf.placeholder(tf.float32 ,[None,self.n_s] ,'State_input' )
                _,_, self.global_public_params, self.global_actor_params,\
                self.global_critic_params,self.global_predictor_params,_= self._build_net(scope)
        else:
            if optimizer_list : # 针对 actor critic 以及共享层 的 优化器
                self.OPT_Actor  = optimizer_list[0]
                self.OPT_Critic = optimizer_list[1]
                self.OPT_Shared = optimizer_list[2]
            else:
                print("优化器未定义!!")
            with tf.variable_scope(scope):
                #  local_network需要训练 所以定义如下几个输入
                self.state = tf.placeholder(tf.float32 ,[None,self.n_s] ,'State_input')
                self.action = tf.placeholder(tf.int32 ,[None,],'Action_input')
                self.v_target = tf.placeholder(tf.float32,[None,1],'Value_target')
                self.state_next = tf.placeholder(tf.float32,[None,self.n_s],'State_next')

                self.action_pro,self.v_estimated,self.local_public_params,\
                self.local_actor_params,self.local_critic_params ,\
                self.local_predictor_params,self.state_predicted = self._build_net(scope)

                # 定义 critic_network loss
                with tf.name_scope('critic_loss'):
                    self.td_error = tf.subtract(self.v_target,self.v_estimated ,name = "td_error")
                    self.critic_loss = tf.reduce_mean(tf.square(self.td_error))
                    self.critic_grad = tf.gradients(self.critic_loss ,self.local_critic_params)
                # 定义actor_network loss =
                with tf.name_scope('actor_loss'):
                    self.advantage = self.td_error
                    # axis = 1计算行向量的平均值
                    prob = tf.reduce_sum( self.action_pro * tf.one_hot(self.action,self.n_a,dtype=tf.float32) , axis=1 , keepdims=True)
                    # 如果使用 exp_v = log_prob*tf.advantage ,那么在BP的时候:
                    # actor网络中的experience ,会通过BP对critic的网络参数进行更新
                    # 所以在这一步,要把advantage进行截断(从而不对advantage的参数产生影响)
                    experience = prob*tf.stop_gradient(self.advantage)
                    """
                    actor网络的更新是为了增大 p(action)*advantage。我们在这个引入一个trick,把增大的目标改为 p(action)*advantage + entropy
                    背后的含义是:一味增大原有的目标会使得我们越来越专注于某个固定的动作,从而减弱对环境的探索。如果我们在增大目标的同时增大一个“动作分布的熵”
                    就一定程度上更大范围的探索了环境
                    """
                    entropy = -tf.reduce_sum(self.action_pro*tf.log(self.action_pro),axis=1,keepdims=True)
                    self.actor_loss = ENTROPY_BETA*entropy - tf.reduce_mean(experience)
                    #self.actor_loss  = -tf.reduce_mean(experience)
                    self.actor_grad = tf.gradients(self.actor_loss,self.local_actor_params)
                with tf.name_scope('shared_loss'):
                    self.public_grads_from_actor = tf.gradients(self.actor_loss , self.local_public_params)
                    self.public_grads_from_critic = tf.gradients(self.critic_loss , self.local_public_params)
                with tf.name_scope('predictor_loss'):
                    self.predictor_loss = tf.losses.mean_squared_error(self.state_predicted,self.state_next)
                    self.predictor_grad = tf.gradients(self.predictor_loss,self.local_predictor_params)
                #每个local_network还具有 “push经验给global_network” ,“从global_network中pull参数给自己”的功能
                with tf.name_scope('sync'): # 从global_network中pull参数给自己
                    with tf.name_scope('pull'):
                        #  x.assing(y)   把y赋值给x
                        self.pull_actor_params_op = [ local_params.assign(global_params) for local_params,global_params in zip(self.local_actor_params,global_ACNet.global_actor_params)]
                        self.pull_critic_params_op = [ local_params.assign(global_params) for local_params,global_params in zip(self.local_critic_params,global_ACNet.global_critic_params)]
                        self.pull_shared_params_op = [ local_params.assign(global_params) for local_params,global_params in zip(self.local_public_params,global_ACNet.global_public_params)]
                        self.pull_predictor_params_op = [ local_params.assign(global_params) for local_params,global_params in zip(self.local_predictor_params,global_ACNet.global_predictor_params)]
                    with tf.name_scope('push'):   # push经验给global_network 即 用local的梯度更新global
                        # OPA_AC 是针对不同AC设计的优化器
                        self.update_global_actor  = self.OPT_Actor.apply_gradients(zip(self.actor_grad ,global_ACNet.global_actor_params))
                        self.update_global_critic = self.OPT_Critic.apply_gradients(zip(self.critic_grad ,global_ACNet.global_critic_params))
                        self.update_global_shared_from_actor = self.OPT_Shared.apply_gradients(zip(self.public_grads_from_actor ,global_ACNet.global_public_params))
                        self.update_global_shared_from_critic = self.OPT_Shared.apply_gradients(zip(self.public_grads_from_critic ,global_ACNet.global_public_params))
                        self.update_global_predictor = self.OPT_Shared.apply_gradients(zip(self.predictor_grad ,global_ACNet.global_predictor_params))
    # 返回 动作分布,value,共享层参数,actor参数,critic参数
    def _build_net(self,scope):
        general_initializer = tf.random_normal_initializer(0.0,0.1)
        with tf.variable_scope('Shared_layer'):
            state_encoder = tf.layers.dense(inputs=self.state,units=200,activation=tf.nn.relu,kernel_initializer=general_initializer,name = 'shared_dense')
        with tf.variable_scope('Actor_layer'):
            action_pro = tf.layers.dense(inputs=state_encoder,units=self.n_a,activation=tf.nn.softmax,kernel_initializer=general_initializer ,name = 'actor_dense')
        with tf.variable_scope('Critic_layer'):
            state_value = tf.layers.dense(inputs=state_encoder,units=1,kernel_initializer=general_initializer,name='critic_dense')
        with tf.variable_scope('State_predictor'):
            joint_state = tf.concat([self.state,tf.one_hot(self.action,self.n_a,dtype=tf.float32)],1)
            dense_layer = tf.layers.dense(inputs=joint_state ,units=300,activation=tf.nn.relu,kernel_initializer=general_initializer)
            state_predicted = tf.layers.dense(inputs=dense_layer ,units=self.n_s,kernel_initializer=general_initializer)
        public_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES , scope = scope+'/Shared_layer')
        actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES , scope = scope+'/Actor_layer')
        critic_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES , scope = scope +'/Critic_layer')
        predictor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES , scope = scope +'/State_predictor')
        return action_pro , state_value ,public_params ,actor_params ,critic_params ,predictor_params,state_predicted


    # 用local自己的梯度更新global的参数
    def update_global_params(self,feed_dict):
        self.sess.run([self.update_global_actor , self.update_global_critic ,
                       self.update_global_shared_from_actor,
                       self.update_global_shared_from_critic,
                       self.update_global_predictor],feed_dict)

    # 获取global的参数 ,用global的参数更新自己的参数
    def pull_params_from_global(self):
        self.sess.run([self.pull_shared_params_op,
                       self.pull_actor_params_op,
                       self.pull_critic_params_op,
                       self.pull_predictor_params_op])

    def make_prediction(self,feed_dict):
        state_predicted = self.sess.run([self.state_predicted],feed_dict)
        return state_predicted

    #
    def choose_actin(self,s):
        action_prob = self.sess.run(self.action_pro , feed_dict= {self.state: s[np.newaxis, :]}  )

        # print(s)
        action = np.random.choice(range(action_prob.shape[1]),p=action_prob.ravel())
        return action

class worker(object):

    def __init__(self,name,global_ACNet,sess,optimizer_list,
                 coordinator,env = None,n_a = None,n_s = None,
                 whe_show = None):
        self.threading_coordinator = coordinator
        self.env = env
        self.name = name
        self.sess = sess
        self.AC_Net = ACNetwork(name,global_ACNet = global_ACNet,session = sess,optimizer_list = optimizer_list,n_a=n_a,n_s=n_s)
        show_or_not(env,whe_show)

    def compute_v_target(self,buffer_v_target,buffer_reward,v_next):  # buffer_v_target,buffer_reward,v_next
        gamma = 0.99
        for r in buffer_reward[::-1]:  # reverse buffer r
            v_next = r + gamma * v_next
            buffer_v_target.append(v_next)  # buffer_v_target 和 buffer_r 长度是一样的
        buffer_v_target.reverse()

        # print("---------")
        # print("buffer reward",buffer_reward)
        # print("v_next",v_next)
        # print('v_target',buffer_v_target)
        return buffer_v_target

    def work(self):
        global  GLOBAL_REWARD_SUM
        total_step = 0  # 总的步数
        buffer_state,buffer_action,buffer_reward,buffer_state_next = [],[],[],[]
        while not self.threading_coordinator.should_stop() and get_global_episode_count() <= MAX__EPISODE:
            state = reset_env(self.env)
            reward_in_episode = 0
            curisoty_reward_in_episode = 0
            step_in_episode = 0
            while True:
                action = self.AC_Net.choose_actin(state)
                state_next,reward,done,info = take_action(self.env,state,action)
                feed_dict_of_predictor = {
                    self.AC_Net.action: np.array([action]),
                    self.AC_Net.state: np.array([state]),
                }
                try_prediction = self.AC_Net.make_prediction(feed_dict_of_predictor)
                #print("prediction------",try_prediction[0][0],'\n',"ground_truth----",np.array([state])[0],'\n',"******************************")
                curisoty_reward = 0
                for i in range(self.AC_Net.n_s):
                    curisoty_reward = curisoty_reward + pow((try_prediction[0][0][i]*1.0 - np.array([state])[0][i]),2)
                reward = curisoty_reward*1.0 + reward*1.0
                curisoty_reward_in_episode = curisoty_reward + curisoty_reward_in_episode
                #print(reward)
                #print(curisoty_reward)
                step_in_episode = step_in_episode + 1
                reward_in_episode = reward_in_episode + reward
                buffer_action.append(action)
                buffer_reward.append(reward)
                buffer_state.append(state)
                buffer_state_next.append(state_next)
                if total_step%UPDATE_ITER == 0 or done:
                    # 开始更新参数 ————>  计算 训练需要feed in local_network 的 self.s   self.action   self.v_target
                    #  rt+1 rt+2 rt+3 rt+4 rt+5 如何计算 V_Target t+1 ~ V_Target t+5
                    if done:
                        v_next = 0
                    else:
                        v_next = self.sess.run(self.AC_Net.v_estimated,{self.AC_Net.state:state_next[np.newaxis,:]})[0,0]
                    buffer_v_target = []
                    # print(buffer_reward)
                    buffer_v_target = self.compute_v_target(buffer_v_target,buffer_reward,v_next)
                    buffer_state,buffer_action,buffer_v_target,buffer_state_next = np.array(buffer_state),np.array(buffer_action),np.vstack(buffer_v_target),np.array(buffer_state_next)
                    feed_dict = {
                        self.AC_Net.state : buffer_state,
                        self.AC_Net.action : buffer_action,
                        self.AC_Net.v_target : buffer_v_target,
                        self.AC_Net.state_next : buffer_state_next
                    }
                    feed_dict_of_predictor = {
                        self.AC_Net.action: buffer_action,
                        self.AC_Net.state: buffer_state,
                    }
                    #state_predicted = self.AC_Net.make_prediction(feed_dict_of_predictor)
                    # print("-----------------------")
                    # print("state_predicted",state_predicted)
                    # print("buffer_state_next",buffer_state_next)
                    # print("------------------------")
                    # feed in 数据后 开始更新参数 。在ACNet中,我们更新的是global的参数
                    self.AC_Net.update_global_params(feed_dict)
                    #  global参数得到更新后,把新的global_net参数分发给local_net
                    self.AC_Net.pull_params_from_global()
                    buffer_state, buffer_action, buffer_reward,buffer_state_next = [], [], [],[]
                    # print(buffer_reward)
                state = state_next
                plus_global_episode_count()
                total_step = total_step + 1
                #print(total_step)
                if done or step_in_episode>=MAX_STEP_IN_EPISODE:
                    print(
                        self.name,
                        " ,Episode—", get_global_episode_count(),
                        "| 环境Reward: %s" % step_in_episode,
                        "| 好奇心Reward:%s"% curisoty_reward_in_episode,
                        "| 总Reward:%s"%(reward_in_episode)
                    )
                    plus_global_episode_count()
                    step_in_episode = 0
                    curisoty_reward_in_episode = 0
                    break





from cusiroty_driven_agent.A3C_network import ACNetwork,worker
from config.config import GLOBAL_NET_SCOPE,LR_Actor,LR_Critic,LR_Shared
import tensorflow as tf
import threading
from config.config import plus_global_episode_count,reset_global_episode_count,get_global_episode_count
from config.env_setup import load_gym_env,get_dim


if __name__ =="__main__":

    N_Action,N_State= get_dim('CartPole-v0')

    reset_global_episode_count()

    N_WORKERS = 10

    SESS = tf.Session()

    with tf.device("/cpu:0"):
        OPT_Actor = tf.train.RMSPropOptimizer(LR_Actor,name = "RMSPropActor")
        OPT_Critic = tf.train.RMSPropOptimizer(LR_Critic,name = "RMSPropCritic")
        OPT_Shared = tf.train.RMSPropOptimizer(LR_Shared,name = "RMSPropShared")

        OPT_LIST = [OPT_Actor,OPT_Critic,OPT_Shared]

        GLOBAL_ACNet = ACNetwork(session=SESS,scope=GLOBAL_NET_SCOPE,n_s=N_State,n_a=N_Action)

        COORD = tf.train.Coordinator()

        # 在计算图中定义多个workers
        workers = []
        for i in range(N_WORKERS):
            env = load_gym_env('CartPole-v0')
            worker_name = "No.%s_worker"%i
            if i ==5:
                whe_show = True
            else:
                whe_show = False
            workers.append(worker(name=worker_name,global_ACNet=GLOBAL_ACNet,
                                  sess = SESS,optimizer_list=OPT_LIST
                                  ,coordinator=COORD,env = env,
                                  n_s=N_State,n_a=N_Action,
                                  whe_show=whe_show))

        SESS.run(tf.global_variables_initializer())

        workers_threads = []

        # 定义线程
        for worker in workers:
            t = threading.Thread(target=worker.work)
            workers_threads.append(t)

        # 开启线程
        for thread in workers_threads:
            print("开始线程:",thread)
            thread.start()

        COORD.join(workers_threads)

        reset_global_episode_count()

        print("所有线程结束!")

github:https://github.com/wangyouzhuo/RL_demo/tree/master/cusiroty_driven_agent

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值