AC tensorflow2.0

AC 算法 tensorflow2.0

import tensorflow as tf
import numpy as np
import gym
from matplotlib import pyplot as plt


def AC_run(AC_agent=None, episode=1000):
    AC_agent = AC_agent.AC(n_actions=2, n_features=4)
    env = gym.make('CartPole-v1')
    score = []
    for i_episode in range(episode):
        # 初始化,
        observation = env.reset()
        done = False
        t = 0
        while not done:
            env.render()
            action = AC_agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r2 = - abs(theta) * 5
            # 先更新critic网络,得到td_loss,再利用这个值取去更新actor网络
            td_loss = AC_agent.critic_learn(s=observation, r=reward + r2, s_=observation_)
            AC_agent.actor_learn(s=observation, a=action, td_loss=td_loss)
            observation = observation_
            t += 1
        print("Episode finished after {} time steps".format(t + 1))
        score.append(t + 1)
        if (i_episode + 1) % 100 == 0:
            plt.plot(score)  # 绘制波形
            # plt.draw()
            plt.savefig(f"RL_algorithm_package/img/AC_score_episode:{i_episode + 1}.png")


class AC:
    """
    AC网络其实是比较难收敛的,多次测试才能收敛,并且跟reward函数关系也非常大。
    在训练的过程中还可能会主线中间的表现比较好,随后性能急速下降的问题。可以时刻的保存网络参数,选取比较好的那个参数。
    """
    def __init__(self, n_features, n_actions, gamma=0.9, learning_rate=0.01):
        self.gamma = gamma
        self.n_features = n_features
        self.n_actions = n_actions
        # 网络参数信息
        self.actor_model = self.actor_net_init()
        self.critic_model = self.critic_net_init()
        self.learning_rate = learning_rate
        self.opt = tf.keras.optimizers.Adam(self.learning_rate)

    def actor_net_init(self):
        inputs = tf.keras.Input(shape=(self.n_features,))
        dense = tf.keras.layers.Dense(32, activation='relu')(inputs)
        out_put = tf.keras.layers.Dense(self.n_actions, activation='softmax')(dense)
        actor_model = tf.keras.Model(inputs, out_put)
        return actor_model

    def critic_net_init(self):
        inputs = tf.keras.Input(shape=(self.n_features,))
        dense = tf.keras.layers.Dense(32, activation='relu')(inputs)
        # 注意这里不是完全相同的两个网络,这里没有加softmax的激活函数,并且他的输出维度也是1,只是该状态的价值
        out_put = tf.keras.layers.Dense(1)(dense)
        critic_model = tf.keras.Model(inputs, out_put)
        return critic_model

    def choose_action(self, s):
        s = s.reshape(1, 4)
        action_value = self.actor_model.predict(np.array(s))
        a = np.random.choice(a=np.arange(self.n_actions), p=action_value[0])
        return a

    def critic_learn(self, s, r, s_):
        s = s.reshape(1, 4)
        s_ = s_.reshape(1, 4)
        with tf.GradientTape() as Tape:
            # critic 网络的更新类似于DQN中的网络更新
            V_s_ = self.critic_model(np.array(s_))
            # 更新target
            V_target = self.gamma * V_s_ + r
            V_s = self.critic_model(np.array(s))
            # 取mse,更新critic网络
            loss = tf.losses.mse(V_target, V_s)
            gradients = Tape.gradient(loss, self.critic_model.trainable_variables)
            self.opt.apply_gradients(zip(gradients, self.critic_model.trainable_variables))
        # 注意td_loss的值
        return V_target - V_s

    def actor_learn(self, s, a, td_loss):
        s = s.reshape(1, 4)
        with tf.GradientTape() as Tape:
            action_p = self.actor_model(np.array(s))
            a_one_hot = tf.one_hot(a, self.n_actions)
            log_action = tf.reduce_sum(a_one_hot * tf.math.log(action_p), axis=1)
            # loss 要取负值,因为要使reward最大化
            loss = - tf.reduce_mean(td_loss * log_action)
            gradients = Tape.gradient(loss, self.actor_model.trainable_variables)
            self.opt.apply_gradients(zip(gradients, self.actor_model.trainable_variables))

在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值