DDPG tensorflow 2.0

DDPG算法的tensorflow2.0实现
算法的详细解析可以看DDPG解析

import tensorflow as tf
import numpy as np
import pandas as pd
import gym
from matplotlib import pyplot as plt
import os

# 设置随机数种子
SEED = 65535

ENV = gym.make('Pendulum-v1')
# 环境的全局变量,不同环境可能会不同
action_dim = ENV.action_space.shape[0]
observation_dim = ENV.observation_space.shape[0]
action_span = ENV.action_space.high


class DDPG:
    def __init__(self,
                 n_features,
                 n_actions_dim,
                 gamma=0.9,
                 mode='train',
                 update_param_n=1,
                 actor_learning_rate=0.001,
                 critic_learning_rate=0.001,
                 soft_tau=0.1,
                 explore_span=3,
                 learning_rate=0.001,
                 experience_pool_size=1000,
                 batch_size=64,
                 ):
        # 随机数种子
        np.random.seed(SEED)
        tf.random.set_seed(SEED)
        # 强化学习超参数
        self.gamma = gamma
        self.explore_span = explore_span  # 探索的范围,越大越容易探索
        # 神经网络相关定义
        self.n_features = n_features  # 特征的维度
        self.n_actions_dim = n_actions_dim  # 动作的维度
        self.actor_learning_rate = actor_learning_rate  # 学习率,可以为不同的网络设置不同的优化函数,设置不同的学习率,这里并没有使用到这两个参数
        self.critic_learning_rate = critic_learning_rate
        self.learning_rate = learning_rate
        self.soft_tau = soft_tau  # 软更新比例,这个值要和新的值进行相乘
        self.learn_time = 0  # 学习次数
        self.update_param_n = update_param_n  # 学习n次更新一次target网络
        self.mode = mode  # 设置模式,可以为训练(train)或测试(test)
        # 建立四个网络,并且使其参数相同
        self.critic_pred_model = self.critic_init(critic_trainable=True, name='pred')
        self.critic_target_model = self.critic_init(critic_trainable=False, name='target')
        self.critic_param_replace()
        self.actor_pred_model = self.actor_init(actor_trainable=True, name='pred')
        self.actor_target_model = self.actor_init(actor_trainable=False, name='target')
        self.actor_param_replace()
        self.opt = tf.keras.optimizers.Adam(self.learning_rate)
        # 经验池相关参数
        self.experience_pool_size = experience_pool_size  # 经验池大小
        self.experience_length = self.n_features * 2 + self.n_actions_dim + 1 + 1  # 一条经验的长度
        self.experience_pool_is_full = False
        self.experience_pool_can_learn = False
        self.experience_pool = pd.DataFrame(np.zeros([self.experience_pool_size, self.experience_length]))  # 建立经验池
        self.experience_pool_index = 0  # 经验池的当前目录
        self.batch_size = batch_size  # 批大小

    def experience_pool_store(self, s, a, r, s_, done):
        """
        存储经验
        :param s: 状态
        :param a: 动作
        :param r: 回报
        :param s_: 下一个状态
        :param done: 是否完成游戏
        :return:
        """
        experience = []
        for i in range(self.experience_length):
            if i < self.n_features:
                experience.append(s[i])
            elif self.n_features <= i < self.n_features + self.n_actions_dim:
                experience.append(a[i - self.n_features])
            elif self.n_features + 1 <= i < self.n_features + self.n_actions_dim + 1:
                experience.append(r)
            elif self.n_features + 2 <= i < self.n_features * 2 + self.n_actions_dim + 1:
                experience.append(s_[i - self.n_features - self.n_actions_dim - 1])
            else:
                experience.append(done)
        self.experience_pool.loc[self.experience_pool_index] = experience
        self.experience_pool_index += 1
        # 判断能否开始训练,以及经验池是否已经满了
        if self.experience_pool_index >= self.batch_size:
            self.experience_pool_can_learn = True
        if self.experience_pool_index == self.experience_pool_size:
            self.experience_pool_is_full = True
            self.experience_pool_index = 0

    def critic_init(self, critic_trainable, name):
        """
        critic 网络定义,s,a 输入,Q(s,a)输出。这里与AC网络不同,AC网络输出的是V(s)
        :param name: 网络名称
        :param critic_trainable: 是否可以被训练,target网络是不能被训练的,设置False,预测网络设置为True
        :return: critic 网络模型
        """
        # 多输入网络的定义法
        input_s = tf.keras.Input(shape=(self.n_features,))
        input_a = tf.keras.Input(shape=(self.n_actions_dim,))
        inputs = tf.concat([input_s, input_a], axis=-1)
        dense1 = tf.keras.layers.Dense(32, activation='relu')(inputs)
        out_put = tf.keras.layers.Dense(1)(dense1)
        critic_model = tf.keras.Model(inputs=[input_s, input_a],
                                      outputs=out_put,
                                      trainable=critic_trainable,
                                      name='critic_' + name)
        return critic_model

    def actor_init(self, actor_trainable, name):
        """
        actor 网络定义,输入s,输出动作a
        :param name: 网络名称
        :param actor_trainable: 是否可以被训练,target网络是不能被训练的,设置False,预测网络设置为True
        :return: actor 网络模型
        """
        # 多输入网络的定义法
        input_s = tf.keras.Input(shape=(self.n_features,))
        dense1 = tf.keras.layers.Dense(32, activation='relu')(input_s)
        # 加入tanh的激活函数,映射到-1 1,需要再将其映射到动作空间上。
        out_put = tf.keras.layers.Dense(1, activation='tanh')(dense1)
        out_put = tf.keras.layers.Lambda(lambda x: x * np.array(action_span))(out_put)
        actor_model = tf.keras.Model(inputs=input_s, outputs=out_put, trainable=actor_trainable, name='actor_' + name)
        return actor_model

    def choose_action(self, s):
        s = s.reshape(1, self.n_features)
        a = self.actor_pred_model.predict(np.array(s))
        if self.mode == 'train':
            # 正式的测试中,可以去掉这个噪声 直接返回动作a[0]即可,这里是训练用,所以要加入噪声,使其能够充分的探索环境
            action = np.clip(np.random.normal(a[0], self.explore_span), -action_span, action_span)
            return action
        elif self.mode == 'test':
            return a[0]

    def DDPG_learn(self):
        """
        在这里进行两个网络的更新
        :return:
        """
        if not self.experience_pool_can_learn:
            return
        elif not self.experience_pool_is_full:
            data_pool = self.experience_pool.loc[:self.experience_pool_index - 1, :].sample(self.batch_size)
        else:
            data_pool = self.experience_pool.sample(self.batch_size)
        exp_s = np.array(data_pool.loc[:, :self.n_features - 1])
        exp_a = np.array(data_pool.loc[:, self.n_features - 1 + self.n_actions_dim]).reshape(self.batch_size, 1)
        exp_r = np.array(data_pool.loc[:, self.n_features - 1 + self.n_actions_dim + 1]).reshape(self.batch_size, 1)
        exp_s_ = np.array(
            data_pool.loc[:, self.n_features + self.n_actions_dim + 1:self.n_features * 2 + self.n_actions_dim])
        # done 没有用到
        exp_done = np.array(data_pool.loc[:, self.n_features * 2 + self.n_actions_dim + 1]).reshape(self.batch_size, 1)
        with tf.GradientTape() as Tape:
            # 首先更新actor 网络,注意loss是Q值
            a = self.actor_pred_model(exp_s)
            Q_pred = self.critic_pred_model([exp_s, a])
            loss_actor = - tf.reduce_mean(Q_pred)  # 负值,最大
            actor_gradients = Tape.gradient(loss_actor, self.actor_pred_model.trainable_variables)
            self.opt.apply_gradients(zip(actor_gradients, self.actor_pred_model.trainable_variables))
        with tf.GradientTape() as Tape:
            # critic网络更新
            a_ = self.actor_target_model(exp_s_)
            Q_pred_critic = self.critic_pred_model([exp_s, exp_a])
            Q_target_critic = exp_r + self.gamma * self.critic_target_model([exp_s_, a_])
            loss_critic = tf.keras.losses.mse(Q_target_critic, Q_pred_critic)
            critic_gradients = Tape.gradient(loss_critic, self.critic_pred_model.trainable_variables)
            self.opt.apply_gradients(zip(critic_gradients, self.critic_pred_model.trainable_variables))
        self.learn_time += 1
        # 参数更新,采用软更新的方式
        if self.learn_time == self.update_param_n:
            self.soft_param_update(self.critic_target_model, self.critic_pred_model)
            self.soft_param_update(self.actor_target_model, self.actor_pred_model)
            self.learn_time = 0

    def soft_param_update(self, target_model, pred_model):
        """
        采用软更新的方式进行参数的更新,不采用DQN中的直接赋值操作,也可以采用别的软更新方式来实现。
        :param pred_model: 预测网络
        :param target_model: 目标网络
        """
        param_target = target_model.get_weights()
        param_pred = pred_model.get_weights()
        for i in range(len(param_target)):
            param_target[i] = param_target[i] * (1 - self.soft_tau)
            param_pred[i] = param_pred[i] * self.soft_tau
        param = np.add(param_pred, param_target)
        target_model.set_weights(param)

    def critic_param_replace(self):
        """
        替换critic网络的参数
        """
        self.critic_target_model.set_weights(self.critic_pred_model.get_weights())

    def actor_param_replace(self):
        """
        替换actor网络的参数
        """
        self.actor_target_model.set_weights(self.actor_pred_model.get_weights())

    def save_model(self, episode):
        """
        save trained weights
        :return: None
        """
        if not os.path.exists('model'):
            os.makedirs('model')
        self.actor_pred_model.save(f'model/ddpg_actor_pred_model_{episode}_episode.h5')
        self.actor_target_model.save(f'model/ddpg_actor_target_model_{episode}_episode.h5')
        self.critic_pred_model.save(f'model/ddpg_critic_pred_model_{episode}_episode.h5')
        self.critic_target_model.save(f'model/ddpg_critic_target_model_{episode}_episode.h5')

    def load_model(self, episode):
        """
        load trained weights
        :return: None
        """
        self.actor_pred_model = tf.keras.models.load_model(f'model/ddpg_actor_pred_model_{episode}_episode.h5')
        self.actor_target_model = tf.keras.models.load_model(f'model/ddpg_actor_target_model_{episode}_episode.h5')
        self.critic_pred_model = tf.keras.models.load_model(f'model/ddpg_critic_pred_model_{episode}_episode.h5')
        self.critic_target_model = tf.keras.models.load_model(f'model/ddpg_critic_target_model_{episode}_episode.h5')


def DDPG_train(episode=300):
    DDPG_agent = DDPG(n_features=observation_dim,
                      n_actions_dim=action_dim,
                      batch_size=64,
                      mode='train',
                      experience_pool_size=640)
    ENV.seed(SEED)
    score = []
    if not os.path.exists('img'):
        os.makedirs('img')
    for i_episode in range(episode):
        # 初始化,
        observation = ENV.reset()
        score_one_episode = 0
        for t in range(500):
            # 刷新环境
            ENV.render()
            # 选择动作
            action = DDPG_agent.choose_action(observation)
            observation_, reward, done, info = ENV.step(action)
            # 存储经验
            DDPG_agent.experience_pool_store(s=observation, a=action, r=reward, s_=observation_, done=done)
            # 学习 流程与DQN相似
            DDPG_agent.DDPG_learn()
            observation = observation_
            score_one_episode += reward
            if done:
                score.append(score_one_episode)
                print(f"the game is finished,episode is {i_episode}, the score is {score_one_episode}")
                break
        if (i_episode + 1) % 100 == 0:
            plt.plot(score)  # 绘制波形
            DDPG_agent.explore_span = DDPG_agent.explore_span / 2
            # plt.draw()
            DDPG_agent.save_model(i_episode + 1)
            plt.savefig(
                f"img/DDPG_score_train_episode:{i_episode + 1}.png")


def DDPG_test(episode=300):
    DDPG_agent = DDPG(n_features=observation_dim,
                      n_actions_dim=action_dim,
                      batch_size=64,
                      mode='test',
                      experience_pool_size=640)
    DDPG_agent.load_model(episode=300)
    ENV.seed(SEED)
    score = []
    for i_episode in range(episode):
        # 初始化,
        observation = ENV.reset()
        score_one_episode = 0
        for t in range(500):
            # 刷新环境
            ENV.render()
            action = DDPG_agent.choose_action(observation)
            observation_, reward, done, info = ENV.step(action)
            observation = observation_
            score_one_episode += reward
            if done:
                score.append(score_one_episode)
                print(f"the game is finished,episode is {i_episode}, the score is {score_one_episode}")
                break
        if (i_episode + 1) % 100 == 0:
            plt.plot(score)  # 绘制波形
            # plt.draw()
            DDPG_agent.save_model(i_episode + 1)
            plt.savefig(
                f"img/DDPG_score_test:{i_episode + 1}.png")


if __name__ == '__main__':
    DDPG_train(episode=300)

训练结果如下:
在这里插入图片描述

阅读终点,创作起航,您可以撰写心得或摘录文章要点写篇博文。去创作
  • 2
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 5
    评论
当然,我可以为您提供一个简单的 TensorFlow 2.0 版本的 DDPG(Deep Deterministic Policy Gradient)算法的代码示例。请注意,下面的代码是一个基本的框架,您可能需要根据您的具体问题和环境进行一些调整和修改。 ```python import tensorflow as tf from tensorflow.keras.layers import Dense from tensorflow.keras.optimizers import Adam class Actor(tf.keras.Model): def __init__(self, action_dim, max_action): super(Actor, self).__init__() self.dense1 = Dense(400, activation='relu') self.dense2 = Dense(300, activation='relu') self.output_layer = Dense(action_dim, activation='tanh') self.max_action = max_action def call(self, inputs): x = self.dense1(inputs) x = self.dense2(x) x = self.output_layer(x) return self.max_action * x class Critic(tf.keras.Model): def __init__(self): super(Critic, self).__init__() self.dense1 = Dense(400, activation='relu') self.dense2 = Dense(300, activation='relu') self.output_layer = Dense(1) def call(self, state, action): x = tf.concat([state, action], axis=-1) x = self.dense1(x) x = self.dense2(x) return self.output_layer(x) class DDPG: def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(action_dim, max_action) self.target_actor = Actor(action_dim, max_action) self.critic = Critic() self.target_critic = Critic() self.actor_optimizer = Adam(learning_rate=0.001) self.critic_optimizer = Adam(learning_rate=0.001) self.hard_update_target_networks() def hard_update_target_networks(self): self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) def soft_update_target_networks(self, tau): actor_weights = self.actor.get_weights() target_actor_weights = self.target_actor.get_weights() critic_weights = self.critic.get_weights() target_critic_weights = self.target_critic.get_weights() for i in range(len(target_actor_weights)): target_actor_weights[i] = tau * actor_weights[i] + (1 - tau) * target_actor_weights[i] for i in range(len(target_critic_weights)): target_critic_weights[i] = tau * critic_weights[i] + (1 - tau) * target_critic_weights[i] self.target_actor.set_weights(target_actor_weights) self.target_critic.set_weights(target_critic_weights) def train(self, replay_buffer, batch_size, gamma, tau): states, actions, next_states, rewards, dones = replay_buffer.sample_batch(batch_size) # Update critic with tf.GradientTape() as tape: target_actions = self.target_actor(next_states) target_q = self.target_critic(next_states, target_actions) target_q = rewards + gamma * target_q * (1 - dones) current_q = self.critic(states, actions) critic_loss = tf.reduce_mean(tf.square(current_q - target_q)) critic_gradients = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients(zip(critic_gradients, self.critic.trainable_variables)) # Update actor with tf.GradientTape() as tape: current_actions = self.actor(states) actor_loss = -tf.reduce_mean(self.critic(states, current_actions)) actor_gradients = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_variables)) # Update target networks self.soft_update_target_networks(tau) ``` 这是一个简单的 DDPG 算法的代码示例,其中包括了 Actor 和 Critic 网络的定义,以及训练方法 `train()` 的实现。您需要根据您的具体问题和环境进行适当的调整和修改。 请注意,这段代码只是一个示例,并不包含完整的环境交互和经验回放等部分。在实际应用中,您还需要实现经验回放缓冲区(replay buffer)来存储和采样经验数据,并且在训练过程中进行环境交互和经验回放。 希望这个代码示例能够对您有所帮助!

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

炸机狂魔

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值