强化学习记录-DQN

Q-learning 算法中以矩阵的方式建立了一张存储每个状态下所有动作值的表格,这种用表格存储动作价值的做法只用在环境的状态和动作都是离散的情况。当状态或者动作数量非常大的时候,这种做法并不适用。需要用函数拟合的方法来估计Q值。

DQN 算法可以用来解决连续状态下离散动作的问题:

CartPole 环境的状态值就是连续的,动作值是离散的。

当状态每一维度的值都是连续的,无法使用表格记录时,一个常见的解决方法是使用函数拟合的思想。神经网络具有强大的表达能力,可以用一个神经网络来表示函数Q。若动作是连续(无限)的,神经网络的输入是状态和动作,然后输出一个标量,表示在状态下采取动作能获得的价值。若动作是离散(有限)的,除了可以采取动作连续情况下的做法,还可以只将状态输入到神经网络中,使其同时输出每一个动作的Q值.

由于 DQN 是离线策略算法,在收集数据的时候可以使用一个\epsilon-贪婪策略来平衡探索与利用,将收集到的数据存储起来,在后续的训练中使用。DQN 中还有两个非常重要的模块——经验回放目标网络,它们能够帮助 DQN 取得稳定、出色的性能。

agent.py

import random

import numpy as np
import torch
import torch.nn as nn

class Replaymemory:
    def __init__(self,n_s,n_a): #观测状态以及观测动作有多少个
        self.n_s = n_s
        self.n_a = n_a
        self.MEMORY_SIZE = 1000
        self.BATCH_SIZE = 64 #每次取64

        self.all_s = np.empty(shape=(self.MEMORY_SIZE,self.n_s),dtype=np.float32)
        self.all_a = np.random.randint(low=0,high=n_a,size=self.MEMORY_SIZE,dtype=np.uint8)
        self.all_r = np.empty(self.MEMORY_SIZE,dtype=np.float32)
        self.all_done = np.random.randint(low=0,high=2,size=self.MEMORY_SIZE,dtype=np.uint8)
        self.all_s_ = np.empty(shape=(self.MEMORY_SIZE,self.n_s),dtype=np.float32)
        self.t_memo = 0
        self.t_max = 0

    def add_memo(self,s,a,r,done,s_):
        self.all_s[self.t_memo] = s
        self.all_a[self.t_memo] = a
        self.all_r[self.t_memo] = r
        self.all_done[self.t_memo] = done
        self.all_s_[self.t_memo] = s_
        self.t_max = max(self.t_memo,self.t_memo+1)
        self.t_memo = (self.t_memo + 1)%self.MEMORY_SIZE

    def sample(self):

        if self.t_max > self.BATCH_SIZE:
            idxes = random.sample(range(self.t_max),self.BATCH_SIZE)#从range(self.t_max)这么多数里边取self.BATCH_SIZE个数存放在列表idxes中,到了t_max步来取,t_max最大为一千
        else:
            idxes = range(0,self.t_max)

        batch_s = []
        batch_a = []
        batch_r = []
        batch_done = []
        batch_s_ = []

        for idx in idxes:
            batch_s.append(self.all_s[idx])
            batch_a.append(self.all_a[idx])
            batch_r.append(self.all_r[idx])
            batch_done.append(self.all_done[idx])
            batch_s_.append(self.all_s_[idx])

        batch_s_tensor = torch.as_tensor(np.asarray(batch_s),dtype=torch.float32)
        batch_a_tensor = torch.as_tensor(np.asarray(batch_a),dtype=torch.int64).unsqueeze(-1)
        batch_r_tensor = torch.as_tensor(np.asarray(batch_r),dtype=torch.float32).unsqueeze(-1)
        batch_done_tensor = torch.as_tensor(np.asarray(batch_done),dtype=torch.float32).unsqueeze(-1)
        batch_s__tensor = torch.as_tensor(np.asarray(batch_s_),dtype=torch.float32)

        return batch_s_tensor, batch_a_tensor, batch_r_tensor, batch_done_tensor, batch_s__tensor


class DQN(nn.Module):
    def __init__(self,n_input,n_output):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(in_features=n_input,out_features=88),
            nn.Tanh(),
            nn.Linear(in_features=88,out_features=n_output)
        )

    def forward(self,x): #前向传播
        return self.net(x)

    def act(self,obs):
        obs_tensor = torch.as_tensor(obs,dtype=torch.float32)
        q_value = self(obs_tensor.unsqueeze(0))#转化为行向量
        max_q_idx = torch.argmax(input=q_value)
        action = max_q_idx.detach().item()

        return action



class Agent:
    def __init__(self,n_input,n_output):
        self.n_input = n_input
        self.n_output = n_output

        self.GAMMA = 0.99 #衰减因子
        self.learning_rate = 1e-3

        self.memo = Replaymemory(self.n_input,self.n_output) #TODO

        self.online_net = DQN(self.n_input,self.n_output) #TODO
        self.target_net = DQN(self.n_input,self.n_output) #TODO

        self.optimizer = torch.optim.Adam(self.online_net.parameters(),lr=self.learning_rate) #TODO

2:rl.py
import random

import gym
import numpy as np
import torch
from torch import nn
from agent import Agent

env = gym.make("CartPole-v1")
s = env.reset()

EPSILON_DECAY = 10000
EPSILON_START = 1.0
EPSILON_END = 0.02
TARGET_UPDATE_FREQUENCY = 10 #每10局更新一次
n_episode = 5000 #玩五千局
n_time_step = 1000 #每局玩一千步

n_state = len(s)
n_action = env.action_space.n

agent = Agent(n_input=n_state,n_output=n_action)

REWARD_BUFFER = np.empty(shape=n_episode) #为每回合需要记录的总奖励开辟空间用于存放
for episode_i in range(n_episode):
    episode_reward = 0
    for step_i in range(n_time_step):
        epsilon = np.interp(episode_i * n_time_step + step_i,[0,EPSILON_DECAY],[EPSILON_START,EPSILON_END])
        random_sample = random.random()

        if random_sample <= epsilon:
            a = env.action_space.sample()
        else:
            a = agent.online_net.act(s) #TODO

        s_, r, done, info = env.step(a)
        agent.memo.add_memo(s,a,r,done,s_)#TODO    将信息存放至经验池
        s = s_ #将下个观测的状态赋给当前状态
        episode_reward += r #将每个回合的reward值进行相加

        if done:
            s = env.reset()#重置状态
            REWARD_BUFFER[episode_i] = episode_reward
            break

        batch_s, batch_a, batch_r, batch_done, batch_s_ = agent.memo.sample()#TODO 从经验池中进行采样


        #computer targets
        target_q_values = agent.target_net(batch_s_)#TODO 从下一个状态获取target_q_values
        max_target_q_values = target_q_values.max(dim=1,keepdim=True)[0]#获取最大的target_q_values
        targets = batch_r + agent.GAMMA * (1-batch_done) * max_target_q_values#TODO 分情况求targets

        #computer q_values
        q_values = agent.online_net(batch_s)#TODO
        a_q_values = torch.gather(input=q_values,dim=1,index=batch_a)#找最大q值

        #computer loss
        loss = nn.functional.smooth_l1_loss(targets,a_q_values)

        #Gradient descent
        agent.optimizer.zero_grad()#TODO
        loss.backward()
        agent.optimizer.step()#TODO

    if episode_i % TARGET_UPDATE_FREQUENCY == 0:  #每TARGET_UPDATE_FREQUENCY更新一次target_net
        agent.target_net.load_state_dict(agent.online_net.state_dict())#TODO


        #show the training process
        print("Episode:{}".format(episode_i))
        print("Avg. Reward:{}".format(np.mean(REWARD_BUFFER[:episode_i])))

gym==0.25.0

  • 50
    点赞
  • 34
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
这里提供一个基于DQN的多智能体强化学习python代码示例: ```python import numpy as np import random import tensorflow as tf from collections import deque # 定义DQN类 class DQN(object): # 初始化DQN网络 def __init__(self, env): self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.n self.memory = deque(maxlen=2000) self.gamma = 0.95 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.learning_rate = 0.001 self.model = self._build_model() # DQN网络的模型结构 def _build_model(self): model = tf.keras.models.Sequential([ tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'), tf.keras.layers.Dense(24, activation='relu'), tf.keras.layers.Dense(self.action_size, activation='linear') ]) model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate)) return model # 记录记忆 def remember(self, state, action, reward, next_state, done): self.memory.append([state, action, reward, next_state, done]) # 通过DQN网络选择一个动作 def act(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) act_values = self.model.predict(state) return np.argmax(act_values[0]) # 训练DQN网络 def replay(self, batch_size): if len(self.memory) < batch_size: return minibatch = random.sample(self.memory, batch_size) for state, action, reward, next_state, done in minibatch: target = reward if not done: target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) target_f = self.model.predict(state) target_f[0][action] = target self.model.fit(state, target_f, epochs=1, verbose=0) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # 加载保存的DQN网络权重 def load(self, name): self.model.load_weights(name) # 保存DQN网络权重 def save(self, name): self.model.save_weights(name) # 定义多智能体强化学习类 class MARL(object): # 初始化多智能体强化学习 def __init__(self, env, n_agent): self.env = env self.n_agent = n_agent self.agents = [DQN(env) for _ in range(n_agent)] self.batch_size = 32 # 进行多智能体强化学习 def train(self, max_episode, max_step): scores = [] for e in range(max_episode): states = self.env.reset() states = np.reshape(states, [1, self.env.observation_space.shape[0] * self.n_agent]) score = 0 for t in range(max_step): actions = [] for i in range(self.n_agent): action = self.agents[i].act(states) actions.append(action) next_states, rewards, done, _ = self.env.step(actions) next_states = np.reshape(next_states, [1, self.env.observation_space.shape[0] * self.n_agent]) for i in range(self.n_agent): self.agents[i].remember(states, actions[i], rewards[i], next_states, done) self.agents[i].replay(self.batch_size) states = next_states score += np.sum(rewards) if done: break scores.append(score) print("episode: {}/{}, score: {}".format(e, max_episode, score)) return scores ``` 该代码仅为示例代码,仅供参考。实际中需要根据具体问题进行适当调整。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值