基于强化学习的Deep-Qlearning网络玩cartpole游戏

1、环境准备,gym的版本为0.26.2

2、编写网络代码

# 导入必要的库
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random


# 定义DQN网络
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        # 定义三层全连接网络
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


# 定义DQN智能体
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)  # 经验回放池
        self.gamma = 0.95  # 折扣因子
        self.epsilon = 1.0  # 探索率
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = DQN(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def remember(self, state, action, reward, next_state, done):
        # 将经验存储到经验回放池中
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # ε-贪婪策略选择动作
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        act_values = self.model(state)
        return np.argmax(act_values.cpu().data.numpy())

    def replay(self, batch_size):
        # 从经验回放池中随机采样进行学习
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state = torch.FloatTensor(next_state).unsqueeze(0).to(self.device)
                target = (reward + self.gamma * np.amax(self.model(next_state).cpu().data.numpy()))
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            target_f = self.model(state)
            target_f[0][action] = target
            self.optimizer.zero_grad()
            loss = nn.MSELoss()(self.model(state), target_f)
            loss.backward()
            self.optimizer.step()
        # 更新探索率
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        # 加载模型
        self.model.load_state_dict(torch.load(name))

    def save(self, name):
        # 保存模型
        torch.save(self.model.state_dict(), name)


# 训练函数
def train_dqn():
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    episodes = 1000
    batch_size = 32

    for e in range(episodes):
        state, _ = env.reset() #重置环境,返回初始观察值和初始奖励
        for time in range(500):
            action = agent.act(state)
            next_state, reward, done, _, _ = env.step(action) # 执行动作,返回5个数值
            reward = reward if not done else -10 # 如果游戏结束,给予负奖励
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print(f"episode: {e}/{episodes}, score: {time}, epsilon: {agent.epsilon:.2}")
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        if e % 100 == 0:
            agent.save(f"cartpole-dqn-{e}.pth")  # 每100回合保存一次模型


# 使用训练好的模型玩游戏
def play_cartpole():
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    agent.load("cartpole-dqn-900.pth")  # 加载训练好的模型

    for e in range(10):  # 玩10局
        state, _ = env.reset()
        for time in range(500):
            env.render()
            action = agent.act(state)
            next_state, reward, done, _, _= env.step(action)
            state = next_state
            if done:
                print(f"episode: {e}, score: {time}")
                break
    env.close()

if __name__ == '__main__':
    # 如果要训练模型,取消下面这行的注释
    # train_dqn()
    # 如果要使用训练好的模型玩游戏,取消下面这行的注释
    play_cartpole()

更多解析请参考:https://zhuanlan.zhihu.com/p/29283993

 https://zhuanlan.zhihu.com/p/29213893

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
强化学习是一种通过与环境交互来学习最优行为的机器学习方法,而DQN算法(Deep Q-Network)是一种基于深度学习的强化学习算法,常用于解决连续状态和动作空间问题。在这里,我们将介绍如何使用DQN算法来控制倒立摆的离散动作空间。 首先,我们需要安装一些必要的库,包括gym、numpy、tensorflow和keras。可以通过以下命令来安装: ``` pip install gym numpy tensorflow keras ``` 接下来,我们将使用gym库中的CartPole-v0环境来模拟倒立摆。该环境需要在每个时间步中采取一个离散的动作,使得倒立摆不倒,直到达到最大时间步数或倒立摆超出允许的角度限制。 我们将使用DQN算法来训练一个神经网络来预测在每个状态下采取每个动作的Q值。在每个时间步,我们将根据epsilon-greedy策略选择一个动作,并将其应用于环境中,然后更新我们的神经网络。 以下是完整的代码: ```python import gym import numpy as np from keras.models import Sequential from keras.layers import Dense from keras.optimizers import Adam class DQNAgent: def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.memory = [] self.gamma = 0.95 # discount rate self.epsilon = 1.0 # exploration rate self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.learning_rate = 0.001 self.model = self._build_model() def _build_model(self): # Neural Net for Deep-Q learning Model model = Sequential() model.add(Dense(24, input_dim=self.state_size, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(self.action_size, activation='linear')) model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) return model def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def act(self, state): if np.random.rand() <= self.epsilon: return np.random.choice(self.action_size) else: return np.argmax(self.model.predict(state)[0]) def replay(self, batch_size): minibatch = np.random.choice(len(self.memory), batch_size, replace=False) for state, action, reward, next_state, done in minibatch: target = reward if not done: target = reward + self.gamma * np.amax(self.model.predict(next_state)[0]) target_f = self.model.predict(state) target_f[0][action] = target self.model.fit(state, target_f, epochs=1, verbose=0) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay if __name__ == "__main__": env = gym.make('CartPole-v0') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(state_size, action_size) batch_size = 32 episodes = 1000 for e in range(episodes): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}" .format(e, episodes, time, agent.epsilon)) break if len(agent.memory) > batch_size: agent.replay(batch_size) ``` 在训练过程中,我们可以看到模型的epsilon值在不断衰减,探索变得越来越少,最终达到一个稳定的水平。在每个episode结束时,我们将打印出得分和epsilon值。 在训练1000个episode后,我们可以看到模型的得分在不断提高。可以尝试调整参数和网络结构来进一步提高性能。 注意:在运行代码时,需要关闭jupyter notebook自带的自动保存,否则可能会导致程序卡住。可以使用以下命令关闭自动保存: ``` jupyter notebook --NotebookApp.autosave_interval=0 ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

恋上钢琴的虫

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值