python实现DQN代码

1 Pytorch

import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size, memory_size=100000, batch_size=32, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, lr=0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.lr = lr
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = self.build_model().to(self.device)
        self.target_network = self.build_model().to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

    def build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, self.action_size)
        )
        return model

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        state = torch.from_numpy(state).float().to(self.device)
        with torch.no_grad():
            q_values = self.q_network(state)
        return q_values.argmax().item()

    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        states = torch.from_numpy(np.vstack([x[0] for x in minibatch])).float().to(self.device)
        actions = torch.from_numpy(np.array([x[1] for x in minibatch])).long().to(self.device)
        rewards = torch.from_numpy(np.array([x[2] for x in minibatch])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([x[3] for x in minibatch])).float().to(self.device)
        dones = torch.from_numpy(np.array([x[4] for x in minibatch]).astype(np.uint8)).float().to(self.device)
        q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_network(next_states).max(1)[0]
        expected_q_values = rewards + (1 - dones) * self.gamma * next_q_values
        loss = F.mse_loss(q_values, expected_q_values.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_epsilon(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay, 0.01)

    def save_model(self, filename):
        torch.save(self.q_network.state_dict(), filename)

    def load_model(self, filename):
        self.q_network.load_state_dict(torch.load(filename))

def train(env, agent, episodes, max_steps):
    scores = deque(maxlen=100)
    for i_episode in range(1, episodes + 1):
        state = env.reset()
        score = 0
        for t in range(max_steps):
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            score += reward
            agent.learn()
            if done:
                break
        scores.append(score)
        agent.update_epsilon()
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores)), end="")
        if np.mean(scores) >= 200:
            print('\nEnvironment solved in {} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores)))
            agent.save_model('dqn.pth')
            break
        if i_episode % 20 == 0:
            agent.update_target_network()

    return scores

env = gym.make('CartPole-v1')
agent = DQNAgent(state_size=env.observation_space.shape[0], action_size=env.action_space.n)
scores = train(env, agent, episodes=1000, max_steps=1000)

在上述代码中,train() 函数使用给定的 DQNAgent 对象和 OpenAI Gym 环境训练智能体。训练完成后,该函数返回最近 100 个 episode 得分的平均值的列表。在训练过程中,每隔一定 episode 数,目标网络将被更新,并且 ϵ \epsilon ϵ 值将按指定的速率逐渐降低。

这将在 CartPole-v1 环境中训练一个智能体,进行 1000 个 episode 的训练,每个 episode 最多运行 1000 步。在训练过程中,如果智能体的平均得分超过 200 分,训练将停止,并且训练期间的最终模型将保存在名为 dqn.pth 的文件中。

2 Tensorflow

import tensorflow as tf
import numpy as np
import random

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


运行示例

import gym

# 定义环境和智能体
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

# 训练智能体
batch_size = 32
num_episodes = 1000
for e in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    score = 0
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        score += reward
    print("episode: {}/{}, score: {}".format(e, num_episodes, score))
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

  • 1
    点赞
  • 28
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
抱歉,我作为AI语言模型,不能进行代码实现,但是我可以提供一些参考资料,希望能对您有帮助。 DQN网络是一种深度强化学习算法,它通过神经网络来学习游戏中的最优策略。下面是一个简单的DQN网络的实现过程: 1. 导入相关库 ```python import numpy as np import tensorflow as tf ``` 2. 定义游戏环境和DQN网络的参数 ```python num_actions = 4 state_size = (84, 84, 4) gamma = 0.99 epsilon = 1.0 epsilon_min = 0.1 epsilon_decay = 1000000 batch_size = 32 memory_size = 1000000 learning_rate = 0.00025 ``` 3. 定义DQN网络的结构 ```python def create_q_network(): input_layer = tf.keras.layers.Input(shape=state_size) conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input_layer) conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(conv1) conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(conv2) flatten = tf.keras.layers.Flatten()(conv3) fc1 = tf.keras.layers.Dense(512, activation='relu')(flatten) output_layer = tf.keras.layers.Dense(num_actions)(fc1) model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse') return model ``` 4. 定义经验回放池 ```python class ReplayMemory: def __init__(self, capacity): self.capacity = capacity self.memory = [] def push(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) if len(self.memory) > self.capacity: del self.memory[0] def sample(self, batch_size): samples = zip(*random.sample(self.memory, batch_size)) return map(lambda x: np.array(x), samples) ``` 5. 定义训练过程 ```python model = create_q_network() memory = ReplayMemory(memory_size) state = env.reset() state = np.stack([state] * 4, axis=2) for step in range(num_steps): action = None if np.random.rand() < epsilon: action = np.random.randint(num_actions) else: q_values = model.predict(np.expand_dims(state, axis=0)) action = np.argmax(q_values) next_state, reward, done, _ = env.step(action) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) memory.push(state, action, reward, next_state, done) state = next_state if len(memory.memory) > batch_size: states, actions, rewards, next_states, dones = memory.sample(batch_size) targets = model.predict(states) q_next = np.max(model.predict(next_states), axis=1) targets[range(batch_size), actions] = rewards + (1 - dones) * gamma * q_next model.train_on_batch(states, targets) if epsilon > epsilon_min: epsilon -= (epsilon - epsilon_min) / epsilon_decay ``` 以上是一个简单的DQN网络的实现过程,更多详细内容,可以参考相关的深度强化学习教程和代码实现

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值