基于DQN的Atari游戏

首先吗,导入库

import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime

初始化游戏环境

env = gym.make("MsPacman-v0")
n_outputs = env.action_space.n

现在,定义一个preprocess_observation函数来对输入的游戏画面进行预处理。在此,缩减图像大小并将图像转换为灰度图像:

color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):

    # 裁剪并调整图像大小
    img = obs[1:176:2, ::2]

    # 将图像转换为灰度图
    img = img.mean(axis=2)

    # 提高图像的对比度
    img[img==color] = 0

    # 将图像归一化为 -1 ~ +1
    img = (img - 128) / 128 - 1

    return img.reshape(88,80,1)

好的,现在来定义一个q_network函数来构建Q网络。Q网络的输入为游戏状态 X。构建的Q网络包括同值填充的3个卷积层和1个全连接层:

tf.reset_default_graph()

def q_network(X, name_scope):
    
    # 初始化各层
    initializer = tf.contrib.layers.variance_scaling_initializer()

    with tf.variable_scope(name_scope) as scope: 

        # 初始化卷积层
        layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer) 
        tf.summary.histogram('layer_1',layer_1)
        
        layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_2',layer_2)
        
        layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_3',layer_3)
        
        # 在进入全连接层之前,将layer_的结果扁平化
        flat = flatten(layer_3)

        fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
        tf.summary.histogram('fc',fc)
        
        output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
        tf.summary.histogram('output',output)
        

        # Vars变量将保存网络的参数和权重
        vars = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
        return vars, output

接下来,定义一个epsilon_greedy函数来执行epsilon贪婪策略。在epsilon贪婪策略中,以概率 1-epsilon 选择最佳行为,并以概率epsilon选择随机行为。
由于并不想一直探索,因此采用一种衰减的 epsilon贪婪策略,其中epsilon值会随着时间推移而不断衰减。为此,随着时间的变化该策略将会只采用最好的行为:

epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000

def epsilon_greedy(action, step):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return action

现在,初始化经验回放缓存为20000,用于保存经验。

buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)

将智能体的所有经验(状态、行为、奖励)都保存在经验回放缓存中,并采用小批量经验来训练网络:

def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

接着,定义所有的超参数:

num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
logdir = 'logs'
tf.reset_default_graph()

这时,定义输入的placeholder,如游戏状态

X = tf.placeholder(tf.float32, shape=X_shape)

定义in_training_model的布尔值来切换训练

in_training_mode = tf.placeholder(tf.bool)

取输入X来构建Q网络,并生成该状态下所有行为的Q值:

mainQ, mainQ_outputs = q_network(X, 'mainQ')

同理,构建目标Q网络:

targetQ, targetQ_outputs = q_network(X, 'targetQ')

定义行为值的placeholder:

X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

将主Q网络的参数复制到目标Q网络:

copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)

定义输出的placeholder,如行为:

y = tf.placeholder(tf.float32, shape=(None,1))

然后计算损失,即实际值与预测值之间的均方差:

loss = tf.reduce_mean(tf.square(y - Q_action))

采用 AdamOptimizer 来最小损失:

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

在 TensorBoard中设置可视化的日志文件:

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

这时,启动TensorFlow会话并运行模型:

init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    
    # 对于每个情景
    for i in range(num_episodes):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter() 
        episodic_loss = []

        # 不是最终状态时
        while not done:

           #env.render()
        
            # 得到预处理游戏画面
            obs = preprocess_observation(obs)

            # 输入游戏画面,并得到每个行为的Q值
            actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})

            # 获得行为
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 

            # 利用epsilon贪婪策略来选择行为
            action = epsilon_greedy(action, global_step)
            
            # 执行行为,并转移下一状态next_obs,获得奖励
            next_obs, reward, done, _ = env.step(action)

            # 将上述转移信息作为经验保存在回放缓存中
            exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
            
            # 经过一定时间,利用经验回放缓存中的样本数据来训练Q网络
            if global_step % steps_train == 0 and global_step > start_steps:
                
                # 样本经验
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)

                # 状态
                o_obs = [x for x in o_obs]

                # 下一状态
                o_next_obs = [x for x in o_next_obs]

                # 下一行为
                next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})

                # 奖励
                y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done) 

                # 合并所有信息,并写入文件
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)

                # 这时训练网络并计算损失
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                episodic_loss.append(train_loss)
            
            # 经过一定时间,将主Q网络的权重复制到目标Q网络
            if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
        
        print('Epoch', epoch, 'Reward', episodic_reward,)
  • 5
    点赞
  • 30
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
基于深度强化学习的Atari游戏实现是一种非常有趣的研究领域,下面是一个简单的框架实现: 1. 环境配置 首先,需要安装OpenAI Gym和PyTorch等库。然后,需要下载Atari游戏环境,可以使用以下命令: ``` pip install gym[atari] ``` 2. 模型定义 定义一个CNN模型,用于处理游戏的图像输入。代码如下: ``` import torch.nn as nn class DQN(nn.Module): def __init__(self, num_actions): super(DQN, self).__init__() self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4) self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) self.fc1 = nn.Linear(3136, 512) self.fc2 = nn.Linear(512, num_actions) def forward(self, x): x = nn.functional.relu(self.conv1(x)) x = nn.functional.relu(self.conv2(x)) x = nn.functional.relu(self.conv3(x)) x = x.view(x.size(0), -1) x = nn.functional.relu(self.fc1(x)) x = self.fc2(x) return x ``` 这个模型有三个卷积层和两个全连接层,用于预测每个可能的动作的Q值。 3. 训练过程 使用深度Q学习算法进行训练。首先,需要定义一个经验回放池,用于存储游戏的经验。代码如下: ``` import random from collections import deque class ReplayMemory(object): def __init__(self, capacity): self.capacity = capacity self.memory = deque(maxlen=capacity) def push(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def sample(self, batch_size): batch = random.sample(self.memory, batch_size) state, action, reward, next_state, done = zip(*batch) return state, action, reward, next_state, done def __len__(self): return len(self.memory) ``` 然后,定义一个Agent类,用于执行动作并更新模型。代码如下: ``` import random import numpy as np import torch.optim as optim class Agent(object): def __init__(self, num_actions, epsilon_start, epsilon_final, epsilon_decay, gamma, memory_capacity, batch_size): self.num_actions = num_actions self.epsilon_start = epsilon_start self.epsilon_final = epsilon_final self.epsilon_decay = epsilon_decay self.gamma = gamma self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size self.policy_net = DQN(num_actions) self.target_net = DQN(num_actions) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.steps_done = 0 def select_action(self, state): epsilon = self.epsilon_final + (self.epsilon_start - self.epsilon_final) * np.exp(-1.0 * self.steps_done / self.epsilon_decay) self.steps_done += 1 if random.random() < epsilon: return random.randrange(self.num_actions) else: with torch.no_grad(): return self.policy_net(state).max(1)[1].view(1, 1) def optimize_model(self): if len(self.memory) < self.batch_size: return states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size) states = torch.cat(states) actions = torch.LongTensor(actions).view(-1, 1) rewards = torch.FloatTensor(rewards).view(-1, 1) next_states = torch.cat(next_states) dones = torch.FloatTensor(dones).view(-1, 1) q_values = self.policy_net(states).gather(1, actions) next_q_values = self.target_net(next_states).max(1)[0].detach().view(-1, 1) expected_q_values = (self.gamma * next_q_values * (1 - dones)) + rewards loss = nn.functional.smooth_l1_loss(q_values, expected_q_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) ``` 这个类有select_action、optimize_model和update_target_net三个方法。select_action方法根据当前状态选择一个动作。optimize_model方法用于优化模型。update_target_net方法每隔一段时间更新目标网络。 4. 训练循环 最后,定义一个训练循环,用于执行训练过程。代码如下: ``` def train(env, agent, num_episodes, max_steps): for i_episode in range(num_episodes): state = env.reset() state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0).float() total_reward = 0 for t in range(max_steps): action = agent.select_action(state) next_state, reward, done, _ = env.step(action.item()) next_state = torch.from_numpy(next_state).permute(2, 0, 1).unsqueeze(0).float() reward = torch.tensor([reward], dtype=torch.float32) agent.memory.push(state, action, reward, next_state, done) state = next_state total_reward += reward.item() agent.optimize_model() if done: break agent.update_target_net() print('Episode %d, total reward: %d' % (i_episode, total_reward)) ``` 这个训练循环会执行一定数量的游戏回合,并在每个回合结束后更新模型和目标网络。 这是一个简单的Atari游戏实现的框架,可以根据具体需求进行修改和扩展。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值