【强化学习】实现Atari游戏的自动化学习(仅供参考)

问题描述:

        参考书的第八章《Atari Games with Deep Q Network》因为版本变更太多,所以本人直接更改源代码,从而实现程序的运行,但是因本人能力有限,本章代码只能单纯实现代码的运行

代码展示:

'''Building an agent to play Atari games'''
from cv2 import merge
from matplotlib import units
import numpy as np
import gym
import tensorflow as tf
from tensorflow.keras.layers import LSTM
from collections import deque,Counter
from tensorflow import keras

#定义preprocess_observation的函数,
# 用于预处理输入游戏屏幕,缩小图像的大小,图像转换成灰度。

coior = np.array([210,164,74]).mean()

def preprocess_observation(obs):
    
    # Crop and resize the image
    img = obs[1:176:2, ::2]

    # Convert the image to greyscale
    img = img.mean(axis=2)


    # Next we normalize the image from -1 to +1
    img = (img - 128) / 128 - 1

    return img.reshape(88,80,1)

import gym
env = gym.make("MsPacman-v0")
n_outputs = env.action_space.n
# env.render()
# print(n_outputs)


#define a q_network function for building our Q network. The input to our Q network will be the game state X.
tf.compat.v1.reset_default_graph()
tf.compat.v1.enable_eager_execution()

#@tf.function
def q_network(X,name_scope):
    
    tf.compat.v1.disable_eager_execution()
    #(None, 88, 80, 1)
    #X = tf.compat.v1.placeholder(tf.float32, shape=X_shape)
    X = tf.float32
    #initialize layers
    initializer = tf.keras.initializers.variance_scaling()

    with tf.compat.v1.variable_scope(name_scope) as scope:
    
        layer_1 = LSTM(units =10, kernel_initializer=initializer)   

        flat =tf.keras.layers.Flatten()
        fc= tf.keras.layers.Dense(units = 10,kernel_initializer=initializer)
    
        vars = {v.name[len(scope.name):]: v for v in tf.compat.v1.get_collection(key=tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
        output = 32.0
        return vars, output



#define an epsilon_greedy function 

epsilon = 0.5
eps_min = 0.005
eps_max = 1.0
eps_decay_steps = 500000

def epsilon_greedy(action,step):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min,eps_max - (eps_max -eps_min) * step / eps_decay_steps)
    if np.random.rand() <epsilon:
        return np.random.randint(n_outputs)
    else:
        return action
        
#store all the agent's experience i.e (state, action, rewards) in the experience replay buffer  
# sample from this minibatch of experience for training the network.

buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)


# initialize our experience replay buffer of length 20000, which holds the experience
def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    men = np.array(exp_buffer)[perm_batch]

    return men[:,0] ,men[:,1],men[:,2],men[:,3],men[:,4]



#define the network hyperParamaters
num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000

input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97
global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
logdir = 'logs'

tf.compat.v1.disable_eager_execution()
#define placeholdedr
print("X_SHAPE",X_shape)
X = tf.compat.v1.placeholder(tf.float32, shape=X_shape)

#define fucntion to toggle the training
in_training_mode = tf.compat.v1.placeholder(tf.bool)



# #build the network
mainQ,mainQ_outputs = q_network(X,"mainQ")
targetQ,targetQ_outputs = q_network(X,"targetQ")



#define the placeholder for our action value
X_action = tf.compat.v1.placeholder(tf.int32,shape=(None,))

#Q_action = tf.reduce_sum(targetQ_outputs *tf.one_hot(X_action,n_outputs),axis = -1,keepdims=True)

Q_action = tf.reduce_sum(targetQ_outputs *tf.one_hot(X_action,n_outputs),axis = -1,keepdims=True)

copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)


# define a placeholder for our output i.e action
y = tf.compat.v1.placeholder(tf.float32, shape=(None,1))

# now we calculate the loss which is the difference between actual value and predicted value
loss = tf.reduce_mean(tf.square(y - Q_action))

# we use adam optimizer for minimizing the loss
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)
#training_op = optimizer.minimize(loss)

init = tf.compat.v1.global_variables_initializer()


#loss_summary = tf.compat.v1.summary.scalar('LOSS', loss)
merge_summary = tf.compat.v1.summary.merge_all()

file_writer = tf.compat.v1.summary.FileWriter(logdir, tf.compat.v1.get_default_graph())



with tf.compat.v1.Session() as sess:
    init.run()
    
    # for each episode
    for i in range(num_episodes):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter() 
        episodic_loss = []

        # while the state is not the terminal state
        while not done:


            env.render()
        
            # get the preprocessed game screen
            obs = preprocess_observation(obs)

            # feed the game screen and get the Q values for each action
            mainQ_outputs = str(mainQ_outputs)
            
            #actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
            actions  = mainQ_outputs

            # get the action
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 

            # select the action using epsilon greedy policy
            action = epsilon_greedy(action, global_step)
            
            # now perform the action and move to the next state, next_obs, receive reward
            next_obs, reward, done, _ = env.step(action)

            # Store this transistion as an experience in the replay buffer
            exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
            
            # After certain steps, we train our Q network with samples from the experience replay buffer
            if global_step % steps_train == 0 and global_step > start_steps:
                
                # sample experience
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)

                # states
                o_obs = [x for x in o_obs]

                # next states
                o_next_obs = [x for x in o_next_obs]

                # next actions
                #next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
                next_act = mainQ_outputs

                # reward
                #y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done) 
                o_rew =o_rew.astype(int)
                o_done = o_done.astype(int)

               
                o_rew = 0
                o_done = 0
                discount_factor = int(discount_factor)
                next_act  =32.0
        
                y_batch = (discount_factor *next_act)

                # merge all summaries and write to the file
                #mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                mrg_summary = merge_summary
                #file_writer.add_summary(mrg_summary, global_step)

                # now we train the network and calculate loss
                #train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                #episodic_loss.append(train_loss)
            
            # after some interval we copy our main Q network weights to target Q network
            if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
        
        print('Epoch', epoch, 'Reward', episodic_reward,)

实现截图:

参考:

《Hands-on Reinforcement Learning with Python. Master Reinforcement and Deep Reinforcement Learning using OpenAI Gym and TensorFlow》

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
基于深度强化学习的Atari游戏实现是一种非常有趣的研究领域,下面是一个简单的框架实现: 1. 环境配置 首先,需要安装OpenAI Gym和PyTorch等库。然后,需要下载Atari游戏环境,可以使用以下命令: ``` pip install gym[atari] ``` 2. 模型定义 定义一个CNN模型,用于处理游戏的图像输入。代码如下: ``` import torch.nn as nn class DQN(nn.Module): def __init__(self, num_actions): super(DQN, self).__init__() self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4) self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) self.fc1 = nn.Linear(3136, 512) self.fc2 = nn.Linear(512, num_actions) def forward(self, x): x = nn.functional.relu(self.conv1(x)) x = nn.functional.relu(self.conv2(x)) x = nn.functional.relu(self.conv3(x)) x = x.view(x.size(0), -1) x = nn.functional.relu(self.fc1(x)) x = self.fc2(x) return x ``` 这个模型有三个卷积层和两个全连接层,用于预测每个可能的动作的Q值。 3. 训练过程 使用深度Q学习算法进行训练。首先,需要定义一个经验回放池,用于存储游戏的经验。代码如下: ``` import random from collections import deque class ReplayMemory(object): def __init__(self, capacity): self.capacity = capacity self.memory = deque(maxlen=capacity) def push(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def sample(self, batch_size): batch = random.sample(self.memory, batch_size) state, action, reward, next_state, done = zip(*batch) return state, action, reward, next_state, done def __len__(self): return len(self.memory) ``` 然后,定义一个Agent类,用于执行动作并更新模型。代码如下: ``` import random import numpy as np import torch.optim as optim class Agent(object): def __init__(self, num_actions, epsilon_start, epsilon_final, epsilon_decay, gamma, memory_capacity, batch_size): self.num_actions = num_actions self.epsilon_start = epsilon_start self.epsilon_final = epsilon_final self.epsilon_decay = epsilon_decay self.gamma = gamma self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size self.policy_net = DQN(num_actions) self.target_net = DQN(num_actions) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.steps_done = 0 def select_action(self, state): epsilon = self.epsilon_final + (self.epsilon_start - self.epsilon_final) * np.exp(-1.0 * self.steps_done / self.epsilon_decay) self.steps_done += 1 if random.random() < epsilon: return random.randrange(self.num_actions) else: with torch.no_grad(): return self.policy_net(state).max(1)[1].view(1, 1) def optimize_model(self): if len(self.memory) < self.batch_size: return states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size) states = torch.cat(states) actions = torch.LongTensor(actions).view(-1, 1) rewards = torch.FloatTensor(rewards).view(-1, 1) next_states = torch.cat(next_states) dones = torch.FloatTensor(dones).view(-1, 1) q_values = self.policy_net(states).gather(1, actions) next_q_values = self.target_net(next_states).max(1)[0].detach().view(-1, 1) expected_q_values = (self.gamma * next_q_values * (1 - dones)) + rewards loss = nn.functional.smooth_l1_loss(q_values, expected_q_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) ``` 这个类有select_action、optimize_model和update_target_net三个方法。select_action方法根据当前状态选择一个动作。optimize_model方法用于优化模型。update_target_net方法每隔一段时间更新目标网络。 4. 训练循环 最后,定义一个训练循环,用于执行训练过程。代码如下: ``` def train(env, agent, num_episodes, max_steps): for i_episode in range(num_episodes): state = env.reset() state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0).float() total_reward = 0 for t in range(max_steps): action = agent.select_action(state) next_state, reward, done, _ = env.step(action.item()) next_state = torch.from_numpy(next_state).permute(2, 0, 1).unsqueeze(0).float() reward = torch.tensor([reward], dtype=torch.float32) agent.memory.push(state, action, reward, next_state, done) state = next_state total_reward += reward.item() agent.optimize_model() if done: break agent.update_target_net() print('Episode %d, total reward: %d' % (i_episode, total_reward)) ``` 这个训练循环会执行一定数量的游戏回合,并在每个回合结束后更新模型和目标网络。 这是一个简单的Atari游戏实现的框架,可以根据具体需求进行修改和扩展。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值