学习了莫烦python的DQN代码 走迷宫
并自己用pytorch写了一遍
1.主函数
"""
主函数
"""
from maze_env import Maze
from RL_Brain import DQNAgent
def run_maze():
step = 0
for episode in range(300):
observation = env.reset()
while True:
env.render()
action = RL.choose_action(observation)
observation_, reward, done = env.step(action)
RL.store_transition(observation, action, reward, observation_)
if (step > 200) and (step % 5 == 0):
RL.learn()
observation = observation_
step += 1
if done:
break
print('game over')
env.destroy()
if __name__ == '__main__':
env = Maze()
RL = DQNAgent(env.n_actions,env.n_features,
learning_rate = 0.01,
reward_decay = 0.9,
e_greedy = 0.9,
replace_target_iter = 200,
memory_size = 2000,
output_graph=True
)
env.after(100,run_maze)
env.mainloop()
RL.plot_cost()
2.环境
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.n_features = 2
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_W * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
# hell2_center = origin + np.array([UNIT, UNIT * 2])
# self.hell2 = self.canvas.create_rectangle(
# hell2_center[0] - 15, hell2_center[1] - 15,
# hell2_center[0] + 15, hell2_center[1] + 15,
# fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.1)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
next_coords = self.canvas.coords(self.rect) # next state
# reward function
if next_coords == self.canvas.coords(self.oval):
reward = 1
done = True
elif next_coords in [self.canvas.coords(self.hell1)]:
reward = -1
done = True
else:
reward = 0
done = False
s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
return s_, reward, done
def render(self):
# time.sleep(0.01)
self.update()
3.RL_Brain
"""
agent代码
"""
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
np.random.seed(42)
torch.manual_seed(2)
class Network(nn.Module):
def __init__(self,n_features,n_actions,n_neuron=10):
super(Network, self).__init__()
self.net = nn.Sequential(
nn.Linear(in_features=n_features, out_features=n_neuron, bias=True),
nn.ReLU(),
nn.Linear(in_features=n_neuron, out_features=n_actions, bias=True)
)
def forward(self,s):
s = s.float()
q = self.net(s)
return q
class DQNAgent:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=300,
memory_size=500,
batch_size = 32,
e_greedy_increment = None,
output_graph=True
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.e_greedy_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
self.learn_step_counter = 0
self.memory = pd.DataFrame(np.zeros((self.memory_size, self.n_features * 2 + 2)))
self.eval_net = Network(self.n_features,self.n_actions)
self.target_net = Network(self.n_features,self.n_actions)
self.loss_function = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(),lr=self.lr)
self.cost_his = []
def store_transition(self,s,a,r,s_):
# 检查对象是否包含对应的属性 没有 则创建
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
# 保证数据类型一致
transition = np.hstack((s,[r,a],s_))
# 覆盖旧的经验
index = self.memory_counter % self.memory_size
self.memory.iloc[index, :] = transition
self.memory_counter += 1
def choose_action(self,observation):
# 将一维向量转化为二维矩阵
observation = observation[np.newaxis,:]
if np.random.uniform() < self.epsilon:
s = torch.tensor(observation)
actions_value = self.eval_net(s)
action = [np.argmax(actions_value.detach().numpy())][0]
else:
action = np.random.randint(0,self.n_actions)
return action
def replace_target_params(self):
self.target_net.load_state_dict(self.eval_net.state_dict())
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0 :
self.replace_target_params()
print('\ntarget params replaced\n')
# 更清晰的写法(功能等效)
if self.memory_counter > self.memory_size:
batch_memory = self.memory.sample(self.batch_size)
else:
batch_memory = self.memory.iloc[:self.memory_counter].sample(
self.batch_size, replace=True
)
s = torch.tensor(batch_memory.iloc[:,:self.n_features].values)
s_ = torch.tensor(batch_memory.iloc[:,-self.n_features:].values)
q_eval = self.eval_net(s)
q_next = self.target_net(s_)
q_target = q_eval.clone()
batch_index = np.arange(self.batch_size,dtype=np.int32)
eval_act_index = batch_memory.iloc[:,self.n_features+1].values.astype(int)
reward = batch_memory.iloc[:,self.n_features].values
# 注意pandas和pytorch的value用法不同 前者是返回数组 后者返回最大值
q_target[batch_index, eval_act_index] = torch.tensor(reward).float() + self.gamma * q_next.max(dim= 1).values.float()
loss = self.loss_function(q_target,q_eval)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.cost_his.append(loss.detach().numpy())
self.epsilon = self.epsilon + self.e_greedy_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
plt.figure()
plt.plot(np.arange(len(self.cost_his)),self.cost_his)
plt.show()