视频链接:Double-DQN与Dueling-Network
续 课程15:DQN完成捕食者游戏
博主参考 keras-rl强化学习示例,进行了修改,自己写对应方向强化学习代码的时候也可以参考keras库,比较简单。
同时强化学习游戏领域用的比较多,其他方向看起来都没太大成果,对于博主设置的捕鱼游戏来说,利用keras进行深度强化学习,平均奖励在-100左右(吃到食物25,移动-1,碰到敌人-300)甚至不如Q learning利用q table效果好,可能是nbsteps=100000,步骤数太大加上奖励固定,碰到敌人惩罚太高,以至于没有学到很好的规律,对于强化学习来讲,奖励的设置非常重要。
下面是详细代码:
# %%
import numpy as np
import cv2
from PIL import Image
import time
import pickle
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
# %%
EPISODES = 30000 # 局数
SHOW_EVERY = 3000 # 定义每隔多少局展示一次图像
epsilon = 0.6
EPS_DECAY = 0.9998
DISCOUNT = 0.95
LEARNING_RATE = 0.1
# %%
# 智能体的类,有其 位置信息 和 动作函数
class Cube:
def __init__(self,size): # 随机初始化位置坐标
self.size = size
self.x = np.random.randint(0, self.size-1)
self.y = np.random.randint(0, self.size-1)
def __str__(self):
return f'{self.x},{self.y}'
def __sub__(self, other):
return (self.x-other.x,self.y- other.y)
def __eq__(self, other) -> bool:
return self.x == other.x and self.y ==other.y #两个变量是否重叠
def action(self,choise):
if choise == 0:
self.move(x=1,y=1)
elif choise == 1:
self.move(x=-1, y=1)
elif choise == 2:
self.move(x=1, y=-1)
elif choise == 3:
self.move(x=-1, y=-1)
elif choise == 4:
self.move(x=0, y=1)
elif choise == 5:
self.move(x=0, y=-1)
elif choise == 6:
self.move(x=1, y=0)
elif choise == 7:
self.move(x=-1, y=0)
elif choise == 8:
self.move(x=0, y=0)
def move(self,x=False, y=False):
if not x:
self.x += np.random.randint(-1, 2)
else:
self.x += x
if not y:
self.y += np.random.randint(-1, 2)
else:
self.y += y
if self.x< 0:
self.x = 0
if self.x> self.size -1:
self.x = self.size-1
if self.y< 0:
self.y = 0
if self.y> self.size -1:
self.y = self.size-1
# %%
class envCube():#环境类
SIZE = 10 # 游戏区域的大小
OBSERVATION_SPACE_VALUES = (SIZE,SIZE,3)
# OBSERVATION_SPACE_VALUES=(4,)
ACTION_SPACE_VALUES = 9
RETURN_IMAGE = True #True就返回图像
FOOD_REWARD = 25 # agent获得食物的奖励
ENEMY_PENALITY = 300 # 遇上对手的惩罚
MOVE_PENALITY = 1 # 每移动一步的惩罚
# 设定三个部分的颜色分别是蓝、绿、红
d = {1:(255,0,0), # blue
2:(0,255,0), # green
3:(0,0,255)} # red
PLAYER_N = 1
FOOD_N = 2
ENEMY_N = 3
def reset(self):
self.player = Cube(self.SIZE)
self.food = Cube(self.SIZE)
while self.food == self.player:
self.food = Cube(self.SIZE)
self.enemy = Cube(self.SIZE)
while self.enemy == self.player:
self.enemy = Cube(self.SIZE)
if self.RETURN_IMAGE:
observation = np.array(self.get_image())/255 #标准化的操作
else:
observation = (self.player-self.food)+(self.player-self.enemy)
self.episode_step = 0 #玩游戏的步骤
return observation
def step(self,action):
self.episode_step += 1
self.player.action(action)
self.food.move()
self.enemy.move()
if self.RETURN_IMAGE:
new_observation = np.array(self.get_image())
else:
new_observation = (self.player-self.food)+(self.player-self.enemy)
if self.player == self.food:
reward = self.FOOD_REWARD
elif self.player == self.enemy:
reward = -self.ENEMY_PENALITY
else:
reward = -self.MOVE_PENALITY
done = False
if self.player == self.food or self.player == self.enemy or self.episode_step >=200:
done = True #吃到食物,遇到敌人,超过步数,就完成训练
return new_observation,reward,done,{}
def render(self,mode='human'):#画面渲染函数
img = self.get_image()
img = img.resize((800,800))
cv2.imshow('',np.array(img))
cv2.waitKey(1)
def get_image(self):
env = np.zeros((self.SIZE,self.SIZE,3),dtype= np.uint8)
env[self.food.x][self.food.y] = self.d[self.FOOD_N]
env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
img = Image.fromarray(env,'RGB')
return img
def get_qtable(self,qtable_name=None):
if qtable_name is None: # 如果没有实现提供,就随机初始化一个Q表格
q_table = {}
for x1 in range(-self.SIZE+1,self.SIZE):#PLAYER-FOOD
for y1 in range(-self.SIZE + 1, self.SIZE):
for x2 in range(-self.SIZE + 1, self.SIZE):#PLAYER-ENEMY
for y2 in range(-self.SIZE + 1, self.SIZE):
q_table[(x1,y1,x2,y2)] = [np.random.uniform(-5,0) for i in range(self.ACTION_SPACE_VALUES)]
else: # 提供了,就使用提供的Q表格
with open(qtable_name,'rb') as f:
q_table= pickle.load(f)
return q_table
# %%
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten,Conv2D
from tensorflow.keras.optimizers import Adam
from rl.agents.dqn import DQNAgent #pip install keras.rl2 安装rl库
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import FileLogger,ModelIntervalCheckpoint
env = envCube()
# %%
def build_model(status,nb_actions):
model = Sequential()
model.add(Conv2D(32,(3,3),activation='relu',input_shape=(1,) + status))#卷积核3*3,stride 1*1
model.add(Conv2D(32,(3,3),activation='relu'))
model.add(Flatten(input_shape=(1,) + status)) #不是图像,直接进入平滑层,x1,x2,y1,y2 4个数值拉平
model.add(Dense(32,activation='relu')) #过滤器,两个连接层
model.add(Dense(32,activation='relu'))
model.add(Dense(nb_actions,activation='linear')) # 激活层,nb_actions 输出动作的数量,SIZE=9
return model
# %%
model = build_model(env.OBSERVATION_SPACE_VALUES,env.ACTION_SPACE_VALUES)
print(model.summary()) #输出模型的相关信息,平滑层4个输入
print(f'(通道数*(卷积3*卷积3)+1)*32=896')
# %%
def build_agent(model,nb_actions):
memory = SequentialMemory(limit=50000,window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model,nb_actions=nb_actions,memory=memory,nb_steps_warmup=1000,
enable_double_dqn=True,target_model_update=5000,policy=policy)
dqn.compile(Adam(lr=1e-3),metrics=['mae'])
return dqn
# %%
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
def build_duel_agent(model,nb_actions):
memory = SequentialMemory(limit=50000,window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
nb_steps=500000) #线性退火policy,大于epsilon选最大Q,小于选随机选择
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000,#热身1000步
enable_dueling_network=True, dueling_type='avg', policy=policy)
dqn.compile(Adam(lr=1e-4),metrics=['mae'])
return dqn
# %%
dqn_duel = build_agent(model,env.ACTION_SPACE_VALUES) #构建duel_DQN Agent
# %%
checkpoint_weights_filename = './results/dqn_weights_{step}.h5f'
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000)]
dqn_duel.fit(env, nb_steps=100000, visualize=False, verbose=1,callbacks=callbacks) #fit拟合,nb_steps训练的epochs
# %%
dqn = build_agent(model,env.ACTION_SPACE_VALUES) #构建DQN Agent
print(dqn)
# %%
dqn.fit(env, nb_steps=100000, visualize=False, verbose=1) #fit拟合,nb_steps训练的epochs
#visualize 显示画面,verbose是否详细显示日志,每10000次输出一个Interval,训练时间比较长
model.save('./results/model_DDQN_results.h5') #作者用的dqn.save_weights我无法保存到.h5f文件,所以换一种参数保存方式
# %%
dqn.save_weights(f'./results/dqn_weights.h5f',overwrite=True)
# %%
scores = dqn.test(env,nb_episodes=20,visualize=True)
print(np.mean(scores.history['episode_reward']))
# %%
del dqn,model #装载本地保存DQN权重参数,重新运行前面的环境,但是不训练即不执行dqn.fit
# %%
dqn.load_weights('./results/model_results.h5')
# %%
scores = dqn.test(env,nb_episodes=20,visualize=True)
print(np.mean(scores.history['episode_reward']))