课程16:Dueling-Network解决捕鱼者游戏

视频链接:Double-DQN与Dueling-Network
课程15:DQN完成捕食者游戏
博主参考 keras-rl强化学习示例,进行了修改,自己写对应方向强化学习代码的时候也可以参考keras库,比较简单。
同时强化学习游戏领域用的比较多,其他方向看起来都没太大成果,对于博主设置的捕鱼游戏来说,利用keras进行深度强化学习,平均奖励在-100左右(吃到食物25,移动-1,碰到敌人-300)甚至不如Q learning利用q table效果好,可能是nbsteps=100000,步骤数太大加上奖励固定,碰到敌人惩罚太高,以至于没有学到很好的规律,对于强化学习来讲,奖励的设置非常重要。
下面是详细代码:

# %%
import numpy as np
import cv2
from PIL import Image
import time
import pickle
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

# %%
EPISODES = 30000    # 局数
SHOW_EVERY = 3000   # 定义每隔多少局展示一次图像
epsilon = 0.6
EPS_DECAY = 0.9998
DISCOUNT = 0.95
LEARNING_RATE = 0.1


# %%
# 智能体的类,有其 位置信息 和 动作函数
class Cube:
    def __init__(self,size): # 随机初始化位置坐标
        self.size = size
        self.x = np.random.randint(0, self.size-1)
        self.y = np.random.randint(0, self.size-1)
    def __str__(self):
        return f'{self.x},{self.y}'
    def __sub__(self, other):
        return (self.x-other.x,self.y- other.y)
    def __eq__(self, other) -> bool:
        return self.x == other.x and self.y ==other.y #两个变量是否重叠
    def action(self,choise):
        if choise == 0:
            self.move(x=1,y=1)
        elif choise == 1:
            self.move(x=-1, y=1)
        elif choise == 2:
            self.move(x=1, y=-1)
        elif choise == 3:
            self.move(x=-1, y=-1)
        elif choise == 4:
            self.move(x=0, y=1)
        elif choise == 5:
            self.move(x=0, y=-1)
        elif choise == 6:
            self.move(x=1, y=0)
        elif choise == 7:
            self.move(x=-1, y=0)
        elif choise == 8:
            self.move(x=0, y=0)
    def move(self,x=False, y=False):
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        if self.x< 0:
            self.x = 0
        if self.x> self.size -1:
            self.x = self.size-1
        if self.y< 0:
            self.y = 0
        if self.y> self.size -1:
            self.y = self.size-1

# %%
class envCube():#环境类
    SIZE = 10           # 游戏区域的大小
    OBSERVATION_SPACE_VALUES = (SIZE,SIZE,3)
    # OBSERVATION_SPACE_VALUES=(4,)
    ACTION_SPACE_VALUES = 9
    RETURN_IMAGE = True #True就返回图像

    FOOD_REWARD = 25      # agent获得食物的奖励
    ENEMY_PENALITY = 300  # 遇上对手的惩罚
    MOVE_PENALITY = 1     # 每移动一步的惩罚

    # 设定三个部分的颜色分别是蓝、绿、红
    d = {1:(255,0,0), # blue
        2:(0,255,0), # green
        3:(0,0,255)} # red
    PLAYER_N = 1
    FOOD_N = 2
    ENEMY_N = 3

    def reset(self):
        self.player = Cube(self.SIZE)
        self.food = Cube(self.SIZE)
        while self.food == self.player:
            self.food = Cube(self.SIZE)

        self.enemy = Cube(self.SIZE)
        while self.enemy == self.player:
            self.enemy = Cube(self.SIZE)
        if self.RETURN_IMAGE:
            observation = np.array(self.get_image())/255 #标准化的操作
        else:
            observation = (self.player-self.food)+(self.player-self.enemy)
        self.episode_step = 0 #玩游戏的步骤
        return observation
    
    def step(self,action):
        self.episode_step += 1
        self.player.action(action)
        self.food.move()
        self.enemy.move()
        if self.RETURN_IMAGE:
            new_observation = np.array(self.get_image())
        else:
            new_observation = (self.player-self.food)+(self.player-self.enemy)

        if self.player == self.food:
            reward = self.FOOD_REWARD
        elif self.player == self.enemy:
            reward = -self.ENEMY_PENALITY
        else:
            reward = -self.MOVE_PENALITY
        
        done = False
        if self.player == self.food or self.player == self.enemy or self.episode_step >=200:
            done = True #吃到食物,遇到敌人,超过步数,就完成训练
        return new_observation,reward,done,{}
    
    def render(self,mode='human'):#画面渲染函数
        img = self.get_image()
        img = img.resize((800,800))
        cv2.imshow('',np.array(img))
        cv2.waitKey(1)

    def get_image(self):
        env = np.zeros((self.SIZE,self.SIZE,3),dtype= np.uint8)
        env[self.food.x][self.food.y] = self.d[self.FOOD_N]
        env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
        env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
        img = Image.fromarray(env,'RGB')
        return img
    def get_qtable(self,qtable_name=None):
        if qtable_name is None:   # 如果没有实现提供,就随机初始化一个Q表格
            q_table = {}
            for x1 in range(-self.SIZE+1,self.SIZE):#PLAYER-FOOD
                for y1 in range(-self.SIZE + 1, self.SIZE):
                    for x2 in range(-self.SIZE + 1, self.SIZE):#PLAYER-ENEMY
                        for y2 in range(-self.SIZE + 1, self.SIZE):
                            q_table[(x1,y1,x2,y2)] = [np.random.uniform(-5,0) for i in range(self.ACTION_SPACE_VALUES)]
        else:                # 提供了,就使用提供的Q表格
            with open(qtable_name,'rb') as f:
                q_table= pickle.load(f)    
        return q_table    
    

# %%
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten,Conv2D
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent #pip install keras.rl2 安装rl库
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import FileLogger,ModelIntervalCheckpoint
env = envCube()

# %%
def build_model(status,nb_actions):
    model = Sequential()
    model.add(Conv2D(32,(3,3),activation='relu',input_shape=(1,) + status))#卷积核3*3,stride 1*1
    model.add(Conv2D(32,(3,3),activation='relu'))
    model.add(Flatten(input_shape=(1,) + status)) #不是图像,直接进入平滑层,x1,x2,y1,y2 4个数值拉平
    model.add(Dense(32,activation='relu'))  #过滤器,两个连接层
    model.add(Dense(32,activation='relu'))
    
    model.add(Dense(nb_actions,activation='linear')) # 激活层,nb_actions 输出动作的数量,SIZE=9
    return model

# %%
model = build_model(env.OBSERVATION_SPACE_VALUES,env.ACTION_SPACE_VALUES)
print(model.summary()) #输出模型的相关信息,平滑层4个输入
print(f'(通道数*(卷积3*卷积3)+1)*32=896')

# %%
def build_agent(model,nb_actions):
    memory = SequentialMemory(limit=50000,window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model,nb_actions=nb_actions,memory=memory,nb_steps_warmup=1000,
                   enable_double_dqn=True,target_model_update=5000,policy=policy)
    dqn.compile(Adam(lr=1e-3),metrics=['mae'])
    return dqn

# %%
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
def build_duel_agent(model,nb_actions):
    memory = SequentialMemory(limit=50000,window_length=1)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=500000)   #线性退火policy,大于epsilon选最大Q,小于选随机选择

    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000,#热身1000步
               enable_dueling_network=True, dueling_type='avg', policy=policy)
    dqn.compile(Adam(lr=1e-4),metrics=['mae'])
    return dqn

# %%
dqn_duel = build_agent(model,env.ACTION_SPACE_VALUES) #构建duel_DQN Agent

# %%
checkpoint_weights_filename = './results/dqn_weights_{step}.h5f'
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000)]
dqn_duel.fit(env, nb_steps=100000, visualize=False, verbose=1,callbacks=callbacks)  #fit拟合,nb_steps训练的epochs

# %%
dqn = build_agent(model,env.ACTION_SPACE_VALUES) #构建DQN Agent
print(dqn)

# %%
dqn.fit(env, nb_steps=100000, visualize=False, verbose=1)  #fit拟合,nb_steps训练的epochs
#visualize 显示画面,verbose是否详细显示日志,每10000次输出一个Interval,训练时间比较长
model.save('./results/model_DDQN_results.h5') #作者用的dqn.save_weights我无法保存到.h5f文件,所以换一种参数保存方式

# %%
dqn.save_weights(f'./results/dqn_weights.h5f',overwrite=True)

# %%
scores = dqn.test(env,nb_episodes=20,visualize=True)
print(np.mean(scores.history['episode_reward']))

# %%
del dqn,model     #装载本地保存DQN权重参数,重新运行前面的环境,但是不训练即不执行dqn.fit

# %%
dqn.load_weights('./results/model_results.h5')

# %%
scores = dqn.test(env,nb_episodes=20,visualize=True)
print(np.mean(scores.history['episode_reward']))

  • 7
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

じょりゅう

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值