Maze环境以及DQN的实现

环境
5*5的迷宫,其中(4,3)位置是出口,障碍物的位置分别为(1,1)、(1,2)、(1,3)、(1,4)、(3,2)、(3,3)、(4,2)
迷宫图片
动作空间当然4个,上下左右
直接上代码~

import gym
import tensorflow as tf
import numpy as np

import time
import sys
if sys.version_info.major == 2:
    import Tkinter as tk
else:
    import tkinter as tk

class Maze(tk.Tk, object):
    def __init__(self,state_x,state_y,frequency):
        super(Maze, self).__init__()
        self.action = [0, 1, 2, 3]
        self.n_actions = len(self.action)
        self.n_features = 2

        self.state_min_x = 0
        self.state_max_x = 4
        self.state_min_y = 0
        self.state_max_y = 4
        self.frequency = frequency

        self.block = np.zeros((self.state_max_x+1, self.state_max_y+1))

        self.block[1][1] = 1
        self.block[1][2] = 1
        self.block[1][3] = 1
        self.block[1][4] = 1
        self.block[3][2] = 1
        self.block[3][3] = 1
        self.block[4][2] = 1

        self.state_x=state_x
        self.state_y=state_y

        # 出口
        self.out_x = 4
        self.out_y = 3

    def reset(self):
        self.update()
        self.state_x = np.random.randint(self.state_min_x, self.state_max_x)
        self.state_y = np.random.randint(self.state_min_y, self.state_max_y)
        if self.block[self.state_x][self.state_y] == 1:
            self.reset()
        if self.frequency !=0:
            print("reset"+str(self.frequency))
        self.frequency = 0
        return np.array([self.state_x, self.state_y])

    def step(self, action):
        judge = 1
        self.frequency+=1
        if action == 0: # up
            if self.state_y < self.state_max_y and self.block[self.state_x][self.state_y+1] == 0:
                self.state_y += 1
            else:
                self.state_y = self.state_y
                judge = 0
        elif action == 1: # down
            if self.state_y > self.state_min_y and self.block[self.state_x][self.state_y-1] == 0:
                self.state_y -= 1
            else:
                self.state_y = self.state_y
                judge = 0
        elif action == 2: # right
            if self.state_x < self.state_max_x and self.block[self.state_x+1][self.state_y] == 0:
                self.state_x += 1
            else:
                self.state_x = self.state_x
                judge = 0
        elif action == 3: # left
            if self.state_x > self.state_min_x and self.block[self.state_x-1][self.state_y] == 0:
                self.state_x -= 1
            else:
                self.state_x = self.state_x
                judge = 0
        if judge == 0:
            return np.array([self.state_x, self.state_y]), -1, self.is_done()
        return np.array([self.state_x, self.state_y]), self.get_reward(), self.is_done()

    def render(self):
        return 0

    def is_done(self):
        if self.state_x == self.out_x and self.state_y == self.out_y:
            print("finish")
            return True
        return False
    def get_reward(self):
        if self.state_x == self.out_x and self.state_y == self.out_y:
            return 1
        return 0

其中回馈值设置的好像有问题,收敛会很慢,但是逻辑应该不存在问题。
其中DQN的实现直接拷贝了Github上的代码
DQN_modified.pyRL_brain.py

执行的代码如下

from RL_brain import DeepQNetwork
from env_maze import Maze

def work():
    step = 0
    for _ in range(1000):
        # initial observation
        observation = env.reset()

        while True:
            # fresh env
            env.render()

            # RL choose action based on observation
            action = RL.choose_action(observation)

            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            RL.store_transition(observation, action, reward, observation_)

            if (step > 200) and (step % 5 == 0):
                RL.learn()

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()

if __name__ == '__main__':
    #导入Maze环境
    env = Maze(0,0,0)
    RL = DeepQNetwork(env.n_actions, env.n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )
    env.after(100, work)
    env.mainloop()

因为还不知道什么评价指标,所以我直接用frequency(到达终点尝试的次数)做了评价指标,速度简直慢的吓人
第一次到达终点用了 168次尝试,第二次直接用了224659次…天
finish
reset168
finish
reset224659

于是,我改了一下get_reward函数,相当于在最后这条路线上提前设置好。

    def get_reward(self):
        if self.state_x == self.out_x and self.state_y == self.out_y:
            return 1
        elif self.state_x == 4 and self.state_y == 4:
            return 0.9
        elif self.state_x == 3 and self.state_y == 4:
            return 0.8
        elif self.state_x == 2 and self.state_y == 4:
            return 0.7
        elif self.state_x == 2 and self.state_y == 3:
            return 0.6
        elif self.state_x == 2 and self.state_y == 2:
            return 0.5
        elif self.state_x == 2 and self.state_y == 1:
            return 0.4
        return 0

训练3000次后基本上10次左右就可以找到路线了,- -有点耍流氓,不过至少证明我这个实验没问题,就是收敛的有点慢
接着,如果刚开始的回馈值有问题呢?比如

    def get_reward(self):
        if self.state_x == self.out_x and self.state_y == self.out_y:
            return 1
        elif self.state_x == 4 and self.state_y == 4:
            return 0.8
        elif self.state_x == 3 and self.state_y == 4:
            return 0.6
        elif self.state_x == 2 and self.state_y == 4:
            return 0.4
        elif self.state_x == 2 and self.state_y == 3:
            return 0.7
        elif self.state_x == 2 and self.state_y == 2:
            return 0.3
        elif self.state_x == 2 and self.state_y == 1:
            return 0.4
        return 0

结果还是可以收敛,emmm 还是老老实实的学下基础内容吧

  • 2
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值