【强化学习】Q-learning 寻找最优路径解

题目

在这里插入图片描述
并尝试将运动方向扩展至

    number_move = 4 : 动作集合:← ↑ → ↓
    number_move = 8 : 动作集合:← ↑ → ↓ ↖ ↗ ↘ ↙
    number_move = 9 : 动作集合:← ↑ → ↓ ↖ ↗ ↘ ↙ + stop

Q-learning 寻找最优路径解

import numpy as np

class Flybrid:
    '''
    in the qusetion , i use Q-learing to slove this problem
    the update equation is self.Q[x,y,action] = self.Q[x,y,action] + self.a*(self.reward[move_end_x,move_end_y] +
                                     self.r * np.max(self.Q[move_end_x,move_end_y]) - self.Q[x,y,action])
    '''
    def __init__(self,a = 0.1, e = 0.3, r = 0.9 , number_move = 9, start = (3,0), G = (3,7)):
        self.a = a
        self.e = e
        self.r = r
        self.start_x,self.start_y = start
        self.G_x,self.G_y = G
        #build wind
        self.wind = np.zeros((7,10))
        self.wind[:,3:6] = 1
        self.wind[:,6:8] = 2
        self.wind[:,8] = 1
        print(self.wind)
        #build_reward
        self.reward = np.zeros((7, 10)) - 1
        self.reward[self.G_x,self.G_y] = 1
        print(self.reward)
        #build_Q_initital
        self.Q = np.zeros((7, 10, number_move))
        '''
        0  = go up
        1  = go down
        2 =  go right
        3 =  go left
        4 =  go up_right
        5 =  go down_right
        6 =  go up_left
        7 =  go down_left
        8 =  stay
        '''
        self.number_move = number_move
        move_list = [(1,0),(-1,0),(0,1),(0,-1),(1,1),(-1,1),(1,-1),(-1,-1),(0,0)]
        self.move_list = move_list[:number_move]
        self.row = self.wind.shape[0]
        self.col = self.wind.shape[1]

    def converge(self,local):
        x,y = local
        if x > self.row - 1:
            x = self.row - 1
        if x < 0:
            x = 0
        if y > self.col -1:
            y = self.col -1
        if y < 0:
            y = 0
        return x, y

    def move(self,x,y,move):
        move_no_wind = (x + self.move_list[move][0],y + self.move_list[move][1])
        new_x,new_y = self.converge(move_no_wind)
        move_result = (new_x+self.wind[new_x,new_y],new_y)
        result_x, result_y = self.converge(move_result)
        return int(result_x),int(result_y)

    def greedy_action(self,x,y):
        p_list = np.ones(self.number_move) * self.e/(self.number_move-1)
        p_list[np.argmax(self.Q[x,y])] = 1 - self.e
        take_action = np.random.choice(list(range(self.number_move)),p=p_list)
        return take_action

    def iter(self,max_iter=10000):
        for iter in range(max_iter):
            x, y = self.start_x, self.start_y
            step = 0
            while (x,y) != (self.G_x,self.G_y):
                action = self.greedy_action(x,y)
                move_end_x, move_end_y = self.move(x,y,action)
                self.Q[x,y,action] = self.Q[x,y,action] + self.a*(self.reward[move_end_x,move_end_y] +
                                     self.r * np.max(self.Q[move_end_x,move_end_y]) - self.Q[x,y,action])
                x,y = move_end_x,move_end_y
                step+=1

    def get_result(self):
        best_q = np.argmax(self.Q,-1)

        print()
        for i in range(self.row-1,-1,-1):
            print(best_q[i])

        tract = np.zeros((self.row,self.col))
        x, y = self.start_x, self.start_y
        step = 0
        while (x,y) !=  (self.G_x,self.G_y):
            tract[x,y] = 1
            x,y = self.move(x,y,best_q[x,y])
            step += 1
        tract[x, y] = 1
        print()
        print('min_step = ', step)
        for i in range(self.row-1,-1,-1):
            print(tract[i])

def main():
    '''
    number_move = 4 : 动作集合:← ↑ → ↓
    number_move = 8 : 动作集合:← ↑ → ↓ ↖ ↗ ↘ ↙
    number_move = 9 : 动作集合:← ↑ → ↓ ↖ ↗ ↘ ↙ + stop
    a               : 学习率
    e               : e-greedy
    r               : 折损率
    start           : 起点
    G               : 终点
    return: 各个位置的最优动作图 、 最小步长 、 行动轨迹图(1表示)
    '''
    fly = Flybrid(a = 0.1, e = 0.3, r = 0.9 , number_move = 9, start = (3,0), G = (3,7))
    fly.iter()
    fly.get_result()

if __name__ == '__main__':
    main()
    
#---------result(其一)------------------
/usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/RL/Q-learning.py
[[0. 0. 0. 1. 1. 1. 2. 2. 1. 0.]
 [0. 0. 0. 1. 1. 1. 2. 2. 1. 0.]
 [0. 0. 0. 1. 1. 1. 2. 2. 1. 0.]
 [0. 0. 0. 1. 1. 1. 2. 2. 1. 0.]
 [0. 0. 0. 1. 1. 1. 2. 2. 1. 0.]
 [0. 0. 0. 1. 1. 1. 2. 2. 1. 0.]
 [0. 0. 0. 1. 1. 1. 2. 2. 1. 0.]]
 
[7 7 1 7 4 2 2 4 5 1]
[1 7 1 7 7 5 4 5 5 1]
[1 5 1 7 7 4 5 5 5 1]
[5 1 7 7 7 5 5 0 5 1]
[2 5 1 7 7 5 5 0 7 7]
[4 2 5 5 5 5 0 0 3 3]
[2 2 2 0 0 0 0 0 0 6]

min_step =  7
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 1. 1. 1. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  • 1
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值