Q-Learning完成迷宫任务

最新推荐文章于 2024-07-26 22:55:49 发布
Fake瑞克
最新推荐文章于 2024-07-26 22:55:49 发布
阅读量1.5k
点赞数
分类专栏：人工智能文章标签：概率论深度学习机器学习强化学习
本文链接：https://blog.csdn.net/m0_64113647/article/details/122790030
版权
人工智能专栏收录该内容
1 篇文章 0 订阅
订阅专栏
Q-Learning完成迷宫任务
项目地址：Q-Learning完成迷宫任务 - CAWCAW的意思是鸦叫声，Guaik的Logo是一只长相奇特的乌鸦。https://caw.guaik.io/d/23-q-learning
# 导入使用的包
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
import matplotlib.cm as cm
%matplotlib inline

# 迷宫的初始位置

# 声明图的大小以及图的变量名
fig = plt.figure(figsize=(6, 6))
ax = plt.gca()

# 画出红色的墙壁
plt.plot([1, 1], [4, 3], color='red', linewidth=2)
plt.plot([1, 2], [3, 3], color='red', linewidth=2)
plt.plot([2, 2], [3, 2], color='red', linewidth=2)
plt.plot([1, 2], [2, 2], color='red', linewidth=2)
plt.plot([1, 1], [2, 1], color='red', linewidth=2)

plt.plot([1, 2], [5, 5], color='red', linewidth=2)
plt.plot([2, 2], [5, 4], color='red', linewidth=2)
plt.plot([2, 3], [4, 4], color='red', linewidth=2)
plt.plot([3, 3], [4, 3], color='red', linewidth=2)
plt.plot([3, 4], [3, 3], color='red', linewidth=2)

plt.plot([2, 2], [1, 0], color='red', linewidth=2)
plt.plot([2, 3], [1, 1], color='red', linewidth=2)
plt.plot([3, 3], [1, 2], color='red', linewidth=2)
plt.plot([3, 4], [2, 2], color='red', linewidth=2)
plt.plot([4, 4], [2, 1], color='red', linewidth=2)

plt.plot([3, 3], [6, 5], color='red', linewidth=2)

plt.plot([4, 4], [6, 4], color='red', linewidth=2)

plt.plot([5, 5], [5, 4], color='red', linewidth=2)
plt.plot([5, 6], [5, 5], color='red', linewidth=2)

plt.plot([5, 6], [3, 3], color='red', linewidth=2)
plt.plot([5, 6], [2, 2], color='red', linewidth=2)
plt.plot([5, 6], [1, 1], color='red', linewidth=2)

# 画出表示状态的文字S0-S35
s = 0
for i in range(0, 6):
    for j in range(0, 6):
        plt.text(0.5 + j, 5.5 - i, 'S'+str(s), size=14, ha='center')
        s += 1

plt.text(0.5, 5.3, 'START', ha='center')
plt.text(5.5, 0.3, 'GOAL', ha='center')

# 设置画图的范围
ax.set_xlim(0, 6)
ax.set_ylim(0, 6)
plt.tick_params(axis='both', which='both', bottom='off', top='off',
                labelbottom='off', right='off', left='off', labelleft='off')

# 当前位置S0用绿色圆圈画出
line, = ax.plot([0.5], [5.5], marker='o', color='g', markersize=40)

# 设定参数θ的初始值theta_0，用于确定初始方案

# 行为状态0-34，列为上右下左的移动方向
theta_0 = np.array([
    [np.nan, 1, 1, np.nan],  # S0
    [np.nan, 1, np.nan, 1],  # S1
    [np.nan, np.nan, 1, 1],  # S2
    [np.nan, np.nan, 1, np.nan],       # S3
    [np.nan, 1, 1, np.nan],  # S4
    [np.nan, np.nan, np.nan, 1],    # S5

    [1, 1, 1, np.nan],  # S6
    [np.nan, np.nan, 1, 1],  # S7
    [1, 1, np.nan, np.nan],  # S8
    [1, np.nan, 1, 1],       # S9
    [1, np.nan, 1, np.nan],  # S10
    [np.nan, np.nan, 1, np.nan],    # S11

    [1, np.nan, 1, np.nan],  # S12
    [1, 1, np.nan, np.nan],  # S13
    [np.nan, np.nan, 1, 1],  # S14
    [1, 1, np.nan, np.nan],       # S15
    [1, 1, 1, 1],  # S16
    [1, np.nan, np.nan, 1],    # S17

    [1, 1, 1, np.nan],  # S18
    [np.nan, np.nan, np.nan, 1],  # S19
    [1, 1, 1, np.nan],  # S20
    [np.nan, 1, np.nan, 1],       # S21
    [1, 1, 1, 1],  # S22
    [np.nan, np.nan, np.nan, 1],    # S23

    [1, np.nan, 1, np.nan],  # S24
    [np.nan, 1, 1, np.nan],  # S25
    [1, np.nan, np.nan, 1],  # S26
    [np.nan, np.nan, 1, np.nan],       # S27
    [1, 1, 1, np.nan],  # S28
    [np.nan, np.nan, np.nan, 1],    # S29

    [1, 1, np.nan, np.nan],  # S30
    [1, np.nan, np.nan, 1],  # S31
    [np.nan, 1, np.nan, np.nan],  # S32
    [1, 1, np.nan, 1],       # S33
    [1, 1, np.nan, 1],  # S34
    # [1, np.nan, np.nan, np.nan],    # S35
])

# 设置初始的动作价值函数

[a, b] = theta_0.shape
Q = np.random.rand(a, b) * theta_0 * 0.1 # 乘theta_0是为了使墙壁方向的值为nan

# 将策略参数theta_0转化为随机策略

def simple_convert_into_pi_from_theta(theta):
    ''' 简单计算比率 '''
    [m, n] = theta.shape
    pi = np.zeros((m, n))
    for i in range(0, m):
        pi[i, :] = theta[i, :] / np.nansum(theta[i, :])
    pi = np.nan_to_num(pi)
    return pi

# 求取随机系统策略pi_0
pi_0 = simple_convert_into_pi_from_theta(theta_0)

# 实现贪婪算法
def get_action(s, Q, epsilon, pi_0):
    direction = ['up', 'right', 'down', 'left']

    # 确定行动
    if np.random.rand() < epsilon:
        # 以概率随机行动
        next_direction = np.random.choice(direction, p=pi_0[s, :])
    else:
        # 采用q的最大值对应的动作
        next_direction = direction[np.nanargmax(Q[s, :])]

    # 为动作加上索引
    if next_direction == 'up':
        action = 0
    elif next_direction == 'right':
        action = 1
    elif next_direction == 'down':
        action = 2
    elif next_direction == 'left':
        action = 3
    return action

def get_s_next(s, a, Q, epsilon, pi_0):
    direction = ['up', 'right', 'down', 'left']
    next_direction = direction[a]

    # 通过动作确定下一个状态
    if next_direction == 'up':
        s_next = s - 6
    elif next_direction == 'right':
        s_next = s + 1
    elif next_direction == 'down':
        s_next = s + 6
    elif next_direction == 'left':
        s_next = s - 1

    return s_next

# 基于Q学习的更新动作价值函数Q
def Q_learning(s, a, r, s_next, Q, eta, gamma):
    if s_next == 35:
        Q[s, a] = Q[s, a] + eta * (r - Q[s, a])
    else:
        Q[s, a] = Q[s, a] + eta * (r + gamma * np.nanmax(Q[s_next, :]) - Q[s, a])

    return Q

# 定义基于Q-Learning求解迷宫的函数，输出状态、动作的历史记录以及更新后的Q
def goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi):
    s = 0
    a = a_next = get_action(s, Q, epsilon, pi) # 初始动作
    s_a_history = [[0, np.nan]] # 记录智能体移动序列

    while (1):
        a = a_next # 更新动作

        # 将动作放在现在的状态下
        s_a_history[-1][1] = a  

        # 获取有效的下一个状态
        s_next = get_s_next(s, a, Q, epsilon, pi)
        # 代入下一个状态，动作未知时为nan
        s_a_history.append([s_next, np.nan])

        if s_next == 35:
            r = 1
            a_next = np.nan
        else:
            r = 0
            a_next = get_action(s_next, Q, epsilon, pi)
        
        Q = Q_learning(s, a, r, s_next, Q, eta, gamma)

        if s_next == 35:
            break
        else:
            s = s_next
    return [s_a_history, Q]

eta = 0.1       # 学习率
gamma = 0.9     # 时间折扣率
epsilon = 0.5   # 贪婪算法初始值
v = np.nanmax(Q, axis=1) # 求每个状态价值的最大值
is_continue = True
episode = 1

V = [] # 存放每回合的状态价值
V.append(np.nanmax(Q, axis=1))

while is_continue:
    print('回合数：'+str(episode))

    # 贪婪算法的值逐渐减少
    epsilon = epsilon / 2

    # 通过Sarsa求解迷宫问题，求取移动历史和更新后的Q值
    [s_a_history, Q] = goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi_0)

    new_v = np.nanmax(Q, axis=1)
    print(np.sum(np.abs(new_v - v)))
    v = new_v
    V.append(v)

    print('求解迷宫问题所需步数 '+str(len(s_a_history) - 1))

    # 重复100回合
    episode = episode + 1
    if episode > 100:
        break
print(Q)

# 制作走迷宫动画
def init():
    line.set_data([], [])
    return (line,)

def animate(i):
    ''' 每一帧画面内容 '''
    state = s_a_history[i][0]
    x = (state % 6) + 0.5
    y = 5.5 - int(state / 6)
    line.set_data(x, y)
    return (line,)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(s_a_history), interval=100, repeat=False)
HTML(anim.to_jshtml())

# 制作Q-Learning行动价值训练动画
# def init():
#     line.set_data([], [])
#     return (line,)

# def animate(i):
#     for n in range(0, 6):
#         for j in range(0, 6):
#             if n == 5 and j == 5:
#                 line, = ax.plot([0.5 + j], [5.5 - n], marker="s", color=cm.jet(1.0), markersize=40)
#             else:
#                 line, = ax.plot([0.5 + j], [5.5 - n], marker="s", color=cm.jet(V[i][n*6+j]), markersize=40)
#     return (line,)

# anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(V), interval=100, repeat=False)
# HTML(anim.to_jshtml())