Q_learn介绍和实现的源码

最新推荐文章于 2024-03-13 20:31:50 发布

insid1out

最新推荐文章于 2024-03-13 20:31:50 发布

阅读量205

点赞数

文章标签： python 算法

本文链接：https://blog.csdn.net/insid1out/article/details/128658902

版权

介绍就不说了（2023年了还有人看吗？）

Q学习（Q-learning）简单理解_qq_39429669的博客-CSDN博客_q学习

代码如下

# -*- encoding: utf-8 -*-
'''
@File    :   QLearning.py   
@Contact :   blmeng@foxmail.com
@License :   (C)Copyright yunji.com 
 
@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2023/1/12 13:00   blmeng                 None
'''

import numpy as np
import random
def leanQ():
    """
    根据门的状态矩阵训练出Q值表
    Returns: q表格
    如以下
        Q = [[  0.           0.           0.           0.         399.99892416,    0.        ],
            [  0.           0.           0.         319.99913933   0.,  499.9986552 ],
            [  0.           0.           0.         319.99913933   0.,    0.        ],
            [  0.         399.99892416 255.99931146   0.         399.99892416,    0.     ],
            [319.99913933   0.           0.         319.99913933   0.,  499.9986552 ],
            [  0.         399.99831901   0.           0.         399.9986552,  499.99831901]]
    """
    # 初始化矩阵
    Q = np.zeros((6, 6))
    Q = np.array(Q)

    # 回报矩阵R
    R = np.array([[-1, -1, -1, -1, 0, -1],
                   [-1, -1, -1, 0, -1, 100],
                   [-1, -1, -1, 0, -1, -1],
                   [-1, 0, 0, -1, 0, -1],
                   [0, -1, -1, 0, -1, 100],
                   [-1, 0, -1, -1, 0, 100]])

    # 设立学习参数
    γ = 0.8

    # 训练
    for i in range(1000):
        # 对每一个训练,随机选择一种状态
        state = random.randint(0, 5)
        while True:
            # 选择当前状态下的所有可能动作
            r_pos_action = []
            for action in range(6):
                if R[state, action] >= 0:
                    r_pos_action.append(action)
            next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]  # 随机选择一个可行的动作（下一个门）
            Q[state, next_state] = R[state, next_state] + γ * (Q[next_state]).max()  # 更新Q表
            state = next_state
            # 如果到了5，说明已经done
            if state == 5:
                break
    return Q
# 从Q中打印最优的结果


def evalNet( Q_matrix, roomStart):
    """
    从Q表中获取最优结果
    Args:
        Q_matrix:  Q表
        roomStart: 房间起始位置
    Returns:

    """
    bestAction = roomStart
    actionList = [bestAction]
    # 找到5这个房间为止（len(actionList)<100为了防止死循环）
    while (bestAction != 5) & (len(actionList) < 100):
        bestQ = -999999999
        # 循环找到价值最大的action(在多个可持行的出口找到价值最大的门，如果有门的价值一样，就在最后一个选)
        for i, q in enumerate(Q_matrix[bestAction]):
            if q > bestQ:
                bestQ = q
                # 更新房间
                bestAction = i
        # 添加进入选择
        actionList.append(bestAction)
    return actionList


if __name__ == '__main__':
    Q = leanQ()
    print(Q)

    #  初始化当前门的位置
    roomStart = 0
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候，路径：".format(roomStart), workRoomList)
    assert [0, 4 ,5] == workRoomList
    roomStart = 1
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候，路径：".format(roomStart), workRoomList)
    assert [1, 5] == workRoomList

    roomStart = 2
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候，路径：".format(roomStart), workRoomList)
    assert [2, 3, 1, 5] == workRoomList

    roomStart = 3
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候，路径：".format(roomStart), workRoomList)
    assert [3, 1, 5] == workRoomList

    roomStart = 4
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候，路径：".format(roomStart), workRoomList)
    assert [4, 5] == workRoomList

    roomStart = 5
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候，路径：".format(roomStart), workRoomList)
    assert [5] == workRoomList