Q_learn介绍和实现的源码

介绍就不说了(2023年了还有人看吗?)

Q学习(Q-learning)简单理解_qq_39429669的博客-CSDN博客_q学习

代码如下

# -*- encoding: utf-8 -*-
'''
@File    :   QLearning.py   
@Contact :   blmeng@foxmail.com
@License :   (C)Copyright yunji.com 
 
@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2023/1/12 13:00   blmeng                 None
'''

import numpy as np
import random
def leanQ():
    """
    根据门的状态矩阵训练出Q值表
    Returns: q表格
    如以下
        Q = [[  0.           0.           0.           0.         399.99892416,    0.        ],
            [  0.           0.           0.         319.99913933   0.,  499.9986552 ],
            [  0.           0.           0.         319.99913933   0.,    0.        ],
            [  0.         399.99892416 255.99931146   0.         399.99892416,    0.     ],
            [319.99913933   0.           0.         319.99913933   0.,  499.9986552 ],
            [  0.         399.99831901   0.           0.         399.9986552,  499.99831901]]
    """
    # 初始化矩阵
    Q = np.zeros((6, 6))
    Q = np.array(Q)

    # 回报矩阵R
    R = np.array([[-1, -1, -1, -1, 0, -1],
                   [-1, -1, -1, 0, -1, 100],
                   [-1, -1, -1, 0, -1, -1],
                   [-1, 0, 0, -1, 0, -1],
                   [0, -1, -1, 0, -1, 100],
                   [-1, 0, -1, -1, 0, 100]])

    # 设立学习参数
    γ = 0.8

    # 训练
    for i in range(1000):
        # 对每一个训练,随机选择一种状态
        state = random.randint(0, 5)
        while True:
            # 选择当前状态下的所有可能动作
            r_pos_action = []
            for action in range(6):
                if R[state, action] >= 0:
                    r_pos_action.append(action)
            next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]  # 随机选择一个可行的动作(下一个门)
            Q[state, next_state] = R[state, next_state] + γ * (Q[next_state]).max()  # 更新Q表
            state = next_state
            # 如果到了5,说明已经done
            if state == 5:
                break
    return Q
# 从Q中打印最优的结果


def evalNet( Q_matrix, roomStart):
    """
    从Q表中获取最优结果
    Args:
        Q_matrix:  Q表
        roomStart: 房间起始位置
    Returns:

    """
    bestAction = roomStart
    actionList = [bestAction]
    # 找到5这个房间为止(len(actionList)<100为了防止死循环)
    while (bestAction != 5) & (len(actionList) < 100):
        bestQ = -999999999
        # 循环找到价值最大的action(在多个可持行的出口找到价值最大的门,如果有门的价值一样,就在最后一个选)
        for i, q in enumerate(Q_matrix[bestAction]):
            if q > bestQ:
                bestQ = q
                # 更新房间
                bestAction = i
        # 添加进入选择
        actionList.append(bestAction)
    return actionList


if __name__ == '__main__':
    Q = leanQ()
    print(Q)

    #  初始化当前门的位置
    roomStart = 0
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
    assert [0, 4 ,5] == workRoomList
    roomStart = 1
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
    assert [1, 5] == workRoomList

    roomStart = 2
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
    assert [2, 3, 1, 5] == workRoomList

    roomStart = 3
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
    assert [3, 1, 5] == workRoomList

    roomStart = 4
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
    assert [4, 5] == workRoomList

    roomStart = 5
    workRoomList = evalNet(Q, roomStart)
    print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
    assert [5] == workRoomList

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值