介绍就不说了(2023年了还有人看吗?)
Q学习(Q-learning)简单理解_qq_39429669的博客-CSDN博客_q学习
代码如下
# -*- encoding: utf-8 -*-
'''
@File : QLearning.py
@Contact : blmeng@foxmail.com
@License : (C)Copyright yunji.com
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2023/1/12 13:00 blmeng None
'''
import numpy as np
import random
def leanQ():
"""
根据门的状态矩阵训练出Q值表
Returns: q表格
如以下
Q = [[ 0. 0. 0. 0. 399.99892416, 0. ],
[ 0. 0. 0. 319.99913933 0., 499.9986552 ],
[ 0. 0. 0. 319.99913933 0., 0. ],
[ 0. 399.99892416 255.99931146 0. 399.99892416, 0. ],
[319.99913933 0. 0. 319.99913933 0., 499.9986552 ],
[ 0. 399.99831901 0. 0. 399.9986552, 499.99831901]]
"""
# 初始化矩阵
Q = np.zeros((6, 6))
Q = np.array(Q)
# 回报矩阵R
R = np.array([[-1, -1, -1, -1, 0, -1],
[-1, -1, -1, 0, -1, 100],
[-1, -1, -1, 0, -1, -1],
[-1, 0, 0, -1, 0, -1],
[0, -1, -1, 0, -1, 100],
[-1, 0, -1, -1, 0, 100]])
# 设立学习参数
γ = 0.8
# 训练
for i in range(1000):
# 对每一个训练,随机选择一种状态
state = random.randint(0, 5)
while True:
# 选择当前状态下的所有可能动作
r_pos_action = []
for action in range(6):
if R[state, action] >= 0:
r_pos_action.append(action)
next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)] # 随机选择一个可行的动作(下一个门)
Q[state, next_state] = R[state, next_state] + γ * (Q[next_state]).max() # 更新Q表
state = next_state
# 如果到了5,说明已经done
if state == 5:
break
return Q
# 从Q中打印最优的结果
def evalNet( Q_matrix, roomStart):
"""
从Q表中获取最优结果
Args:
Q_matrix: Q表
roomStart: 房间起始位置
Returns:
"""
bestAction = roomStart
actionList = [bestAction]
# 找到5这个房间为止(len(actionList)<100为了防止死循环)
while (bestAction != 5) & (len(actionList) < 100):
bestQ = -999999999
# 循环找到价值最大的action(在多个可持行的出口找到价值最大的门,如果有门的价值一样,就在最后一个选)
for i, q in enumerate(Q_matrix[bestAction]):
if q > bestQ:
bestQ = q
# 更新房间
bestAction = i
# 添加进入选择
actionList.append(bestAction)
return actionList
if __name__ == '__main__':
Q = leanQ()
print(Q)
# 初始化当前门的位置
roomStart = 0
workRoomList = evalNet(Q, roomStart)
print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
assert [0, 4 ,5] == workRoomList
roomStart = 1
workRoomList = evalNet(Q, roomStart)
print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
assert [1, 5] == workRoomList
roomStart = 2
workRoomList = evalNet(Q, roomStart)
print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
assert [2, 3, 1, 5] == workRoomList
roomStart = 3
workRoomList = evalNet(Q, roomStart)
print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
assert [3, 1, 5] == workRoomList
roomStart = 4
workRoomList = evalNet(Q, roomStart)
print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
assert [4, 5] == workRoomList
roomStart = 5
workRoomList = evalNet(Q, roomStart)
print("所在房间为{}时候,路径:".format(roomStart), workRoomList)
assert [5] == workRoomList