#作业,不完全原创
一、实验目的
(1)掌握基于贪心策略的Q-learning算法的基本原理及过程
(2)掌握使用Python实现基于贪心策略的Q-learning算法
(3)使用基于贪心策略的Q-learning算法使机器人走出迷宫
二、实验环境
(1)中文Window10操作系统
(2)Python3
三、实验内容
(1)掌握基于贪心策略的Q-learning算法的基本原理及过程
(2)程序框架设置
(3)程序详细实现
(4)程序测试与性能分析
# coding=utf-8
# Author :SuY
# @Time : 2021/6/10 0010 21:10
# @File : q-learning.py
# @Software: PyCharm
# @contact: suyang9185@foxmail.com
# Description:
"""
机器人九宫格Python3语言实现, 基于贪心策略的q-learning算法.
"""
# 正文
#coding:utf-8
import random
alpha = 0.5
gamma = 0.99
status = {'s1':[0,0],'s2':[0,1],'s3':[0,2],'s4':[1,0],'s5':[1,1],'s6':[1,2],'s7':[2,0],'s8':[2,1],'s9':[2,2],'sd':[-1,-1]}
actions = {0:"下",1:"右"}
def init():
q_value = [1,-2]
q = [[q_value[:] for i in range(3)] for i in range(3)]
return q
def reword(s):
R = 0
s_index = status[s]
if s_index[0]*3+s_index[1]+1 == 9:
R = 1
elif s_index[0]+s_index[1]== -2:
R = -1
return R
def choise_action(s,q):
s_index = status[s]
q_value = q[s_index[0]][s_index[1]]
# 找到最大的q_value,并根据这个值确定action
q_max = max(q_value)
action_index = q_value.index(q_max)
action = actions[action_index]
return (action_index,s,action,q_max)
def choise_action_random(s, q):
s_index = status[s]
q_value = q[s_index[0]][s_index[1]]
# 随机选择一个action,并做相应更新
q_max = max(q_value)
action_index = q_value.index(q_max)
action = actions[random.choice([i for i in range(len(actions)) if i != action_index])]
return (action_index, s, action, q_max)
if __name__ == '__main__':
time = 120
q = init()
greed = 0.1
flag = 0
while time > 0:
print(q)
s = 's1'
my_way = []
while s != 's9' and s != 'sd':
s_index = status[s]
if random.random() < (1-greed):
action = choise_action(s,q)
else:
action = choise_action_random(s,q)
my_way.append(action[1:3])
# print(my_way)
if action[2] == "下":
temp = (s_index[0]+1)*3+s_index[1]+1
if temp <= 9:
s = 's' + str(temp)
print(s)
else:
s = 'sd'
else:
temp = (s_index[0]) * 3 + s_index[1] + 1 + 1
if temp <= 9:
s = 's' + str(temp)
print(s)
else:
s = 'sd'
q_s_max = gamma*choise_action(s,q)[-1]
# 完成q的更新
q[s_index[0]][s_index[1]][action[0]] = q[s_index[0]][s_index[1]][action[0]]\
+alpha * (reword(s) + gamma * q_s_max - q[s_index[0]][s_index[1]][action[0]])
if s == 's9':
print('the last way:',my_way)
break
time -= 1