""" 马尔可夫决策过程即 Markov Decision Process,简称 MDP,是强化学习的核心概念。"""
""" 马尔可夫奖励过程回报 """
import numpy as np
np.random.seed(0)
#定义状态转移矩阵
P=[
[0.9,0,1,0.0,0.0,0.0,0.0],
[0.5,0.0,0.5,0.0,0.0,0.0],
[0.0,0.0,0.0,0.6,0.0,0.4],
[0.0,0.0,0.0,0.0,0.3,0.7],
[0.0,0.2,0.3,0.5,0.0,0.0],
[0.0,0.0,0.0,0.0,0.0,1.0]
]
P=np.array(P)
#定义奖励函数
rewards=[-1,-2,-2,10,1,0]
#定义折扣因子
gamma=0.5
#给定一条序列,计算从某个索引(起始状态)开始到序列最后(终止状态)得到的回报
def compute_return(start_index,chain,gamma):
G=0
for i in reversed(range(start_index,len(chain))):
G=gamma*G+rewards[chain[i]-1]
return G
#一个状态序列为s1-s2-s3-s6
chain=[1,2,3,6]
start_index=0
G=compute_return(start_index=start_index,chain=chain,gamma=gamma)
print("根据本序列计算得到回报为:%s。"%G)
""" 马尔可夫奖励过程价值函数 """
P=[[0.9,0.1,0.0,0.0,0.0,0.0],
[0.5,0.0,0.5,0.0,0.0,0.0],
[0.0,0.0,0.0,0.6,0.0,0.4],
[0.0,0.0,0.0,0.0,0.3,0.7],
[0.0,0.2,0.3,0.5,0.0,0.0],
[0.0,0.0,0.0,0.0,0.0,1.0]]
P=np.array(P)
gamma=0.5
# rewards=[-2.01950168,-2.21451846,1.16142785,10.53809283,3.58728554,0.0]
rewards=[-1,-2,-2,10,1,0]
def compute(P,rewards,gamma,state_num):
rewards=np.array(rewards).reshape((-1,1))#将rewards写成列向量的形式
value=np.dot(np.linalg.inv(np.eye(state_num,state_num)-gamma*P),rewards)
return value
value=compute(P,rewards=rewards,gamma=gamma,state_num=6)
print("MRP的状态价值分别为:\n",value)
""" 采用贝尔曼方程进行简单验证 """
""" 对于状态S4来说,当gamma=0.5时 """
""" V(s4)=r(s4)+gamma*V(s5)*P(s5|s4)+gamma*V(s6)*P(s6|s4) =10+0.5*0.3*3.58728554+0.5*0.7*0=10.53809283~=10.54"""
""" 马尔可夫决策过程,即Markov decision process,简称MDP """
S=["s1","s2","s3","s4","s5","s6"]
A=["保持s1","前往s1","前往s2","前往s3","前往s4","前往s5","概率前往"]#动作集合
P={"s1-保持s1-s1":1.0,"s1-前往s2-s2":1.0,"s2-前往s1-s1":1.0,"s2-前往s3-s3":1.0,"s3-前往s4-s4":1.0,"s3-前往s5-s5":1.0,"s4-前往s5-s5":1.0,"s4-概率前往-s2":0.2,"s4-概率前往-s3":0.4,"s4-概率前往-s4":0.4}#状态转移函数
R={"s1-保持s1":-1,"s1-前往s2":0,"s2-前往s1":-1,"s2-前往s3":-2,"s3-前往s4":-2,"s3-前往s5":0,"s4-前往s5":10,"s4-概率前往":1}#奖励函数
gamma=0.5#折扣因子
MDP=(S,A,P,R,gamma)
#策略一,随机策略
pi_1={"s1-保持s1":0.5,"s1-前往s2":0.5,"s2-前往s1":0.5,"s2-前往s3":0.5,"s3-前往s4":0.5,"s3-前往s5":0.5,"s4-前往s5":0.5,"s4-概率前往":0.5}
#策略二
pi_2={"s1-保持s1":0.6,"s1-前往s2":0.4,"s2-前往s1":0.3,"s2-前往s3":0.7,"s3-前往s4":0.5,"s3-前往s5":0.5,"s4-前往s5":0.1,"s4-概率前往":0.9}
#把输入的两个字符串通过“-”连接便于使用上述定义的P与R变量
def join(str1,str2):
return str1+"-"+str2
#根据MDP与MRP关系,将MDP转化为MRP,直接给出转化后的MRP的状态转移矩阵和奖励函数
P_from_mdp_to_mrp=[[0.5,0.5,0.0,0.0,0.0],[0.5,0.0,0.5,0.0,0.0],[0.0,0.0,0.0,0.5,0.5],[0.0,0.1,0.2,0.2,0.5],[0.0,0.0,0.0,0.0,1.0]]
P_from_mdp_to_mrp=np.array(P_from_mdp_to_mrp)
R_from_mdp_to_mrp=[-0.5,-1.5,-1.0,5.5,0]
V=compute(P=P_from_mdp_to_mrp,rewards=R_from_mdp_to_mrp,gamma=gamma,state_num=5)
print("MRP的状态价值分别为:\n",V)
#MRP解析解方法在状态动作集合比较大时不是很适用。
强化学习学习程序笔记记录二(马尔可夫决策过程)
于 2024-07-25 18:10:33 首次发布