在MDP原理和求解博客中我们讲有两种常见的MDP求解方法,上一篇博客介绍了价值迭代算法的python 实现,本文中我们继续介绍另一种MDP求解算法,即策略迭代算法。首先回归一下policy iteration的算法原理,如下图所示:
接下来是算法的实现,第一步和value iteration algorithm 一样,同样是定义状态转移概率:
import numpy as np
#定义状态转移矩阵
upprobolity= [[0.1,0.1,0,0,0.8,0,0,0,0,0,0,0],
[0.1,0.8,0.1,0,0,0,0,0,0,0,0,0],
[0,0.1,0,0.1,0,0,0.8,0,0,0,0,0],
[0,0,0.1,0.1,0,0,0,0.8,0,0,0,0],
[0,0,0,0,0.2,0,0,0,0.8,0,0,0],
[0,0,0,0,0.1,0,0.1,0,0,0.8,0,0],
[0,0,0,0,0,0,0.1,0.1,0,0,0.8,0],
[0,0,0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0.9,0.1,0,0],
[0,0,0,0,0,0,0,0,0.1,0.8,0.1,0],
[0,0,0,0,0,0,0,0,0,0.1,0.8,0.1],
[0,0,0,0,0,0,0,0,0,0,0,0]]
downprobolity = [[0.9,0.1,0,0,0,0,0,0,0,0,0,0],
[0.1,0.8,0.1,0,0,0,0,0,0,0,0,0],
[0,0.1,0.8,0.1,0,0,0,0,0,0,0,0],
[0,0,0.1,0.9,0,0,0,0,0,0,0,0],
[0.8,0,0,0,0.2,0,0,0,0,0,0,0],
[0,0.8,0,0,0.1,0,0.1,0,0,0,0,0],
[0,0,0.8,0,0,0,0.1,0.1,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0.8,0,0,0,0.1,0.1,0,0],
[0,0,0,0,0,0,0,0.8,0.1,0.1,0,0],
[0,0,0,0,0,0,0.8,0,0.1,0,0.1,0],
[0,0,0,0,0,0,0,0,0,0,0,0]]
leftprobolity = [[0.9,0,0,0,0.1,0,0,0,0,0,0,0],
[0.8,0.2,0,0,0,0,0,0,0,0,0,0],
[0,0.8,0.1,0,0,0,0.1,0,0,0,0,0],
[0,0,0.8,0.1,0,0,0,0.1,0,0,0,0],
[0.1,0,0,0,0.8,0,0,0,0.1,0,0,0],
[0,0.1,0,0,0.8,0,0,0,0,0.1,0,0],
[0,0,0.1,0,0,0,0.8,0,0,0,0.1,0],
[0,0,0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0.1,0,0,0,0.9,0,0,0],
[0,0,0,0,0,0,0,0,0.8,0.2,0,0],
[0,0,0,0,0,0,0.1,0,0,0.8,0.1,0],
[0,0,0,0,0,0,0,0,0,0,0,0]]
rightprobolity = [[0.1,0.8,0,0,0.1,0,0,0,0,0,0,0],
[0,0.2,0.8,0,0,0,0,0,0,0,0,0],
[0,0,0.1,0.8,0,0,0.1,0,0,0,0,0],
[0,0,0,0.9,0,0,0,0.1,0,0,0,0],
[0.1,0,0,0,0.8,0,0,0,0.1,0,0,0],
[0,0.1,0,0,0,0,0.8,0,0,0.1,0,0],
[0,0,0.1,0,0,0,0,0.8,0,0,0.1,0],
[0,0,0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0.1,0,0,0,0.1,0.8,0,0],
[0,0,0,0,0,0,0,0,0,0.2,0.8,0],
[0,0,0,0,0,0,0.1,0,0,0,0.1,0.8],
[0,0,0,0,0,0,0,0,0,0,0,0]]
probobility = [upprobolity, downprobolity, leftprobolity, rightprobolity]
P = np.array(probobility)
接下来进行变量的初始化:
States = [1,2,3,4,5,6, 7, 8, 9,10,11,12]
Actions = ['up', 'down', 'left', 'right']
Rewards = [-0.04,-0.04, -0.04, -0.04,
-0.04,-10000,-0.04, -1,
-0.04,-0.04, -0.04, 1]
ActionIDs = [0,1,2,3] #分别对应于up, down, left, right
U = np.zeros(12)
r = 0.9
pi = np.random.choice(ActionIDs, 12) # 为12个states随机选择一个动作
这里我们采用modified policy iteraiton来实现policy evulation, 代码如下:
# K 是我们自行定义的一个迭代参数,用于对U的求解
def policy_evaluation(pi, U, k=20):
"""Return an updated utility mapping U from each state in the MDP to its
utility, using an approximation (modified policy iteration)."""
for i in range(k):
for s in States:
actionid = pi[s-1]
index = s-1
U[s-1] = Rewards[s-1] + r*(((P[actionID, index, :])* U).sum())
return U
最后我们就可以对结果进行求解了
while True:
U = policy_evaluation(pi, U)
unchanged = True
for s in States:
index = s-1
action_rewards = ((P[:, index, :])* U).sum(axis = 1)
actionID = np.argmax(action_rewards) # actionID 属于[]
if actionID != pi[s-1]:
pi[s-1] = actionID
unchanged = False
if unchanged:
print([Actions[acid] for acid in pi] )
break
'''
output: ['right', 'right', 'up', 'left',
'up', 'right', 'up', 'up',
'right', 'right', 'right', 'up']
'''
结果可视化如下:
注意,上面的图和value iteration 得到的结果差别之处在于,这里格子(1,1)的最优动作是向右而不是向上。
后记:
好了,到这里关于MDP的三篇博文就算是结束。其实本来是为了写强化学习相关的博客的,最后发现网上很有博客讲的不是很明白,所以就自己去把原书 Artificial Intelligence A Modern Approach 相关的章节撸了一边(网上其实也有这本书其实的python 代码实现哦,有兴趣的可以点这里),总的来讲还是把之前看其他人文章不解的地方弄懂了一些。但是在本系列的第一篇讲原理的博客我感觉写的不怎么好。哎。。。。先这样吧!接下来会在MDP基础上会进一步写几篇的关于强化学习的博客。最后再 上一次MDP系列博客链接: