最近在学习关于强化学习的相关知识,因此在此总结自己的心得体会。但由于小弟学识浅薄,内容难免存在错误,还望各路大神批评指正。
1、模型无关的策略评价
模型无关的策略评价是指,当马尔可夫决策过程的部分信息不可知,计算一个策略下的状态价值。
模型无关的策略评价主要有两种方法:蒙特卡洛算法和时差学习算法。
2、蒙特卡洛算法
蒙特卡洛算法是一个根据样本计算期望值的方法,因此我们可以产生样本来计算状态价值。
使用需要计算状态价值的策略
π
,探索得到“状态—动作—奖励”的一个完整序列,如:
然后,在序列每次碰到一个状态 st 时,计算其衰减奖励
最后遍历所有状态:
3、时差学习算法
蒙特卡洛算法是一种很好的模型无关的策略评价方法,但是蒙特卡洛算法在实际应用时可能会遇到困难,因为蒙特卡洛算法需要策略探索得到完整的“状态—动作—奖励”序列,但是不是所有例子都有终止状态,因此应用蒙特卡洛算法有问题。
为了解决这个问题,人们提出了时差学习算法 (Temperal Difference, TD)。时差学习算法利用马尔科夫性质,只利用了下一步信息。时差学习算法让系统按照策略指引进行探索,在探索每一步都进行状态价值的更新,更新公式如下:
s 表示当前状态,
4、代码
下面同样以机器人寻找三角形的例子,分别使用蒙特卡洛算法和时差学习算法来计算随机策略下各个状态的价值。
代码如下:
—Mdp.py
# -*- coding:utf-8 -*-
import random
class Mdp:
def __init__(self):
# 状态 state
self.state = [1, 2, 3, 4, 5, 6, 7, 8]
# 终止状态 terminalstate
self.terminalstate = dict()
self.terminalstate[6] = True
self.terminalstate[7] = True
self.terminalstate[8] = True
# 动作 actions
self.action = ["n", "e", "s", "w"] # 北东南西
# 奖励 reward
self.reward = dict()
self.reward["2_s"] = -1
self.reward["3_s"] = -1
self.reward["4_s"] = 1
# 状态转义 t
self.t = dict()
self.t["1_e"] = 2
self.t["2_w"] = 1
self.t["2_e"] = 3
self.t["2_s"] = 6
self.t["3_w"] = 2
self.t["3_e"] = 4
self.t["3_s"] = 7
self.t["4_w"] = 3
self.t["4_e"] = 5
self.t["4_s"] = 8
self.t["5_w"] = 4
# 衰减系数 gamma
self.gamma = 0.8
def transform(self, state, action):
if state in self.terminalstate:
return True, state, 0
key = "%d_%s"%(state, action)
if key in self.t:
next_state = self.t[key]
else:
next_state = state
is_terminal = False
if next_state in self.terminalstate:
is_terminal = True
reward = 0.0
if key in self.reward:
reward = self.reward[key]
return is_terminal, next_state, reward
def generate_randompi_sample(self, num):
state_sample = []
action_sample = []
reward_sample = []
for i in xrange(num):
tmp_state = []
tmp_action =[]
tmp_reward = []
s = self.state[int(random.random() * len(self.state))]
is_terminal = False
while False == is_terminal:
a = self.action[int(random.random() * len(self.action))]
is_terminal, s1, reward = self.transform(s, a)
tmp_state.append(s)
tmp_action.append(a)
tmp_reward.append(reward)
s = s1
state_sample.append(tmp_state)
action_sample.append(tmp_action)
reward_sample.append(tmp_reward)
return state_sample, action_sample, reward_sample
—Policy_Evaluation_MC.py
# -*- coding:utf-8 -*-
from Mdp import *
def policy_evaluation_mc(state_sample, action_sample, reward_sample, mdp):
V = dict()
N = dict()
for state in mdp.state:
V[state] = 0
N[state] = 0
for iter1 in xrange(len(state_sample)):
g = 0
for iter2 in xrange(len(state_sample[iter1])-1, -1, -1):
g *= mdp.gamma
g += reward_sample[iter1][iter2]
for iter2 in xrange(len(state_sample[iter1])):
s = state_sample[iter1][iter2]
V[s] += g
N[s] += 1
g = g - reward_sample[iter1][iter2]
g = g / mdp.gamma
for state in V:
if N[state] > 0.001:
V[state] = V[state] / N[s]
return V
if "__main__" == __name__:
mdp = Mdp()
state_sample, action_sample, reward_sample = mdp.generate_randompi_sample(1000000)
v = policy_evaluation_mc(state_sample, action_sample, reward_sample, mdp)
print "policy_evaluation_mc:"
for state in xrange(1, 6):
print "%d: %f"%(state, v[state])
—Policy_Evaluation_TD.py
# -*- coding:utf-8 -*-
from Mdp import *
def policy_evaluation_td(state_sample, action_sample, reward_sample, mdp, alpha):
V = dict()
for state in mdp.state:
V[state] = 0
for iter1 in xrange(len(state_sample)):
for iter2 in xrange(len(state_sample[iter1])):
state = state_sample[iter1][iter2]
if iter2 < (len(state_sample[iter1]) - 1):
next_v = V[state_sample[iter1][iter2+1]]
else:
next_v = 0
V[state] += alpha * (reward_sample[iter1][iter2] + mdp.gamma * next_v - V[state])
return V
if "__main__" == __name__:
mdp = Mdp()
state_sample, action_sample, reward_sample = mdp.generate_randompi_sample(1000000)
v = policy_evaluation_td(state_sample, action_sample, reward_sample, mdp, 0.2)
print "policy_evaluation_td:"
for state in xrange(1, 6):
print "%d: %f"%(state, v[state])
代码运行结果:(由于存在随机性,因此每次执行的结果有所不同)