读书笔记:Algorithms for Decision Making(11)

读书笔记:Algorithms for Decision Making

上一篇 读书笔记:Algorithms for Decision Making(10)
下一篇 读书笔记:Algorithms for Decision Making(12)



四、状态不确定性(2)

在有限维场景中,POMDP问题的精确解也经常很难计算。因而,考虑求得近似解的方法是合理的。本部分从离线近似解讨论到在线近似解,是近似方法的常规逻辑思路。


3. 离线置信状态规划

3.1 QMDP

一种简单的离线逼近技术是QMDP,即与全观察MDP相关的行动值函数方法。其核心迭代为: α a ( k + 1 ) ( s ) = R ( s , a ) + γ ∑ s ′ T ( s ′ ∣ s , a ) max ⁡ a ′ α a ′ ( k ) ( s ′ ) . \alpha_{a}^{(k+1)}(s) = R(s,a) + \gamma \sum_{s^{\prime}}T(s^{\prime} \mid s,a) \max_{a^{\prime}} \alpha_{a^{\prime}}^{(k)}(s^{\prime}). αa(k+1)(s)=R(s,a)+γsT(ss,a)amaxαa(k)(s).

struct QMDP
	k_max # maximum number of iterations
end

function update(𝒫::POMDP, M::QMDP, Γ)
	𝒮, 𝒜, R, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.T, 𝒫.γ
	Γ′ = [[R(s,a) + γ*sum(T(s,a,s′)*maximum(α′[j] for α′ in Γ)
	for (j,s′) in enumerate(𝒮)) for s in 𝒮] for a in 𝒜]
		return Γ′
	end

function solve(M::QMDP, 𝒫::POMDP)
	Γ = [zeros(length(𝒫.𝒮)) for a in 𝒫.𝒜]
	Γ = alphavector_iteration(𝒫, M, Γ)
	return AlphaVectorPolicy(𝒫, Γ, 𝒫.𝒜)
end
3.1.1 Fast Informed Bound

其核心迭代为: α a ( k + 1 ) ( s ) = R ( s , a ) + γ ∑ o max ⁡ a ′ ∑ s ′ O ( o ∣ a , s ′ ) T ( s ′ ∣ s , a ) α a ′ ( k ) ( s ′ ) . \alpha_{a}^{(k+1)}(s) = R(s,a) + \gamma \sum_{o} \max_{a^{\prime}} \sum_{s^{\prime}}O(o \mid a, s^{\prime}) T(s^{\prime} \mid s,a)\alpha_{a^{\prime}}^{(k)}(s^{\prime}). αa(k+1)(s)=R(s,a)+γoamaxsO(oa,s)T(ss,a)αa(k)(s).

struct FastInformedBound
	k_max # maximum number of iterations
end

function update(𝒫::POMDP, M::FastInformedBound, Γ)
	𝒮, 𝒜, 𝒪, R, T, O, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.R, 𝒫.T, 𝒫.O, 𝒫.γ
	Γ′ = [[R(s, a) + γ*sum(maximum(sum(O(a,s′,o)*T(s,a,s′)*α′[j]
	for (j,s′) in enumerate(𝒮)) for α′ in Γ) for o in 𝒪) for s in 𝒮] for a in 𝒜]
		return Γ′
	end

function solve(M::FastInformedBound, 𝒫::POMDP)
	Γ = [zeros(length(𝒫.𝒮)) for a in 𝒫.𝒜]
	Γ = alphavector_iteration(𝒫, M, Γ)
	return AlphaVectorPolicy(𝒫, Γ, 𝒫.𝒜)
end
3.1.2 Fast Lower Bounds
  • 一个常见的下限是 best-action worst-state(BAWS)lower bound算法。
    function baws_lowerbound(𝒫::POMDP)
    	𝒮, 𝒜, R, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.γ
    	r = maximum(minimum(R(s, a) for s in 𝒮) for a in 𝒜) / (1)
    	α = fill(r, length(𝒮))
    	return α
    end
    
  • Blind lower bound算法,该算法与QMDP相似。
    function blind_lowerbound(𝒫, k_max)
    	𝒮, 𝒜, T, R, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.T, 𝒫.R, 𝒫.γ
    	Q(s,a,α) = R(s,a) + γ*sum(T(s,a,s′)[j] for (j,s′) in enumerate(𝒮))
    	Γ = [baws_lowerbound(𝒫) for a in 𝒜]
    	for k in 1:k_max
    		Γ = [[Q(s,a,α) for s in 𝒮] for (α,a) in zip(Γ, 𝒜)]
    	end
    	return Γ
    end
    

3.2 Point-Based Value Iteration(PBVI)

具体思想可参见PBVI

struct PointBasedValueIteration
	B # set of belief points
	k_max # maximum number of iterations
end

function backup(𝒫::POMDP, Γ, b)
	𝒮, 𝒜, 𝒪, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.γ
	R, T, O = 𝒫.R, 𝒫.T, 𝒫.O
	Γa = []
	for a in 𝒜
		Γao = []
		for o in 𝒪
			b′ = update(b, 𝒫, a, o)
			push!(Γao, argmax(α->α⋅b′, Γ))
		end
		α = [R(s, a) + γ*sum(sum(T(s, a, s′)*O(a, s′, o)*Γao[i][j]
		for (j,s′) in enumerate(𝒮)) for (i,o) in enumerate(𝒪)) for s in 𝒮]
			push!(Γa, α)
		end
		return argmax(α->α⋅b, Γa)
	end
end

function update(𝒫::POMDP, M::PointBasedValueIteration, Γ)
	return [backup(𝒫, Γ, b) for b in M.B]
end

function solve(M::PointBasedValueIteration, 𝒫)
	Γ = fill(baws_lowerbound(𝒫), length(𝒫.𝒜))
	Γ = alphavector_iteration(𝒫, M, Γ)
	return LookaheadAlphaVectorPolicy(𝒫, Γ)
end
3.2.1 Randomized PBVI
struct RandomizedPointBasedValueIteration
	B # set of belief points
	k_max # maximum number of iterations
end

function update(𝒫::POMDP, M::RandomizedPointBasedValueIteration, Γ)
	Γ′, B′ = [], copy(M.B)
	while !isempty(B′)
		b = rand(B′)
		α = argmax(α->α⋅b, Γ)
		α′ = backup(𝒫, Γ, b)
		if α′⋅b ≥ α⋅b
			push!(Γ′, α′)
		else
			push!(Γ′, α)
		end
		filter!(b->maximum(α⋅b for α in Γ′) <
		maximum(α⋅b for α in Γ), B′)
	end
	return Γ′
end

function solve(M::RandomizedPointBasedValueIteration, 𝒫)
	Γ = [baws_lowerbound(𝒫)]
	Γ = alphavector_iteration(𝒫, M, Γ)
	return LookaheadAlphaVectorPolicy(𝒫, Γ)
end

3.3 Sawtooth Upper Bound

该方法时用基来表示值函数,具体讲,基是一组belief-utility对。

struct SawtoothPolicy
	𝒫 # POMDP problem
	V # dictionary mapping beliefs to utilities
end

function basis(𝒫)
	n = length(𝒫.𝒮)
	e(i) = [j == i ? 1.0 : 0.0 for j in 1:n]
	return [e(i) for i in 1:n]
end

function utility(π::SawtoothPolicy, b)
	𝒫, V = π.𝒫, π.V
	if haskey(V, b)
		return V[b]
	end
	n = length(𝒫.𝒮)
	E = basis(𝒫)
	u = sum(V[E[i]] * b[i] for i in 1:n)
	for (b′, u′) in V
		if b′ ∉ E
			i = argmax([norm(b-e, 1) - norm(b′-e, 1) for e in E])
			w = [norm(b - e, 1) for e in E]
			w[i] = norm(b - b′, 1)
			w /= sum(w)
			w = [1 - wi for wi in w]
			α = [V[e] for e in E]
			α[i] = u′
			u = min(u, w⋅α)
		end
	end
	return u
end

(π::SawtoothPolicy)(b) = greedy(π, b).a
3.3.1 点选择

无论是PBVI还是sawtooth迭代算法,都需要知道置信 B B B

function randstep(𝒫::POMDP, b, a)
	s = rand(SetCategorical(𝒫.𝒮, b))
	s′, r, o = 𝒫.TRO(s, a)
	b′ = update(b, 𝒫, a, o)
	return b′, r
end

function random_belief_expansion(𝒫, B)
	B′ = copy(B)
	for b in B
		a = rand(𝒫.𝒜)
		b′, r = randstep(𝒫, b, a)
		push!(B′, b′)
	end
	return unique!(B′)
end

function exploratory_belief_expansion(𝒫, B)
	B′ = copy(B)
	for b in B
		best = (b=copy(b), d=0.0)
		for a in 𝒫.𝒜
			b′, r = randstep(𝒫, b, a)
			d = minimum(norm(b - b′, 1) for b in B′)
			if d > best.d
				best = (b=b′, d=d)
			end
		end
		push!(B′, best.b)
	end
	return unique!(B′)
end
3.3.2 Sawtooth Heuristic Search
struct SawtoothHeuristicSearch
	b # initial belief
	δ # gap threshold
	d # depth
	k_max # maximum number of iterations
	k_fib # number of iterations for fast informed bound
end

function explore!(M::SawtoothHeuristicSearch, 𝒫, πhi, πlo, b, d=0)
	𝒮, 𝒜, 𝒪, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.γ
	ϵ(b′) = utility(πhi, b′) - utility(πlo, b′)
	if d ≥ M.d || ϵ(b) ≤ M.δ / γ^d
		return
	end
	a = πhi(b)
	o = argmax(o -> ϵ(update(b, 𝒫, a, o)), 𝒪)
	b′ = update(b, 𝒫, a, o)
	explore!(M, 𝒫, πhi, πlo, b′, d+1)
	if b′ ∉ basis(𝒫)
		πhi.V[b′] = greedy(πhi, b′).u
	end
	push!(πlo.Γ, backup(𝒫, πlo.Γ, b′))
end

function solve(M::SawtoothHeuristicSearch, 𝒫::POMDP)
	πfib = solve(FastInformedBound(M.k_fib), 𝒫)
	Vhi = Dict(e => utility(πfib, e) for e in basis(𝒫))
	πhi = SawtoothPolicy(𝒫, Vhi)
	πlo = LookaheadAlphaVectorPolicy(𝒫, [baws_lowerbound(𝒫)])
	for i in 1:M.k_max
		explore!(M, 𝒫, πhi, πlo, M.b)
		if utility(πhi, M.b) - utility(πlo, M.b) < M.δ
			break
		end
	end
	return πlo
end

4. 在线置信状态规划

在线方法的思想来源于树搜索,虽然在每步迭代中进行了大量的计算,但对高维问题是有效的。

  • 一个简单的在线策略是执行一步lookhead,它考虑从当前信念采取的每个行动,并使用近似值函数估计预期值。
  • 前向搜索可以得到更好的策略,但其计算复杂性随着范围的增加而显著增加。
  • 分支定界是一种更有效的正向搜索版本,通过在值函数上设置上限和下限,可以避免搜索某些路径。
  • 稀疏采样是一种近似方法,可以减少在所有可能观测值的空间上迭代的计算负担。
  • 蒙特卡罗树搜索可以通过对历史而不是状态进行操作来适应POMDP。
  • 确定稀疏树搜索使用一粒子置信,确保观测值确定,从而大大减少搜索树。
  • 启发式搜索智能地选择动作观察对,以探索其保持的值函数上下限之间存在较大差距的区域。

5. Controller Abstractions

OMDP策略的控制器表示允许策略维护自己的内部状态。与之前的方法相比,这些表示可以提高可扩展性。

5.1 控制器

控制器是维护自身内部状态的策略表示,具体表示为一个由有限组节点 X X X组成的图。

mutable struct ControllerPolicy
	𝒫 # problem
	X # set of controller nodes
	ψ # action selection distribution
	η # successor selection distribution
end

function (π::ControllerPolicy)(x)
	𝒜, ψ = π.𝒫.𝒜, π.ψ
	dist = [ψ[x, a] for a in 𝒜]
	return rand(SetCategorical(𝒜, dist))
end

function update(π::ControllerPolicy, x, a, o)
	X, η = π.X, π.η
	dist = [η[x, a, o, x′] for x′ in X]
	return rand(SetCategorical(X, dist))
end

通过形成状态空间为 X × S X\times S X×S的MDP,可以计算遵循控制器策略的效用。节点 x x x处于活动状态时的迭代策略评估为:

function utility(π::ControllerPolicy, U, x, s)
	𝒮, 𝒜, 𝒪 = π.𝒫.𝒮, π.𝒫.𝒜, π.𝒫.𝒪
	T, O, R, γ = π.𝒫.T, π.𝒫.O, π.𝒫.R, π.𝒫.γ
	X, ψ, η = π.X, π.ψ, π.η
	U′(a,s′,o) = sum(η[x,a,o,x′]*U[x′,s′] for x′ in X)
	U′(a,s′) = T(s,a,s′)*sum(O(a,s′,o)*U′(a,s′,o) for o in 𝒪)
	U′(a) = R(s,a) + γ*sum(U′(a,s′) for s′ in 𝒮)
	return sum(ψ[x,a]*U′(a) for a in 𝒜)
end

function iterative_policy_evaluation(π::ControllerPolicy, k_max)
	𝒮, X = π.𝒫.𝒮, π.X
	U = Dict((x, s) => 0.0 for x in X, s in 𝒮)
	for k in 1:k_max
		U = Dict((x, s) => utility(π, U, x, s) for x in X, s in 𝒮)
	end
	return U
end

5.2 实现案例

5.2.1 策略迭代

策略迭代从任何初始控制器开始,然后在策略评估和策略改进之间迭代。

struct ControllerPolicyIteration
	k_max # number of iterations
	eval_max # number of evaluation iterations
end

function solve(M::ControllerPolicyIteration, 𝒫::POMDP)
	𝒜, 𝒪, k_max, eval_max = 𝒫.𝒜, 𝒫.𝒪, M.k_max, M.eval_max
	X = [1]
	ψ = Dict((x, a) => 1.0 / length(𝒜) for x in X, a in 𝒜)
	η = Dict((x, a, o, x′) => 1.0 for x in X, a in 𝒜, o in 𝒪, x′ in X)
	π = ControllerPolicy(𝒫, X, ψ, η)
	for i in 1:k_max
		prevX = copy(π.X)
		U = iterative_policy_evaluation(π, eval_max)
		policy_improvement!(π, U, prevX)
		prune!(π, U, prevX)
	end
	return π
end

function policy_improvement!(π::ControllerPolicy, U, prevX)
	𝒮, 𝒜, 𝒪 = π.𝒫.𝒮, π.𝒫.𝒜, π.𝒫.𝒪
	X, ψ, η = π.X, π.ψ, π.η
	repeatX𝒪 = fill(X, length(𝒪))
	assign𝒜X′ = vec(collect(product(𝒜, repeatX𝒪...)))
	for ax′ in assign𝒜X′
		x, a = maximum(X) + 1, ax′[1]
		push!(X, x)
		successor(o) = ax′[findfirst(isequal(o), 𝒪) + 1]
		U′(o,s′) = U[successor(o), s′]
		for s in 𝒮
			U[x, s] = lookahead(π.𝒫, U′, s, a)
		end
		for a′ in 𝒜
			ψ[x, a′] = a′ == a ? 1.0 : 0.0
			for (o, x′) in product(𝒪, prevX)
				η[x, a′, o, x′] = x′ == successor(o) ? 1.0 : 0.0
			end
		end
	end
	for (x, a, o, x′) in product(X, 𝒜, 𝒪, X)
		if !haskey(η, (x, a, o, x′))
			η[x, a, o, x′] = 0.0
		end
	end
end
5.2.2 非线性规划

策略改进问题可以作为一个单一的、大型的、非线性规划公式。

struct NonlinearProgramming
	b # initial belief# number of nodes
end

function tensorform(𝒫::POMDP)
	𝒮, 𝒜, 𝒪, R, T, O = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.R, 𝒫.T, 𝒫.O
	𝒮′ = eachindex(𝒮)
	𝒜′ = eachindex(𝒜)
	𝒪′ = eachindex(𝒪)
	R′ = [R(s,a) for s in 𝒮, a in 𝒜]
	T′ = [T(s,a,s′) for s in 𝒮, a in 𝒜, s′ in 𝒮]
	O′ = [O(a,s′,o) for a in 𝒜, s′ in 𝒮, o in 𝒪]
	return 𝒮′, 𝒜′, 𝒪′, R′, T′, O′
end

function solve(M::NonlinearProgramming, 𝒫::POMDP)
	x1, X = 1, collect(1:M.ℓ)
	𝒫, γ, b = 𝒫, 𝒫.γ, M.b
	𝒮, 𝒜, 𝒪, R, T, O = tensorform(𝒫)
	model = Model(Ipopt.Optimizer)
	@variable(model, U[X,𝒮])
	@variable(model, ψ[X,𝒜]0)
	@variable(model, η[X,𝒜,𝒪,X]0)
	@objective(model, Max, b⋅U[x1,:])
	@NLconstraint(model, [x=X,s=𝒮],
	U[x,s] == (sum(ψ[x,a]*(R[s,a] + γ*sum(T[s,a,s′]*sum(O[a,s′,o]
		*sum(η[x,a,o,x′]*U[x′,s′] for x′ in X) for o in 𝒪) for s′ in 𝒮)) for a in 𝒜)))
	@constraint(model, [x=X], sum(ψ[x,:]) == 1)
	@constraint(model, [x=X,a=𝒜,o=𝒪], sum(η[x,a,o,:]) == 1)
	optimize!(model)
	ψ′, η′ = value.(ψ), value.(η)
	return ControllerPolicy(𝒫, X,
	Dict((x, 𝒫.𝒜[a]) => ψ′[x, a] for x in X, a in 𝒜),
	Dict((x, 𝒫.𝒜[a], 𝒫.𝒪[o], x′) => η′[x, a, o, x′] 
		for x in X, a in 𝒜, o in 𝒪, x′ in X))
end

总结

从单个决策到序列决策,整个过程中就是加入迭代,如何让迭代更优的过程。从确定性状态到模型不确定、状态不确定,就是使用参数化表示进行毕竟,进而得到策略解的过程。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值