读书笔记:Algorithms for Decision Making(10)

读书笔记:Algorithms for Decision Making

上一篇 读书笔记:Algorithms for Decision Making(9)
下一篇 读书笔记:Algorithms for Decision Making(11)



四、状态不确定性(1)

前面已经讨论了两个不确定性,即结果状态和模型的不确定性。在这一部分将不确定性扩展到状态。具体讲,接收到的观测值与状态只有概率关系,而不是精确地观察状态。此类问题可以建模为部分可观察的马尔可夫决策过程(POMDP),但POMDP很难以最佳方式解决所有问题,因而需要引入更多的近似策略。


1. Beliefs

解决POMDP的一种常见方法是在当前时间步推断基础状态的置信分布,然后应用将置信映射到行动的策略。接下来,将讨论置信参数表示和非参表示。

在POMDP中,智能体得到的来自观察空间 O \mathcal{O} O中的某个观察。在给定已做出的行动 a a a和转移状态 s ′ s^{\prime} s的情况下,观察 o o o的概率为 O ( o ∣ a , s ′ ) O(o \mid a, s^{\prime}) O(oa,s)

struct POMDP
	γ # discount factor
	𝒮 # state space
	𝒜 # action space
	𝒪 # observation space
	T # transition function
	R # reward function
	O # observation function
	TRO # sample transition, reward, and observation
end

递归贝叶斯估计可以用来推断并更新当前状态下的置信分布,给出最新的行动和观察结果。用 b ( s ) b(s) b(s)表示分配给状态 s s s的概率,一个特定的置信 b b b属于置信空间 B \mathcal{B} B

1.1 离散状态滤波器

若状态空间和观测空间都是有限的,可用离散状态滤波器来精确地进行推断。

function update(b::Vector{Float64}, 𝒫, a, o)
	𝒮, T, O = 𝒫.𝒮, 𝒫.T, 𝒫.O
	b′ = similar(b)
	for (i′, s′) in enumerate(𝒮)
		po = O(a, s′, o)
		b′[i′] = po * sum(T(s, a, s′) * b[i] for (i, s) in enumerate(𝒮))
	end
	if sum(b′)0.0
		fill!(b′, 1)
	end
	return normalize!(b′, 1)
end

置信更新的成功取决于具有观察和转移模型的准确性。在这些模型不为人所知的情况下,建议使用具有更分散分布的简化模型,以帮助防止过度置信,从而导致状态估计的脆性。

1.2 Kalman滤波器

Kalman滤波器在给出一定系统假设下,以简便的迭代解决连续的置信更新问题,即 b ′ ( s ′ ) ∝ O ( o ∣ a , s ′ ) ∫ T ( s ′ ∣ s , a ) b ( s )   d s . b^{\prime}(s^{\prime}) \propto O(o \mid a, s^{\prime}) \int T(s^{\prime} \mid s, a) b(s) \ {\rm d}s. b(s)O(oa,s)T(ss,a)b(s) ds.

在给定 T T T O O O是线性高斯, b b b是高斯的前提下,Kalman滤波器给出的更新过程为: T ( s ′ ∣ s , a ) = N ( s ′ ∣ T s + T a , Σ s ) , O ( o ∣ s ′ ) = N ( o ∣ O s s ′ , Σ o ) , b ( s ) = N ( s ∣ μ b , Σ b ) . \begin{align*} T(s^{\prime} \mid s, a) & = \mathcal{N}(s^{\prime} \mid T_{s} + T_{a}, \Sigma_{s}), \\ O(o \mid s^{\prime}) & = \mathcal{N}(o \mid O_{s} s^{\prime}, \Sigma_{o}), \\ b(s) & = \mathcal{N}(s \mid \mu_{b}, \Sigma_{b}).\end{align*} T(ss,a)O(os)b(s)=N(sTs+Ta,Σs),=N(oOss,Σo),=N(sμb,Σb).

struct KalmanFilter
	μb # mean vector
	Σb # covariance matrix
end

function update(b::KalmanFilter, 𝒫, a, o)
	μb, Σb = b.μb, b.Σb
	Ts, Ta, Os = 𝒫.Ts, 𝒫.Ta, 𝒫.Os
	Σs, Σo = 𝒫.Σs, 𝒫.Σo
	# predict step
	μp = Ts*μb + Ta*a
	Σp = Ts*Σb*Ts' + Σs
	# update step
	Σpo = Σp*Os'
	K = Σpo/(Os*Σp*Os' + Σo)
	μb′ = μp + K*(o - Os*μp)
	Σb′ = (I - K*Os)*Σp
	return KalmanFilter(μb′, Σb′)
end
1.2.1 Extended Kalman滤波器

Extended Kalman滤波器将Kalman滤波器简单地扩展到具有高斯噪声的非线性动力学问题,即 T ( s ′ ∣ s , a ) = N ( s ′ ∣ f T ( s , a ) , Σ s ) , O ( o ∣ s ′ ) = N ( o ∣ f O ( s ′ ) , Σ o ) , b ( s ) = N ( s ∣ μ b , Σ b ) . \begin{align*} T(s^{\prime} \mid s, a) & = \mathcal{N}(s^{\prime} \mid f_{T}(s, a), \Sigma_{s}), \\ O(o \mid s^{\prime}) & = \mathcal{N}(o \mid f_{O}(s^{\prime}), \Sigma_{o}), \\ b(s) & = \mathcal{N}(s \mid \mu_{b}, \Sigma_{b}).\end{align*} T(ss,a)O(os)b(s)=N(sfT(s,a),Σs),=N(ofO(s),Σo),=N(sμb,Σb).

struct ExtendedKalmanFilter
	μb # mean vector
	Σb # covariance matrix
end

import ForwardDiff: jacobian
function update(b::ExtendedKalmanFilter, 𝒫, a, o)
	μb, Σb = b.μb, b.Σb
	fT, fO = 𝒫.fT, 𝒫.fO
	Σs, Σo = 𝒫.Σs, 𝒫.Σo
	# predict
	μp = fT(μb, a)
	Ts = jacobian(s->fT(s, a), μb)
	Os = jacobian(fO, μp)
	Σp = Ts*Σb*Ts' + Σs
	# update
	Σpo = Σp*Os'
	K = Σpo/(Os*Σp*Os' + Σo)
	μb′ = μp + K*(o - fO(μp))
	Σb′ = (I - K*Os)*Σp
	return ExtendedKalmanFilter(μb′, Σb′)
end
1.2.2 Unscented Kalman滤波器(UKF)

UKF解决带高斯噪声的非线性问题1。该方法是无导数的,并且依赖于确定性采样策略来近似非线性变换的分布的影响。

struct UnscentedKalmanFilter
	μb # mean vector
	Σb # covariance matrix
	λ # spread parameter
end

function unscented_transform(μ, Σ, f, λ, ws)
	n = length(μ)
	Δ = cholesky((n + λ) * Σ).L
	S = [μ]
	for i in 1:n
		push!(S, μ + Δ[:,i])
		push!(S, μ - Δ[:,i])
	end
	S′ = f.(S)
	μ′ = sum(w*s for (w,s) in zip(ws, S′))
	Σ′ = sum(w*(s - μ′)*(s - μ′)' for (w,s) in zip(ws, S′))
	return (μ′, Σ′, S, S′)
end

function update(b::UnscentedKalmanFilter, 𝒫, a, o)
	μb, Σb, λ = b.μb, b.Σb, b.λ
	fT, fO = 𝒫.fT, 𝒫.fO
	n = length(μb)
	ws = [λ / (n + λ); fill(1/(2(n + λ)), 2n)]
	# predict
	μp, Σp, Sp, Sp′ = unscented_transform(μb, Σb, s->fT(s,a), λ, ws)
	Σp += 𝒫.Σs
	# update
	μo, Σo, So, So′ = unscented_transform(μp, Σp, fO, λ, ws)
	Σo += 𝒫.Σo
	Σpo = sum(w*(s - μp)*(s′ - μo)' for (w,s,s′) in zip(ws, So, So′))
	K = Σpo / Σo
	μb′ = μp + K*(o - μo)
	Σb′ = Σp - K*Σo*K'
	return UnscentedKalmanFilter(μb′, Σb′, λ)
end

1.3 粒子滤波器

具有大状态空间的离散问题或具有动力学的连续问题,如果不能通过Kalman滤波器的线性高斯假设很好地近似,则会使用近似技术进行表征并更新。一种常见的方法是使用粒子过滤器,它将置信状态表示为一组状态,每一个在更新置信的状态称为粒子

struct ParticleFilter
	states # vector of state samples
end

function update(b::ParticleFilter, 𝒫, a, o)
	T, O = 𝒫.T, 𝒫.O
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s′, o) for s′ in states]
	D = SetCategorical(states, weights)
	return ParticleFilter(rand(D, length(states)))
end

针对离散问题,叠加rejection。

struct RejectionParticleFilter
	states # vector of state samples
end

function update(b::RejectionParticleFilter, 𝒫, a, o)
	T, O = 𝒫.T, 𝒫.O
	states = similar(b.states)
	i = 1
	while i ≤ length(states)
		s = rand(b.states)
		s′ = rand(T(s,a))
		if rand(O(a,s′)) == o
			states[i] = s′
			i += 1
		end
	end
	return RejectionParticleFilter(states)
end
1.3.1 Particle Injection

对于机器人定位问题,通常的做法是将均匀分布的粒子注入所有可能的机器人姿态,并根据当前观测值加权。

struct InjectionParticleFilter
	states # vector of state samples
	m_inject # number of samples to inject
	D_inject # injection distribution
end

function update(b::InjectionParticleFilter, 𝒫, a, o)
	T, O, m_inject, D_inject = 𝒫.T, 𝒫.O, b.m_inject, b.D_inject
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s′, o) for s′ in states]
	D = SetCategorical(states, weights)
	m = length(states)
	states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
	return InjectionParticleFilter(states, m_inject, D_inject)
end

此外,还可以有更加自适应的方法。

mutable struct AdaptiveInjectionParticleFilter
	states # vector of state samples
	w_slow # slow moving average
	w_fast # fast moving average
	α_slow # slow moving average parameter
	α_fast # fast moving average parameter
	ν # injection parameter
	D_inject # injection distribution
end

function update(b::AdaptiveInjectionParticleFilter, 𝒫, a, o)
	T, O = 𝒫.T, 𝒫.O
	w_slow, w_fast, α_slow, α_fast, ν, D_inject = 
		b.w_slow, b.w_fast, b.α_slow, b.α_fast, b.ν, b.D_inject
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s′, o) for s′ in states]
	w_mean = mean(weights)
	w_slow += α_slow*(w_mean - w_slow)
	w_fast += α_fast*(w_mean - w_fast)
	m = length(states)
	m_inject = round(Int, m * max(0, 1.0 - ν*w_fast / w_slow))
	D = SetCategorical(states, weights)
	states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
	b.w_slow, b.w_fast = w_slow, w_fast
	return AdaptiveInjectionParticleFilter(states,
	w_slow, w_fast, α_slow, α_fast, ν, D_inject)
end

2. 精确置信状态规划

任何POMDP都可以被视为带置信状态的的MDP,即置信状态MDP。置信状态MDP的奖励函数取决于置信和所采取的行动,只是奖励的预期值,即 R ( b , a ) = ∑ s R ( s , a ) b ( s ) . R(b, a) = \sum_{s} R(s, a) b(s). R(b,a)=sR(s,a)b(s).该模型中的转移函数为: T ( b ′ ∣ b , a ) = P ( b ′ ∣ b , a ) = ∑ o ∑ s ′ ∑ s P ( b ′ ∣ b , a , o ) P ( o ∣ b , a , s , s ′ ) P ( s ′ ∣ b , s , a ) b ( s ) . \begin{align*} T(b^{\prime} \mid b, a) & = \mathbb{P} (b^{\prime} \mid b, a) \\ & = \sum_{o}\sum_{s^{\prime}}\sum_{s} \mathbb{P} (b^{\prime} \mid b, a, o) \mathbb{P} (o \mid b, a, s, s^{\prime} ) \mathbb{P} (s^{\prime} \mid b, s, a) b(s).\end{align*} T(bb,a)=P(bb,a)=ossP(bb,a,o)P(ob,a,s,s)P(sb,s,a)b(s). 由于状态空间是连续的,求解置信状态MDP具有挑战性。

2.1 Conditional Plans

有许多方法可以表示POMDP的策略,其中一种是使用表示树的Conditional Plans。

struct ConditionalPlan
	a # action to take at root
	subplans # dictionary mapping observations to subplans
end

ConditionalPlan(a) = ConditionalPlan(a, Dict())

(π::ConditionalPlan)() = π.a
(π::ConditionalPlan)(o) = π.subplans[o]

# compute expected utility 
function lookahead(𝒫::POMDP, U, s, a)
	𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
	u′ = sum(T(s,a,s′)*sum(O(a,s′,o)*U(o,s′) for o in 𝒪) for s′ in 𝒮)
	return R(s,a) + γ*u′
end

function evaluate_plan(𝒫::POMDP, π::ConditionalPlan, s)
	U(o,s′) = evaluate_plan(𝒫, π(o), s′)
	return isempty(π.subplans) ? 𝒫.R(s,π()) : lookahead(𝒫, U, s, π())
end

2.2 置信效用

计算置信 b b b的效用可通过: U π ( b ) = ∑ s b ( s ) U π ( s ) . \mathcal{U}^{\pi}(b) = \sum_{s} b(s) \mathcal{U}^{\pi}(s). Uπ(b)=sb(s)Uπ(s).

2.2.1 Alpha Vectors

将上式重写为: U π ( b ) = α π T b . \mathcal{U}^{\pi}(b) = \bm{\alpha}_{\pi}^{\rm T} \bm{b}. Uπ(b)=απTb.其中的 α π \bm{\alpha}_{\pi} απ是alpha vector。

function alphavector(𝒫::POMDP, π::ConditionalPlan)
	return [evaluate_plan(𝒫, π, s) for s in 𝒫.𝒮]
end
2.2.2 Alpha Vector Policy

基于 Alpha Vector 的表达形式,可有如下策略:

struct AlphaVectorPolicy
	𝒫 # POMDP problem
	Γ # alpha vectors
	a # actions associated with alpha vectors
end

function utility(π::AlphaVectorPolicy, b)
	return maximum(α⋅b for α in π.Γ)
end

function (π::AlphaVectorPolicy)(b)
	i = argmax([α⋅b for α in π.Γ])
	return π.a[i]
end

将该策略进行运用。

function lookahead(𝒫::POMDP, U, b::Vector, a)
	𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
	r = sum(R(s,a)*b[i] for (i,s) in enumerate(𝒮))
	Posa(o,s,a) = sum(O(a,s′,o)*T(s,a,s′) for s′ in 𝒮)
	Poba(o,b,a) = sum(b[i]*Posa(o,s,a) for (i,s) in enumerate(𝒮))
	return r + γ*sum(Poba(o,b,a)*U(update(b, 𝒫, a, o)) for o in 𝒪)
end

function greedy(𝒫::POMDP, U, b::Vector)
	u, a = findmax(a->lookahead(𝒫, U, b, a), 𝒫.𝒜)
	return (a=a, u=u)
end

struct LookaheadAlphaVectorPolicy
	𝒫 # POMDP problem
	Γ # alpha vectors
end

function utility(π::LookaheadAlphaVectorPolicy, b)
	return maximum(α⋅b for α in π.Γ)
end

function greedy(π, b)
	U(b) = utility(π, b)
	return greedy(π.𝒫, U, b)
end

(π::LookaheadAlphaVectorPolicy)(b) = greedy(π, b).a
2.2.3 剪枝

针对大量的alpha vectors,为了提高效率可使用剪枝。

function find_maximal_belief(α, Γ)
	m = length(α)
	if isempty(Γ)
		return fill(1/m, m) # arbitrary belief
	end
	model = Model(GLPK.Optimizer)
	@variable(model, δ)
	@variable(model, b[i=1:m]0)
	@constraint(model, sum(b) == 1.0)
	for a in Γ
		@constraint(model, (α-a)⋅b ≥ δ)
	end
	@objective(model, Max, δ)
	optimize!(model)
	return value(δ) > 0 ? value.(b) : nothing
end

function find_dominating(Γ)
	n = length(Γ)
	candidates, dominating = trues(n), falses(n)
	while any(candidates)
		i = findfirst(candidates)
		b = find_maximal_belief(Γ[i], Γ[dominating])
		if b === nothing
			candidates[i] = false
		else
			k = argmax([candidates[j] ? b⋅Γ[j] : -Inf for j in 1:n])
			candidates[k], dominating[k] = false, true
		end
	end
	return dominating
end

function prune(plans, Γ)
	d = find_dominating(Γ)
	return (plans[d], Γ[d])
end

2.3 值迭代

MDP的值迭代算法可直接应用于POMDP。

# This version of value iteration in terms of 
# conditional plans and alpha vectors

# one-step
function value_iteration(𝒫::POMDP, k_max)
	𝒮, 𝒜, R = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R
	plans = [ConditionalPlan(a) for a in 𝒜]
	Γ = [[R(s,a) for s in 𝒮] for a in 𝒜]
	plans, Γ = prune(plans, Γ)
	for k in 2:k_max
		plans, Γ = expand(plans, Γ, 𝒫)
		plans, Γ = prune(plans, Γ)
	end
	return (plans, Γ)
end

function solve(M::ValueIteration, 𝒫::POMDP)
	plans, Γ = value_iteration(𝒫, M.k_max)
	return LookaheadAlphaVectorPolicy(𝒫, Γ)
end

# expansion-step
function ConditionalPlan(𝒫::POMDP, a, plans)
	subplans = Dict(o=>π for (o, π) in zip(𝒫.𝒪, plans))
	return ConditionalPlan(a, subplans)
end

function combine_lookahead(𝒫::POMDP, s, a, Γo)
	𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
	U′(s′,i) = sum(O(a,s′,o)[i] for (o,α) in zip(𝒪,Γo))
	return R(s,a) + γ*sum(T(s,a,s′)*U′(s′,i) for (i,s′) in enumerate(𝒮))
end

function combine_alphavector(𝒫::POMDP, a, Γo)
	return [combine_lookahead(𝒫, s, a, Γo) for s in 𝒫.𝒮]
end

function expand(plans, Γ, 𝒫)
	𝒮, 𝒜, 𝒪, T, O, R = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R
	plans′, Γ′ = [], []
	for a in 𝒜
		# iterate over all possible mappings from observations to plans
		for inds in product([eachindex(plans) for o in 𝒪]...)
			πo = plans[[inds...]]
			Γo = Γ[[inds...]]
			π = ConditionalPlan(𝒫, a, πo)
			α = combine_alphavector(𝒫, a, Γo)
			push!(plans′, π)
			push!(Γ′, α)
		end
	end
	return (plans′, Γ′)
end


  1. S. J. Julier and J. K. Uhlmann, “Unscented filtering and nonlinear estimation,” in Proceedings of the IEEE, vol. 92, no. 3, pp. 401–422, 2004. ↩︎

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值