读书笔记：Algorithms for Decision Making（10）

LibraVan

已于 2022-10-03 11:58:19 修改

阅读量190

点赞数 1

分类专栏：读书笔记 Control 文章标签：人工智能

于 2022-10-02 14:56:56 首次发布

本文链接：https://blog.csdn.net/u013229302/article/details/126955741

版权

Control 同时被 2 个专栏收录

26 篇文章 0 订阅

订阅专栏

读书笔记

19 篇文章 4 订阅

订阅专栏

读书笔记：Algorithms for Decision Making

上一篇读书笔记：Algorithms for Decision Making（9）
下一篇读书笔记：Algorithms for Decision Making（11）

四、状态不确定性（1）

前面已经讨论了两个不确定性，即结果状态和模型的不确定性。在这一部分将不确定性扩展到状态。具体讲，接收到的观测值与状态只有概率关系，而不是精确地观察状态。此类问题可以建模为部分可观察的马尔可夫决策过程（POMDP），但POMDP很难以最佳方式解决所有问题，因而需要引入更多的近似策略。

1. Beliefs

解决POMDP的一种常见方法是在当前时间步推断基础状态的置信分布，然后应用将置信映射到行动的策略。接下来，将讨论置信参数表示和非参表示。

在POMDP中，智能体得到的来自观察空间 $\mathcal{O}$ 中的某个观察。在给定已做出的行动 $a$ 和转移状态 $s^{\prime}$ 的情况下，观察 $o$ 的概率为 $\mid a, s^{\prime})$ 。

struct POMDP
	γ # discount factor
	𝒮 # state space
	𝒜 # action space
	𝒪 # observation space
	T # transition function
	R # reward function
	O # observation function
	TRO # sample transition, reward, and observation
end

递归贝叶斯估计可以用来推断并更新当前状态下的置信分布，给出最新的行动和观察结果。用 $b (s)$ 表示分配给状态 $s$ 的概率，一个特定的置信 $b$ 属于置信空间 $\mathcal{B}$ 。

1.1 离散状态滤波器

若状态空间和观测空间都是有限的，可用离散状态滤波器来精确地进行推断。

function update(b::Vector{Float64}, 𝒫, a, o)
	𝒮, T, O = 𝒫.𝒮, 𝒫.T, 𝒫.O
	b′ = similar(b)
	for (i′, s′) in enumerate(𝒮)
		po = O(a, s′, o)
		b′[i′] = po * sum(T(s, a, s′) * b[i] for (i, s) in enumerate(𝒮))
	end
	if sum(b′) ≈ 0.0
		fill!(b′, 1)
	end
	return normalize!(b′, 1)
end

置信更新的成功取决于具有观察和转移模型的准确性。在这些模型不为人所知的情况下，建议使用具有更分散分布的简化模型，以帮助防止过度置信，从而导致状态估计的脆性。

1.2 Kalman滤波器

Kalman滤波器在给出一定系统假设下，以简便的迭代解决连续的置信更新问题，即 $b^{\prime}(s^{\prime}) \propto O(o \mid a, s^{\prime}) \int T(s^{\prime} \mid s, a) b(s) \ {\rm d}s.$

在给定 $T$ 和 $O$ 是线性高斯， $b$ 是高斯的前提下，Kalman滤波器给出的更新过程为： $\begin{align*} T(s^{\prime} \mid s, a) & = \mathcal{N}(s^{\prime} \mid T_{s} + T_{a}, \Sigma_{s}), \\ O(o \mid s^{\prime}) & = \mathcal{N}(o \mid O_{s} s^{\prime}, \Sigma_{o}), \\ b(s) & = \mathcal{N}(s \mid \mu_{b}, \Sigma_{b}).\end{align*}$

struct KalmanFilter
	μb # mean vector
	Σb # covariance matrix
end

function update(b::KalmanFilter, 𝒫, a, o)
	μb, Σb = b.μb, b.Σb
	Ts, Ta, Os = 𝒫.Ts, 𝒫.Ta, 𝒫.Os
	Σs, Σo = 𝒫.Σs, 𝒫.Σo
	# predict step
	μp = Ts*μb + Ta*a
	Σp = Ts*Σb*Ts' + Σs
	# update step
	Σpo = Σp*Os'
	K = Σpo/(Os*Σp*Os' + Σo)
	μb′ = μp + K*(o - Os*μp)
	Σb′ = (I - K*Os)*Σp
	return KalmanFilter(μb′, Σb′)
end

1.2.1 Extended Kalman滤波器

Extended Kalman滤波器将Kalman滤波器简单地扩展到具有高斯噪声的非线性动力学问题，即 $\begin{align*} T(s^{\prime} \mid s, a) & = \mathcal{N}(s^{\prime} \mid f_{T}(s, a), \Sigma_{s}), \\ O(o \mid s^{\prime}) & = \mathcal{N}(o \mid f_{O}(s^{\prime}), \Sigma_{o}), \\ b(s) & = \mathcal{N}(s \mid \mu_{b}, \Sigma_{b}).\end{align*}$

struct ExtendedKalmanFilter
	μb # mean vector
	Σb # covariance matrix
end

import ForwardDiff: jacobian
function update(b::ExtendedKalmanFilter, 𝒫, a, o)
	μb, Σb = b.μb, b.Σb
	fT, fO = 𝒫.fT, 𝒫.fO
	Σs, Σo = 𝒫.Σs, 𝒫.Σo
	# predict
	μp = fT(μb, a)
	Ts = jacobian(s->fT(s, a), μb)
	Os = jacobian(fO, μp)
	Σp = Ts*Σb*Ts' + Σs
	# update
	Σpo = Σp*Os'
	K = Σpo/(Os*Σp*Os' + Σo)
	μb′ = μp + K*(o - fO(μp))
	Σb′ = (I - K*Os)*Σp
	return ExtendedKalmanFilter(μb′, Σb′)
end

1.2.2 Unscented Kalman滤波器（UKF）

UKF解决带高斯噪声的非线性问题¹。该方法是无导数的，并且依赖于确定性采样策略来近似非线性变换的分布的影响。

struct UnscentedKalmanFilter
	μb # mean vector
	Σb # covariance matrix
	λ # spread parameter
end

function unscented_transform(μ, Σ, f, λ, ws)
	n = length(μ)
	Δ = cholesky((n + λ) * Σ).L
	S = [μ]
	for i in 1:n
		push!(S, μ + Δ[:,i])
		push!(S, μ - Δ[:,i])
	end
	S′ = f.(S)
	μ′ = sum(w*s for (w,s) in zip(ws, S′))
	Σ′ = sum(w*(s - μ′)*(s - μ′)' for (w,s) in zip(ws, S′))
	return (μ′, Σ′, S, S′)
end

function update(b::UnscentedKalmanFilter, 𝒫, a, o)
	μb, Σb, λ = b.μb, b.Σb, b.λ
	fT, fO = 𝒫.fT, 𝒫.fO
	n = length(μb)
	ws = [λ / (n + λ); fill(1/(2(n + λ)), 2n)]
	# predict
	μp, Σp, Sp, Sp′ = unscented_transform(μb, Σb, s->fT(s,a), λ, ws)
	Σp += 𝒫.Σs
	# update
	μo, Σo, So, So′ = unscented_transform(μp, Σp, fO, λ, ws)
	Σo += 𝒫.Σo
	Σpo = sum(w*(s - μp)*(s′ - μo)' for (w,s,s′) in zip(ws, So, So′))
	K = Σpo / Σo
	μb′ = μp + K*(o - μo)
	Σb′ = Σp - K*Σo*K'
	return UnscentedKalmanFilter(μb′, Σb′, λ)
end

1.3 粒子滤波器

具有大状态空间的离散问题或具有动力学的连续问题，如果不能通过Kalman滤波器的线性高斯假设很好地近似，则会使用近似技术进行表征并更新。一种常见的方法是使用粒子过滤器，它将置信状态表示为一组状态，每一个在更新置信的状态称为粒子。

struct ParticleFilter
	states # vector of state samples
end

function update(b::ParticleFilter, 𝒫, a, o)
	T, O = 𝒫.T, 𝒫.O
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s′, o) for s′ in states]
	D = SetCategorical(states, weights)
	return ParticleFilter(rand(D, length(states)))
end

针对离散问题，叠加rejection。

struct RejectionParticleFilter
	states # vector of state samples
end

function update(b::RejectionParticleFilter, 𝒫, a, o)
	T, O = 𝒫.T, 𝒫.O
	states = similar(b.states)
	i = 1
	while i ≤ length(states)
		s = rand(b.states)
		s′ = rand(T(s,a))
		if rand(O(a,s′)) == o
			states[i] = s′
			i += 1
		end
	end
	return RejectionParticleFilter(states)
end

1.3.1 Particle Injection

对于机器人定位问题，通常的做法是将均匀分布的粒子注入所有可能的机器人姿态，并根据当前观测值加权。

struct InjectionParticleFilter
	states # vector of state samples
	m_inject # number of samples to inject
	D_inject # injection distribution
end

function update(b::InjectionParticleFilter, 𝒫, a, o)
	T, O, m_inject, D_inject = 𝒫.T, 𝒫.O, b.m_inject, b.D_inject
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s′, o) for s′ in states]
	D = SetCategorical(states, weights)
	m = length(states)
	states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
	return InjectionParticleFilter(states, m_inject, D_inject)
end

此外，还可以有更加自适应的方法。

mutable struct AdaptiveInjectionParticleFilter
	states # vector of state samples
	w_slow # slow moving average
	w_fast # fast moving average
	α_slow # slow moving average parameter
	α_fast # fast moving average parameter
	ν # injection parameter
	D_inject # injection distribution
end

function update(b::AdaptiveInjectionParticleFilter, 𝒫, a, o)
	T, O = 𝒫.T, 𝒫.O
	w_slow, w_fast, α_slow, α_fast, ν, D_inject = 
		b.w_slow, b.w_fast, b.α_slow, b.α_fast, b.ν, b.D_inject
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s′, o) for s′ in states]
	w_mean = mean(weights)
	w_slow += α_slow*(w_mean - w_slow)
	w_fast += α_fast*(w_mean - w_fast)
	m = length(states)
	m_inject = round(Int, m * max(0, 1.0 - ν*w_fast / w_slow))
	D = SetCategorical(states, weights)
	states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
	b.w_slow, b.w_fast = w_slow, w_fast
	return AdaptiveInjectionParticleFilter(states,
	w_slow, w_fast, α_slow, α_fast, ν, D_inject)
end

2. 精确置信状态规划

任何POMDP都可以被视为带置信状态的的MDP，即置信状态MDP。置信状态MDP的奖励函数取决于置信和所采取的行动，只是奖励的预期值，即 $\sum_{s} R(s, a) b(s).$ 该模型中的转移函数为： $\begin{align*} T(b^{\prime} \mid b, a) & = \mathbb{P} (b^{\prime} \mid b, a) \\ & = \sum_{o}\sum_{s^{\prime}}\sum_{s} \mathbb{P} (b^{\prime} \mid b, a, o) \mathbb{P} (o \mid b, a, s, s^{\prime} ) \mathbb{P} (s^{\prime} \mid b, s, a) b(s).\end{align*}$ 由于状态空间是连续的，求解置信状态MDP具有挑战性。

2.1 Conditional Plans

有许多方法可以表示POMDP的策略，其中一种是使用表示树的Conditional Plans。

struct ConditionalPlan
	a # action to take at root
	subplans # dictionary mapping observations to subplans
end

ConditionalPlan(a) = ConditionalPlan(a, Dict())

(π::ConditionalPlan)() = π.a
(π::ConditionalPlan)(o) = π.subplans[o]

# compute expected utility 
function lookahead(𝒫::POMDP, U, s, a)
	𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
	u′ = sum(T(s,a,s′)*sum(O(a,s′,o)*U(o,s′) for o in 𝒪) for s′ in 𝒮)
	return R(s,a) + γ*u′
end

function evaluate_plan(𝒫::POMDP, π::ConditionalPlan, s)
	U(o,s′) = evaluate_plan(𝒫, π(o), s′)
	return isempty(π.subplans) ? 𝒫.R(s,π()) : lookahead(𝒫, U, s, π())
end

2.2 置信效用

计算置信 $b$ 的效用可通过： $\mathcal{U}^{\pi}(b) = \sum_{s} b(s) \mathcal{U}^{\pi}(s).$

2.2.1 Alpha Vectors

将上式重写为： $\mathcal{U}^{\pi}(b) = \bm{\alpha}_{\pi}^{\rm T} \bm{b}.$ 其中的 $\bm{\alpha}_{\pi}$ 是alpha vector。

function alphavector(𝒫::POMDP, π::ConditionalPlan)
	return [evaluate_plan(𝒫, π, s) for s in 𝒫.𝒮]
end

2.2.2 Alpha Vector Policy

基于 Alpha Vector 的表达形式，可有如下策略：

struct AlphaVectorPolicy
	𝒫 # POMDP problem
	Γ # alpha vectors
	a # actions associated with alpha vectors
end

function utility(π::AlphaVectorPolicy, b)
	return maximum(α⋅b for α in π.Γ)
end

function (π::AlphaVectorPolicy)(b)
	i = argmax([α⋅b for α in π.Γ])
	return π.a[i]
end

将该策略进行运用。

function lookahead(𝒫::POMDP, U, b::Vector, a)
	𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
	r = sum(R(s,a)*b[i] for (i,s) in enumerate(𝒮))
	Posa(o,s,a) = sum(O(a,s′,o)*T(s,a,s′) for s′ in 𝒮)
	Poba(o,b,a) = sum(b[i]*Posa(o,s,a) for (i,s) in enumerate(𝒮))
	return r + γ*sum(Poba(o,b,a)*U(update(b, 𝒫, a, o)) for o in 𝒪)
end

function greedy(𝒫::POMDP, U, b::Vector)
	u, a = findmax(a->lookahead(𝒫, U, b, a), 𝒫.𝒜)
	return (a=a, u=u)
end

struct LookaheadAlphaVectorPolicy
	𝒫 # POMDP problem
	Γ # alpha vectors
end

function utility(π::LookaheadAlphaVectorPolicy, b)
	return maximum(α⋅b for α in π.Γ)
end

function greedy(π, b)
	U(b) = utility(π, b)
	return greedy(π.𝒫, U, b)
end

(π::LookaheadAlphaVectorPolicy)(b) = greedy(π, b).a

2.2.3 剪枝

针对大量的alpha vectors，为了提高效率可使用剪枝。

function find_maximal_belief(α, Γ)
	m = length(α)
	if isempty(Γ)
		return fill(1/m, m) # arbitrary belief
	end
	model = Model(GLPK.Optimizer)
	@variable(model, δ)
	@variable(model, b[i=1:m] ≥ 0)
	@constraint(model, sum(b) == 1.0)
	for a in Γ
		@constraint(model, (α-a)⋅b ≥ δ)
	end
	@objective(model, Max, δ)
	optimize!(model)
	return value(δ) > 0 ? value.(b) : nothing
end

function find_dominating(Γ)
	n = length(Γ)
	candidates, dominating = trues(n), falses(n)
	while any(candidates)
		i = findfirst(candidates)
		b = find_maximal_belief(Γ[i], Γ[dominating])
		if b === nothing
			candidates[i] = false
		else
			k = argmax([candidates[j] ? b⋅Γ[j] : -Inf for j in 1:n])
			candidates[k], dominating[k] = false, true
		end
	end
	return dominating
end

function prune(plans, Γ)
	d = find_dominating(Γ)
	return (plans[d], Γ[d])
end

2.3 值迭代

MDP的值迭代算法可直接应用于POMDP。

# This version of value iteration in terms of 
# conditional plans and alpha vectors

# one-step
function value_iteration(𝒫::POMDP, k_max)
	𝒮, 𝒜, R = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R
	plans = [ConditionalPlan(a) for a in 𝒜]
	Γ = [[R(s,a) for s in 𝒮] for a in 𝒜]
	plans, Γ = prune(plans, Γ)
	for k in 2:k_max
		plans, Γ = expand(plans, Γ, 𝒫)
		plans, Γ = prune(plans, Γ)
	end
	return (plans, Γ)
end

function solve(M::ValueIteration, 𝒫::POMDP)
	plans, Γ = value_iteration(𝒫, M.k_max)
	return LookaheadAlphaVectorPolicy(𝒫, Γ)
end

# expansion-step
function ConditionalPlan(𝒫::POMDP, a, plans)
	subplans = Dict(o=>π for (o, π) in zip(𝒫.𝒪, plans))
	return ConditionalPlan(a, subplans)
end

function combine_lookahead(𝒫::POMDP, s, a, Γo)
	𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
	U′(s′,i) = sum(O(a,s′,o)*α[i] for (o,α) in zip(𝒪,Γo))
	return R(s,a) + γ*sum(T(s,a,s′)*U′(s′,i) for (i,s′) in enumerate(𝒮))
end

function combine_alphavector(𝒫::POMDP, a, Γo)
	return [combine_lookahead(𝒫, s, a, Γo) for s in 𝒫.𝒮]
end

function expand(plans, Γ, 𝒫)
	𝒮, 𝒜, 𝒪, T, O, R = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R
	plans′, Γ′ = [], []
	for a in 𝒜
		# iterate over all possible mappings from observations to plans
		for inds in product([eachindex(plans) for o in 𝒪]...)
			πo = plans[[inds...]]
			Γo = Γ[[inds...]]
			π = ConditionalPlan(𝒫, a, πo)
			α = combine_alphavector(𝒫, a, Γo)
			push!(plans′, π)
			push!(Γ′, α)
		end
	end
	return (plans′, Γ′)
end

S. J. Julier and J. K. Uhlmann, “Unscented filtering and nonlinear estimation,” in Proceedings of the IEEE, vol. 92, no. 3, pp. 401–422, 2004. ↩︎

LibraVan

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
读书笔记：Algorithms for Decision Making（10）

前面已经讨论了两个不确定性，即结果状态和模型的不确定性。在这一部分将不确定性扩展到状态。具体讲，接收到的观测值与状态只有概率关系，而不是精确地观察状态。此类问题可以建模为部分可观察的马尔可夫决策过程（POMDP），但POMDP很难以最佳方式解决所有问题，因而需要引入更多的近似策略。
复制链接

扫一扫

专栏目录