读书笔记：Algorithms for Decision Making（6）-CSDN博客

本文链接：https://blog.csdn.net/u013229302/article/details/126877362

读书笔记：Algorithms for Decision Making

上一篇读书笔记：Algorithms for Decision Making（5）
下一篇读书笔记：Algorithms for Decision Making（7）

三、序列问题（2）

上文中提及的精确解方法适用于小型离散问题，对于较大状态空间的问题，计算精确解需要极大的内存量，因而考虑近似解的方法。常使用approximate dynamic programming的方法去寻求近似解，进而使用在线方法实现实时计算。

2. 近似值函数

2.1 参数化表示

记值函数的参数化表示为 $\mathcal{U}_{\theta} (s)$ 。

struct ApproximateValueIteration
	Uθ # initial parameterized value function that supports fit!
	S # set of discrete states for performing backups
	k_max # maximum number of iterations
end

function solve(M::ApproximateValueIteration, 𝒫::MDP)
	Uθ, S, k_max = M.Uθ, M.S, M.k_max
	for k in 1:k_max
		U = [backup(𝒫, Uθ, s) for s in S]
		fit!(Uθ, S, U)
	end
	return ValueFunctionPolicy(𝒫, Uθ)
end

接下来提及的所有参数表示均可与与上述逼近算法一起使用，且参数表示需要支持 $\mathcal{U}_{\theta}$ 的计算以及$\mathcal{U}_{\theta} $与$ S$中点效用估计的拟合。

参数化表示分为两类：

局部近似方法，其中 $\theta$ 对应于 $S$ 中状态的值。
全局近似方法，其中 $\theta$ 与 $S$ 中状态的值不直接相关。
但两者本质上都可以视为一个线性函数逼近，即 $\mathcal{U}_{\theta} = \theta^{\rm T} \beta(s)$ 。

2.2 最邻近方法

mutable struct NearestNeighborValueFunction
	k # number of neighbors
	d # distance function d(s, s′)
	S # set of discrete states
	θ # vector of values at states in S
end

function (Uθ::NearestNeighborValueFunction)(s)
	dists = [Uθ.d(s,s′) for s′ in Uθ.S]
	ind = sortperm(dists)[1:Uθ.k]
	return mean(Uθ.θ[i] for i in ind)
end

function fit!(Uθ::NearestNeighborValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.3 核光滑方法

mutable struct LocallyWeightedValueFunction
	k # kernel function k(s, s′)
	S # set of discrete states
	θ # vector of values at states in S
end

function (Uθ::LocallyWeightedValueFunction)(s)
	w = normalize([Uθ.k(s,s′) for s′ in Uθ.S], 1)
	return Uθ.θ ⋅ w
end

function fit!(Uθ::LocallyWeightedValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.4 线性插值

在这里插入图片描述

mutable struct MultilinearValueFunction
	o # position of lower-left corner
	δ # vector of widths
	θ # vector of values at states in S
end

function (Uθ::MultilinearValueFunction)(s)
	o, δ, θ = Uθ.o, Uθ.δ, Uθ.θ
	Δ = (s - o)./δ
	# Multidimensional index of lower-left cell
	i = min.(floor.(Int, Δ) .+ 1, size(θ) .- 1)
	vertex_index = similar(i)
	d = length(s)
	u = 0.0
	for vertex in 0:2^d-1
		weight = 1.0
		for j in 1:d
		# Check whether jth bit is set
			if vertex & (1 << (j-1)) > 0
				vertex_index[j] = i[j] + 1
				weight *= Δ[j] - i[j] + 1
			else
				vertex_index[j] = i[j]
				weight *= i[j] - Δ[j]
			end
		end
		u += θ[vertex_index...]*weight
	end
	return u
end

function fit!(Uθ::MultilinearValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.5 单纯形插值

mutable struct SimplexValueFunction
	o # position of lower-left corner
	δ # vector of widths
	θ # vector of values at states in S
end

function (Uθ::SimplexValueFunction)(s)
	Δ = (s - Uθ.o)./Uθ.δ
	# Multidimensional index of upper-right cell
	i = min.(floor.(Int, Δ) .+ 1, size(Uθ.θ) .- 1) .+ 1
	u = 0.0
	s′ = (s - (Uθ.o + Uθ.δ.*(i.-2))) ./ Uθ.δ
	p = sortperm(s′) # increasing order
	w_tot = 0.0
	for j in p
		w = s′[j] - w_tot
		u += w*Uθ.θ[i...]
		i[j] -= 1
		w_tot += w
	end
	u += (1 - w_tot)*Uθ.θ[i...]
	return u
end

function fit!(Uθ::SimplexValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.6 线性回归与神经网络回归

下面介绍全局方法。线性回归需要一组线性函数作为基函数，如下：

mutable struct LinearRegressionValueFunction
	β # basis vector function
	θ # vector of parameters
end

function (Uθ::LinearRegressionValueFunction)(s)
	return Uθ.β(s) ⋅ Uθ.θ
end

function fit!(Uθ::LinearRegressionValueFunction, S, U)
	X = hcat([Uθ.β(s) for s in S]...)'
	Uθ.θ = pinv(X)*U
	return Uθ
end

神经网络回归不必按照线性回归的要求构造一组适当的基函数。相反，使用神经网络来表示值函数。

3. 在线规划

3.1 滚动时域规划（Receding Horizon Planning）

预测控制的优化不是一次离线进行，而是随着采样时刻的前进反复地在线进行，故而该方法面临着确定滚动深度的问题。这种优化虽然得不到理想的全局最优解，但是反复对每一采样时刻的偏差进行优化计算，将可及时地校正控制过程中出现的各种复杂情况。

3.2 Lookahead with Rollouts

struct RolloutLookahead
	𝒫 # problem
	π # rollout policy
	d # depth
end

randstep(𝒫::MDP, s, a) = 𝒫.TR(s, a)

function rollout(𝒫, s, π, d)
	ret = 0.0
	for t in 1:d
		a = π(s)
		s, r = randstep(𝒫, s, a)
		ret += 𝒫.γ^(t-1) * r
	end
	return ret
end

function (π::RolloutLookahead)(s)
	U(s) = rollout(π.𝒫, s, π.π, π.d)
	return greedy(π.𝒫, U, s).
end

3.3 正向搜索（Forward Search）

struct ForwardSearch
	𝒫 # problem
	d # depth
	U # value function at depth d
end

function forward_search(𝒫, s, d, U)
	if d ≤ 0
		return (a=nothing, u=U(s))
	end
	best = (a=nothing, u=-Inf)
	U′(s) = forward_search(𝒫, s, d-1, U).u
	for a in 𝒫.𝒜
		u = lookahead(𝒫, U′, s, a)
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end

(π::ForwardSearch)(s) = forward_search(π.𝒫, s, π.d, π.U).a

3.4 分支定界方法（Branch and Bound）

struct BranchAndBound
	𝒫 # problem
	d # depth
	Ulo # lower bound on value function at depth d
	Qhi # upper bound on action value function
end

function branch_and_bound(𝒫, s, d, Ulo, Qhi)
	if d ≤ 0
		return (a=nothing, u=Ulo(s))
	end
	U′(s) = branch_and_bound(𝒫, s, d-1, Ulo, Qhi).u
	best = (a=nothing, u=-Inf)
	for a in sort(𝒫.𝒜, by=a->Qhi(s,a), rev=true)
		if Qhi(s, a) < best.u
			return best # safe to prune
		end
		u = lookahead(𝒫, U′, s, a)
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end

(π::BranchAndBound)(s) = branch_and_bound(π.𝒫, s, π.d, π.Ulo, π.Qhi).a

3.5 稀疏采样

struct SparseSampling
	𝒫 # problem
	d # depth
	m # number of samples
	U # value function at depth d
end

function sparse_sampling(𝒫, s, d, m, U)
	if d ≤ 0
		return (a=nothing, u=U(s))
	end
	best = (a=nothing, u=-Inf)
	for a in 𝒫.𝒜
		u = 0.0
		for i in 1:m
			s′, r = randstep(𝒫, s, a)
			a′, u′ = sparse_sampling(𝒫, s′, d-1, m, U)
			u += (r + 𝒫.γ*u′) / m
		end
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end

(π::SparseSampling)(s) = sparse_sampling(π.𝒫, s, π.d, π.m, π.U).a

3.6 蒙特卡罗树搜索

struct MonteCarloTreeSearch
	𝒫 # problem
	N # visit counts
	Q # action value estimates
	d # depth
	m # number of simulations
	c # exploration constant
	U # value function estimate
end

function (π::MonteCarloTreeSearch)(s)
	for k in 1:π.m
		simulate!(π, s)
	end
	return argmax(a->π.Q[(s,a)], π.𝒫.𝒜)
end

3.7 启发式搜索

struct HeuristicSearch
	𝒫 # problem
	Uhi # upper bound on value function
	d # depth
	m # number of simulations
end

function simulate!(π::HeuristicSearch, U, s)
	𝒫 = π.𝒫
	for d in 1:π.d
		a, u = greedy(𝒫, U, s)
		U[s] = u
		s = rand(𝒫.T(s, a))
	end
end

function (π::HeuristicSearch)(s)
	U = [π.Uhi(s) for s in π.𝒫.𝒮]
	for i in 1:π.m
		simulate!(π, U, s)
	end
	return greedy(π.𝒫, U, s).a
end

3.8 标签启发式搜索

struct LabeledHeuristicSearch
	𝒫 # problem
	Uhi # upper bound on value function
	d # depth
	δ # gap threshold
end

function (π::LabeledHeuristicSearch)(s)
	U, solved = [π.Uhi(s) for s in 𝒫.𝒮], Set()
	while s ∉ solved
		simulate!(π, U, solved, s)
	end
	return greedy(π.𝒫, U, s).a
end

3.9 开环规划/model predictive control

开环规划可提供最佳闭环规划的满意近似，同时通过避免对未来信息的获取进行推理提高了计算效率。过程可表示为 $\max_{a_{1:d}} \mathcal{U}(a_{1:d}),$ 即最大化是执行操作序列 $a_{1:d}$ 时的预期返回。

确定性模型预测控制
$\begin{align*} & \max_{a_{1:d}, s_{2:d}} \qquad \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \qquad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*}$
鲁棒模型预测控制
$\begin{align*} & \max_{a_{1:d}} \qquad \min_{s_{2:d}} \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \quad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*}$
多预测模型预测控制
$\begin{align*} & \max_{a_{1:d}^{1:m}, s_{2:d}^{i}} \qquad \frac{1}{m} \sum_{i=1}^{m}\sum_{k = 1}^{d} \gamma^{k} R(s_{k}^{(i)}, a_{k}^{(i)}) \\ & {\rm s.t.} \qquad \qquad s_{k+1}^{(i)} = T_{i}(s_{k}^{(i)}, a_{k}^{(i)}), \ k \in 1:d-1, i \in 1:m, \\ & \quad \qquad \qquad \ \ a_{1}^{(i)} = a_{1}^{(j)}, \qquad \qquad i, j \in 1:m. \end{align*}$