读书笔记:Algorithms for Decision Making(6)

读书笔记:Algorithms for Decision Making

上一篇 读书笔记:Algorithms for Decision Making(5)
下一篇 读书笔记:Algorithms for Decision Making(7)



三、序列问题(2)

上文中提及的精确解方法适用于小型离散问题,对于较大状态空间的问题,计算精确解需要极大的内存量,因而考虑近似解的方法。常使用approximate dynamic programming的方法去寻求近似解,进而使用在线方法实现实时计算。


2. 近似值函数

2.1 参数化表示

记值函数的参数化表示为 U θ ( s ) \mathcal{U}_{\theta} (s) Uθ(s)

struct ApproximateValueIteration
	Uθ # initial parameterized value function that supports fit!
	S # set of discrete states for performing backups
	k_max # maximum number of iterations
end

function solve(M::ApproximateValueIteration, 𝒫::MDP)
	Uθ, S, k_max = M.Uθ, M.S, M.k_max
	for k in 1:k_max
		U = [backup(𝒫, Uθ, s) for s in S]
		fit!(Uθ, S, U)
	end
	return ValueFunctionPolicy(𝒫, Uθ)
end

接下来提及的所有参数表示均可与与上述逼近算法一起使用,且参数表示需要支持 U θ \mathcal{U}_{\theta} Uθ的计算以及$\mathcal{U}_{\theta} 与 与 S$中点效用估计的拟合。

参数化表示分为两类:

  • 局部近似方法,其中 θ \theta θ对应于 S S S中状态的值。
  • 全局近似方法,其中 θ \theta θ S S S中状态的值不直接相关。
    但两者本质上都可以视为一个线性函数逼近,即 U θ = θ T β ( s ) \mathcal{U}_{\theta} = \theta^{\rm T} \beta(s) Uθ=θTβ(s)

2.2 最邻近方法

mutable struct NearestNeighborValueFunction
	k # number of neighbors
	d # distance function d(s, s′)
	S # set of discrete states
	θ # vector of values at states in S
end

function (Uθ::NearestNeighborValueFunction)(s)
	dists = [Uθ.d(s,s′) for s′ in Uθ.S]
	ind = sortperm(dists)[1:Uθ.k]
	return mean(Uθ.θ[i] for i in ind)
end

function fit!(Uθ::NearestNeighborValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.3 核光滑方法

mutable struct LocallyWeightedValueFunction
	k # kernel function k(s, s′)
	S # set of discrete states
	θ # vector of values at states in S
end

function (Uθ::LocallyWeightedValueFunction)(s)
	w = normalize([Uθ.k(s,s′) for s′ in Uθ.S], 1)
	return Uθ.θ ⋅ w
end

function fit!(Uθ::LocallyWeightedValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.4 线性插值

在这里插入图片描述

mutable struct MultilinearValueFunction
	o # position of lower-left corner
	δ # vector of widths
	θ # vector of values at states in S
end

function (Uθ::MultilinearValueFunction)(s)
	o, δ, θ = Uθ.o, Uθ.δ, Uθ.θ
	Δ = (s - o)./δ
	# Multidimensional index of lower-left cell
	i = min.(floor.(Int, Δ) .+ 1, size(θ) .- 1)
	vertex_index = similar(i)
	d = length(s)
	u = 0.0
	for vertex in 0:2^d-1
		weight = 1.0
		for j in 1:d
		# Check whether jth bit is set
			if vertex & (1 << (j-1)) > 0
				vertex_index[j] = i[j] + 1
				weight *= Δ[j] - i[j] + 1
			else
				vertex_index[j] = i[j]
				weight *= i[j] - Δ[j]
			end
		end
		u += θ[vertex_index...]*weight
	end
	return u
end

function fit!(Uθ::MultilinearValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.5 单纯形插值

mutable struct SimplexValueFunction
	o # position of lower-left corner
	δ # vector of widths
	θ # vector of values at states in S
end

function (Uθ::SimplexValueFunction)(s)
	Δ = (s - Uθ.o)./Uθ.δ
	# Multidimensional index of upper-right cell
	i = min.(floor.(Int, Δ) .+ 1, size(Uθ.θ) .- 1) .+ 1
	u = 0.0
	s′ = (s - (Uθ.o + Uθ.δ.*(i.-2))) ./ Uθ.δ
	p = sortperm(s′) # increasing order
	w_tot = 0.0
	for j in p
		w = s′[j] - w_tot
		u += w*Uθ.θ[i...]
		i[j] -= 1
		w_tot += w
	end
	u += (1 - w_tot)*Uθ.θ[i...]
	return u
end

function fit!(Uθ::SimplexValueFunction, S, U)
	Uθ.θ = U
	return Uθ
end

2.6 线性回归与神经网络回归

下面介绍全局方法。线性回归需要一组线性函数作为基函数,如下:

mutable struct LinearRegressionValueFunction
	β # basis vector function
	θ # vector of parameters
end

function (Uθ::LinearRegressionValueFunction)(s)
	return Uθ.β(s) ⋅ Uθ.θ
end

function fit!(Uθ::LinearRegressionValueFunction, S, U)
	X = hcat([Uθ.β(s) for s in S]...)'
	Uθ.θ = pinv(X)*U
	return Uθ
end

神经网络回归不必按照线性回归的要求构造一组适当的基函数。相反,使用神经网络来表示值函数。

3. 在线规划

3.1 滚动时域规划(Receding Horizon Planning)

预测控制的优化不是一次离线进行,而是随着采样时刻的前进反复地在线进行,故而该方法面临着确定滚动深度的问题。这种优化虽然得不到理想的全局最优解,但是反复对每一采样时刻的偏差进行优化计算,将可及时地校正控制过程中出现的各种复杂情况。

3.2 Lookahead with Rollouts

struct RolloutLookahead
	𝒫 # problem
	π # rollout policy
	d # depth
end

randstep(𝒫::MDP, s, a) = 𝒫.TR(s, a)

function rollout(𝒫, s, π, d)
	ret = 0.0
	for t in 1:d
		a = π(s)
		s, r = randstep(𝒫, s, a)
		ret += 𝒫.γ^(t-1) * r
	end
	return ret
end

function (π::RolloutLookahead)(s)
	U(s) = rollout(π.𝒫, s, π.π, π.d)
	return greedy(π.𝒫, U, s).
end

3.3 正向搜索(Forward Search)

struct ForwardSearch
	𝒫 # problem
	d # depth
	U # value function at depth d
end

function forward_search(𝒫, s, d, U)
	if d ≤ 0
		return (a=nothing, u=U(s))
	end
	best = (a=nothing, u=-Inf)
	U′(s) = forward_search(𝒫, s, d-1, U).u
	for a in 𝒫.𝒜
		u = lookahead(𝒫, U′, s, a)
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end

(π::ForwardSearch)(s) = forward_search(π.𝒫, s, π.d, π.U).a

3.4 分支定界方法(Branch and Bound)

struct BranchAndBound
	𝒫 # problem
	d # depth
	Ulo # lower bound on value function at depth d
	Qhi # upper bound on action value function
end

function branch_and_bound(𝒫, s, d, Ulo, Qhi)
	if d ≤ 0
		return (a=nothing, u=Ulo(s))
	end
	U′(s) = branch_and_bound(𝒫, s, d-1, Ulo, Qhi).u
	best = (a=nothing, u=-Inf)
	for a in sort(𝒫.𝒜, by=a->Qhi(s,a), rev=true)
		if Qhi(s, a) < best.u
			return best # safe to prune
		end
		u = lookahead(𝒫, U′, s, a)
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end

(π::BranchAndBound)(s) = branch_and_bound(π.𝒫, s, π.d, π.Ulo, π.Qhi).a

3.5 稀疏采样

struct SparseSampling
	𝒫 # problem
	d # depth
	m # number of samples
	U # value function at depth d
end

function sparse_sampling(𝒫, s, d, m, U)
	if d ≤ 0
		return (a=nothing, u=U(s))
	end
	best = (a=nothing, u=-Inf)
	for a in 𝒫.𝒜
		u = 0.0
		for i in 1:m
			s′, r = randstep(𝒫, s, a)
			a′, u′ = sparse_sampling(𝒫, s′, d-1, m, U)
			u += (r + 𝒫.γ*u′) / m
		end
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end

(π::SparseSampling)(s) = sparse_sampling(π.𝒫, s, π.d, π.m, π.U).a

3.6 蒙特卡罗树搜索

struct MonteCarloTreeSearch
	𝒫 # problem
	N # visit counts
	Q # action value estimates
	d # depth
	m # number of simulations
	c # exploration constant
	U # value function estimate
end

function (π::MonteCarloTreeSearch)(s)
	for k in 1:π.m
		simulate!(π, s)
	end
	return argmax(a->π.Q[(s,a)], π.𝒫.𝒜)
end

3.7 启发式搜索

struct HeuristicSearch
	𝒫 # problem
	Uhi # upper bound on value function
	d # depth
	m # number of simulations
end

function simulate!(π::HeuristicSearch, U, s)
	𝒫 = π.𝒫
	for d in 1:π.d
		a, u = greedy(𝒫, U, s)
		U[s] = u
		s = rand(𝒫.T(s, a))
	end
end

function (π::HeuristicSearch)(s)
	U = [π.Uhi(s) for s in π.𝒫.𝒮]
	for i in 1:π.m
		simulate!(π, U, s)
	end
	return greedy(π.𝒫, U, s).a
end

3.8 标签启发式搜索

struct LabeledHeuristicSearch
	𝒫 # problem
	Uhi # upper bound on value function
	d # depth
	δ # gap threshold
end

function (π::LabeledHeuristicSearch)(s)
	U, solved = [π.Uhi(s) for s in 𝒫.𝒮], Set()
	while s ∉ solved
		simulate!(π, U, solved, s)
	end
	return greedy(π.𝒫, U, s).a
end

3.9 开环规划/model predictive control

开环规划可提供最佳闭环规划的满意近似,同时通过避免对未来信息的获取进行推理提高了计算效率。过程可表示为 max ⁡ a 1 : d U ( a 1 : d ) , \max_{a_{1:d}} \mathcal{U}(a_{1:d}), a1:dmaxU(a1:d),即最大化是执行操作序列 a 1 : d a_{1:d} a1:d时的预期返回。

  • 确定性模型预测控制
    max ⁡ a 1 : d , s 2 : d ∑ t = 1 d γ t R ( s t , a t ) s . t . s t + 1 = T ( s t , a t ) ,   t ∈ 1 : d − 1. \begin{align*} & \max_{a_{1:d}, s_{2:d}} \qquad \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \qquad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*} a1:d,s2:dmaxt=1dγtR(st,at)s.t.st+1=T(st,at), t1:d1.
  • 鲁棒模型预测控制
    max ⁡ a 1 : d min ⁡ s 2 : d ∑ t = 1 d γ t R ( s t , a t ) s . t . s t + 1 = T ( s t , a t ) ,   t ∈ 1 : d − 1. \begin{align*} & \max_{a_{1:d}} \qquad \min_{s_{2:d}} \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \quad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*} a1:dmaxs2:dmint=1dγtR(st,at)s.t.st+1=T(st,at), t1:d1.
  • 多预测模型预测控制
    max ⁡ a 1 : d 1 : m , s 2 : d i 1 m ∑ i = 1 m ∑ k = 1 d γ k R ( s k ( i ) , a k ( i ) ) s . t . s k + 1 ( i ) = T i ( s k ( i ) , a k ( i ) ) ,   k ∈ 1 : d − 1 , i ∈ 1 : m ,    a 1 ( i ) = a 1 ( j ) , i , j ∈ 1 : m . \begin{align*} & \max_{a_{1:d}^{1:m}, s_{2:d}^{i}} \qquad \frac{1}{m} \sum_{i=1}^{m}\sum_{k = 1}^{d} \gamma^{k} R(s_{k}^{(i)}, a_{k}^{(i)}) \\ & {\rm s.t.} \qquad \qquad s_{k+1}^{(i)} = T_{i}(s_{k}^{(i)}, a_{k}^{(i)}), \ k \in 1:d-1, i \in 1:m, \\ & \quad \qquad \qquad \ \ a_{1}^{(i)} = a_{1}^{(j)}, \qquad \qquad i, j \in 1:m. \end{align*} a1:d1:m,s2:dimaxm1i=1mk=1dγkR(sk(i),ak(i))s.t.sk+1(i)=Ti(sk(i),ak(i)), k1:d1,i1:m,  a1(i)=a1(j),i,j1:m.

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值