读书笔记:Algorithms for Decision Making(7)

读书笔记:Algorithms for Decision Making

上一篇 读书笔记:Algorithms for Decision Making(6)
下一篇 读书笔记:Algorithms for Decision Making(8)



三、序列问题(3)

策略搜索即搜索策略空间,而无需直接计算值函数。策略空间的维数通常低于状态空间,并且通常可以更有效地搜索。本部分首先讨论在初始状态分布下估计策略价值的方法。然后讨论不使用策略梯度估计的搜索方法和策略梯度方法。接着介绍Actor-Critic方法用值函数的估计来指导优化。


4. 策略搜索

4.1 近似策略评估

在已知初始状态 b ( s ) b(s) b(s)的情况下,可计算策略 π \pi π的预期折扣回报: U ( π ) = ∑ s b ( s ) U π ( s ) . \mathcal{U}(\pi) = \sum_{s} b(s) \mathcal{U}^{\pi}(s). U(π)=sb(s)Uπ(s).当状态空间大或连续时,可近似表示为: U ( π ) = E [ R ( τ ) ] = ∫ p π ( τ ) R ( τ )   d τ . \mathcal{U}(\pi) = \mathbb{E} [R(\tau)] = \int p_{\pi}(\tau) R(\tau) \ {\rm d} \tau. U(π)=E[R(τ)]=pπ(τ)R(τ) dτ. 蒙特卡罗策略评估可将建立上述两式的关系。

struct MonteCarloPolicyEvaluation
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
end

function (U::MonteCarloPolicyEvaluation)(π)
	R(π) = rollout(U.𝒫, rand(U.b), π, U.d)
	return mean(R(π) for i = 1:U.m)
end

(U::MonteCarloPolicyEvaluation)(π, θ) = U(s->π(θ, s))

4.2 局部搜索

struct HookeJeevesPolicySearch
	θ # initial parameterization
	α # step size
	c # step size reduction factor
	ϵ # termination step size
end

function optimize(M::HookeJeevesPolicySearch, π, U)
	θ, θ′, α, c, ϵ = copy(M.θ), similar(M.θ), M.α, M.c, M.ϵ
	u, n = U(π, θ), length(θ)
	while α > ϵ
		copyto!(θ′, θ)
		best = (i=0, sgn=0, u=u)
		for i in 1:n
			for sgn in (-1,1)
				θ′[i] = θ[i] + sgn*α
				u′ = U(π, θ′)
				if u′ > best.u
					best = (i=i, sgn=sgn, u=u′)
				end
			end
			θ′[i] = θ[i]
		end
		if best.i != 0
			θ[best.i] += best.sgn*α
			u = best.u
		else
			α *= c
		end
	end
	return θ
end

4.3 遗传算法

struct GeneticPolicySearch
	θs # initial population
	σ # initial standard deviation
	m_elite # number of elite samples
	k_max # number of iterations
end

function optimize(M::GeneticPolicySearch, π, U)
	θs, σ = M.θs, M.σ
	n, m = length(first(θs)), length(θs)
	for k in 1:M.k_max
		us = [U(π, θ) for θ in θs]
		sp = sortperm(us, rev=true)
		θ_best = θs[sp[1]]
		rand_elite() = θs[sp[rand(1:M.m_elite)]]
		θs = [rand_elite() + σ.*randn(n) for i in 1:(m-1)]
		push!(θs, θ_best)
	end
	return last(θs)
end

4.4 交叉熵方法

该方法想要寻找 ψ ∗ = arg max ⁡ ψ E θ ∼ p ( ⋅ ∣ ψ ) [ U ( π θ ) ] . \psi^{\ast} = \argmax_{\psi} \mathbb{E}_{\theta \sim p(\cdot \mid \psi)} [\mathcal{U}(\pi_{\theta})]. ψ=ψargmaxEθp(ψ)[U(πθ)].事实上,直接去解上式很难实现。一般使用采样估计的方法去迭代。

struct CrossEntropyPolicySearch
	p # initial distribution
	m # number of samples
	m_elite # number of elite samples
	k_max # number of iterations
end

function optimize_dist(M::CrossEntropyPolicySearch, π, U)
	p, m, m_elite, k_max = M.p, M.m, M.m_elite, M.k_max
	for k in 1:k_max
		θs = rand(p, m)
		us = [U(π, θs[:,i]) for i in 1:m]
		θ_elite = θs[:,sortperm(us)[(m-m_elite+1):m]]
		p = Distributions.fit(typeof(p), θ_elite)
	end
	return p
end

function optimize(M, π, U)
	return Distributions.mode(optimize_dist(M, π, U))
end

4.5 进化策略

struct EvolutionStrategies
	D # distribution constructor
	ψ # initial distribution parameterization
	∇logp # log search likelihood gradient
	m # number of samples
	α # step factor
	k_max # number of iterations
end

function evolution_strategy_weights(m)
	ws = [max(0, log(m/2+1) - log(i)) for i in 1:m]
	ws ./= sum(ws)
	ws .-= 1/m
	return ws
end

function optimize_dist(M::EvolutionStrategies, π, U)
	D, ψ, m, ∇logp, α = M.D, M.ψ, M.m, M.∇logp, M.α
	ws = evolution_strategy_weights(m)
	for k in 1:M.k_max
		θs = rand(D(ψ), m)
		us = [U(π, θs[:,i]) for i in 1:m]
		sp = sortperm(us, rev=true)= sum(w.*∇logp(ψ, θs[:,i]) for (w,i) in zip(ws,sp))
		ψ += α.*∇
	end
	return D(ψ)
end

5. 策略梯度

5.1 有限差分

在策略优化的背景下,估计遵循 θ \theta θ参数化的策略所期望的效用梯度: ∇ U ( θ ) = [ ∂ U ∂ θ 1 ( θ ) , ⋯   , ∂ U ∂ θ n ] ≈ [ U ( θ + δ e ( 1 ) ) − U ( θ ) δ , ⋯   , U ( θ + δ e ( n ) ) − U ( θ ) δ ] . \begin{align*} \nabla\mathcal{U}(\theta) & = \begin{bmatrix} \frac{\partial \mathcal{U}}{\partial \theta_{1}}(\theta), \cdots, \frac{\partial \mathcal{U}}{\partial \theta_{n}} \end{bmatrix} \\ & \approx \begin{bmatrix} \frac{\mathcal{U}(\theta + \delta e^{(1)}) - \mathcal{U}(\theta)}{\delta}, \cdots, \frac{\mathcal{U}(\theta + \delta e^{(n)}) - \mathcal{U}(\theta)}{\delta} \end{bmatrix}.\end{align*} U(θ)=[θ1U(θ),,θnU][δU(θ+δe(1))U(θ),,δU(θ+δe(n))U(θ)].

struct FiniteDifferenceGradient
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	δ # step size
end

function gradient(M::FiniteDifferenceGradient, π, θ)
	𝒫, b, d, m, δ, γ, n = M.𝒫, M.b, M.d, M.m, M.δ, M.𝒫.γ, length(θ)
	Δθ(i) = [i == k ? δ : 0.0 for k in 1:n]
	R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
	U(θ) = mean(R(simulate(𝒫, rand(b), s->π(θ, s), d)) for i in 1:m)
	ΔU = [U(θ + Δθ(i)) - U(θ) for i in 1:n]
	return ΔU ./ δ
end

5.2 回归梯度

struct RegressionGradient
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	δ # step size
end

function gradient(M::RegressionGradient, π, θ)
	𝒫, b, d, m, δ, γ = M.𝒫, M.b, M.d, M.m, M.δ, M.𝒫.γ
	ΔΘ = [δ.*normalize(randn(length(θ)), 2) for i = 1:m]
	R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
	U(θ) = R(simulate(𝒫, rand(b), s->π(θ,s), d))
	ΔU = [U(θ + Δθ) - U(θ) for Δθ in ΔΘ]
	return pinv(reduce(hcat, ΔΘ)') * ΔU
end

5.3 似然比

struct LikelihoodRatioGradient
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	∇logπ # gradient of log likelihood
end

function gradient(M::LikelihoodRatioGradient, π, θ)
	𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
	πθ(s) = π(θ, s)
	R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
	∇U(τ) = sum(∇logπ(θ, a, s) for (s,a) in τ)*R(τ)
	return mean(∇U(simulate(𝒫, rand(b), πθ, d)) for i in 1:m)
end

5.4 Reward-to-Go

struct RewardToGoGradient
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	∇logπ # gradient of log likelihood
end

function gradient(M::RewardToGoGradient, π, θ)
	𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
	πθ(s) = π(θ, s)
	R(τ, j) = sum(r*γ^(k-1) for (k,(s,a,r)) in zip(j:d, τ[j:end]))
	∇U(τ) = sum(∇logπ(θ, a, s)*R(τ,j) for (j, (s,a,r)) in enumerate(τ))
	return mean(∇U(simulate(𝒫, rand(b), πθ, d)) for i in 1:m)
end

5.5 基线校正(Baseline Subtraction)

struct BaselineSubtractionGradient
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	∇logπ # gradient of log likelihood
end

function gradient(M::BaselineSubtractionGradient, π, θ)
	𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
	πθ(s) = π(θ, s)(a, s, k) = ∇logπ(θ, a, s)*γ^(k-1)
	R(τ, k) = sum(r*γ^(j-1) for (j,(s,a,r)) in enumerate(τ[k:end]))
	numer(τ) = sum((a,s,k).^2*R(τ,k) for (k,(s,a,r)) in enumerate(τ))
	denom(τ) = sum((a,s,k).^2 for (k,(s,a)) in enumerate(τ))
	base(τ) = numer(τ) ./ denom(τ)
	trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
	rbase = mean(base(τ) for τ in trajs)
	∇U(τ) = sum((a,s,k).*(R(τ,k).-rbase) for (k,(s,a,r)) in enumerate(τ))
	return mean(∇U(τ) for τ in trajs)
end

6. Actor-Critic方法

6.1 Actor-Critic

具体讲,策略 π θ \pi_{\theta} πθ是Actor,由 θ \theta θ确定的参数,并借助 ϕ \phi ϕ参数化的值函数 U ϕ ( s ) \mathcal{U}_{\phi}(s) Uϕ(s) Q ϕ ( s , a ) \mathcal{Q}_{\phi}(s,a) Qϕ(s,a) A ϕ ( s , a ) \mathcal{A}_{\phi}(s,a) Aϕ(s,a)作为Critic。

struct ActorCritic
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	∇logπ # gradient of log likelihood ∇logπ(θ,a,s)
	U # parameterized value function U(ϕ, s)
	∇U # gradient of value function ∇U(ϕ,s)
end

function gradient(M::ActorCritic, π, θ, ϕ)
	𝒫, b, d, m, ∇logπ = M.𝒫, M.b, M.d, M.m, M.∇logπ
	U, ∇U, γ = M.U, M.∇U, M.𝒫.γ
	πθ(s) = π(θ, s)
	R(τ,j) = sum(r*γ^(k-1) for (k,(s,a,r)) in enumerate(τ[j:end]))
	A(τ,j) = τ[j][3] + γ*U(ϕ,τ[j+1][1]) - U(ϕ,τ[j][1])
	∇Uθ(τ) = sum(∇logπ(θ,a,s)*A(τ,j)*γ^(j-1) for (j, (s,a,r)) in enumerate(τ[1:end-1]))
	∇ℓϕ(τ) = sum((U(ϕ,s) - R(τ,j))*∇U(ϕ,s) for (j, (s,a,r)) in enumerate(τ))
	trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
	return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end

6.2 广义的优势估计

该方法能够平衡偏差和方差。

struct GeneralizedAdvantageEstimation
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	∇logπ # gradient of log likelihood ∇logπ(θ,a,s)
	U # parameterized value function U(ϕ, s)
	∇U # gradient of value function ∇U(ϕ,s)
	λ # weight ∈ [0,1]
end

function gradient(M::GeneralizedAdvantageEstimation, π, θ, ϕ)
	𝒫, b, d, m, ∇logπ = M.𝒫, M.b, M.d, M.m, M.∇logπ
	U, ∇U, γ, λ = M.U, M.∇U, M.𝒫.γ, M.λ
	πθ(s) = π(θ, s)
	R(τ,j) = sum(r*γ^(k-1) for (k,(s,a,r)) in enumerate(τ[j:end]))
	δ(τ,j) = τ[j][3] + γ*U(ϕ,τ[j+1][1]) - U(ϕ,τ[j][1])
	A(τ,j) = sum((γ*λ)^(-1)*δ(τ, j+-1) for ℓ in 1:d-j)
	∇Uθ(τ) = sum(∇logπ(θ,a,s)*A(τ,j)*γ^(j-1) for (j, (s,a,r)) in enumerate(τ[1:end-1]))
	∇ℓϕ(τ) = sum((U(ϕ,s) - R(τ,j))*∇U(ϕ,s) for (j, (s,a,r)) in enumerate(τ))
	trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
	return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end

6.3 确定性策略梯度

struct DeterministicPolicyGradient
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	∇π # gradient of deterministic policy π(θ, s)
	Q # parameterized value function Q(ϕ,s,a)
	∇Qϕ # gradient of value function with respect to ϕ
	∇Qa # gradient of value function with respect to a
	σ # policy noise
end

function gradient(M::DeterministicPolicyGradient, π, θ, ϕ)
	𝒫, b, d, m, ∇π = M.𝒫, M.b, M.d, M.m, M.∇π
	Q, ∇Qϕ, ∇Qa, σ, γ = M.Q, M.∇Qϕ, M.∇Qa, M.σ, M.𝒫.γ
	π_rand(s) = π(θ, s) + σ*randn()*I
	∇Uθ(τ) = sum(∇π(θ,s)*∇Qa(ϕ,s,π(θ,s))*γ^(j-1) for (j,(s,a,r)) in enumerate(τ))
	∇ℓϕ(τ,j) = begin
		s, a, r = τ[j]
		s′ = τ[j+1][1]
		a′ = π(θ,s′)
		δ = r + γ*Q(ϕ,s′,a′) - Q(ϕ,s,a)
		return δ*(γ*∇Qϕ(ϕ,s′,a′) - ∇Qϕ(ϕ,s,a))
	end
	∇ℓϕ(τ) = sum(∇ℓϕ(τ,j) for j in 1:length(τ)-1)
	trajs = [simulate(𝒫, rand(b), π_rand, d) for i in 1:m]
	return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end

在线方法,如蒙特卡罗树搜索,可用于指导策略和值函数估计的优化。


总结

到目前为止,在序列决策问题的讨论中,假设过渡和奖励模型是已知的。然而,在许多问题中,这些模型并不确切,代理必须通过经验来学习行动。解决存在模型不确定性的此类问题是强化学习领域的主题,也是接下来讨论的重点。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值