读书笔记:Algorithms for Decision Making(9)

读书笔记:Algorithms for Decision Making

上一篇 读书笔记:Algorithms for Decision Making(8)
下一篇 读书笔记:Algorithms for Decision Making(10)



三、模型不确定性(2)

与基于模型的方法相比,无模型方法不需要构建转移函数和奖励函数的显性表示,而是直接作用于值函数建模。进一步地,考虑模拟学习来重建奖励函数。


3. 无模型方法

3.1 均值的增量估计

许多无模型方法从样本递增中估计作用值函数 Q ( s , a ) Q(s,a) Q(s,a)。假设只关心 m m m个样本中单个变量 X X X的当前期望值: x ^ m = 1 m ∑ i = 1 m x ( i ) . \hat{x}_{m} = \frac{1}{m} \sum_{i=1}^{m} x^{(i)}. x^m=m1i=1mx(i).对应的增量更新的过程可显示为: x ^ m = x ^ m − 1 + 1 m ( x ( m ) − x ^ m − 1 ) . \hat{x}_{m} = \hat{x}_{m - 1} + \frac{1}{m} \left(x^{(m)} - \hat{x}_{m - 1}\right). x^m=x^m1+m1(x(m)x^m1).进一步地,如果引入学习率的概念: x ^ m = x ^ m − 1 + α ( m ) ( x ( m ) − x ^ m − 1 ) . \hat{x}_{m} = \hat{x}_{m - 1} + \alpha(m) \left(x^{(m)} - \hat{x}_{m - 1}\right). x^m=x^m1+α(m)(x(m)x^m1).为了保证收敛, α ( m ) \alpha(m) α(m)需满足 ∑ m α ( m ) = ∞ \sum_{m} \alpha(m) = \infty mα(m)=,且 ∑ m α 2 ( m ) < ∞ \sum_{m} \alpha^{2}(m) < \infty mα2(m)<

mutable struct IncrementalEstimate
	μ # mean estimate
	α # learning rate function
	m # number of updates
end

function update!(model::IncrementalEstimate, x)
	model.m += 1
	model.μ += model.α(model.m) * (x - model.μ)
	return model
end

3.2 Q学习

mutable struct QLearning
	𝒮 # state space (assumes 1:nstates)
	𝒜 # action space (assumes 1:nactions)
	γ # discount
	Q # action value function
	α # learning rate
end

lookahead(model::QLearning, s, a) = model.Q[s,a]

function update!(model::QLearning, s, a, r, s′)
	γ, Q, α = model.γ, model.Q, model.α
	Q[s,a] += α*(r + γ*maximum(Q[s′,:]) - Q[s,a])
	return model
end

3.3 Sarsa

该方法是Q学习的一个替代,名字来源于迭代Q函数的每一步都需要 ( s , a , r , s ′ , a ′ ) (s, a, r, s^{\prime}, a^{\prime}) (s,a,r,s,a)

mutable struct Sarsa
	𝒮 # state space (assumes 1:nstates)
	𝒜 # action space (assumes 1:nactions)
	γ # discount
	Q # action value function
	α # learning rate# most recent experience tuple (s,a,r)
end

lookahead(model::Sarsa, s, a) = model.Q[s,a]

function update!(model::Sarsa, s, a, r, s′)
	if model.ℓ != nothing
		γ, Q, α, ℓ = model.γ, model.Q, model.α, model.ℓ
		model.Q[ℓ.s,ℓ.a] += α*(ℓ.r + γ*Q[s,a] - Q[ℓ.s,ℓ.a])
	end
	model.ℓ = (s=s, a=a, r=r)
	return model
end

3.4 Eligibility Traces

Q-learning和Sarsa的缺点之一是学习可能很慢,尤其是当奖励很少的情形下。可通过使用Eligibility Traces修改Q-learning和Sarsa,将奖励反向传播到导致奖励来源的状态和行动,以提高学习效率。

mutable struct SarsaLambda
	𝒮 # state space (assumes 1:nstates)
	𝒜 # action space (assumes 1:nactions)
	γ # discount
	Q # action value function
	N # trace
	α # learning rate
	λ # trace decay rate# most recent experience tuple (s,a,r)
end

lookahead(model::SarsaLambda, s, a) = model.Q[s,a]

function update!(model::SarsaLambda, s, a, r, s′)
	if model.ℓ != nothing
		γ, λ, Q, α, ℓ = model.γ, model.λ, model.Q, model.α, model.ℓ
		model.N[ℓ.s,ℓ.a] += 1
		δ = ℓ.r + γ*Q[s,a] - Q[ℓ.s,ℓ.a]
		for s in model.𝒮
			for a in model.𝒜
				model.Q[s,a] += α*δ*model.N[s,a]
				model.N[s,a] *= γ*λ
			end
		end
	else
		model.N[:,:] .= 0.0
	end
	model.ℓ = (s=s, a=a, r=r)
	return model
end

当将该方法应用于非策略算法时,必须特别小心,因为它试图学习最优策略的值。

3.5 Reward Shaping

Reward Shaping1也可以改善学习,特别适用于奖励稀少的问题。具体讲,用 R ( s , a , s ′ ) + F ( s , a , s ′ ) R(s,a,s^{\prime}) + F(s,a,s^{\prime}) R(s,a,s)+F(s,a,s)替代传统的奖励函数 R ( s , a , s ′ ) R(s,a,s^{\prime}) R(s,a,s),第二项表征人为设计附加的奖励。

3.6 行动值函数近似

struct GradientQLearning
	𝒜 # action space (assumes 1:nactions)
	γ # discount
	Q # parameterized action value function Q(θ,s,a)
	∇Q # gradient of action value function
	θ # action value function parameter
	α # learning rate
end

function lookahead(model::GradientQLearning, s, a)
	return model.Q(model.θ, s,a)
end

function update!(model::GradientQLearning, s, a, r, s′)
	𝒜, γ, Q, θ, α = model.𝒜, model.γ, model.Q, model.θ, model.α
	u = maximum(Q(θ,s′,a′) for a′ in 𝒜)
	Δ = (r + γ*u - Q(θ,s,a))*model.∇Q(θ,s,a)
	θ[:] += α*scale_gradient(Δ, 1)
	return model
end

Q-learning和Sarsa所经历的灾难性遗忘可以通过经验重放来缓解。

struct ReplayGradientQLearning
	𝒜 # action space (assumes 1:nactions)
	γ # discount
	Q # parameterized action value function Q(θ,s,a)
	∇Q # gradient of action value function
	θ # action value function parameter
	α # learning rate
	buffer # circular memory buffer
	m # number of steps between gradient updates
	m_grad # batch size
end

function lookahead(model::ReplayGradientQLearning, s, a)
	return model.Q(model.θ, s,a)
end

function update!(model::ReplayGradientQLearning, s, a, r, s′)
	𝒜, γ, Q, θ, α = model.𝒜, model.γ, model.Q, model.θ, model.α
	buffer, m, m_grad = model.buffer, model.m, model.m_grad
	if isfull(buffer)
		U(s) = maximum(Q(θ,s,a) for a in 𝒜)
		∇Q(s,a,r,s′) = (r + γ*U(s′) - Q(θ,s,a))*model.∇Q(θ,s,a)
		Δ = mean(∇Q(s,a,r,s′) for (s,a,r,s′) in rand(buffer, m_grad))
		θ[:] += α*scale_gradient(Δ, 1)
		for i in 1:m # discard oldest experiences
			popfirst!(buffer)
		end
	else
		push!(buffer, (s,a,r,s′))
	end
	return model
end

4. 模拟学习

4.1 Behavioral Cloning

模仿学习的一种简单形式是将其视为有监督的学习问题,这种方法称为行为克隆。

struct BehavioralCloning
	α # step size
	k_max # number of iterations
	∇logπ # log likelihood gradient
end

function optimize(M::BehavioralCloning, D, θ)
	α, k_max, ∇logπ = M.α, M.k_max, M.∇logπ
	for k in 1:k_max
		∇ = mean(∇logπ(θ, a, s) for (s,a) in D)
		θ += α*∇
	end
	return θ
end

专家演示越接近最佳状态,所产生的行为克隆策略的执行效果越好,但容易有cascading errors。虽然行为克隆由于其简单性而具有吸引力,但cascading errors会导致该方法在许多问题上表现不佳,特别是策略必须长期使用时。

4.2 Sequential interactive demonstration

解决cascading errors的一种方法是使用额外的专家输入纠正经过训练的策略,即Sequential interactive demonstration。

  • data set aggregation (DAgger) :在用数据训练策略,以及已训练的策略生成数据间交替迭代。

    struct DataSetAggregation
    	𝒫 # problem with unknown reward function
    	bc # behavioral cloning struct
    	k_max # number of iterations
    	m # number of rollouts per iteration
    	d # rollout depth
    	b # initial state distribution
    	πE # expert
    	πθ # parameterized policy
    end
    
    function optimize(M::DataSetAggregation, D, θ)
    	𝒫, bc, k_max, m = M.𝒫, M.bc, M.k_max, M.m
    	d, b, πE, πθ = M.d, M.b, M.πE, M.πθ
    	θ = optimize(bc, D, θ)
    	for k in 2:k_max
    		for i in 1:m
    			s = rand(b)
    			for j in 1:d
    				push!(D, (s, πE(s)))
    				a = rand(πθ(θ, s))
    				s = rand(𝒫.T(s, a))
    			end
    		end
    		θ = optimize(bc, D, θ)
    	end
    	return θ
    end
    
  • stochastic mixing iterative learning (SMILe):通过随机混合新训练的策略来迭代构建策略。

    struct SMILe
    	𝒫 # problem with unknown reward
    	bc # Behavioral cloning struct
    	k_max # number of iterations
    	m # number of rollouts per iteration
    	d # rollout depth
    	b # initial state distribution
    	β # mixing scalar (e.g., d^-3)
    	πE # expert policy
    	πθ # parameterized policy
    end
    
    function optimize(M::SMILe, θ)
    	𝒫, bc, k_max, m = M.𝒫, M.bc, M.k_max, M.m
    	d, b, β, πE, πθ = M.d, M.b, M.β, M.πE, M.πθ
    	𝒜, T = 𝒫.𝒜, 𝒫.T
    	θs = []
    	π = s -> πE(s)
    	for k in 1:k_max
    		# execute latest π to get new data set D
    		D = []
    		for i in 1:m
    			s = rand(b)
    			for j in 1:d
    				push!(D, (s, πE(s)))
    				a = π(s)
    				s = rand(T(s, a))
    			end
    		end
    		# train new policy classifier
    		θ = optimize(bc, D, θ)
    		push!(θs, θ)
    		# compute a new policy mixture= Categorical(normalize([(1)^(i-1) for i in 1:k],1))
    		π = s -> begin
    		if rand() < (1)^(k-1)
    			return πE(s)
    		else
    			return rand(Categorical(πθ(θs[rand()], s)))
    		end
    	end
    	Ps = normalize([(1)^(i-1) for i in 1:k_max],1)
    	return Ps, θs
    end
    

4.3 Inverse Reinforcement Learning(IRL)

在逆IRL中,假设专家正在优化一个未知的奖励函数,即从专家已有的数据集 D \mathcal{D} D出发,试图推导出奖励函数。有了这个奖励函数,可用已有方法来推导最优策略。根据奖励函数的参数化方法不同,有很多具体算法。

4.3.1 Maximum Margin IRL

该方法试图找到与专家数据集中发现的二进制特征频率相匹配的策略2

struct InverseReinforcementLearning
	𝒫 # problem
	b # initial state distribution
	d # depth
	m # number of samples
	π # parameterized policy
	β # binary feature mapping
	μE # expert feature expectations
	RL # reinforcement learning method
	ϵ # tolerance
end

function feature_expectations(M::InverseReinforcementLearning, π)
	𝒫, b, m, d, β, γ = M.𝒫, M.b, M.m, M.d, M.β, M.𝒫.γ
	μ(τ) = sum(γ^(k-1)(s, a) for (k,(s,a)) in enumerate(τ))
	τs = [simulate(𝒫, rand(b), π, d) for i in 1:m]
	return mean(μ(τ) for τ in τs)
end

function calc_weighting(M::InverseReinforcementLearning, μs)
	μE = M.μE
	k = length(μE)
	model = Model(Ipopt.Optimizer)
	@variable(model, t)
	@variable(model, ϕ[1:k]0)
	@objective(model, Max, t)
	for μ in μs
		@constraint(model, ϕ⋅μE ≥ ϕ⋅μ + t)
	end
	@constraint(model, ϕ⋅ϕ ≤ 1)
	optimize!(model)
	return (value(t), value.(ϕ))
end

function calc_policy_mixture(M::InverseReinforcementLearning, μs)
	μE = M.μE
	k = length(μs)
	model = Model(Ipopt.Optimizer)
	@variable(model, λ[1:k]0)
	@objective(model, Min, (μE - sum(λ[i]*μs[i] for i in 1:k))(μE - sum(λ[i]*μs[i] for i in 1:k)))
	@constraint(model, sum(λ) == 1)
	optimize!(model)
	return value.(λ)
end

function optimize(M::InverseReinforcementLearning, θ)
	π, ϵ, RL = M.π, M.ϵ, M.RL
	θs = [θ]
	μs = [feature_expectations(M, s->π(θ,s))]
	while true
		t, ϕ = calc_weighting(M, μs)
		if t ≤ ϵ
			break
		end
		copyto!(RL.ϕ, ϕ) # R(s,a) = ϕ⋅β(s,a)
		θ = optimize(RL, π, θ)
		push!(θs, θ)
		push!(μs, feature_expectations(M, s->π(θ,s)))
	end
	λ = calc_policy_mixture(M, μs)
	return λ, θs
end

4.3.2 Maximum Entropy IRL

该方法将寻找最佳奖励参数的问题定义为最大似然估计问题。

struct MaximumEntropyIRL
	𝒫 # problem
	b # initial state distribution
	d # depth
	π # parameterized policy π(θ,s)# parameterized policy likelihood π(θ, a, s)
	∇R # reward function gradient
	RL # reinforcement learning method
	α # step size
	k_max # number of iterations
end

function discounted_state_visitations(M::MaximumEntropyIRL, θ)
	𝒫, b, d, Pπ = M.𝒫, M.b, M.d, M.Pπ
	𝒮, 𝒜, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.T, 𝒫.γ
	b_sk = zeros(length(𝒫.𝒮), d)
	b_sk[:,1] = [pdf(b, s) for s in 𝒮]
	for k in 2:d
		for (si′, s′) in enumerate(𝒮)
			b_sk[si′,k] = γ*sum(sum(b_sk[si,k-1]*Pπ(θ, a, s)*T(s, a, s′) 
					for (si,s) in enumerate(𝒮)) for a in 𝒜)
		end
	end
	return normalize!(vec(mean(b_sk, dims=2)),1)
end

function optimize(M::MaximumEntropyIRL, D, ϕ, θ)
	𝒫, π, Pπ, ∇R, RL, α, k_max = M.𝒫, M.π, M.Pπ, M.∇R, M.RL, M.α, M.k_max
	𝒮, 𝒜, γ, nD = 𝒫.𝒮, 𝒫.𝒜, 𝒫.γ, length(D)
	for k in 1:k_max
		copyto!(RL.ϕ, ϕ) # update parameters
		θ = optimize(RL, π, θ)
		b = discounted_state_visitations(M, θ)
		∇Rτ = τ -> sum(γ^(i-1)*∇R(ϕ,s,a) for (i,(s,a)) in enumerate(τ))
		∇f = sum(∇Rτ(τ) for τ in D) - nD*sum(b[si]*sum((θ,a,s)*∇R(ϕ,s,a)
			for (ai,a) in enumerate(𝒜)) for (si, s) in enumerate(𝒮))
		ϕ += α*∇f
	end
	return ϕ, θ
end

4.4 Generative Adversarial Imitation Learning

在这里插入图片描述


总结

在许多问题中,这些模型是不确切的,必须通过经验来学习行动。在这个过程中需要已有经验与探索数据进行平衡。基于传统的机器学习理念,已知奖励函数基于模型和无模型的方法被讨论。除此之外,在更少的确定性先验已知下,模拟学习(看起来似乎更贴合实际)的方法得到讨论。至少对于本人,IRF是第一次被学习到。


  1. A. Y. Ng, D. Harada, and S. Russell, “Policy invariance under reward transformations: Theory and application to reward shaping,” in International Conference on Machine Learning (ICML), 1999. ↩︎

  2. P. Abbeel and A. Y. Ng, “Apprenticeship learning via inverse reinforcement learning,” in International Conference on Machine Learning (ICML), 2004. ↩︎

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值