读书笔记:Algorithms for Decision Making(13)

读书笔记:Algorithms for Decision Making

上一篇 读书笔记:Algorithms for Decision Making(12)
下一篇 读书笔记:Algorithms for Decision Making(14)



五、多智能体系统(2)

本部分将简单游戏扩展到具有多个状态的连续上下文。马尔可夫博弈可以看作是多个具有自己奖励函数的智能体的马尔可夫决策过程。


2. 次序问题

马尔可夫博弈的结构如下:

struct MG
	γ # discount factor# agents
	𝒮 # state space
	𝒜 # joint action space
	T # transition function
	R # joint reward function
end

马尔可夫决策是从状态到简单博弈决策的映射。

struct MGPolicy
	p # dictionary mapping states to simple game policies
	MGPolicy(p::Base.Generator) = new(Dict(p))
end

(πi::MGPolicy)(s, ai) = πi.p[s](ai)

(πi::SimpleGamePolicy)(s, ai) = πi(ai)

probability(𝒫::MG, s, π, a) = prod(πj(s, aj) for (πj, aj) in zip(π, a))
reward(𝒫::MG, s, π, i) =
	sum(𝒫.R(s,a)[i]*probability(𝒫,s,π,a) for a in joint(𝒫.𝒜))
transition(𝒫::MG, s, π, s′) =
	sum(𝒫.T(s,a,s′)*probability(𝒫,s,π,a) for a in joint(𝒫.𝒜))

function policy_evaluation(𝒫::MG, π, i)
	𝒮, 𝒜, R, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.T, 𝒫.γ
	p(s,a) = prod(πj(s, aj) for (πj, aj) in zip(π, a))
	R′ = [sum(R(s,a)[i]*p(s,a) for a in joint(𝒜)) for s in 𝒮]
	T′ = [sum(T(s,a,s′)*p(s,a) for a in joint(𝒜)) for s in 𝒮, s′ in 𝒮]
	return (I - γ*T′)\R′
end

2.1 响应模型

  • 最佳响应
    function best_response(𝒫::MG, π, i)
    	𝒮, 𝒜, R, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.T, 𝒫.γ
    	T′(s,ai,s′) = transition(𝒫, s, joint(π, SimpleGamePolicy(ai), i), s′)
    	R′(s,ai) = reward(𝒫, s, joint(π, SimpleGamePolicy(ai), i), i)
    	πi = solve(MDP(γ, 𝒮, 𝒜[i], T′, R′))
    	return MGPolicy(s => SimpleGamePolicy(πi(s)) for s in 𝒮)
    end
    
  • Softmax响应
    function softmax_response(𝒫::MG, π, i, λ)
    	𝒮, 𝒜, R, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.T, 𝒫.γ
    	T′(s,ai,s′) = transition(𝒫, s, joint(π, SimpleGamePolicy(ai), i), s′)
    	R′(s,ai) = reward(𝒫, s, joint(π, SimpleGamePolicy(ai), i), i)
    	mdp = MDP(γ, 𝒮, joint(𝒜), T′, R′)
    	πi = solve(mdp)
    	Q(s,a) = lookahead(mdp, πi.U, s, a)
    	p(s) = SimpleGamePolicy(a => exp(λ*Q(s,a)) for a in 𝒜[i])
    	return MGPolicy(s => p(s) for s in 𝒮)
    end
    

2.2 Nash均衡

Nash均衡与之前形式相同,可用非线性规划来解决:

function tensorform(𝒫::MG)
	ℐ, 𝒮, 𝒜, R, T = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.T
	ℐ′ = eachindex()
	𝒮′ = eachindex(𝒮)
	𝒜′ = [eachindex(𝒜[i]) for i in]
	R′ = [R(s,a) for s in 𝒮, a in joint(𝒜)]
	T′ = [T(s,a,s′) for s in 𝒮, a in joint(𝒜), s′ in 𝒮]
	return ℐ′, 𝒮′, 𝒜′, R′, T′
end

function solve(M::NashEquilibrium, 𝒫::MG)
	ℐ, 𝒮, 𝒜, R, T = tensorform(𝒫)
	𝒮′, 𝒜′, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.γ
	model = Model(Ipopt.Optimizer)
	@variable(model, U[ℐ, 𝒮])
	@variable(model, π[i=ℐ, 𝒮, ai=𝒜[i]]0)
	@NLobjective(model, Min,
		sum(U[i,s] - sum(prod(π[j,s,a[j]] for j in)
			* (R[s,y][i] + γ*sum(T[s,y,s′]*U[i,s′] for s′ in 𝒮))
			for (y,a) in enumerate(joint(𝒜))) for i in ℐ, s in 𝒮))
	@NLconstraint(model, [i=ℐ, s=𝒮, ai=𝒜[i]],
		U[i,s] ≥ sum(
			prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : π[j,s,a[j]] for j in)
			* (R[s,y][i] + γ*sum(T[s,y,s′]*U[i,s′] for s′ in 𝒮))
			for (y,a) in enumerate(joint(𝒜))))
	@constraint(model, [i=ℐ, s=𝒮], sum(π[i,s,ai] for ai in 𝒜[i]) == 1)
	optimize!(model)
	π′ = value.(π)
	πi′(i,s) = SimpleGamePolicy(𝒜′[i][ai] => π′[i,s,ai] for ai in 𝒜[i])
	πi′(i) = MGPolicy(𝒮′[s] => πi′(i,s) for s in 𝒮)
	return [πi′(i) for i in]
end

2.3 Fictitious Play

mutable struct MGFictitiousPlay
	𝒫 # Markov game
	i # agent index
	Qi # state-action value estimates
	Ni # state-action counts
end

function MGFictitiousPlay(𝒫::MG, i)
	ℐ, 𝒮, 𝒜, R = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.R
	Qi = Dict((s, a) => R(s, a)[i] for s in 𝒮 for a in joint(𝒜))
	Ni = Dict((j, s, aj) => 1.0 for j in ℐ for s in 𝒮 for aj in 𝒜[j])
	return MGFictitiousPlay(𝒫, i, Qi, Ni)
end

function (πi::MGFictitiousPlay)(s)
	𝒫, i, Qi = πi.𝒫, πi.i, πi.Qi
	ℐ, 𝒮, 𝒜, T, R, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.T, 𝒫.R, 𝒫.γ
	πi′(i,s) = SimpleGamePolicy(ai => πi.Ni[i,s,ai] for ai in 𝒜[i])
	πi′(i) = MGPolicy(s => πi′(i,s) for s in 𝒮)
	π = [πi′(i) for i in ℐ]
	U(s,π) = sum(πi.Qi[s,a]*probability(𝒫,s,π,a) for a in joint(𝒜))
	Q(s,π) = reward(𝒫,s,π,i) + γ*sum(transition(𝒫,s,π,s′)*U(s′,π) for s′ in 𝒮)
	Q(ai) = Q(s, joint(π, SimpleGamePolicy(ai), i))
	ai = argmax(Q, 𝒫.𝒜[πi.i])
	return SimpleGamePolicy(ai)
end

function update!(πi::MGFictitiousPlay, s, a, s′)
	𝒫, i, Qi = πi.𝒫, πi.i, πi.Qi
	ℐ, 𝒮, 𝒜, T, R, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.T, 𝒫.R, 𝒫.γ
	for (j,aj) in enumerate(a)
		πi.Ni[j,s,aj] += 1
	end
	πi′(i,s) = SimpleGamePolicy(ai => πi.Ni[i,s,ai] for ai in 𝒜[i])
	πi′(i) = MGPolicy(s => πi′(i,s) for s in 𝒮)
	π = [πi′(i) for i in]
	U(π,s) = sum(πi.Qi[s,a]*probability(𝒫,s,π,a) for a in joint(𝒜))
	Q(s,a) = R(s,a)[i] + γ*sum(T(s,a,s′)*U(π,s′) for s′ in 𝒮)
	for a in joint(𝒜)
		πi.Qi[s,a] = Q(s,a)
	end
end

2.4 梯度上升

mutable struct MGGradientAscent
	𝒫 # Markov game
	i # agent index
	t # time step
	Qi # state-action value estimates
	πi # current policy
end

function MGGradientAscent(𝒫::MG, i)
	ℐ, 𝒮, 𝒜 = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜
	Qi = Dict((s, a) => 0.0 for s in 𝒮, a in joint(𝒜))
	uniform() = Dict(s => SimpleGamePolicy(ai => 1.0 for ai in 𝒫.𝒜[i]) for s in 𝒮)
	return MGGradientAscent(𝒫, i, 1, Qi, uniform())
end

function (πi::MGGradientAscent)(s)
	𝒜i, t = πi.𝒫.𝒜[πi.i], πi.t
	ϵ = 1 / sqrt(t)
	πi′(ai) = ϵ/length(𝒜i) + (1)*πi.πi[s](ai)
	return SimpleGamePolicy(ai => πi′(ai) for ai in 𝒜i)
end

function update!(πi::MGGradientAscent, s, a, s′)
	𝒫, i, t, Qi = πi.𝒫, πi.i, πi.t, πi.Qi
	ℐ, 𝒮, 𝒜i, R, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜[πi.i], 𝒫.R, 𝒫.γ
	jointπ(ai) = Tuple(j == i ? ai : a[j] for j in)
	α = 1 / sqrt(t)
	Qmax = maximum(Qi[s′, jointπ(ai)] for ai in 𝒜i)
	πi.Qi[s, a] += α * (R(s, a)[i] + γ * Qmax - Qi[s, a])
	u = [Qi[s, jointπ(ai)] for ai in 𝒜i]
	π′ = [πi.πi[s](ai) for ai in 𝒜i]
	π = project_to_simplex(π′ + u / sqrt(t))
	πi.t = t + 1
	πi.πi[s] = SimpleGamePolicy(ai => p for (ai, p) in zip(𝒜i, π))
end

2.5 Nash Q-学习

mutable struct NashQLearning
	𝒫 # Markov game
	i # agent index
	Q # state-action value estimates
	N # history of actions performed
end

function NashQLearning(𝒫::MG, i)
	ℐ, 𝒮, 𝒜 = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜
	Q = Dict((j, s, a) => 0.0 for j in ℐ, s in 𝒮, a in joint(𝒜))
	N = Dict((s, a) => 1.0 for s in 𝒮, a in joint(𝒜))
	return NashQLearning(𝒫, i, Q, N)
end

function (πi::NashQLearning)(s)
	𝒫, i, Q, N = πi.𝒫, πi.i, πi.Q, πi.N
	ℐ, 𝒮, 𝒜, 𝒜i, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒜[πi.i], 𝒫.γ
	M = NashEquilibrium()
	𝒢 = SimpleGame(γ, ℐ, 𝒜, a -> [Q[j, s, a] for j in])
	π = solve(M, 𝒢)
	ϵ = 1 / sum(N[s, a] for a in joint(𝒜))
	πi′(ai) = ϵ/length(𝒜i) + (1)[i](ai)
	return SimpleGamePolicy(ai => πi′(ai) for ai in 𝒜i)
end

function update!(πi::NashQLearning, s, a, s′)
	𝒫, ℐ, 𝒮, 𝒜, R, γ = πi.𝒫, πi.𝒫.ℐ, πi.𝒫.𝒮, πi.𝒫.𝒜, πi.𝒫.R, πi.𝒫.γ
	i, Q, N = πi.i, πi.Q, πi.N
	M = NashEquilibrium()
	𝒢 = SimpleGame(γ, ℐ, 𝒜, a′ -> [Q[j, s′, a′] for j in])
	π = solve(M, 𝒢)
	πi.N[s, a] += 1
	α = 1 / sqrt(N[s, a])
	for j in ℐ
		πi.Q[j,s,a] += α*(R(s,a)[j] + γ*utility(𝒢,π,j) - Q[j,s,a])
	end
end

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值