读书笔记:Algorithms for Decision Making(12)

读书笔记:Algorithms for Decision Making

上一篇 读书笔记:Algorithms for Decision Making(11)
下一篇 读书笔记:Algorithms for Decision Making(13)



五、多智能体系统(1)

现将单智能体的核心概念扩展到多智能体系统的问题。在该系统中,可将其他智能体建模为潜在的盟友或对手,并随着时间的推移进行相应的调整。


1. 多智能体推理

Multiagent reasoning本质上是一个博弈论1的课题。

1.1 简单博弈

简单博弈包括下述几部分:

struct SimpleGame
	γ # discount factor# agents
	𝒜 # joint action space
	R # joint reward function
end

具体来讲,每个智能体 i ∈ I i \in \mathcal{I} iI选择一个行动 a i ∈ A i a^{i} \in \mathcal{A}^{i} aiAi去最大化自己的累积奖励 r i ∈ R i r^{i} \in R^{i} riRi

  • 联合行动空间 A = A 1 × ⋯ × A k \mathcal{A} = \mathcal{A}^{1} \times \cdots \times \mathcal{A}^{k} A=A1××Ak包含多智能体系统的所有可能行动。
  • 联合行动 a = ( a 1 , ⋯   , a k ) ∈ A \bm{a} =\left(a^{1} ,\cdots, a^{k} \right) \in \mathcal{A} a=(a1,,ak)A
  • 对应于联合行动的联合奖励函数 R ( a ) R(\bm{a}) R(a)

联合策略 π \bm{\pi} π指定智能体采取的联合行动的概率分布。在博弈论中,确定性策略称为纯策略,随机策略称为混合策略。从智能体 i i i的角度来看,联合策略 π \pi π的效用是 U i ( π ) = ∑ a ∈ A R i ( a ) ∏ j ∈ I π j ( a j ) . \mathcal{U}^{i}(\bm{\pi}) = \sum_{\bm{a} \in \mathcal{A}} R^{i}(\bm{a}) \prod_{j \in \mathcal{I}} \pi^{j}(a^{j}). Ui(π)=aARi(a)jIπj(aj).

struct SimpleGamePolicy
	p # dictionary mapping actions to probabilities
	function SimpleGamePolicy(p::Base.Generator)
		return SimpleGamePolicy(Dict(p))
	end
	
	function SimpleGamePolicy(p::Dict)
		vs = collect(values(p))
		vs ./= sum(vs)
		return new(Dict(k => v for (k,v) in zip(keys(p), vs)))
	end

	SimpleGamePolicy(ai) = new(Dict(ai => 1.0))
end

(πi::SimpleGamePolicy)(ai) = get(πi.p, ai, 0.0)

function (πi::SimpleGamePolicy)()
	D = SetCategorical(collect(keys(πi.p)), collect(values(πi.p)))
	return rand(D)
end

joint(X) = vec(collect(product(X...)))

joint(π, πi, i) = [i == j ? πi : πj for (j, πj) in enumerate(π)]

function utility(𝒫::SimpleGame, π, i)
	𝒜, R = 𝒫.𝒜, 𝒫.R
	p(a) = prod(πj(aj) for (πj, aj) in zip(π, a))
	return sum(R(a)[i]*p(a) for a in joint(𝒜))
end

1.2 响应模型

首先考虑在给定其他智能体的固定策略的情况下,对单个智能体 i i i的响应建模。

function best_response(𝒫::SimpleGame, π, i)
	U(ai) = utility(𝒫, joint(π, SimpleGamePolicy(ai), i), i)
	ai = argmax(U, 𝒫.𝒜[i])
	return SimpleGamePolicy(ai)
end
function softmax_response(𝒫::SimpleGame, π, i, λ)
	𝒜i = 𝒫.𝒜[i]
	U(ai) = utility(𝒫, joint(π, SimpleGamePolicy(ai), i), i)
	return SimpleGamePolicy(ai => exp(λ*U(ai)) for ai in 𝒜i)
end

1.3 Nash均衡

找Nash均衡的过程可以视为如下的优化问题:

max ⁡ π , U ∑ i ( U i − U i ( π ) ) s . t . U i ≥ U i ( a i , π − i ) ,   ∀ i , a i ∑ a ′ π i ( a i ) = 1 ,   ∀ i π i ( a i ) ≥ 0 ,   ∀ i , a i \begin{align*} \max_{\pi, \mathcal{U}} \quad & \sum_{i} \left(\mathcal{U}^{i} - \mathcal{U}^{i}(\pi)\right) \\ {\rm s.t.} \quad & \mathcal{U}^{i} \geq \mathcal{U}^{i} (a^{i}, \pi^{- i}), \ \forall i, a^{i} \\ & \sum_{a^{\prime}} \pi^{i} (a^{i}) = 1,\ \forall i \\ & \pi^{i} (a^{i}) \geq 0, \ \forall i, a^{i} \end{align*} π,Umaxs.t.i(UiUi(π))UiUi(ai,πi), i,aiaπi(ai)=1, iπi(ai)0, i,ai

struct NashEquilibrium end

function tensorform(𝒫::SimpleGame)
	ℐ, 𝒜, R = 𝒫.ℐ, 𝒫.𝒜, 𝒫.R
	ℐ′ = eachindex()
	𝒜′ = [eachindex(𝒜[i]) for i in]
	R′ = [R(a) for a in joint(𝒜)]
	return ℐ′, 𝒜′, R′
end

function solve(M::NashEquilibrium, 𝒫::SimpleGame)
	ℐ, 𝒜, R = tensorform(𝒫)
	model = Model(Ipopt.Optimizer)
	@variable(model, U[])
	@variable(model, π[i=ℐ, 𝒜[i]]0)
	@NLobjective(model, Min,
		sum(U[i] - sum(prod(π[j,a[j]] for j in) * R[y][i]
		for (y,a) in enumerate(joint(𝒜))) for i in))
	@NLconstraint(model, [i=ℐ, ai=𝒜[i]],
		U[i] ≥ sum(
			prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : π[j,a[j]] for j in)
			* R[y][i] for (y,a) in enumerate(joint(𝒜))))
	@constraint(model, [i=], sum(π[i,ai] for ai in 𝒜[i]) == 1)
	optimize!(model)
	πi′(i) = SimpleGamePolicy(𝒫.𝒜[i][ai] => value(π[i,ai]) for ai in 𝒜[i])
	return [πi′(i) for i in]
end
1.3.1 Correlated Equilibrium
struct CorrelatedEquilibrium end

function solve(M::CorrelatedEquilibrium, 𝒫::SimpleGame)
	ℐ, 𝒜, R = 𝒫.ℐ, 𝒫.𝒜, 𝒫.R
	model = Model(Ipopt.Optimizer)
	@variable(model, π[joint(𝒜)]0)
	@objective(model, Max, sum(sum(π[a]*R(a) for a in joint(𝒜))))
	@constraint(model, [i=ℐ, ai=𝒜[i], ai′=𝒜[i]],
		sum(R(a)[i][a] for a in joint(𝒜) if a[i]==ai)
		≥ sum(R(joint(a,ai′,i))[i][a] for a in joint(𝒜) if a[i]==ai))
	@constraint(model, sum(π) == 1)
	optimize!(model)
	return JointCorrelatedPolicy(a => value(π[a]) for a in joint(𝒜))
end

1.4 Iterated Best Response

由于计算Nash均衡可能需要大量的计算,另一种方法是在一系列重复游戏中迭代应用最佳响应。

struct IteratedBestResponse
	k_max # number of iterations
	π # initial policy
end

function IteratedBestResponse(𝒫::SimpleGame, k_max)
	π = [SimpleGamePolicy(ai => 1.0 for ai in 𝒜i) for 𝒜i in 𝒫.𝒜]
	return IteratedBestResponse(k_max, π)
end

function solve(M::IteratedBestResponse, 𝒫)
	π = M.π
	for k in 1:M.k_max
		π = [best_response(𝒫, π, i) for i in 𝒫.ℐ]
	end
	return π
end
1.4.1 Hierarchical Softmax
struct HierarchicalSoftmax
	λ # precision parameter
	k # level
	π # initial policy
end

function HierarchicalSoftmax(𝒫::SimpleGame, λ, k)
	π = [SimpleGamePolicy(ai => 1.0 for ai in 𝒜i) for 𝒜i in 𝒫.𝒜]
	return HierarchicalSoftmax(λ, k, π)
end

function solve(M::HierarchicalSoftmax, 𝒫)
	π = M.π
	for k in 1:M.k
		π = [softmax_response(𝒫, π, i, M.λ) for i in 𝒫.ℐ]
	end
	return π
end

1.5 Fictitious Play

计算不同智能体策略的另一种方法是让它们在模拟中相互作用,并学习如何最佳响应。

mutable struct FictitiousPlay
	𝒫 # simple game
	i # agent index
	N # array of action count dictionaries
	πi # current policy
end

function FictitiousPlay(𝒫::SimpleGame, i)
	N = [Dict(aj => 1 for aj in 𝒫.𝒜[j]) for j in 𝒫.ℐ]
	πi = SimpleGamePolicy(ai => 1.0 for ai in 𝒫.𝒜[i])
	return FictitiousPlay(𝒫, i, N, πi)
end

(πi::FictitiousPlay)() = πi.πi()

(πi::FictitiousPlay)(ai) = πi.πi(ai)

function update!(πi::FictitiousPlay, a)
	N, 𝒫, ℐ, i = πi.N, πi.𝒫, πi.𝒫.ℐ, πi.i
	for (j, aj) in enumerate(a)
		N[j][aj] += 1
	end
	p(j) = SimpleGamePolicy(aj => u/sum(values(N[j])) for (aj, u) in N[j])
	π = [p(j) for j in]
	πi.πi = best_response(𝒫, π, i)
end

1.6 梯度上升

mutable struct GradientAscent
	𝒫 # simple game
	i # agent index
	t # time step
	πi # current policy
end

function GradientAscent(𝒫::SimpleGame, i)
	uniform() = SimpleGamePolicy(ai => 1.0 for ai in 𝒫.𝒜[i])
	return GradientAscent(𝒫, i, 1, uniform())
end

(πi::GradientAscent)() = πi.πi()

(πi::GradientAscent)(ai) = πi.πi(ai)

function update!(πi::GradientAscent, a)
	𝒫, ℐ, 𝒜i, i, t = πi.𝒫, πi.𝒫.ℐ, πi.𝒫.𝒜[πi.i], πi.i, πi.t
	jointπ(ai) = [SimpleGamePolicy(j == i ? ai : a[j]) for j in]
	r = [utility(𝒫, jointπ(ai), i) for ai in 𝒜i]
	π′ = [πi.πi(ai) for ai in 𝒜i]
	π = project_to_simplex(π′ + r / sqrt(t))
	πi.t = t + 1
	πi.πi = SimpleGamePolicy(ai => p for (ai, p) in zip(𝒜i, π))
end


  1. 博弈论基础书可见 [1] D. Fudenberg and J. Tirole, Game Theory. MIT Press, 1991. [2] R. B.
    Myerson, Game Theory: Analysis of Conflict. Harvard University Press, 1997. [3] Y. Shoham and K. Leyton Brown, Multiagent Systems: Algorithmic, Game Theoretic, and LogicalFoundations. Cambridge University Press, 2009. ↩︎

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值