文章目录
Nash-QLearning
论文:Nash Q-learning for general-sum stochastic games
链接:http://www.jmlr.org/papers/volume4/hu03a/hu03a.pdf
智能体
import numpy as np
import nashpy
class NashQLearner():
def __init__(self,
alpha=0.1,
policy=None,
gamma=0.99,
ini_state="nonstate",
actions=None):
self.alpha = alpha
self.gamma = gamma
self.policy = policy
self.actions = actions
self.state = ini_state
# q values (my and opponent)
self.q, self.q_o = {
}, {
}
self.q[ini_state] = {
}
self.q_o[ini_state] = {
}
# nash q value
self.nashq = {
}
self.nashq[ini_state] = 0
# pi (my and opponent)
self.pi, self.pi_o = {
}, {
}
self.pi[ini_state] = np.repeat(1.0/len(self.actions), len(self.actions))
self.pi_o[ini_state] = np.repeat(1.0/len(self.actions), len(self.actions))
self.previous_action = None
self.reward_history = []
self.pi_history = []
def act(self, training=True):
if training:
action_id = self.policy.select_action(self.pi[self.state])
action = self.actions[action_id]
self.previous_action = action
else:
action_id = self.policy.select_greedy_action(self.pi)
action = self.actions[action_id]
return action
def observe(self, state="nonstate", reward=None, reward_o=None, opponent_action=None, is_learn=True):
"""
observe next state and learn
"""
if is_learn:
self.check_new_state(state) # if the state is new state, extend q table
self.learn(state, reward, reward_o, opponent_action)
def learn(self, state, reward, reward_o, opponent_action):
self.reward_history.append(reward)
self.q[state][(self.previous_action, opponent_action)] = self.compute_q(state, reward, opponent_action, self.q)
self.q_o[state][(self.previous_action, opponent_action)] = self.compute_q(state, reward_o, opponent_action, self.q_o)
self.pi[state], self.pi_o[state] = self.compute_pi(state)
self.nashq[state] = self.compute_nashq(state)
self.pi_history.append(self.pi[state][0])
def compute_q(self, state, reward, opponent_action, q):
if