classQLearningAgent(BaseAgent):defagent_init(self, agent_init_info):"""Setup for the agent called when the experiment first starts.
Args:
agent_init_info (dict), the parameters used to initialize the agent. The dictionary contains:
{
num_states (int): The number of states,
num_actions (int): The number of actions,
epsilon (float): The epsilon parameter for exploration,
step_size (float): The step-size,
discount (float): The discount factor,
}
"""# Store the parameters provided in agent_init_info.
self.num_actions = agent_init_info["num_actions"]
self.num_states = agent_init_info["num_states"]
self.epsilon = agent_init_info["epsilon"]
self.step_size = agent_init_info["step_size"]
self.discount = agent_init_info["discount"]
self.rand_generator = np.random.RandomState(agent_info["seed"])# Create an array for action-value estimates and initialize it to zero.
self.q = np.zeros((self.num_states, self.num_actions))# The array of action-value estimates.defagent_start(self, observation):"""The first method called when the episode starts, called after
the environment starts.
Args:
observation (int): the state observation from the
environment's evn_start function.
Returns:
action (int): the first action the agent takes.
"""# Choose action using epsilon greedy.
state = observation
current_q = self.q[state,:]if self.rand_generator.rand()< self.epsilon:
action = self.rand_generator.randint(self.num_actions)else:
action = self.argmax(current_q)
self.prev_state = state
self.prev_action = action
return action
defagent_step(self, reward, observation):"""A step taken by the agent.
Args:
reward (float): the reward received for taking the last action taken
observation (int): the state observation from the
environment's step based on where the agent ended up after the
last step.
Returns:
action (int): the action the agent is taking.
"""# Choose action using epsilon greedy.
state = observation
current_q = self.q[state,:]if self.rand_generator.rand()< self.epsilon:
action = self.rand_generator.randint(self.num_actions)else:
action = self.argmax(current_q)# Perform an update
targetPolicy = reward + self.discount * np.max(self.q[state,:])
self.q[self.prev_state, self.prev_action]= self.q[self.prev_state,self.prev_action]+ self.step_size *(targetPolicy - self.q[self.prev_state,self.prev_action])
self.prev_state = state
self.prev_action = action
return action
defagent_end(self, reward):"""Run when the agent terminates.
Args:
reward (float): the reward the agent received for entering the
terminal state.
"""# Perform the last update in the episode
targetPolicy = reward + self.discount *0
self.q[self.prev_state, self.prev_action]= self.q[self.prev_state,self.prev_action]+ self.step_size *(targetPolicy - self.q[self.prev_state,self.prev_action])defargmax(self, q_values):"""argmax with random tie-breaking
Args:
q_values (Numpy array): the array of action-values
Returns:
action (int): an action with the highest value
"""
top =float("-inf")
ties =[]for i inrange(len(q_values)):if q_values[i]> top:
top = q_values[i]
ties =[]if q_values[i]== top:
ties.append(i)return self.rand_generator.choice(ties)