# qlearningAgents.py
# ------------------
# Licensing Information: You are free to use or extend these projects for
# educational purposes provided that (1) you do not distribute or publish
# solutions, (2) you retain this notice, and (3) you provide clear
# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
#
# Attribution Information: The Pacman AI projects were developed at UC Berkeley.
# The core projects and autograders were primarily created by John DeNero
# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
# Student side autograding was added by Brad Miller, Nick Hay, and
# Pieter Abbeel (pabbeel@cs.berkeley.edu).
from game import *
from learningAgents import ReinforcementAgent
from featureExtractors import *
import random,util,math
class QLearningAgent(ReinforcementAgent):
"""Q-Learning AgentFunctions you should fill in:- computeValueFromQValues- computeActionFromQValues- getQValue- getAction- updateInstance variables you have access to- self.epsilon (exploration prob)- self.alpha (learning rate)- self.discount (discount rate)Functions you should use- self.getLegalActions(state)which returns legal actions for a state"""
def __init__(self, **args):
"You can initialize Q-values here..."
ReinforcementAgent.__init__(self, **args)
"*** YOUR CODE HERE ***"
# 设置初始Q值
self.q_values = util.Counter()
def getQValue(self, state, action):
"""Returns Q(state,action)Should return 0.0 if we have never seen a stateor the Q node value otherwise"""
"*** YOUR CODE HERE ***"
# 将参数指定的Q值返回出去
return self.q_values[(state, action)]
def computeValueFromQValues(self, state):
"""Returns max_action Q(state,action)where the max is over legal actions. Note that ifthere are no legal actions, which is the case at theterminal state, you should return a value of 0.0."""
"*** YOUR CODE HERE ***"
# 获取所有可行的action
legalActions = self.getLegalActions(state)
if len(legalActions)==0:
return 0.0
tmp = util.Counter()
for action in legalActions:
tmp[action] = self.getQValue(state, action)
# util.Counter()这个增强版的字典,可以直接返回其中的最大值,嘿嘿
return tmp[tmp.argMax()]
def computeActionFromQValues(self, state):
"""Compute the best action to take in a state. Note that if thereare no legal actions, which is the case at the terminal state,you should return None."""
"*** YOUR CODE HERE ***"
# 获取所有可行的action
actions = self.getLegalActions(state)
best_action = None
# 求所有action中Q值最大的那一个,并返回对应的action
max_val = float('-inf')
for action in actions:
q_value = self.q_values[(state, action)]
if max_val < q_value:
max_val = q_value
best_action = action
return best_action
def getAction(self, state):
"""Compute the action to take in the current state. Withprobability self.epsilon, we should take a random action andtake the best policy action otherwise. Note that if there areno legal actions, which is the case at the terminal state, youshould choose None as the action.HINT: You might want to use util.flipCoin(prob)HINT: To pick randomly from a list, use random.choice(list)"""
# Pick Action
legalActions = self.getLegalActions(state)
action = None
"*** YOUR CODE HERE ***"
# 以一定的概率访问最优的action,这个是Q7的要求,可以一并做了
explore = util.flipCoin(self.epsilon)
if explore:
return random.choice(legalActions)
else:
return self.getPolicy(state)
def update(self, state, action, nextState, reward):
"""The parent class calls this to observe astate = action => nextState and reward transition.You should do your Q-Value update hereNOTE: You should never call this function,it will be called on your behalf"""
"*** YOUR CODE HERE ***"
# 第1步,获取旧的Q值,为了程序的简洁,重命名变量
old_Q = self.getQValue(state, action)
a = self.alpha
r = reward
g = self.discount
if nextState:
self.q_values[(state, action)] = (1 - a) * old_Q + a * (r + g * self.getValue(nextState))
else:
self.q_values[(state, action)] = (1 - a) * old_Q + a * r
def getPolicy(self, state):
return self.computeActionFromQValues(state)
def getValue(self, state):
return self.computeValueFromQValues(state)
class PacmanQAgent(QLearningAgent):
"Exactly the same as QLearningAgent, but with different default parameters"
def __init__(self, epsilon=0.05,gamma=0.8,alpha=0.2, numTraining=0, **args):
"""These default parameters can be changed from the pacman.py command line.For example, to change the exploration rate, try:python pacman.py -p PacmanQLearningAgent -a epsilon=0.1alpha - learning rateepsilon - exploration rategamma - discount factornumTraining - number of training episodes, i.e. no learning after these many episodes"""
args['epsilon'] = epsilon
args['gamma'] = gamma
args['alpha'] = alpha
args['numTraining'] = numTraining
self.index = 0 # This is always Pacman
QLearningAgent.__init__(self, **args)
def getAction(self, state):
"""Simply calls the getAction method of QLearningAgent and theninforms parent of action for Pacman. Do not change or remove thismethod."""
action = QLearningAgent.getAction(self,state)
self.doAction(state,action)
return action
class ApproximateQAgent(PacmanQAgent):
"""ApproximateQLearningAgentYou should only have to overwrite getQValueand update. All other QLearningAgent functionsshould work as is."""
def __init__(self, extractor='IdentityExtractor', **args):
self.featExtractor = util.lookup(extractor, globals())()
PacmanQAgent.__init__(self, **args)
self.weights = util.Counter()
def getWeights(self):
return self.weights
def getQValue(self, state, action):
"""Should return Q(state,action) = w * featureVectorwhere * is the dotProduct operator"""
"*** YOUR CODE HERE ***"
# 把所有的特征值给取出来
features = self.featExtractor.getFeatures(state, action)
# 接下来做矩阵乘法,得到最后的Q值
total = 0
for i in features:
total += features[i] * self.weights[i]
return total
def update(self, state, action, nextState, reward):
"""Should update your weights based on transition"""
"*** YOUR CODE HERE ***"
# 根据公式计算diff值,并更新系数矩阵w
diff = (reward + self.discount * self.getValue(nextState)) - self.getQValue(state, action)
features = self.featExtractor.getFeatures(state, action)
for i in features:
self.weights[i] = self.weights[i] + self.alpha * diff * features[i]
def final(self, state):
"Called at the end of each game."
# call the super-class final method
PacmanQAgent.final(self, state)
# did we finish training?
if self.episodesSoFar == self.numTraining:
# you might want to print your weights here for debugging
"*** YOUR CODE HERE ***"
# 这里用于输出调试信息,随便各位啦~~
print("Approximate Q-Learning Summary")
print("Learning rate(alpha) :{0}".format(self.alpha))
print("Discount rate(gamma) :{0}".format(self.gamma))
print("Exploration rate(epsilon) :{0}".format(self.epsilon))
print("Training episodes :{0}".format(self.numTraining))
print("=======Feature Weights=======")
for i in features:
print("{0}:{1}".format(i, self.weights[i]))