cspython代码_敲代码学Python:CS188之Q-Learning

# qlearningAgents.py

# ------------------

# Licensing Information: You are free to use or extend these projects for

# educational purposes provided that (1) you do not distribute or publish

# solutions, (2) you retain this notice, and (3) you provide clear

# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.

#

# Attribution Information: The Pacman AI projects were developed at UC Berkeley.

# The core projects and autograders were primarily created by John DeNero

# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).

# Student side autograding was added by Brad Miller, Nick Hay, and

# Pieter Abbeel (pabbeel@cs.berkeley.edu).

from game import *

from learningAgents import ReinforcementAgent

from featureExtractors import *

import random,util,math

class QLearningAgent(ReinforcementAgent):

"""Q-Learning AgentFunctions you should fill in:- computeValueFromQValues- computeActionFromQValues- getQValue- getAction- updateInstance variables you have access to- self.epsilon (exploration prob)- self.alpha (learning rate)- self.discount (discount rate)Functions you should use- self.getLegalActions(state)which returns legal actions for a state"""

def __init__(self, **args):

"You can initialize Q-values here..."

ReinforcementAgent.__init__(self, **args)

"*** YOUR CODE HERE ***"

# 设置初始Q值

self.q_values = util.Counter()

def getQValue(self, state, action):

"""Returns Q(state,action)Should return 0.0 if we have never seen a stateor the Q node value otherwise"""

"*** YOUR CODE HERE ***"

# 将参数指定的Q值返回出去

return self.q_values[(state, action)]

def computeValueFromQValues(self, state):

"""Returns max_action Q(state,action)where the max is over legal actions. Note that ifthere are no legal actions, which is the case at theterminal state, you should return a value of 0.0."""

"*** YOUR CODE HERE ***"

# 获取所有可行的action

legalActions = self.getLegalActions(state)

if len(legalActions)==0:

return 0.0

tmp = util.Counter()

for action in legalActions:

tmp[action] = self.getQValue(state, action)

# util.Counter()这个增强版的字典,可以直接返回其中的最大值,嘿嘿

return tmp[tmp.argMax()]

def computeActionFromQValues(self, state):

"""Compute the best action to take in a state. Note that if thereare no legal actions, which is the case at the terminal state,you should return None."""

"*** YOUR CODE HERE ***"

# 获取所有可行的action

actions = self.getLegalActions(state)

best_action = None

# 求所有action中Q值最大的那一个,并返回对应的action

max_val = float('-inf')

for action in actions:

q_value = self.q_values[(state, action)]

if max_val < q_value:

max_val = q_value

best_action = action

return best_action

def getAction(self, state):

"""Compute the action to take in the current state. Withprobability self.epsilon, we should take a random action andtake the best policy action otherwise. Note that if there areno legal actions, which is the case at the terminal state, youshould choose None as the action.HINT: You might want to use util.flipCoin(prob)HINT: To pick randomly from a list, use random.choice(list)"""

# Pick Action

legalActions = self.getLegalActions(state)

action = None

"*** YOUR CODE HERE ***"

# 以一定的概率访问最优的action,这个是Q7的要求,可以一并做了

explore = util.flipCoin(self.epsilon)

if explore:

return random.choice(legalActions)

else:

return self.getPolicy(state)

def update(self, state, action, nextState, reward):

"""The parent class calls this to observe astate = action => nextState and reward transition.You should do your Q-Value update hereNOTE: You should never call this function,it will be called on your behalf"""

"*** YOUR CODE HERE ***"

# 第1步,获取旧的Q值,为了程序的简洁,重命名变量

old_Q = self.getQValue(state, action)

a = self.alpha

r = reward

g = self.discount

if nextState:

self.q_values[(state, action)] = (1 - a) * old_Q + a * (r + g * self.getValue(nextState))

else:

self.q_values[(state, action)] = (1 - a) * old_Q + a * r

def getPolicy(self, state):

return self.computeActionFromQValues(state)

def getValue(self, state):

return self.computeValueFromQValues(state)

class PacmanQAgent(QLearningAgent):

"Exactly the same as QLearningAgent, but with different default parameters"

def __init__(self, epsilon=0.05,gamma=0.8,alpha=0.2, numTraining=0, **args):

"""These default parameters can be changed from the pacman.py command line.For example, to change the exploration rate, try:python pacman.py -p PacmanQLearningAgent -a epsilon=0.1alpha - learning rateepsilon - exploration rategamma - discount factornumTraining - number of training episodes, i.e. no learning after these many episodes"""

args['epsilon'] = epsilon

args['gamma'] = gamma

args['alpha'] = alpha

args['numTraining'] = numTraining

self.index = 0 # This is always Pacman

QLearningAgent.__init__(self, **args)

def getAction(self, state):

"""Simply calls the getAction method of QLearningAgent and theninforms parent of action for Pacman. Do not change or remove thismethod."""

action = QLearningAgent.getAction(self,state)

self.doAction(state,action)

return action

class ApproximateQAgent(PacmanQAgent):

"""ApproximateQLearningAgentYou should only have to overwrite getQValueand update. All other QLearningAgent functionsshould work as is."""

def __init__(self, extractor='IdentityExtractor', **args):

self.featExtractor = util.lookup(extractor, globals())()

PacmanQAgent.__init__(self, **args)

self.weights = util.Counter()

def getWeights(self):

return self.weights

def getQValue(self, state, action):

"""Should return Q(state,action) = w * featureVectorwhere * is the dotProduct operator"""

"*** YOUR CODE HERE ***"

# 把所有的特征值给取出来

features = self.featExtractor.getFeatures(state, action)

# 接下来做矩阵乘法,得到最后的Q值

total = 0

for i in features:

total += features[i] * self.weights[i]

return total

def update(self, state, action, nextState, reward):

"""Should update your weights based on transition"""

"*** YOUR CODE HERE ***"

# 根据公式计算diff值,并更新系数矩阵w

diff = (reward + self.discount * self.getValue(nextState)) - self.getQValue(state, action)

features = self.featExtractor.getFeatures(state, action)

for i in features:

self.weights[i] = self.weights[i] + self.alpha * diff * features[i]

def final(self, state):

"Called at the end of each game."

# call the super-class final method

PacmanQAgent.final(self, state)

# did we finish training?

if self.episodesSoFar == self.numTraining:

# you might want to print your weights here for debugging

"*** YOUR CODE HERE ***"

# 这里用于输出调试信息,随便各位啦~~

print("Approximate Q-Learning Summary")

print("Learning rate(alpha) :{0}".format(self.alpha))

print("Discount rate(gamma) :{0}".format(self.gamma))

print("Exploration rate(epsilon) :{0}".format(self.epsilon))

print("Training episodes :{0}".format(self.numTraining))

print("=======Feature Weights=======")

for i in features:

print("{0}:{1}".format(i, self.weights[i]))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值