python cs架构实现简单文件传输 代码解析_敲代码学Python:CS188之马尔科夫决策过程...

本文介绍了ValueIterationAgent及其变种,如同步、异步和优先级扫荡价值迭代算法,用于解决MDP问题。讲解了如何使用折扣因子、迭代次数,以及核心的Q值计算、策略选择和状态值更新方法。重点在于展示了如何在不同策略下进行价值迭代,适用于强化学习和决策制定场景。
摘要由CSDN通过智能技术生成

# valueIterationAgents.py

# -----------------------

# Licensing Information: You are free to use or extend these projects for

# educational purposes provided that (1) you do not distribute or publish

# solutions, (2) you retain this notice, and (3) you provide clear

# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.

#

# Attribution Information: The Pacman AI projects were developed at UC Berkeley.

# The core projects and autograders were primarily created by John DeNero

# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).

# Student side autograding was added by Brad Miller, Nick Hay, and

# Pieter Abbeel (pabbeel@cs.berkeley.edu).

import mdp, util

from learningAgents import ValueEstimationAgent

import collections

class ValueIterationAgent(ValueEstimationAgent):

"""* Please read learningAgents.py before reading this.*A ValueIterationAgent takes a Markov decision process(see mdp.py) on initialization and runs value iterationfor a given number of iterations using the supplieddiscount factor."""

def __init__(self, mdp, discount = 0.9, iterations = 100):

"""Your value iteration agent should take an mdp onconstruction, run the indicated number of iterationsand then act according to the resulting policy.Some useful mdp methods you will use:mdp.getStates()mdp.getPossibleActions(state)mdp.getTransitionStatesAndProbs(state, action)mdp.getReward(state, action, nextState)mdp.isTerminal(state)"""

self.mdp = mdp

self.discount = discount

self.iterations = iterations

self.values = util.Counter() # A Counter is a dict with default 0

self.runValueIteration()

def runValueIteration(self):

# Write value iteration code here

"*** YOUR CODE HERE ***"

for i in range(self.iterations):

states = self.mdp.getStates()

temp_counter = util.Counter()

for state in states:

maxVal = -float('inf')

for action in self.mdp.getPossibleActions(state):

Q = self.computeQValueFromValues(state ,action)

if Q>maxVal:

maxVal = Q

# 只有maxVal被更新过了,我们才将这个值更新到self.values中

temp_counter[state] = maxVal if maxVal!=-float('inf') else 0

self.values = temp_counter

def getValue(self, state):

"""Return the value of the state (computed in __init__)."""

return self.values[state]

def computeQValueFromValues(self, state, action):

"""Compute the Q-value of action in state from thevalue function stored in self.values."""

"*** YOUR CODE HERE ***"

# 第一步,先来实现计算Q-Value的功能

# 把可能的下一步状态信息进行遍历,并求和

total = 0

# 按照求Q值的公式,进行代码编写

for nextState,prob in self.mdp.getTransitionStatesAndProbs(state, action):

total += prob * (self.mdp.getReward(state, action, nextState)

+ self.discount * self.getValue(nextState))

return total

def computeActionFromValues(self, state):

"""The policy is the best action in the given stateaccording to the values currently stored in self.values.You may break ties any way you see fit. Note that ifthere are no legal actions, which is the case at theterminal state, you should return None."""

"*** YOUR CODE HERE ***"

# 初始化

maxVal = -float('inf')

bestAction = None

# 对当前状态所有的可能动作进行求Q值的遍历,从中选出最大的Q值作为V值

for action in self.mdp.getPossibleActions(state):

Q = self.computeQValueFromValues(state ,action)

if Q>maxVal:

maxVal = Q

bestAction = action

return bestAction

def getPolicy(self, state):

return self.computeActionFromValues(state)

def getAction(self, state):

"Returns the policy at the state (no exploration)."

return self.computeActionFromValues(state)

def getQValue(self, state, action):

return self.computeQValueFromValues(state, action)

class AsynchronousValueIterationAgent(ValueIterationAgent):

"""* Please read learningAgents.py before reading this.*An AsynchronousValueIterationAgent takes a Markov decision process(see mdp.py) on initialization and runs cyclic value iterationfor a given number of iterations using the supplieddiscount factor."""

def __init__(self, mdp, discount = 0.9, iterations = 1000):

"""Your cyclic value iteration agent should take an mdp onconstruction, run the indicated number of iterations,and then act according to the resulting policy. Each iterationupdates the value of only one state, which cycles throughthe states list. If the chosen state is terminal, nothinghappens in that iteration.Some useful mdp methods you will use:mdp.getStates()mdp.getPossibleActions(state)mdp.getTransitionStatesAndProbs(state, action)mdp.getReward(state)mdp.isTerminal(state)"""

ValueIterationAgent.__init__(self, mdp, discount, iterations)

def runValueIteration(self):

# 保存当前MDP状态

states = self.mdp.getStates()

# 所谓的异步更新策略就是一次更新一个节点,而不像前面的算法一次迭代更新所有的节点

for index in range(self.iterations):

# 首先要从states中获取一个state,下方代码中的求余数操作可以保证索引值在states的允许范围内

state = states[index % len(states)]

# 接下来只要更新这个state即可,但是题目上要求不更新Terminal节点

if not self.mdp.isTerminal(state):

# 按照同样的方法求V值

maxVal = -float('inf')

for action in self.mdp.getPossibleActions(state):

Q = self.computeQValueFromValues(state ,action)

if Q>maxVal:

maxVal = Q

# 这句话更新的是某个单独的节点,而非上面算法中更新了所有节点的V值

self.values[state] = maxVal

class PrioritizedSweepingValueIterationAgent(AsynchronousValueIterationAgent):

"""* Please read learningAgents.py before reading this.*A PrioritizedSweepingValueIterationAgent takes a Markov decision process(see mdp.py) on initialization and runs prioritized sweeping value iterationfor a given number of iterations using the supplied parameters."""

def __init__(self, mdp, discount = 0.9, iterations = 100, theta = 1e-5):

"""Your prioritized sweeping value iteration agent should take an mdp onconstruction, run the indicated number of iterations,and then act according to the resulting policy."""

self.theta = theta

ValueIterationAgent.__init__(self, mdp, discount, iterations)

def runValueIteration(self):

# 计算所有状态的前驱节点

predecessors = {}

# 将MPD中的所有state进行遍历

for state in self.mdp.getStates():

# 接下来只要更新这个state即可,但是题目上要求不更新Terminal节点

if not self.mdp.isTerminal(state):

# 按照同样的方法求V值

maxVal = -float('inf')

for action in self.mdp.getPossibleActions(state):

# 为了保存当前节点的前驱节点,必须调用getTransitionStatesAndProbs得到一系列节点和概率的组合

for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):

# 注意,要保存的是前驱节点的信息,所以要以下一个节点作为键值进行索引

if nextState in predecessors:

predecessors[nextState].add(state)

else:

predecessors[nextState]={state}

# 初始化一个空的优先级队列

pq = util.PriorityQueue()

# 对所有非终点的状态s进行遍历

for s in self.mdp.getStates():

if not self.mdp.isTerminal(s):

# 找到s的V值和最大Q值的差的绝对值diff

maxQ = -float('inf')

for action in self.mdp.getPossibleActions(s):

Q = self.computeQValueFromValues(s ,action)

if Q>maxQ:

maxQ = Q

diff = abs(maxQ - self.values[s])

# 把s按照-diff的值,推到优先级队列中

pq.update(s, -diff)

# 按照迭代次数构造循环

for _ in range(self.iterations):

# 如果队列为空,则终止循环

if pq.isEmpty():

break

# 从队列中弹出一个状态s

s = pq.pop()

# 更新s的V值

if not self.mdp.isTerminal(s):

maxVal = -float('inf')

for action in self.mdp.getPossibleActions(s):

Q = self.computeQValueFromValues(s ,action)

if Q>maxVal:

maxVal = Q

# 这句话更新的是某个单独的节点,而非上面算法中更新了所有节点的V值

self.values[s] = maxVal

# 立刻遍历s的前驱节点并更新它们的状态

for p in predecessors[s]:

# 找到p的V值和从p计算的最大Q值的差的绝对值diff

maxQ = -float('inf')

for action in self.mdp.getPossibleActions(p):

Q = self.computeQValueFromValues(p ,action)

if Q>maxQ:

maxQ = Q

diff = abs(maxQ - self.values[p])

# 如果diff大于theta的值,则将p以-diff的优先值放到队列中

if diff>self.theta:

pq.update(p, -diff)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值