class CliffWalkingEnv:
def __init__(self,ncol=12,nrow=4):
self.ncol=ncol#定义网格世界的列
self.nrow=nrow#定义网格世界的行
self.P=self.createP()#转移矩阵P[state][action]=[(p,next_state,reward,done)]包含下一个状态和奖励
def createP(self):
P=[[[]for i in range(4)]for j in range(self.ncol*self.nrow)]#初始化
change=[[0,-1],[0,1],[-1,0],[1,0]]#4种动作,change[0]:上;change[0]:上;change[0]:上;change[0]:上。[列变化,行变化];坐标系原点(0,0)
for i in range(self.nrow):
for j in range(self.ncol):
for a in range(len(change)):
if i==self.nrow-1 and j>0:#如果在悬崖或者目标状态,任何动作奖励都为0
P[i*self.ncol+j][a]=[(1,i*self.ncol+j,0,True)]
continue
next_x=min(self.ncol-1,max(0,j+change[a][0]))
next_y=min(self.nrow-1,max(0,i+change[a][1]))
next_state=next_y*self.ncol+next_x
reward=-1
done=False
if next_y==self.ncol-1 and next_x>0:#如果下一个位置在悬崖或者终点,done=True
done=True
if next_x==self.ncol:#如果在悬崖,奖励为-100
reward=-100
P[i*self.ncol+j][a]=[(1,next_state,reward,done)]
return P
class ValueIteration:
""" 价值迭代算法 """
def __init__(self,env,theta,gamma):
self.env=env
self.theta=theta#策略评估收敛阈值
self.gamma=gamma#折扣因子
self.v=[0]*(self.env.ncol*self.env.nrow)#初始化价值为0
self.pi=[None for i in range(self.env.ncol*self.env.nrow)]
def value_iteration(self):
count=0
while 1:
max_diff=0
new_v=[0]*self.env.ncol*self.env.nrow
for s in range(self.env.ncol*self.env.nrow):
Qsa_list=[]#开始计算状态s下面的所有Q(s,a)价值
for a in range(4):
Qsa=0
for res in self.env.P[s][a]:
p,next_state,reward,done=res
Qsa+=p*(reward+self.gamma*self.v[next_state]*(1-done))
Qsa_list.append(Qsa)
new_v[s]=max(Qsa_list)
max_diff=max(max_diff,abs(new_v[s]-self.v[s]))
self.v=new_v
if max_diff<self.theta:break
count+=1
print("价值迭代进行%d轮后完成"%count)
self.getpolicy()
def getpolicy(self):#价值迭代完成后,根据价值函数导出一个贪婪策略
for s in range(self.env.ncol*self.env.nrow):
Qsa_list=[]
for a in range(4):
Qsa=0
for res in self.env.P[s][a]:
p,next_state,reward,done=res
Qsa+=p*(reward+self.gamma*self.v[next_state]*(1-done))
Qsa_list.append(Qsa)
max_Qsa=max(Qsa_list)
count_max_Qsa=Qsa_list.count(max_Qsa)
self.pi[s]=[1/count_max_Qsa if p==max_Qsa else 0 for p in Qsa_list]
def print_agent(agent,action_meaning,disater=[],end=[]):
print("状态价值:")
for i in range(agent.env.nrow):
for j in range(agent.env.ncol):
print('%6.6s' % ('%.3f' % agent.v[i*agent.env.ncol+j]),end=' ')
print()
print("策略:")
for i in range(agent.env.nrow):
for j in range(agent.env.ncol):
#一些特殊的状态,例如悬崖漫步中的悬崖
if (i*agent.env.ncol+j) in disater:
print('****',end=' ')
elif (i*agent.env.ncol+j) in end:#目标状态
print('EEEE',end=' ')
else:
a=agent.pi[i*agent.env.ncol+j]
pi_str=''
for k in range(len(action_meaning)):
pi_str+=action_meaning[k] if a[k]>0 else 'o'
print(pi_str,end=' ')
print()#换行
env=CliffWalkingEnv()
theta=0.001
gamma=0.9
action_meaning=['↑','↓','←','→']
agent=ValueIteration(env,theta,gamma)
agent.value_iteration()
print_agent(agent,action_meaning,list(range(37,47)),[47])
""" 解决同样的训练问题,价值迭代总共进行了数十轮,而策略迭代中策略评估总共进行了数百轮,价值迭代中的循环次数远少于策略迭代。 """
05-01
3284
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)