采用贪婪算法,每走一步-1,遇到障碍-10,到达+20.终止条件是抵达障碍物或终点,迭代200次。
Q(s,a)=Q(s,a)+alpha*(r+gama*max(Q(s',a'))-Q(s,a))
s'为下一状态,max指的是a'的最大Q值。
maze是地图,1为起点,2终点,3障碍,4路径,0初始值,Q是状态-行为对的value。
代码如下:
maze=[]
size=8
reward_ob=-10
reward_walk=-1
reward_goal=20
Q=[]
alpha=0.5
gama=0.9
pos=[0,0]
ini_x=0
ini_y=0
def init_Q():
tmp=[]
tmp1=[]
for i in range(size+2):
tmp1=[]
for j in range(size+2):
tmp=[]
for k in range(4):
tmp.append(0)
tmp1.append(tmp)
Q.append(tmp1)
def init_map():
global maze,size
for i in range(size+2):
tmp=[]
for j in range(size+2):
tmp.append(0)
maze.append(tmp)
for i in range(size+2):
for j in range(size+2):
if i==0 or j==0 or i==size+1 or j==size+1:
maze[i][j]=3
return maze
def init_goal(maze,size,x,y,x1,y1):
global ini_x,ini_y,pos
maze[y][x]=1
maze[y1][x1]=2
pos[0]=x
pos[1]=y
ini_x=x
ini_y=y
return maze
def init_obstacle(maze,size,x,y,x1,y1):
for i in range(x,x1+1):
for j in range(y,y1+1):
maze[i][j]=3
return maze
#a:0 1 2 3->left up right down
def reward(x,y,a):
if a==0:
if x<=1:
return reward_ob
else:
if maze[y][x-1]==3:
return reward_ob
else:
if maze[y][x-1]==2:
return reward_goal
else:
return reward_walk
if a==1:
if y<=1:
return reward_ob
else:
if maze[y-1][x]==3:
return reward_ob
else:
if maze[y-1][x]==2:
return reward_goal
else:
return reward_walk
if a==2:
if x>=size:
return reward_ob
else:
if maze[y][x+1]==3:
return reward_ob
else:
if maze[y][x+1]==2:
return reward_goal
else:
return reward_walk
if a==3:
if y>=size:
return reward_ob
else:
if maze[y+1][x]==3:
return reward_ob
else:
if maze[y+1][x]==2:
return reward_goal
else:
return reward_walk
def evo_Q(p):
cur_x=p[0]
cur_y=p[1]
best_a=Q[cur_y][cur_x].index(max(Q[cur_y][cur_x]))
if best_a==0:
p[0]-=1
if best_a==1:
p[1]-=1
if best_a==2:
p[0]+=1
if best_a==3:
p[1]+=1
Q[cur_y][cur_x][best_a]+=alpha*(reward(cur_x,cur_y,best_a)+gama*max(Q[p[1]][p[0]])-Q[cur_y][cur_x][best_a])
def better_Q(p):
global ini_x,ini_y
repeat_num=200
for i in range(repeat_num):
while(maze[p[1]][p[0]]!=3 and maze[p[1]][p[0]]!=2):
evo_Q(p)
p=[ini_x,ini_y]
def show_path(solv):
p=[ini_x,ini_y]
while maze[p[1]][p[0]]!=3 and maze[p[1]][p[0]]!=2:
if maze[p[1]][p[0]]!=1:
maze[p[1]][p[0]]=4
index_solv=solv[p[1]][p[0]].index(max(solv[p[1]][p[0]]))
if index_solv==0:
p[0]-=1
if index_solv==1:
p[1]-=1
if index_solv==2:
p[0]+=1
if index_solv==3:
p[1]+=1
for i in maze:
print i
if __name__ == "__main__":
init_map()
init_goal(maze,size,6,1,6,5)
init_obstacle(maze,size,2,2,2,8)
init_obstacle(maze,size,2,4,6,4)
init_Q()
better_Q(pos)
show_path(Q)