蒙特卡洛方法解简易迷宫

import numpy as np

def transform(state,action):
    y,x=state//size,state%size
    if action=='n':
        y=y-1 if y>0 and p[y-1,x] else y
    elif action=='e':
        x=x+1 if x<size-1 and p[y,x+1] else x
    elif action=='s':
        y=y+1 if y<size-1 and p[y+1,x] else y
    else:
        x=x-1 if x>0 and p[y,x-1] else x
    r=1 if y*size+x==success else 0
    t=True if y*size+x==success else False
    return t,y*size+x,r

# qs.get(k)有多个最大值时随机取其中一个
def epsilon_greed(qs,state,epsilon=.8):
    if np.random.rand()>epsilon:
        keys=['{}_{}'.format(state,i) for i in actions]
        r=[qs.get(k) if qs.get(k) is not None else 0 for k in keys ]
        r=np.where(r==np.max(r))[0]
        r=np.random.choice(r)
        return actions[r]
    return np.random.choice(actions)


size=5
success=14
actions=['n','e','s','w']
s_over=[3,8,10,11,14,22,23,24]

p=np.ones((size,size))
for s in s_over:
    p[s // size, s % size]=0
p[2,-1]=1
print(p)

act=np.ones((size,size))*-1

qs={}
ns={}
ep=300
gama=.8


s_list,r_list,a_list=[],[],[]
while ep>0:
    t=False
    s=np.random.randint(size*size)
    if s in s_over:continue
    ep -= 1
    s_tmp,r_tmp,a_tmp=[],[],[]
    while not t:
        # a = np.random.choice(actions)
        a = epsilon_greed(qs,actions)
        t,next_s,r=transform(s,a)
        # print(qs,s,a)
        s_tmp.append(s)
        r_tmp.append(r)
        a_tmp.append(a)
        s=next_s
        if next_s==14:break
    s_list.append(s_tmp)
    r_list.append(r_tmp)
    a_list.append(a_tmp)

for i,_ in enumerate(s_list):
    states=s_list[i]
    g=0
    for j in range(len(states)-1,-1,-1):
       g=g*gama+r_list[i][j]
       key='{}_{}'.format(states[j],a_list[i][j])
       ns[key] = ns[key] + 1 if ns.get(key) is not None else 1
       qs[key]=round((qs[key]*(ns[key]-1)+g)/ns[key],3) if qs.get(key) is not None else g


for i,v in enumerate(sorted(qs.keys(),key=lambda x:'0'+x if len(x)==3 else x)):
    end ='\n' if i%4==3 else '\t'
    print(v,':',qs[v],'\t',end=end)

for state in range(size*size):
    y, x = state // size, state % size
    if p[y,x]:
        act[y,x]=actions.index(epsilon_greed(qs,state,epsilon=-1))
print(act)

[[1. 1. 1. 0. 1.]
[1. 1. 1. 0. 1.]
[0. 0. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 0. 0. 0.]]

[[ 2. 2. 2. -1. 2.] [ 1. 1. 2. -1. 2.] [-1. -1. 1. 1. 0.]
[ 1. 1. 1. 1. 0.] [ 1. 0. -1. -1. -1.]]

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值