import numpy as np
def transform ( state, action) :
y, x= state// size, state% size
if action== 'n' :
y= y- 1 if y> 0 and p[ y- 1 , x] else y
elif action== 'e' :
x= x+ 1 if x< size- 1 and p[ y, x+ 1 ] else x
elif action== 's' :
y= y+ 1 if y< size- 1 and p[ y+ 1 , x] else y
else :
x= x- 1 if x> 0 and p[ y, x- 1 ] else x
r= 1 if y* size+ x== success else 0
t= True if y* size+ x== success else False
return t, y* size+ x, r
def epsilon_greed ( qs, state, epsilon= .8 ) :
if np. random. rand( ) > epsilon:
keys= [ '{}_{}' . format ( state, i) for i in actions]
r= [ qs. get( k) if qs. get( k) is not None else 0 for k in keys ]
r= np. where( r== np. max ( r) ) [ 0 ]
r= np. random. choice( r)
return actions[ r]
return np. random. choice( actions)
def sarsa ( state, action) :
count= 0
while count< 100 :
t, next_s, r = transform( state, action)
next_a = epsilon_greed( qs, next_s)
next_q= qs[ kstr. format ( next_s, next_a) ]
qs[ kstr. format ( state, action) ] += alpha* ( r+ gama* next_q- qs[ kstr. format ( state, action) ] )
if t: break
state= next_s
action= next_a
count += 1
size= 5
success= 14
actions= [ 'n' , 'e' , 's' , 'w' ]
s_over= [ 3 , 8 , 10 , 11 , 14 , 22 , 23 , 24 ]
p= np. ones( ( size, size) )
for s in s_over:
p[ s // size, s % size] = 0
p[ 2 , - 1 ] = 1
print ( p)
ep= 1000
gama= .8
alpha= 0.1
kstr= '{}_{}'
qs= { kstr. format ( s, a) : 0 for s in range ( size* size) for a in actions}
ns= { }
for i in range ( ep) :
state= np. random. randint( size* size)
if not p[ state// size, state% size] : continue
action = epsilon_greed( qs, state)
sarsa( state, action)
qs= { k: round ( qs[ k] , 3 ) for k in qs}
print ( qs)
for i in range ( size* size) :
if p[ i// size, i% size] :
print ( i, ":" , epsilon_greed( qs, i, - 1 ) , end= '\t' )
0 : e 1 : e 2 : s 4 : s
5 : e 6 : e 7 : s 9 : s
12 : e 13 : e 14 : w
15 : e 16 : e 17 : e 18 : e 19 : n
20 : n 21 : n