Q-learning一维寻宝实现
Q-learning
代码实现
import numpy as np
import pandas as pd
import time
np. random. seed( 2 )
N_STATES= 6
ACTIONS= [ "left" , "right" ]
EPCILON= 0.9
ALPHA= 0.1
GAMMA= 0.9
MAX_EPISODE= 13
FRESH_TIME= 0.3
def build_q_table ( n_states, actions) :
table= pd. DataFrame( np. zeros( ( n_states, len ( actions) ) ) , columns= actions)
return table
def choose_action ( state, table) :
actions= table. iloc[ state, : ]
if np. random. uniform( ) > EPCILON or actions. all ( ) == 0 :
action= np. random. choice( ACTIONS)
else :
action= actions. idxmax( )
return action
def env_feedback ( S, A) :
if A== "right" :
if S== N_STATES- 2 :
S_= "terminal"
reward= 1
else :
S_= S+ 1
reward= 0
else :
if S== 0 :
S_= S
else :
S_= S- 1
reward= 0
return S_, reward
def print_env ( S, episode, step_counter) :
env_list = [ '-' ] * ( N_STATES- 1 ) + [ 'T' ]
if S == 'terminal' :
interaction = 'Episode %s: total_steps = %s' % ( episode+ 1 , step_counter)
print ( '\r{}' . format ( interaction) , end= '' )
time. sleep( 2 )
print ( '\r ' , end= '' )
else :
env_list[ S] = 'o'
interaction = '' . join( env_list)
print ( '\r{}' . format ( interaction) , end= '' )
time. sleep( FRESH_TIME)
def q_learning ( ) :
q_table= build_q_table( N_STATES, ACTIONS)
for episode in range ( MAX_EPISODE) :
is_terminated= False
step_counter= 0
S= 0
print_env( S, episode, step_counter)
while not is_terminated:
A= choose_action( S, q_table)
S_, r= env_feedback( S, A)
q_predict= q_table. loc[ S, A]
if S_== "terminal" :
q_target= r
is_terminated= True
else :
q_target= r+ GAMMA* q_table. iloc[ S_, : ] . max ( )
q_table. loc[ S, A] += ALPHA* ( q_target- q_predict)
S= S_
step_counter+= 1
print_env( S, episode, step_counter)
return q_table
if __name__== "__main__" :
q_table= q_learning( )
print ( q_table)