一些解释
- EPISILON 也就是控制探索还是开发的这个参数我是使用每个episode之后就乘以一个常数的方法来进行衰减,就是蛮无脑的。
代码
import gym
import numpy as np
import random
import os
env = gym.make("CliffWalking-v0")
N_STATE = env.observation_space.n
N_ACTION = env.action_space.n
q_table = np.zeros((N_STATE,N_ACTION))
LR = 0.9
GAMMA = 0.9
#episilon = 0.9
#episilon_end = 0.5
tmp= env.reset()
print(tmp)
def choose_action(state,episilon):
#每次选择的话是根据EPISILON来进行选择动作
if random.random() > episilon :
a = np.argmax(q_table[state,:])
#print(a)
else:
a = np.random.randint(0,N_ACTION)
return a
def update(state,action,state_,reward):
max_q = max(q_table[state_ , :])
q_table[state][action] = q_table[state][action] + LR * (reward + GAMMA * max_q - q_table[state][action])
def main():
episilon = 0.9
for episode in range(500):
state = env.reset()
episode_reward = 0
while True:
action = choose_action(state,episilon)
state_,reward,done,info = env.step(action)
env.render()
update(state,action,state_,reward)
state = state_
episode_reward += reward
if done :
print("the {}th episode 's total reward is ".format(episode,episode_reward))
os.system("pause")
episilon = 0.95 * episilon
break
if __name__ == "__main__" :
main()