类方法
0 Cart Position -4.8 4.8
1 Cart Velocity -Inf Inf
2 Pole Angle -24 deg 24 deg
3 Pole Velocity At Tip -Inf Inf
Num Action
0 Push cart to the left
1 Push cart to the right
Reward is 1 for every step taken, including the termination step
#输入所需要的类文件
import numpy as np
import matplotlib.pyplot as plt
import gym
#定义环境、状态、GAMMA、ETA、最大步数和迭代次数
ENV = 'CartPole-v1'
NUM_DIGITIZED = 6
GAMMA = 0.99 #decrease rate
ETA = 0.5 #learning rate
MAX_STEPS = 200 #steps for 1 episode
NUM_EPISODES = 2000 #number of episodes
#定义一个行动者的类
class Agent:
def __init__(self,num_states,num_actions):
self.brain=Brain(num_states,num_actions)
def update_Q_function(self, observation, action, reward, observation_next):
self.brain.update_Q_table(observation, action, reward, observation_next)
def get_action(self, observation, step):
action = self.brain.decide_action(observation, step)
return action
#定义一个游戏分析的类
class Brain:
#初始化生成Q表
def __init__(self, num_states, num_actions):
self.num_actions = num_actions #the number of CartPole actions
self.q_table = np.random.uniform(low=0, high=1, size=(NUM_DIGITIZED**num_states, num_actions))
#Q表的尺寸
def bins(self, clip_min, clip_max, num):
return np.linspace(clip_min, clip_max, num + 1)[1: -1]
#获得observation状态在Q表中的为位置
def digitize_state(self, observation):
#get the discrete state in total 1296 states
cart_pos, cart_v, pole_angle, pole_v = observation
digitized = [
np.digitize(cart_pos, bins = self.bins(-2.4, 2.4, NUM_DIGITIZED)),
np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIGITIZED)),
np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIGITIZED)), #angle represent by radian
np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIGITIZED))
]
return sum([x* (NUM_DIGITIZED**i) for i, x in enumerate(digitized)])
#更新Q表
def update_Q_table(self, observation, action, reward, observation_next):
state = self.digitize_state(observation)
state_next = self.digitize_state(observation_next)
Max_Q_next = max(self.q_table[state_next][:])
self.q_table[state, action] = self.q_table[state, action] + \
ETA * (reward + GAMMA * Max_Q_next - self.q_table[state, action])
#做出行动
def decide_action(self, observation, episode):
#epsilon-greedy
state = self.digitize_state(observation)
epsilon = 0.5 * (1 / (episode + 1))
if epsilon <= np.random.uniform(0, 1):
action = np.argmax(self.q_table[state][:])
else:
action = np.random.choice(self.num_actions)
return action
#定义环境启动类
class Environment:
def __init__(self):
self.env = gym.make(ENV)
num_states = self.env.observation_space.shape[0]
num_actions = self.env.action_space.n
self.agent = Agent(num_states, num_actions)
def run(self):
complete_episodes = 0
is_episode_final = False
frames = []
for episode in range(NUM_EPISODES):
observation = self.env.reset()
for step in range(MAX_STEPS):
action = self.agent.get_action(observation, episode)
observation_next, _, done, _ = self.env.step(action)
if done:
if step < 195:
reward = -1
complete_episodes = 0
else:
reward = 1
complete_episodes += 1
else:
reward = 0
self.agent.update_Q_function(observation, action, reward, observation_next)
observation = observation_next
if done:
print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
break
q=Environment()
q.run()
函数方法
#平衡倒立摆模型
'''
0 Cart Position -4.8 4.8
1 Cart Velocity -Inf Inf
2 Pole Angle -24 deg 24 deg
3 Pole Velocity At Tip -Inf Inf
Num Action
0 Push cart to the left
1 Push cart to the right
Reward is 1 for every step taken, including the termination step
'''
import numpy as np
import matplotlib.pyplot as plt
import gym
env = gym.make('CartPole-v0')
# print('观测空间 = {}'.format(env.observation_space))
# print('动作空间 = {}'.format(env.action_space))
# print('观测范围 = {} ~ {}'.format(env.observation_space.low,
# env.observation_space.high))
# print('动作数 = {}'.format(env.action_space.n))
#将每个状态空间离散为6个参数,则一共6*6*6*6个状态,其中动作有左右两个动作,因此Q矩阵的长度为1296*2
num_state=6
#将状态空间分块
p_lim=np.linspace(-2.4,2.4,6+1)[1: -1]
carv_lim=np.linspace(-3,3,6+1)[1: -1]
ang_lim=np.linspace(-0.5,0.5,6+1)[1: -1]
pole_lim=np.linspace(-2.0,2.0,6+1)[1: -1]
def digitize_state(observation):
cart_pos,cart_v,pole_angle,pole_v=observation
tag_s=[np.digitize(cart_pos, p_lim),
np.digitize(cart_v, carv_lim),
np.digitize(pole_angle, ang_lim), #angle represent by radian
np.digitize(pole_v, pole_lim)]
s=sum([x* (6**i) for i, x in enumerate(tag_s)])
return s
ETA=0.5
GAMMA=0.99
q_table = np.random.uniform(low=0, high=1, size=(6**4, 2))
def decide_action( observation, episode):
#epsilon-greedy
state = digitize_state(observation)
epsilon = 0.5 * (1 / (episode + 1))
if epsilon <= np.random.uniform(0, 1):
action = np.argmax(q_table[state][:])
else:
action = np.random.choice(2)
return action
def update_Q_table( observation, action, reward, observation_next):
state = digitize_state(observation)
state_next =digitize_state(observation_next)
Max_Q_next = max(q_table[state_next][:])
q_table[state, action] = q_table[state, action] + ETA * (reward + GAMMA * Max_Q_next - q_table[state, action])
env = gym.make('CartPole-v0')
complete_episodes=0
reward = 0.
step=0
episode=0
for episode in range(1000): #1000 episodes
observation = env.reset()
for step in range(200):
action =decide_action(observation, episode) #根据observation结合起来
next_observation, _, done, _ = env.step(action)
if done: #step > 200 or larger than angle
if step < 195:
reward = -1 #give punishment if game over less than last step
#complete_episodes = 0 #game over less than 195 step then reset
else:
reward = 1
complete_episodes += 1
else:
reward = 0 #until done, reward is 0
update_Q_table(observation,action,reward,next_observation)
observation = next_observation
if done:
print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
break
print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
print(complete_episodes)