CartPole-v1 传统Q学习

 类方法

        0   Cart Position             -4.8            4.8

        1   Cart Velocity             -Inf            Inf

        2   Pole Angle                 -24 deg        24 deg

        3   Pole Velocity At Tip      -Inf            Inf

    Num Action

        0   Push cart to the left

        1   Push cart to the right

 

Reward is 1 for every step taken, including the termination step

#输入所需要的类文件
import numpy as np
import matplotlib.pyplot as plt
import gym

#定义环境、状态、GAMMA、ETA、最大步数和迭代次数
ENV = 'CartPole-v1'
NUM_DIGITIZED = 6
GAMMA = 0.99 #decrease rate
ETA = 0.5 #learning rate
MAX_STEPS = 200 #steps for 1 episode
NUM_EPISODES = 2000 #number of episodes

#定义一个行动者的类
class Agent:
    def __init__(self,num_states,num_actions):
        self.brain=Brain(num_states,num_actions)
    def update_Q_function(self, observation, action, reward, observation_next):
        self.brain.update_Q_table(observation, action, reward, observation_next)
    def get_action(self, observation, step):
        action = self.brain.decide_action(observation, step)
        return action
#定义一个游戏分析的类
class Brain:
#初始化生成Q表
    def  __init__(self, num_states, num_actions):
        self.num_actions = num_actions #the number of CartPole actions
    
        self.q_table = np.random.uniform(low=0, high=1, size=(NUM_DIGITIZED**num_states, num_actions)) 
#Q表的尺寸
    def bins(self, clip_min, clip_max, num):

        return np.linspace(clip_min, clip_max, num + 1)[1: -1]

#获得observation状态在Q表中的为位置
    def digitize_state(self, observation):
        #get the discrete state in total 1296 states
        cart_pos, cart_v, pole_angle, pole_v = observation
        
        digitized = [
            np.digitize(cart_pos, bins = self.bins(-2.4, 2.4, NUM_DIGITIZED)),
            np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIGITIZED)),
            np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIGITIZED)), #angle represent by radian
            np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIGITIZED))
        ]
        
        return sum([x* (NUM_DIGITIZED**i) for i, x in enumerate(digitized)])
#更新Q表
    def update_Q_table(self, observation, action, reward, observation_next):
        state = self.digitize_state(observation)
        state_next = self.digitize_state(observation_next)
        Max_Q_next = max(self.q_table[state_next][:])
        self.q_table[state, action] = self.q_table[state, action] + \
            ETA * (reward + GAMMA * Max_Q_next - self.q_table[state, action])

#做出行动
    def decide_action(self, observation, episode):
        #epsilon-greedy
        state = self.digitize_state(observation)
        epsilon = 0.5 * (1 / (episode + 1))
        
        if epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(self.num_actions)
            
        return action
#定义环境启动类
class Environment:
    def __init__(self):
        self.env = gym.make(ENV)
        num_states = self.env.observation_space.shape[0] 
        num_actions = self.env.action_space.n 
        self.agent = Agent(num_states, num_actions) 
    def run(self):
        complete_episodes = 0 
        is_episode_final = False 
        frames = []   
        
        for episode in range(NUM_EPISODES):   
            observation = self.env.reset()  
            
            for step in range(MAX_STEPS):   
                action = self.agent.get_action(observation, episode)
                observation_next, _, done, _ = self.env.step(action)
                if done: 
                    if step < 195:
                        reward = -1  
                        complete_episodes = 0
                    else:   
                        reward = 1  
                        complete_episodes += 1  
                else:
                    reward = 0  
                self.agent.update_Q_function(observation, action, reward, observation_next)
                observation = observation_next
                if done:
                    print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
                    break
q=Environment()
q.run()

函数方法

#平衡倒立摆模型
'''
        0	Cart Position             -4.8            4.8
        1	Cart Velocity             -Inf            Inf
        2	Pole Angle                 -24 deg        24 deg
        3	Pole Velocity At Tip      -Inf            Inf
    Num	Action
        0	Push cart to the left
        1	Push cart to the right

Reward is 1 for every step taken, including the termination step
'''
import numpy as np
import matplotlib.pyplot as plt
import gym
env = gym.make('CartPole-v0')
# print('观测空间 = {}'.format(env.observation_space))
# print('动作空间 = {}'.format(env.action_space))
# print('观测范围 = {} ~ {}'.format(env.observation_space.low,
#         env.observation_space.high))
# print('动作数 = {}'.format(env.action_space.n))
#将每个状态空间离散为6个参数,则一共6*6*6*6个状态,其中动作有左右两个动作,因此Q矩阵的长度为1296*2

num_state=6

#将状态空间分块
p_lim=np.linspace(-2.4,2.4,6+1)[1: -1]
carv_lim=np.linspace(-3,3,6+1)[1: -1]
ang_lim=np.linspace(-0.5,0.5,6+1)[1: -1]
pole_lim=np.linspace(-2.0,2.0,6+1)[1: -1]



def digitize_state(observation):
    cart_pos,cart_v,pole_angle,pole_v=observation
    tag_s=[np.digitize(cart_pos, p_lim),
    np.digitize(cart_v, carv_lim),
    np.digitize(pole_angle, ang_lim), #angle represent by radian
    np.digitize(pole_v, pole_lim)]
    
    s=sum([x* (6**i) for i, x in enumerate(tag_s)])
    return s

ETA=0.5
GAMMA=0.99


q_table = np.random.uniform(low=0, high=1, size=(6**4, 2))
def decide_action( observation, episode):
    #epsilon-greedy
    state = digitize_state(observation)

    epsilon = 0.5 * (1 / (episode + 1))
    if epsilon <= np.random.uniform(0, 1):
        action = np.argmax(q_table[state][:])
    else:
        action = np.random.choice(2)
    return action


def update_Q_table( observation, action, reward, observation_next):
        state = digitize_state(observation)
        state_next =digitize_state(observation_next)
   
        Max_Q_next = max(q_table[state_next][:])
        q_table[state, action] = q_table[state, action] + ETA * (reward + GAMMA * Max_Q_next - q_table[state, action])
env = gym.make('CartPole-v0')
complete_episodes=0
reward = 0.
step=0
episode=0
for episode in range(1000):   #1000 episodes
    observation = env.reset()
    for step in range(200):
        action =decide_action(observation, episode)                              #根据observation结合起来
        next_observation, _, done, _ = env.step(action) 
        if done: #step > 200 or larger than angle
            if step < 195:
                reward = -1  #give punishment if game over less than last step
                #complete_episodes = 0  #game over less than 195 step then reset
            else:   
                reward = 1  
                complete_episodes += 1  
          
        else:
            reward = 0   #until done, reward is 0 


        update_Q_table(observation,action,reward,next_observation)
        observation = next_observation
        if done:
            print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
            break
        
    print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
print(complete_episodes)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值