[强化学习-3] Q-Learning &SARA

公式:

QL:Q(S,A) \leftarrow Q(S,A) + \alpha [R+\gamma max_{a}Q(S',A)-Q(S,A)]

SARA: Q(S,A) \leftarrow Q(S,A)+\alpha [R+\gamma Q(S',A)-Q(S,A)]

在相同策略下,选择产生的动作At+1就是SARA

在相同策略下,选择能够产生的最大动作就是QL

QL代码


import numpy as np
import gym
import random


env = gym.make("FrozenLake-v1", render_mode="rgb_array") #环境提供当前状态的 RGB 图像

## 建立Q表
# 获取action_size和state_size
action_size = env.action_space.n
state_size = env.observation_space.n
# 创建Q表并打印,初始化Q值都为0
qtable = np.zeros((state_size, action_size))
print(qtable)

## 创建超参数
total_episodes = 15000        # Total episodes 训练次数
learning_rate = 0.8           # Learning rate 学习率
max_steps = 99                # Max steps per episode 一次训练中最多决策次数
gamma = 0.95                  # Discounting rate 折扣率,对未来收益的折扣

# Exploration parameters
epsilon = 1.0                 # Exploration rate 探索率,就是选择动作时,随机选择动作的概率
max_epsilon = 1.0             # Exploration probability at start 初始探索率
min_epsilon = 0.01            # Minimum exploration probability 最低探索率
decay_rate = 0.005            # Exponential decay rate for exploration prob 探索率消减的指数

##Q learning部分
# List of rewards
rewards = [] # 奖励列表用于存储训练过程中每个情节获得的总奖励

# For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    state = state[0]  # 本来没这条代码,但是我看这个是二元组,为了后面估计Q值可以跑,我就改成这个了,我看着是不影响的
    step = 0
    done = False # 用于指示当前事件是否已完成
    total_rewards = 0

    for step in range(max_steps):
        # Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1) # 生成一个(0,1)的数字,这个数字用来决定agent在当前状态下应该探索还是利用

        '''
        当 epsilon 较高时,agent更倾向于探索和采取随机行动来发现环境。
        随着训练的进行和ε的降低(由于衰减),代理会更多地依赖学习到的 Q 值进行开发。
        随着代理经验的增加,这种方法可以帮助它逐渐从探索转向利用。
        '''
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon: # 利用 -> 即选取最大值
            action = np.argmax(qtable[state, :])

        # Else doing a random choice --> exploration
        else: # 否则(小于等于)探索 -> 做一个随机选择
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info, _ = env.step(action)  # 这个也是,刚开始报错,来后我查了新的库这个函数输出五个数,网上说最后那个加‘_’就行
                                                             # "_" 占位符,用于表示 step 函数返回的代码中不需要的任何附加信息。

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (
                    reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]) #qlearning 公式表达

        total_rewards += reward

        # Our new state is state
        state = new_state

        # If done (if we're dead) : finish episode
        if done == True:
            break

    # Reduce epsilon (because we need less and less exploration) 随着智能体对环境熟悉程度增加,可以减少对环境的探索
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) # epsilon衰减公式
    rewards.append(total_rewards) # agent在单次事件中获得的奖励总和

print("Score over time: " + str(sum(rewards) / total_episodes)) # 打印训练期间所有事件的平均得分(总奖励)
'''sum(rewards): 计算所有事件中获得的所有奖励的总和。
/ total_episodes: 用奖励总和除以总集数(total_episodes)
'''
print(qtable)

## 使用qlearning玩冰壶

env.reset()

for episode in range(5):
    state = env.reset()
    state = state[0]
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):

        # Take the action (index) that have the maximum expected future reward given that state
        '''qtable[state,:]: 读取 Q 表中与当前状态对应的行。这一行包含该状态下所有可能行动的 Q 值。
           np.argmax(qtable[state, :]): 查找所选行中最大 Q 值的索引,
           该索引对应的行动被认为是最有前途的,或具有最高的预期累积奖励。
        '''
        action = np.argmax(qtable[state, :])

        new_state, reward, done, info, _ = env.step(action) # 模拟agent与环境的交互

        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()

            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

SARA代码:


import numpy as np
import gym
import random


env = gym.make("FrozenLake-v1", render_mode="rgb_array") #环境提供当前状态的 RGB 图像

## 建立Q表
# 获取action_size和state_size
action_size = env.action_space.n
state_size = env.observation_space.n
# 创建Q表并打印,初始化Q值都为0
qtable = np.zeros((state_size, action_size))
print(qtable)

## 创建超参数
total_episodes = 15000        # Total episodes 训练次数
learning_rate = 0.8           # Learning rate 学习率
max_steps = 99                # Max steps per episode 一次训练中最多决策次数
gamma = 0.95                  # Discounting rate 折扣率,对未来收益的折扣

# Exploration parameters
epsilon = 1.0                 # Exploration rate 探索率,就是选择动作时,随机选择动作的概率
max_epsilon = 1.0             # Exploration probability at start 初始探索率
min_epsilon = 0.01            # Minimum exploration probability 最低探索率
decay_rate = 0.005            # Exponential decay rate for exploration prob 探索率消减的指数

##SARA 部分
# List of rewards
rewards = [] # 奖励列表用于存储训练过程中每个情节获得的总奖励

# For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    state = state[0]  # 本来没这条代码,但是我看这个是二元组,为了后面估计Q值可以跑,我就改成这个了,我看着是不影响的
    step = 0
    done = False # 用于指示当前事件是否已完成
    total_rewards = 0

    for step in range(max_steps):
        # Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1) # 生成一个(0,1)的数字,这个数字用来决定agent在当前状态下应该探索还是利用

        '''
        当 epsilon 较高时,agent更倾向于探索和采取随机行动来发现环境。
        随着训练的进行和ε的降低(由于衰减),代理会更多地依赖学习到的 Q 值进行开发。
        随着代理经验的增加,这种方法可以帮助它逐渐从探索转向利用。
        '''
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon: # 利用 -> 即选取最大值
            action = np.argmax(qtable[state, :])

        # Else doing a random choice --> exploration
        else: # 否则(小于等于)探索 -> 做一个随机选择
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info, _ = env.step(action)  # 这个也是,刚开始报错,来后我查了新的库这个函数输出五个数,网上说最后那个加‘_’就行
                                                             # "_" 占位符,用于表示 step 函数返回的代码中不需要的任何附加信息。

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma *  Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (
                    reward + gamma * (qtable[new_state, :]) - qtable[state, action]) #qlearning 公式表达

        total_rewards += reward

        # Our new state is state
        state = new_state

        # If done (if we're dead) : finish episode
        if done == True:
            break

    # Reduce epsilon (because we need less and less exploration) 随着智能体对环境熟悉程度增加,可以减少对环境的探索
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) # epsilon衰减公式
    rewards.append(total_rewards) # agent在单次事件中获得的奖励总和

print("Score over time: " + str(sum(rewards) / total_episodes)) # 打印训练期间所有事件的平均得分(总奖励)
'''sum(rewards): 计算所有事件中获得的所有奖励的总和。
/ total_episodes: 用奖励总和除以总集数(total_episodes)
'''
print(qtable)

## 使用qlearning玩冰壶

env.reset()

for episode in range(5):
    state = env.reset()
    state = state[0]
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):

        # Take the action (index) that have the maximum expected future reward given that state
        '''qtable[state,:]: 读取 Q 表中与当前状态对应的行。这一行包含该状态下所有可能行动的 Q 值。
           np.argmax(qtable[state, :]): 查找所选行中最大 Q 值的索引,
           该索引对应的行动被认为是最有前途的,或具有最高的预期累积奖励。
        '''
        action = np.argmax(qtable[state, :])

        new_state, reward, done, info, _ = env.step(action) # 模拟agent与环境的交互

        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()

            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

  • 9
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值