[强化学习-3] Q-Learning &SARA

最新推荐文章于 2024-08-31 00:00:00 发布

华华不在

最新推荐文章于 2024-08-31 00:00:00 发布

阅读量443

点赞数 9

分类专栏：强化学习文章标签：机器学习人工智能

本文链接：https://blog.csdn.net/qq_43459731/article/details/135094693

版权

强化学习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

公式：

QL： $Q(S,A) \leftarrow Q(S,A) + \alpha [R+\gamma max_{a}Q(S',A)-Q(S,A)]$

SARA: $Q(S,A) \leftarrow Q(S,A)+\alpha [R+\gamma Q(S',A)-Q(S,A)]$

在相同策略下，选择产生的动作At+1就是SARA

在相同策略下，选择能够产生的最大动作就是QL

QL代码


import numpy as np
import gym
import random


env = gym.make("FrozenLake-v1", render_mode="rgb_array") #环境提供当前状态的 RGB 图像

## 建立Q表
# 获取action_size和state_size
action_size = env.action_space.n
state_size = env.observation_space.n
# 创建Q表并打印，初始化Q值都为0
qtable = np.zeros((state_size, action_size))
print(qtable)

## 创建超参数
total_episodes = 15000        # Total episodes 训练次数
learning_rate = 0.8           # Learning rate 学习率
max_steps = 99                # Max steps per episode 一次训练中最多决策次数
gamma = 0.95                  # Discounting rate 折扣率，对未来收益的折扣

# Exploration parameters
epsilon = 1.0                 # Exploration rate 探索率，就是选择动作时，随机选择动作的概率
max_epsilon = 1.0             # Exploration probability at start 初始探索率
min_epsilon = 0.01            # Minimum exploration probability 最低探索率
decay_rate = 0.005            # Exponential decay rate for exploration prob 探索率消减的指数

##Q learning部分
# List of rewards
rewards = [] # 奖励列表用于存储训练过程中每个情节获得的总奖励

# For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    state = state[0]  # 本来没这条代码，但是我看这个是二元组，为了后面估计Q值可以跑，我就改成这个了，我看着是不影响的
    step = 0
    done = False # 用于指示当前事件是否已完成
    total_rewards = 0

    for step in range(max_steps):
        # Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1) # 生成一个(0,1)的数字，这个数字用来决定agent在当前状态下应该探索还是利用

        '''
        当 epsilon 较高时，agent更倾向于探索和采取随机行动来发现环境。
        随着训练的进行和ε的降低（由于衰减），代理会更多地依赖学习到的 Q 值进行开发。
        随着代理经验的增加，这种方法可以帮助它逐渐从探索转向利用。
        '''
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon: # 利用 -> 即选取最大值
            action = np.argmax(qtable[state, :])

        # Else doing a random choice --> exploration
        else: # 否则（小于等于）探索 -> 做一个随机选择
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info, _ = env.step(action)  # 这个也是，刚开始报错，来后我查了新的库这个函数输出五个数，网上说最后那个加‘_’就行
                                                             # "_" 占位符，用于表示 step 函数返回的代码中不需要的任何附加信息。

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (
                    reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]) #qlearning 公式表达

        total_rewards += reward

        # Our new state is state
        state = new_state

        # If done (if we're dead) : finish episode
        if done == True:
            break

    # Reduce epsilon (because we need less and less exploration) 随着智能体对环境熟悉程度增加，可以减少对环境的探索
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) # epsilon衰减公式
    rewards.append(total_rewards) # agent在单次事件中获得的奖励总和

print("Score over time: " + str(sum(rewards) / total_episodes)) # 打印训练期间所有事件的平均得分（总奖励）
'''sum(rewards)： 计算所有事件中获得的所有奖励的总和。
/ total_episodes： 用奖励总和除以总集数（total_episodes）
'''
print(qtable)

## 使用qlearning玩冰壶

env.reset()

for episode in range(5):
    state = env.reset()
    state = state[0]
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):

        # Take the action (index) that have the maximum expected future reward given that state
        '''qtable[state,:]： 读取 Q 表中与当前状态对应的行。这一行包含该状态下所有可能行动的 Q 值。
           np.argmax(qtable[state, :])： 查找所选行中最大 Q 值的索引，
           该索引对应的行动被认为是最有前途的，或具有最高的预期累积奖励。
        '''
        action = np.argmax(qtable[state, :])

        new_state, reward, done, info, _ = env.step(action) # 模拟agent与环境的交互

        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()

            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

SARA代码：


import numpy as np
import gym
import random


env = gym.make("FrozenLake-v1", render_mode="rgb_array") #环境提供当前状态的 RGB 图像

## 建立Q表
# 获取action_size和state_size
action_size = env.action_space.n
state_size = env.observation_space.n
# 创建Q表并打印，初始化Q值都为0
qtable = np.zeros((state_size, action_size))
print(qtable)

## 创建超参数
total_episodes = 15000        # Total episodes 训练次数
learning_rate = 0.8           # Learning rate 学习率
max_steps = 99                # Max steps per episode 一次训练中最多决策次数
gamma = 0.95                  # Discounting rate 折扣率，对未来收益的折扣

# Exploration parameters
epsilon = 1.0                 # Exploration rate 探索率，就是选择动作时，随机选择动作的概率
max_epsilon = 1.0             # Exploration probability at start 初始探索率
min_epsilon = 0.01            # Minimum exploration probability 最低探索率
decay_rate = 0.005            # Exponential decay rate for exploration prob 探索率消减的指数

##SARA 部分
# List of rewards
rewards = [] # 奖励列表用于存储训练过程中每个情节获得的总奖励

# For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    state = state[0]  # 本来没这条代码，但是我看这个是二元组，为了后面估计Q值可以跑，我就改成这个了，我看着是不影响的
    step = 0
    done = False # 用于指示当前事件是否已完成
    total_rewards = 0

    for step in range(max_steps):
        # Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1) # 生成一个(0,1)的数字，这个数字用来决定agent在当前状态下应该探索还是利用

        '''
        当 epsilon 较高时，agent更倾向于探索和采取随机行动来发现环境。
        随着训练的进行和ε的降低（由于衰减），代理会更多地依赖学习到的 Q 值进行开发。
        随着代理经验的增加，这种方法可以帮助它逐渐从探索转向利用。
        '''
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon: # 利用 -> 即选取最大值
            action = np.argmax(qtable[state, :])

        # Else doing a random choice --> exploration
        else: # 否则（小于等于）探索 -> 做一个随机选择
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info, _ = env.step(action)  # 这个也是，刚开始报错，来后我查了新的库这个函数输出五个数，网上说最后那个加‘_’就行
                                                             # "_" 占位符，用于表示 step 函数返回的代码中不需要的任何附加信息。

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma *  Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (
                    reward + gamma * (qtable[new_state, :]) - qtable[state, action]) #qlearning 公式表达

        total_rewards += reward

        # Our new state is state
        state = new_state

        # If done (if we're dead) : finish episode
        if done == True:
            break

    # Reduce epsilon (because we need less and less exploration) 随着智能体对环境熟悉程度增加，可以减少对环境的探索
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) # epsilon衰减公式
    rewards.append(total_rewards) # agent在单次事件中获得的奖励总和

print("Score over time: " + str(sum(rewards) / total_episodes)) # 打印训练期间所有事件的平均得分（总奖励）
'''sum(rewards)： 计算所有事件中获得的所有奖励的总和。
/ total_episodes： 用奖励总和除以总集数（total_episodes）
'''
print(qtable)

## 使用qlearning玩冰壶

env.reset()

for episode in range(5):
    state = env.reset()
    state = state[0]
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):

        # Take the action (index) that have the maximum expected future reward given that state
        '''qtable[state,:]： 读取 Q 表中与当前状态对应的行。这一行包含该状态下所有可能行动的 Q 值。
           np.argmax(qtable[state, :])： 查找所选行中最大 Q 值的索引，
           该索引对应的行动被认为是最有前途的，或具有最高的预期累积奖励。
        '''
        action = np.argmax(qtable[state, :])

        new_state, reward, done, info, _ = env.step(action) # 模拟agent与环境的交互

        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()

            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()