Q-learning例1探索者【学习笔记3】

最新推荐文章于 2023-03-16 13:27:56 发布

Alan_Lowe

最新推荐文章于 2023-03-16 13:27:56 发布

阅读量169

点赞数

分类专栏： # 强化学习文章标签： python Q-Learning

本文链接：https://blog.csdn.net/qq_45985728/article/details/120728905

版权

强化学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Q-learning例1探索者

在一维空间中有一条长度为6的空间，最开始的时候探索者在0位置，宝藏在5位置，探索者在这个一维空间中可以向左向右走，他走多少步能找到宝藏？

因为在初始阶段, 随机的探索环境, 往往比固定的行为模式要好,所以这也是累积经验的阶段, 我们希望探索者不会太贪婪，所以引入一个EPSILON 用来控制贪婪程度的。EPSILON 可以随着探索时间不断提升(越来越贪婪)，不过在这个例子中, 我们就固定成 EPSILON = 0.9, 90% 的时间是选择最优策略, 10% 的时间来探索.

1.预置值

import numpy as np
import pandas as pd
import time

np.random.seed(2) # reproducible

N_states = 6 # 1维世界的宽度
Actions = ['left','right'] #探索者的可用行动
Epsilon = 0.9 #贪婪度
# 因为在初始阶段, 随机的探索环境, 往往比固定的行为模式要好,
# 所以这也是累积经验的阶段, 我们希望探索者不会那么贪婪(greedy).
# 所以 EPSILON 就是用来控制贪婪程度的值. EPSILON 可以随着探索时间不断提升(越来越贪婪)
# 不过在这个例子中, 我们就固定成 EPSILON = 0.9, 90% 的时间是选择最优策略, 10% 的时间来探索.
Alpha = 0.1 #学习效率
Gamma = 0.9 #奖励递减值
Max_Episodes = 13 #最大回合数
Fresh_Time = 0.3 #移动间隔时间

2.建立q-table表

def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),     # 表的初始值
        columns=actions,    # 行为
    )
    return table
# q_table初始:
#    left  right
# 0   0.0    0.0
# 1   0.0    0.0
# 2   0.0    0.0
# 3   0.0    0.0
# 4   0.0    0.0
# 5   0.0    0.0

3.定义动作

# 在某个 state 地点, 选择行为
def choose_action(state, q_table):
    state_actions = q_table.iloc[state, :]  # 选出这个 state 的所有 action 值
    if (np.random.uniform() > Epsilon) or (state_actions.all() == 0):  # 非贪婪 or 或者这个 state 还没有探索过
        action_name = np.random.choice(Actions)
    else:
        action_name = state_actions.idxmax()    # 贪婪模式
    return action_name

4.环境反馈

def get_env_feedback(S, A):
    if A == 'right':    # move right
        if S == N_states - 2:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:   # move left
        R = 0
        if S == 0:
            S_ = S  # reach the wall
        else:
            S_ = S - 1
    return S_, R

5.环境更新（不多看）

# 环境更新
def update_env(S, episode, step_counter):
    env_list = ['-']*(N_states-1) + ['T']
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(Fresh_Time)

6.强化学习

# 强化学习主循环
def rl():
    q_table = build_q_table(N_states, Actions)  # 初始 q table
    for episode in range(Max_Episodes):     # 回合
        step_counter = 0
        S = 0   # 回合初始位置
        is_terminated = False   # 是否回合结束
        update_env(S, episode, step_counter)    # 环境更新
        while not is_terminated:

            A = choose_action(S, q_table)   # 选行为
            S_, R = get_env_feedback(S, A)  # 实施行为并得到环境的反馈
            q_predict = q_table.loc[S, A]    # 估算的(状态-行为)值
            if S_ != 'terminal':
                q_target = R + Gamma * q_table.iloc[S_, :].max()   #  实际的(状态-行为)值 (回合没结束)
            else:
                q_target = R     #  实际的(状态-行为)值 (回合结束)
                is_terminated = True    # terminate this episode

            q_table.loc[S, A] += Alpha * (q_target - q_predict)  #  q_table 更新
            S = S_  # 探索者移动到下一个 state

            update_env(S, episode, step_counter+1)  # 环境更新

            step_counter += 1
    return q_table

接下来就可以开始探索了！

7.探索

# 训练
if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)

全部代码：

"""
A simple example for Reinforcement Learning using table lookup Q-learning method.
An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
Run this program and to see how the agent will improve its strategy of finding the treasure.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""

import numpy as np
import pandas as pd
import time

np.random.seed(2)  # reproducible


N_STATES = 6   # the length of the 1 dimensional world
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9   # greedy police
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 13   # maximum episodes
FRESH_TIME = 0.3    # fresh time for one move


def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),     # q_table initial values
        columns=actions,    # actions's name
    )
    # print(table)    # show table
    return table


def choose_action(state, q_table):
    # This is how to choose an action
    state_actions = q_table.iloc[state, :]
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
        action_name = np.random.choice(ACTIONS)
    else:   # act greedy
        action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
    return action_name


def get_env_feedback(S, A):
    # This is how agent will interact with the environment
    if A == 'right':    # move right
        if S == N_STATES - 2:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:   # move left
        R = 0
        if S == 0:
            S_ = S  # reach the wall
        else:
            S_ = S - 1
    return S_, R


def update_env(S, episode, step_counter):
    # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


def rl():
    # main part of RL loop
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:

            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)  # take action & get next state and reward
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal
            else:
                q_target = R     # next state is terminal
                is_terminated = True    # terminate this episode

            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)
            step_counter += 1
    return q_table


if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)

Alan_Lowe

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
Q-learning例1探索者【学习笔记3】

Q-learning例1探索者在一维空间中有一条长度为6的空间，最开始的时候探索者在0位置，宝藏在5位置，探索者在这个一维空间中可以向左向右走，他走多少步能找到宝藏？因为在初始阶段, 随机的探索环境, 往往比固定的行为模式要好,所以这也是累积经验的阶段, 我们希望探索者不会太贪婪，所以引入一个EPSILON 用来控制贪婪程度的。EPSILON 可以随着探索时间不断提升(越来越贪婪)，不过在这个例子中, 我们就固定成 EPSILON = 0.9, 90% 的时间是选择最优策略, 10% 的时间来探索.1
复制链接

扫一扫