Q-Learning

本文介绍了Q-Learning,一种强化学习中的值基算法,它通过不断更新Q-Table来学习最优策略。Q-Table代表每个状态和动作的值函数。在示例中,创建了一个简单的环境,当向右移动并到达最右点时给予奖励。通过选择动作、获取环境反馈、更新Q-Table和环境,算法在多个回合中迭代,直至收敛。该代码实现包括了动作选择、环境反馈和Q-Table的更新过程。
摘要由CSDN通过智能技术生成

Q-Learning

什么是 Q-learning?

强化学习中的一种 values-based 算法,最终应是会学出一个收敛的表格 Q-Table。

在Q-Learning的学习中,我们用Q(S, A)来表示value function.之前我们只讨论了有限个状态(state)和行动(action)的情况,这种情况下,我们其实等价于在不断维护一个Q-table,不断更新,直至其收敛。

在这里插入图片描述

实例

1.导入工具包

import numpy as np
import pandas as pd
import time
# 设置随机种子,保障复现
np.random.seed(2)
N_STATES = 6  # environment
ACTIONS = ['LEFT','RIGHT'] # action
EPSILON = 0.9  # greedy police
ALPHA = 0.1    # 学习率
LAMBDA = 0.9   # 折扣因子
MAX_EPISODES = 13 # 最大回合数
FRESH_TIME = 0.003 # 更新频率

2.定义Q_TABLE

def build_q_table(n_states,actions):
    table = pd.DataFrame(
       np.zeros((n_states,len(actions))),
        columns = actions,
    )
    print(table)
    return table

build_q_table(N_STATES,ACTIONS)

在这里插入图片描述

3.动作选择

def choose_action(state,q_table):
    state_actions = q_table.iloc[state,:]
    if (np.random.uniform() > EPSILON) or (state_actions.all()==0):
        action_name = np.random.choice(ACTIONS)
    else:
        action_name  = ACTIONS[state_actions.argmax()]
    return action_name

4.确定环境反馈

# 这里在只有到达最右点,才会给予奖励
def get_env_feedback(S,A):
    if A == 'RIGHT': # 向右移动
        if S == N_STATES - 2: # 到达最右点前一点
            S_ = 'TERMINAL' # 回合结束
            R = 1 # 奖励+1
        else:
            S_ = S + 1 # 没有结束,就向右+1
            R = 0 # 奖励为0
    else:
        R = 0 # 向左移动,奖励为0
        if S == 0: # 如果到最左点
            S_ = S  
        else:
            S_ = S-1 # 否则向左-1
    return S_,R

5.创建环境

def update_env(S,episode,step_counter):
    env_list = ['_']*(N_STATES-1) + ['GOAL']  # env
    if S == 'TERMINAL':
        interaction = 'Episode %s : total_steps = %s' % (episode+1,step_counter)
        print('\r{}'.format(interaction),end='')
        time.sleep(FRESH_TIME)
        print('\r                          ',end='')
    else:
        env_list[S] = '*'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction),end='')
        time.sleep(FRESH_TIME)
def main():
    q_table = build_q_table(N_STATES,ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0 # 每一回合重置状态,为最左点
        is_terminated = False
        update_env(S,episode,step_counter)
        while not is_terminated:
            A = choose_action(S,q_table)
            S_,R = get_env_feedback(S,A)
            q_predict = q_table.loc[S,A]
            if S_!= 'TERMINAL':
                q_target = R + LAMBDA*q_table.iloc[S_,:].max()
            else:
                q_target = R
                is_terminated =True
                
            q_table.loc[S,A] += ALPHA*(q_target - q_predict)
            S = S_
            
            update_env(S,episode,step_counter+1)
            step_counter += 1
    return q_table
            
            
if __name__ == "__main__":
    q_table = main()
    print(q_table)

在这里插入图片描述

综合起来:

# coding: utf-8

# ## 1.导入工具包


import numpy as np
import pandas as pd
import time




# 设置随机种子,保障复现
np.random.seed(2)



N_STATES = 6  # environment
ACTIONS = ['LEFT','RIGHT'] # action
EPSILON = 0.9  # greedy police
ALPHA = 0.1    # 学习率
LAMBDA = 0.9   # 折扣因子
MAX_EPISODES = 13 # 最大回合数
FRESH_TIME = 0.001 # 更新频率


# ## 2.定义Q_TABLE



def build_q_table(n_states,actions):
    table = pd.DataFrame(
       np.zeros((n_states,len(actions))),
        columns = actions,
    )
    print(table)
    return table

build_q_table(N_STATES,ACTIONS)


# ## 3.动作选择




def choose_action(state,q_table):
    state_actions = q_table.iloc[state,:]
    if (np.random.uniform() > EPSILON) or (state_actions.all()==0):
        action_name = np.random.choice(ACTIONS)
    else:
        action_name  = ACTIONS[state_actions.argmax()]
    return action_name


# ## 4.确定环境反馈


# 这里在只有到达最右点,才会给予奖励
def get_env_feedback(S,A):
    if A == 'RIGHT': # 向右移动
        if S == N_STATES - 2: # 到达最右点前一点
            S_ = 'TERMINAL' # 回合结束
            R = 1 # 奖励+1
        else:
            S_ = S + 1 # 没有结束,就向右+1
            R = 0 # 奖励为0
    else:
        R = 0 # 向左移动,奖励为0
        if S == 0: # 如果到最左点
            S_ = S  
        else:
            S_ = S-1 # 否则向左-1
    return S_,R


# ## 5.创建环境

def update_env(S,episode,step_counter):
    env_list = ['_']*(N_STATES-1) + ['GOAL']  # env
    if S == 'TERMINAL':
        interaction = 'Episode %s : total_steps = %s' % (episode+1,step_counter)
        print('\r{}'.format(interaction),end='')
        time.sleep(FRESH_TIME)
        print('\r                          ',end='')
    else:
        env_list[S] = '*'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction),end='')
        time.sleep(FRESH_TIME)




def main():
    q_table = build_q_table(N_STATES,ACTIONS)
    for episode in range(MAX_EPISODES):
        print('episode:', episode)
        step_counter = 0
        S = 0 # 每一回合重置状态,为最左点
        is_terminated = False
        update_env(S,episode,step_counter)
        while not is_terminated:
            A = choose_action(S,q_table)
            S_,R = get_env_feedback(S,A)
            q_predict = q_table.loc[S,A]
            if S_!= 'TERMINAL':
                q_target = R + LAMBDA*q_table.iloc[S_,:].max()
            else:
                q_target = R
                is_terminated =True
                
            q_table.loc[S,A] += ALPHA*(q_target - q_predict)
            S = S_
            
            update_env(S,episode,step_counter+1)
            step_counter += 1
    return q_table
            
            

if __name__ == "__main__":
    q_table = main()
    print(q_table)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值