Machine Learning 学习 之 Qleaning 学习

学习改编自 莫烦Python Qlearning学习教程

import numpy as np
import time

np.random.seed(2)  # reproducible


N_STATES = 6   # the length of the 1 dimensional world
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9  # greedy police
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 53   # maximum episodes
FRESH_TIME = 0.05    # fresh time for one move


def build_q_table(n_states, actions):
    table=[[0 for j in range(len(actions))] for i in range(n_states)]
    print(table)    # show table
    return table

def list_Zero(lis):
    for li in lis:
        if li!=0:
            return False
    return True

def max_Index(lis):
    mmax=-999999
    index=0
    for j in range(len(lis)):
        if lis[j]>mmax:
            mmax=lis[j]
            index=j
    return index

def max_Value(lis):
    mmax=-999999
    index=0
    for j in range(len(lis)):
        if lis[j]>mmax:
            mmax=lis[j]
            index=j
    return mmax


def choose_action(state, q_table):
    # This is how to choose an action
    state_actions = q_table[state]
    #print state_actions
    if (np.random.uniform() > EPSILON) or list_Zero(state_actions):  # act non-greedy or state-action have no value
        action_num=np.random.randint(0,len(ACTIONS))
        action_name = ACTIONS[action_num]
        #print action_name
    else:   # act greedy
        action_num=max_Index(state_actions)
        action_name = ACTIONS[action_num]

    return action_name,action_num


def get_feedback(S, A):
    # This is how agent will interact with the environment
    if A == 'right':    # move right
        if S == N_STATES - 2:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:   # move left
        R = 0
        if S == 0:
            S_ = S  # reach the wall
        else:
            S_ = S - 1
    return S_, R


def update(S, episode, step_counter):
    # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print '\r{}'.format(interaction)
        time.sleep(1)
        print '\r'                                
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print '\r{}'.format(interaction)
        time.sleep(FRESH_TIME)


def RL(method):
    # main part of RL loop
    q_table = build_q_table(N_STATES, ACTIONS)
    print(q_table)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update(S, episode, step_counter)

        while not is_terminated:
            A,Anum= choose_action(S, q_table)
            S_, R = get_feedback(S, A)  # take action & get next state and reward

            if S_ != 'terminal':
                if method=='Qlearning':
                    q_reward= R + GAMMA * max_Value(q_table[S_])
                elif method=='Sarsa':
                    A_,Anum_= choose_action(S_, q_table)
                    q_reward= R + GAMMA * q_table[S_][Anum_]

            else: 
                q_reward = R     # next state is terminal
                is_terminated = True    # terminate this episode

            q_table[S][Anum] =(1-ALPHA)*q_table[S][Anum]+ALPHA *q_reward 
            #print(q_table)
            S = S_  # move to next state
            update(S, episode, step_counter+1)
            step_counter += 1

    return q_table




if __name__ == "__main__":
    #q_table = RL('Qlearning')
    q_table = RL('Sarsa')
    print('\r\nQ-table:\n')
    print(q_table)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值