Reinforcement Learning Chapter 5 Example 5.5

In the book, example 5.5 shows the infinity variance of the ordinary importance sampling in a specific case. I tried this experiment in my computer and get a similar result. Unfortunately, my computer’s memory size is not enough to support 100,000,000 episodes for ten runs. So, here I shows the code and the result of 1000,000 episodes for ten runs. Additionally, I tried weighted importance sampling, it converges very fast. I show the result here also.

#==================================================================
# Python3
# Copyright
# 2019 Ye Xiang (xiang_ye@outlook.com)
#==================================================================

import numpy as np
import matplotlib.pyplot as mplt_pyplt
from progressbar import ProgressBar
import threading

STATE_TRANSITION = 0
STATE_TERMINAL = 1

ACTION_LEFT = 1
ACTION_RIGHT = 0
class TrajectoryElement:
    def __init__(self, state = STATE_TRANSITION, action = ACTION_LEFT, reward = 1):
        self.state = state
        self.action = action
        self.reward = reward

class PolicyTester():
    def __init__(self):
        self.__sampling_data = []

    def __generate_trajectory(self):
        trajectory = []
        state = STATE_TRANSITION
        action = np.random.choice([ACTION_LEFT, ACTION_RIGHT])
        reward = 0
        trajectory.append(TrajectoryElement(state, action, reward))

        if action == ACTION_RIGHT:
            state = STATE_TERMINAL
            reward = 0
        else:
            state = np.random.choice([STATE_TRANSITION, STATE_TERMINAL], p = [0.9, 0.1])
            if state == STATE_TERMINAL:
                reward = 1

        while state != STATE_TERMINAL:        
            trajectory.append(TrajectoryElement(state, action, reward))
            action = np.random.choice([ACTION_LEFT, ACTION_RIGHT])
            if action == ACTION_RIGHT:
                state = STATE_TERMINAL
                reward = 0
            else:
                state = np.random.choice([STATE_TRANSITION, STATE_TERMINAL], p = [0.9, 0.1])
                if state == STATE_TERMINAL:
                    reward = 1
        trajectory.append(TrajectoryElement(state, action, reward))    
        return trajectory
    
    def __calculate_rho(self, trajectory):
        b = 1.0
        pai = 1.0
        for trj in trajectory:
            b *= 0.5
            if trj.action == ACTION_RIGHT: 
                pai = 0
                break
        return pai / b

    def MC_off_policy_first_visit_ordinary(self, episodes):
        self.__sampling_data = []
        pb = ProgressBar().start()
        nominator_sum = 0.0
        for i in range(0, episodes):
            trajectory = self.__generate_trajectory()
            rho = self.__calculate_rho(trajectory)
            nominator_sum += rho * trajectory[-1].reward
            self.__sampling_data.append(nominator_sum / (i + 1))
            pb.update(int(i /episodes * 100))
        pb.update(100)

    def MC_off_policy_first_visit_weighted(self, episodes):
        self.__sampling_data = []
        pb = ProgressBar().start()
        nominator_sum = 0.0
        denominator_sum = 0.0
        for i in range(0, episodes):
            trajectory = self.__generate_trajectory()
            rho = self.__calculate_rho(trajectory)      
            nominator_sum += rho * trajectory[-1].reward
            denominator_sum += rho
            Vs = nominator_sum / denominator_sum if denominator_sum != 0.0 else 0.0
            self.__sampling_data.append(Vs)
            pb.update(int(i /episodes * 100))
        pb.update(100)

    @property
    def sampling_data(self):
        return self.__sampling_data

class TestThread_ordinary(threading.Thread):
    def __init__(self, test_instance = None, episodes = 10000):
        threading.Thread.__init__(self)
        self.__test_instance = test_instance
        self.__episodes = episodes
    def run(self):
        if self.__test_instance == None:
            return
        self.__test_instance.MC_off_policy_first_visit_ordinary(self.__episodes)


def plot_ordinary(episodes, run_times):
    threads = []
    policy_testers = []
    for i in range(0, run_times):
        policy_tester = PolicyTester()
        thread = TestThread_ordinary(policy_tester, episodes)
        threads.append(thread)
        policy_testers.append(policy_tester)
        thread.start()

    for t in threads:
        t.join()

    for pt in policy_testers:
        mplt_pyplt.plot(pt.sampling_data)
    mplt_pyplt.xscale('log')
    mplt_pyplt.ylim(0, 3)
    mplt_pyplt.xlabel('Episodes (log scale)')
    mplt_pyplt.ylabel('Monte-Carlo estimate of ' + r'$v_\pi(s)$' + 'with ordinary importance sampling')
    mplt_pyplt.show()

def plot_weighted(episodes, run_times):
    for i in range(0, run_times):
        policy_tester = PolicyTester()
        policy_tester.MC_off_policy_first_visit_weighted(episodes)
        mplt_pyplt.plot(policy_tester.sampling_data)
    mplt_pyplt.xscale('log')
    mplt_pyplt.ylim(0, 2)
    mplt_pyplt.xlabel('Episodes (log scale)')
    mplt_pyplt.ylabel('Monte-Carlo estimate of ' + r'$v_\pi(s)$' + 'with weighted importance sampling')
    mplt_pyplt.show()

def set_priority(pid=None,priority=1):
    """ Set The Priority of a Windows Process.  Priority is a value between 0-5 where
        2 is normal priority.  Default sets the priority of the current
        python process but can take any valid process ID. """

    import win32api,win32process,win32con

    priorityclasses = [win32process.IDLE_PRIORITY_CLASS,
                       win32process.BELOW_NORMAL_PRIORITY_CLASS,
                       win32process.NORMAL_PRIORITY_CLASS,
                       win32process.ABOVE_NORMAL_PRIORITY_CLASS,
                       win32process.HIGH_PRIORITY_CLASS,
                       win32process.REALTIME_PRIORITY_CLASS]
    if pid == None:
        pid = win32api.GetCurrentProcessId()
    handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
    win32process.SetPriorityClass(handle, priorityclasses[priority])

if __name__ == '__main__':
    set_priority(priority = 4)
    plot_ordinary(1000000, 10)   
    plot_weighted(10000, 10)     

在这里插入图片描述

Figure 1. Infinity variance for ordinary importance sampling (ten runs)
Figure 2. Variance of weighted importance sampling (ten runs)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值