Model-Free TD Control: Q-Learning

     

import time
import random

class Env():
    def __init__(self, length, height):
        # define the height and length of the map
        self.length = length
        self.height = height
        # define the agent's start position
        self.x = 0
        self.y = 0

    def render(self, frames=50):
        for i in range(self.height):
            if i == 0: # cliff is in the line 0
                line = ['S'] + ['x']*(self.length - 2) + ['T'] # 'S':start, 'T':terminal, 'x':the cliff
            else:
                line = ['.'] * self.length
            if self.x == i:
                line[self.y] = 'o' # mark the agent's position as 'o'
            print(''.join(line))
        print('\033['+str(self.height+1)+'A')  # printer go back to top-left
        time.sleep(1.0 / frames)

    def step(self, action):
        """4 legal actions, 0:up, 1:down, 2:left, 3:right"""
        change = [[0, 1], [0, -1], [-1, 0], [1, 0]]
        self.x = min(self.height - 1, max(0, self.x + change[action][0]))
        self.y = min(self.length - 1, max(0, self.y + change[action][1]))

        states = [self.x, self.y]
        reward = -1
        terminal = False
        if self.x == 0: # if agent is on the cliff line "SxxxxxT"
            if self.y > 0: # if agent is not on the start position
                terminal = True
                if self.y != self.length - 1: # if agent falls
                    reward = -100
        return reward, states, terminal

    def reset(self):
        self.x = 0
        self.y = 0

class Q_table():
    def __init__(self, length, height, actions=4, alpha=0.1, gamma=0.9):
        self.table = [0] * actions * length * height # initialize all Q(s,a) to zero
        self.actions = actions
        self.length = length
        self.height = height
        self.alpha = alpha
        self.gamma = gamma

    def _index(self, a, x, y):
        """Return the index of Q([x,y], a) in Q_table."""
        return a * self.height * self.length + x * self.length + y

    def _epsilon(self):
        return 0.1
        # version for better convergence:
        # """At the beginning epsilon is 0.2, after 300 episodes decades to 0.05, and eventually go to 0."""
        # return 20. / (num_episode + 100)

    def take_action(self, x, y, num_episode):
        """epsilon-greedy action selection"""
        if random.random() < self._epsilon():
            return int(random.random() * 4)
        else:
            actions_value = [self.table[self._index(a, x, y)] for a in range(self.actions)]
            return actions_value.index(max(actions_value))

    def max_q(self, x, y):
        actions_value = [self.table[self._index(a, x, y)] for a in range(self.actions)]
        return max(actions_value)

    def update(self, a, s0, s1, r, is_terminated):
        # both s0, s1 have the form [x,y]
        q_predict = self.table[self._index(a, s0[0], s0[1])]
        if not is_terminated:
            q_target = r + self.gamma * self.max_q(s1[0], s1[1])
        else:
            q_target = r
        self.table[self._index(a, s0[0], s0[1])] += self.alpha * (q_target - q_predict)

def cliff_walk():
    env = Env(length=12, height=4)
    table = Q_table(length=12, height=4)
    for num_episode in range(2000):
        # within the whole learning process
        episodic_reward = 0
        is_terminated = False
        s0 = [0, 0]
        while not is_terminated:
            # within one episode
            action = table.take_action(s0[0], s0[1], num_episode)
            r, s1, is_terminated = env.step(action)
            table.update(action, s0, s1, r, is_terminated)
            episodic_reward += r
            # env.render(frames=100)
            s0 = s1
        if num_episode % 10 == 0:
            print("Episode: {}, Score: {}".format(num_episode, episodic_reward))
        env.reset()

cliff_walk()

    

Episode: 0, Score: -100
Episode: 10, Score: -123
Episode: 20, Score: -290
Episode: 30, Score: -119
Episode: 40, Score: -166
Episode: 50, Score: -133
Episode: 60, Score: -109
Episode: 70, Score: -124
Episode: 80, Score: -60
Episode: 90, Score: -57
Episode: 100, Score: -152
Episode: 110, Score: -100
Episode: 120, Score: -108
Episode: 130, Score: -127
Episode: 140, Score: -112
Episode: 150, Score: -45
Episode: 160, Score: -140
Episode: 170, Score: -104
Episode: 180, Score: -115
Episode: 190, Score: -25
Episode: 200, Score: -28
Episode: 210, Score: -22
Episode: 220, Score: -19
Episode: 230, Score: -114
Episode: 240, Score: -17
Episode: 250, Score: -16
Episode: 260, Score: -26
Episode: 270, Score: -17
Episode: 280, Score: -116
Episode: 290, Score: -26
Episode: 300, Score: -100
Episode: 310, Score: -21
Episode: 320, Score: -17
Episode: 330, Score: -21
Episode: 340, Score: -23
Episode: 350, Score: -28
Episode: 360, Score: -31
Episode: 370, Score: -16
Episode: 380, Score: -13
Episode: 390, Score: -19
Episode: 400, Score: -15
Episode: 410, Score: -118
Episode: 420, Score: -13
Episode: 430, Score: -15
Episode: 440, Score: -13
Episode: 450, Score: -13
Episode: 460, Score: -13
Episode: 470, Score: -13
Episode: 480, Score: -105
Episode: 490, Score: -17
Episode: 500, Score: -103
Episode: 510, Score: -15
Episode: 520, Score: -102
Episode: 530, Score: -15
Episode: 540, Score: -13
Episode: 550, Score: -19
Episode: 560, Score: -107
Episode: 570, Score: -107
Episode: 580, Score: -18
Episode: 590, Score: -20
Episode: 600, Score: -110
Episode: 610, Score: -13
Episode: 620, Score: -14
Episode: 630, Score: -107
Episode: 640, Score: -110
Episode: 650, Score: -13
Episode: 660, Score: -105
Episode: 670, Score: -103
Episode: 680, Score: -104
Episode: 690, Score: -13
Episode: 700, Score: -33
Episode: 710, Score: -13
Episode: 720, Score: -107
Episode: 730, Score: -110
Episode: 740, Score: -15
Episode: 750, Score: -19
Episode: 760, Score: -105
Episode: 770, Score: -17
Episode: 780, Score: -14
Episode: 790, Score: -111
Episode: 800, Score: -109
Episode: 810, Score: -17
Episode: 820, Score: -102
Episode: 830, Score: -100
Episode: 840, Score: -103
Episode: 850, Score: -15
Episode: 860, Score: -15
Episode: 870, Score: -13
Episode: 880, Score: -107
Episode: 890, Score: -15
Episode: 900, Score: -13
Episode: 910, Score: -14
Episode: 920, Score: -13
Episode: 930, Score: -17
Episode: 940, Score: -15
Episode: 950, Score: -13
Episode: 960, Score: -15
Episode: 970, Score: -13
Episode: 980, Score: -100
Episode: 990, Score: -19
Episode: 1000, Score: -104
Episode: 1010, Score: -104
Episode: 1020, Score: -15
Episode: 1030, Score: -15
Episode: 1040, Score: -13
Episode: 1050, Score: -16
Episode: 1060, Score: -15
Episode: 1070, Score: -15
Episode: 1080, Score: -13
Episode: 1090, Score: -103
Episode: 1100, Score: -13
Episode: 1110, Score: -13
Episode: 1120, Score: -15
Episode: 1130, Score: -15
Episode: 1140, Score: -104
Episode: 1150, Score: -15
Episode: 1160, Score: -105
Episode: 1170, Score: -13
Episode: 1180, Score: -102
Episode: 1190, Score: -17
Episode: 1200, Score: -13
Episode: 1210, Score: -17
Episode: 1220, Score: -13
Episode: 1230, Score: -104
Episode: 1240, Score: -15
Episode: 1250, Score: -13
Episode: 1260, Score: -14
Episode: 1270, Score: -15
Episode: 1280, Score: -100
Episode: 1290, Score: -104
Episode: 1300, Score: -14
Episode: 1310, Score: -13
Episode: 1320, Score: -13
Episode: 1330, Score: -15
Episode: 1340, Score: -13
Episode: 1350, Score: -17
Episode: 1360, Score: -15
Episode: 1370, Score: -13
Episode: 1380, Score: -15
Episode: 1390, Score: -105
Episode: 1400, Score: -112
Episode: 1410, Score: -13
Episode: 1420, Score: -13
Episode: 1430, Score: -103
Episode: 1440, Score: -15
Episode: 1450, Score: -13
Episode: 1460, Score: -111
Episode: 1470, Score: -102
Episode: 1480, Score: -13
Episode: 1490, Score: -113
Episode: 1500, Score: -15
Episode: 1510, Score: -15
Episode: 1520, Score: -100
Episode: 1530, Score: -100
Episode: 1540, Score: -15
Episode: 1550, Score: -13
Episode: 1560, Score: -103
Episode: 1570, Score: -103
Episode: 1580, Score: -108
Episode: 1590, Score: -19
Episode: 1600, Score: -102
Episode: 1610, Score: -17
Episode: 1620, Score: -111
Episode: 1630, Score: -106
Episode: 1640, Score: -20
Episode: 1650, Score: -16
Episode: 1660, Score: -15
Episode: 1670, Score: -13
Episode: 1680, Score: -102
Episode: 1690, Score: -15
Episode: 1700, Score: -111
Episode: 1710, Score: -13
Episode: 1720, Score: -106
Episode: 1730, Score: -105
Episode: 1740, Score: -108
Episode: 1750, Score: -13
Episode: 1760, Score: -15
Episode: 1770, Score: -15
Episode: 1780, Score: -13
Episode: 1790, Score: -13
Episode: 1800, Score: -13
Episode: 1810, Score: -103
Episode: 1820, Score: -13
Episode: 1830, Score: -13
Episode: 1840, Score: -13
Episode: 1850, Score: -17
Episode: 1860, Score: -19
Episode: 1870, Score: -14
Episode: 1880, Score: -13
Episode: 1890, Score: -19
Episode: 1900, Score: -101
Episode: 1910, Score: -17
Episode: 1920, Score: -13
Episode: 1930, Score: -15
Episode: 1940, Score: -13
Episode: 1950, Score: -112
Episode: 1960, Score: -113
Episode: 1970, Score: -13
Episode: 1980, Score: -13
Episode: 1990, Score: -13

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

XF鸭

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值