Reinforcement Learning Chapter 5, Example of Blackjack

This article exhibits a source code and experiment result for the blackjack example in the book. Both on-policy and off-policy are implemented in this source code. The off-policy includes ordinary importance sampling and weighted importance sampling.The first-visit and every-visit methods are implemented in on_policy, although they lead to a same result (in blackjack example, the return of each state is zero except the terminal state).

#==================================================================
# Python3
# Copyright
# 2019 Ye Xiang (xiang_ye@outlook.com)
#==================================================================

import copy
import numpy as np
from enum import Enum
import matplotlib.pyplot as mplt_pyplt
from mpl_toolkits.mplot3d import Axes3D
from progressbar import ProgressBar
import seaborn as sbn

ACTION_STICK = 0
ACTION_HIT   = 1

class PlayerState:
    def __init__(self):
        self.card_sum = 0
        self.used_ace_num = 0
        self.dealers_showing = 0
    def reset(self):
        self.card_sum = 0
        self.used_ace_num = 0
        self.dealers_showing = 0

    def __eq__(self,rhs):
        if self.card_sum == rhs.card_sum \
            and self.dealers_showing == rhs.dealers_showing \
            and self.used_ace_num == rhs.used_ace_num:
            return True
        else:
            return False

class TrajectoryElement:
    def __init__(self, player_state = PlayerState(), action = ACTION_STICK, reward = 0.0):
        self.state = player_state
        self.action = action
        self.reward = reward

class Player:
    def __init__(self):
        self._trajectory = []
        #[sum, dealer's showing, used_ace_num]
        self._state = PlayerState() 

    def _get_card(self):
        card = np.random.randint(1, 14)
        card = min(card, 10)
        return card

    def _get_action(self):
        return ACTION_HIT if self._state.card_sum < 20 else ACTION_STICK
    
    def start_game(self, dealers_showing_card):
        self._state.reset()
        self._state.dealers_showing = dealers_showing_card
        self._trajectory = []
        while self._state.card_sum < 12:
            card = self._get_card()
            self._update_state(card)
    
    def _update_state(self, card):
        if card == 1:
            self._state.card_sum += 11
            self._state.used_ace_num += 1
        else:
            self._state.card_sum += card
        while self._state.card_sum > 21 and self._state.used_ace_num > 0:
            self._state.card_sum -= 10
            self._state.used_ace_num -= 1     
        if self._state.card_sum >=12 and self._state.card_sum <= 21:
            self._trajectory.append(TrajectoryElement(player_state = copy.copy(self._state), \
                reward = 0, action = self._get_action()))
    
    def play_game(self):
        while self._get_action() == ACTION_HIT and self._state.card_sum <= 21:
            card = self._get_card()
            self._update_state(card)

    def set_game_result(self, final_reward):
        self._trajectory[-1].reward = final_reward

    @property
    def sum(self):
        return self._state.card_sum

    @property
    def trajectory(self):
        return self._trajectory
    
    

class Dealer(Player):
    def _get_action(self):
        return ACTION_HIT if self._state.card_sum < 17 else ACTION_STICK

    def start_game(self):
        self._state.card_sum = 0
        self._state.used_ace_num = 0
        card1 = self._get_card()
        card2 = self._get_card()
        self._update_state(card1)
        self._update_state(card2)  
        #return showing card randomly
        if np.random.choice([0, 1]) == 0:
            return card1
        else:
            return card2

    def _update_state(self, card):
        if card == 1:
            self._state.card_sum += 11
            self._state.used_ace_num += 1
        else:
            self._state.card_sum += card
        while self._state.card_sum > 21 and self._state.used_ace_num > 0:
            self._state.card_sum -= 10
            self._state.used_ace_num -= 1    

class Player_MC_ES(Player):
        
    def __init__(self):
        Player.__init__(self)
        self._policy = np.ones
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值