This article exhibits a source code and experiment result for the blackjack example in the book. Both on-policy and off-policy are implemented in this source code. The off-policy includes ordinary importance sampling and weighted importance sampling.The first-visit and every-visit methods are implemented in on_policy, although they lead to a same result (in blackjack example, the return of each state is zero except the terminal state).
#==================================================================
# Python3
# Copyright
# 2019 Ye Xiang (xiang_ye@outlook.com)
#==================================================================
import copy
import numpy as np
from enum import Enum
import matplotlib.pyplot as mplt_pyplt
from mpl_toolkits.mplot3d import Axes3D
from progressbar import ProgressBar
import seaborn as sbn
ACTION_STICK = 0
ACTION_HIT = 1
class PlayerState:
def __init__(self):
self.card_sum = 0
self.used_ace_num = 0
self.dealers_showing = 0
def reset(self):
self.card_sum = 0
self.used_ace_num = 0
self.dealers_showing = 0
def __eq__(self,rhs):
if self.card_sum == rhs.card_sum \
and self.dealers_showing == rhs.dealers_showing \
and self.used_ace_num == rhs.used_ace_num:
return True
else:
return False
class TrajectoryElement:
def __init__(self, player_state = PlayerState(), action = ACTION_STICK, reward = 0.0):
self.state = player_state
self.action = action
self.reward = reward
class Player:
def __init__(self):
self._trajectory = []
#[sum, dealer's showing, used_ace_num]
self._state = PlayerState()
def _get_card(self):
card = np.random.randint(1, 14)
card = min(card, 10)
return card
def _get_action(self):
return ACTION_HIT if self._state.card_sum < 20 else ACTION_STICK
def start_game(self, dealers_showing_card):
self._state.reset()
self._state.dealers_showing = dealers_showing_card
self._trajectory = []
while self._state.card_sum < 12:
card = self._get_card()
self._update_state(card)
def _update_state(self, card):
if card == 1:
self._state.card_sum += 11
self._state.used_ace_num += 1
else:
self._state.card_sum += card
while self._state.card_sum > 21 and self._state.used_ace_num > 0:
self._state.card_sum -= 10
self._state.used_ace_num -= 1
if self._state.card_sum >=12 and self._state.card_sum <= 21:
self._trajectory.append(TrajectoryElement(player_state = copy.copy(self._state), \
reward = 0, action = self._get_action()))
def play_game(self):
while self._get_action() == ACTION_HIT and self._state.card_sum <= 21:
card = self._get_card()
self._update_state(card)
def set_game_result(self, final_reward):
self._trajectory[-1].reward = final_reward
@property
def sum(self):
return self._state.card_sum
@property
def trajectory(self):
return self._trajectory
class Dealer(Player):
def _get_action(self):
return ACTION_HIT if self._state.card_sum < 17 else ACTION_STICK
def start_game(self):
self._state.card_sum = 0
self._state.used_ace_num = 0
card1 = self._get_card()
card2 = self._get_card()
self._update_state(card1)
self._update_state(card2)
#return showing card randomly
if np.random.choice([0, 1]) == 0:
return card1
else:
return card2
def _update_state(self, card):
if card == 1:
self._state.card_sum += 11
self._state.used_ace_num += 1
else:
self._state.card_sum += card
while self._state.card_sum > 21 and self._state.used_ace_num > 0:
self._state.card_sum -= 10
self._state.used_ace_num -= 1
class Player_MC_ES(Player):
def __init__(self):
Player.__init__(self)
self._policy = np.ones