本文代码参考蒙特卡洛树实现黑白棋
设计思想
采用蒙特卡洛树算法,即采用选择-拓展-模拟-反向传播,获取下棋的近似最优解。
代码内容
from copy import deepcopy
import math
import random
class AIPlayer:
"""
AI 玩家
"""
def __init__(self, color):
"""
玩家初始化
:param color: 下棋方,'X' - 黑棋,'O' - 白棋
"""
self.color = color #下棋方
self.max_times = 200 #最大迭代次数
self.c = 1.2 #UCB值中的参数c
def get_move(self, board):
"""
根据当前棋盘状态获取最佳落子位置
:param board: 棋盘
:return: action 最佳落子位置, e.g. 'A1'
"""
if self.color == 'X':
player_name = '黑棋'
else:
player_name = '白棋'
print("请等一会,对方 {}-{} 正在思考中...".format(player_name, self.color))
board_state = deepcopy(board) #深拷贝棋盘,因为算法会改变棋盘状态
root = Node(state = board_state, color=self.color)
action = self.mcts(root) #蒙特卡洛树搜索
return action
#蒙特卡洛树搜索
def mcts(self, root):
for t in range(self.max_times):
leave_node = self.select_expand_node(root) #选择并拓展
reward = self.simulate(leave_node) #模拟
self.back_propagate(leave_node, reward) #反向传播
best_child = self.select(root, 0) #选择胜率最大的子节点
return best_child.action
#选择并拓展
def select_expand_node(self, node):
while not self.game_over(node.state): #当前节点不是叶子节点
if len(node.children) == 0: #当前节点没有被拓展过
return self.expand(node) #拓展任意节点
else:
if not node.full_expand(): #当前节点没有被完全拓展
return self.expand(node)
else:
node = self.select(node, self.c) #若被完全拓展,继续选择节点
return node
#选择
def select(self, node, c):
max_ucb = -float('inf')
select_children = []
#在所有的子节点中,选择UCB值最大的子节点
for child in node.children:
if child.visits == 0:
select_children = [child]
break
ucb = child.reward / child.visits + c * math.sqrt(2.0 * math.log(node.visits) / float(child.visits))
if ucb == max_ucb:
select_children.append(child)
if ucb > max_ucb:
select_children = [child]
max_ucb = ucb
if len(select_children) == 0:
return node.parent
return random.choice(select_children) #如果有多个UCB值相同的子节点,选择任意一个
#拓展
def expand(self, node):
#下一步合法行动
legal_actions = list(node.state.get_legal_actions(node.color))
if len(legal_actions) == 0: #如果没有合法行动
return node.parent
action = random.choice(legal_actions) #随机选择一个合法行动
expanded_actions = [child.action for child in node.children]
while action in expanded_actions: #选择一个没有被拓展过的子节点
action = random.choice(legal_actions)
#下棋并更新棋盘
new_state = deepcopy(node.state)
new_state._move(action, node.color)
#子节点下棋方相反
if node.color == 'X':
new_color = 'O'
else:
new_color = 'X'
#新建节点
node.add_child(new_state, action=action, color=new_color)
return node.children[-1] #返回最新拓展的节点
#模拟并返回奖励
def simulate(self, node):
board = deepcopy(node.state) #深拷贝棋盘,因为模拟会改变棋盘
color = node.color
count = 0
while not self.game_over(board): #模拟到游戏结束为止
legal_actions = list(node.state.get_legal_actions(color))
if not len(legal_actions) == 0: #若有合法行动
action = random.choice(legal_actions) #任意选择一个合法行动
board._move(action, color)
count = count + 1
#交换下棋方
if color == 'X':
color = 'O'
else:
color = 'X'
if count >= 50: #最大模拟次数
break
winner, win_count = board.get_winner() #获得赢家和领先几个棋子
#赢家+10分,每领先一个棋子多1分
reward = 0
if not winner == 2:
reward = 10 + win_count
if (self.color == 'X' and winner == 1) or (self.color == 'O' and winner == 0):
reward = - reward
return reward
#反向传播
def back_propagate(self, node, reward):
while node is not None:
node.visits += 1
if node.color == self.color:
node.reward += reward
else:
node.reward -= reward
node = node.parent
return 0
#判断游戏是否结束
def game_over(self, state):
#如果两个选手都没有合法的下棋位置,则比赛停止
now_loc = list(state.get_legal_actions('X'))
next_loc = list(state.get_legal_actions('O'))
over = len(now_loc) == 0 and len(next_loc) == 0 #返回值 True/False
return over