读源码学算法之Monte Carlo Tree Search

Researcher-Du

已于 2022-08-17 16:54:12 修改

阅读量632

点赞数 2

分类专栏： Algorithm 文章标签：算法概率论蒙特卡洛树搜索

于 2021-06-27 18:16:18 首次发布

本文链接：https://blog.csdn.net/u011426016/article/details/118277589

版权

Algorithm 专栏收录该内容

23 篇文章 2 订阅

订阅专栏

最近研究新的算法有使用到Monte Carlo Tree Search，查了一些资料，参考几篇博客：
1.知乎：蒙特卡洛树搜索最通俗入门指南
2.知乎：AlphaGo背后的力量：蒙特卡洛树搜索入门指南
2.https://blog.csdn.net/caozixuan98724/article/details/103213795
4.https://blog.csdn.net/kurumi233/article/details/80307788
5.https://vgarciasc.github.io/mcts-viz/
在这里插入图片描述

我认真研究了一份代码，用蒙特卡洛实现井字棋人机对战的代码，非常感谢作者！
代码来自：https://github.com/cryer/monte-carlo-tree-search

实在没空写博客了，只能贴代码，我写了详细的注释，希望以后复习方便！

一、主代码 main.py

import numpy as np
from node import *
from search import MonteCarloTreeSearch
from tictactoe import TicTacToeGameState

def init(): #初始化，从一个3*3的全0的空白棋盘开始，模拟10000次，选一个最优的位置机器先落子
    state = np.zeros((3, 3))
    initial_board_state = TicTacToeGameState(state=state, next_to_move=1)
    root = MonteCarloTreeSearchNode(state=initial_board_state, parent=None)
    mcts = MonteCarloTreeSearch(root)
    best_node = mcts.best_action(1000)
    return best_node.state

#绘制棋盘格
def draw_chessboard(c_board):
    for i in range(3):
        print("\n{0:3}".format(i).center(8)+"|", end='')
        for j in range(3):
            if c_board[i][j] ==  0: print('_'.center(8), end='')
            if c_board[i][j] ==  1: print('X'.center(8), end='')
            if c_board[i][j] == -1: print('O'.center(8), end='')
    print("\n______________________________")

#人走棋，在某个空闲的位置落子(在3*3的矩阵中某个空闲的位置赋值为-1)
def get_human_action(state):
    location = input("Your move: ")
    location = [int(n, 10) for n in location.split(",")]
    x = location[0]
    y = location[1]
    move = TicTacToeMove(x, y, -1) 		#机器走是1，人走是-1
    if not state.is_move_legal(move):	#主要判断落子是否有问题
        print("invalid move")
        move = get_human_action(state)	#落子有问题则需要重新落子
    return move

#判断游戏是否已经结束(胜、负、平)
def is_game_over(state):
    if state.is_game_over():
        if state.game_result ==  1.0: print("You lose!")
        if state.game_result ==  0.0: print("Tie!")
        if state.game_result == -1.0: print("You Win!")
        return 1
    return 0

if __name__ == '__main__':
    c_state = init()                #机器先走一步
    draw_chessboard(c_state.board)  #绘制机器走的结果

    while is_game_over(c_state) != 1:	            #人走棋->机器落子->人走棋... 循环直到游戏结束
        # 1.人落子
        human_move = get_human_action(c_state)      #人落子
        c_state = c_state.move(human_move)          #落子的状态
        draw_chessboard(c_state.board)			    #绘制人落子棋盘

        # 2.机器搜索最好的位置落子
        board_state = TicTacToeGameState(state=c_state.board, next_to_move=1)
        root = MonteCarloTreeSearchNode(state=board_state, parent=None) #将当前位置作为新的根节点
        mcts = MonteCarloTreeSearch(root)           #从root开始进行MCT search
        best_computer_node = mcts.best_action(1000) #进行1000次模拟，然后返回最优落子位置,机器落子
        c_state = best_computer_node.state		    #机器落子后的状态
        draw_chessboard(c_state.board)		        #绘制机器落子后的棋盘格

二、蒙特卡洛搜搜代码 search.py

from node import MonteCarloTreeSearchNode

class MonteCarloTreeSearch:
    def __init__(self, node: MonteCarloTreeSearchNode):
        self.root = node

    def draw_chessboard(self,c_board):
        for i in range(3):
            print("\n{0:3}".format(i).center(8)+"|", end='')
            for j in range(3):
                if c_board[i][j] ==  0: print('_'.center(8), end='')
                if c_board[i][j] ==  1: print('X'.center(8), end='')
                if c_board[i][j] == -1: print('O'.center(8), end='')
        print("\n______________________________")

    def best_action(self, simulations_number):
        for i in range(simulations_number):         #模拟n次人机对战，找到最优落子位置
            v = self.expand_node()                  #root:当前棋局，在当前棋局下expand（走棋），先遍历孩子节点，再遍历孙子节点...
            # self.draw_chessboard(v.state.board)
            result = v.simulate_man_machine_game()  #从返回的扩展到的节点开始，模拟机器和人下棋，直到胜负平
            v.backpropagate(result)                 #将胜负情况，节点访问次数等回传到祖宗节点
        return self.root.best_child(c_param=0.)     #根据子节点的统计信息返回最优的落子位置

    def expand_node(self):
        current_node = self.root
        while not current_node.is_terminal_node():      #还没到达胜、负、平叶子节点
            if not current_node.is_fully_expanded():    #如果当前节点的所有子节点还没访问完
                return current_node.expand()            #扩展根节点 到其中一个子节点
            else: #如果当前所有子节点已都经访问过，那么找一个best child开始继续扩展
                current_node = current_node.best_child() #找出最优节点
        return current_node                              #如果已经到达叶子结点，直接返回

三、蒙特卡洛树节点类 node.py

import numpy as np
from collections import defaultdict
from tictactoe import *

class MonteCarloTreeSearchNode(object):
    def __init__(self, state: TicTacToeGameState, parent=None):
        self._number_of_visits = 0.
        self._results = defaultdict(int)
        self.state = state
        self.parent = parent
        self.children = []

    @property
    def untried_actions(self): #当前状态下，下一步可以走的所有方案存入_untried_actions
        if not hasattr(self, '_untried_actions'):
            self._untried_actions = self.state.get_legal_actions()
        return self._untried_actions

    @property
    def q(self):
        wins = self._results[self.parent.state.next_to_move]  #胜利次数
        loses = self._results[-1 * self.parent.state.next_to_move] #失败次数
        return wins - loses

    @property
    def n(self):
        return self._number_of_visits

    def expand(self):
        action = self.untried_actions.pop()     #走一个位置，这个位置就移除
        next_state = self.state.move(action)    #移动到这个位置,并切换next_to_move：1->-1 or -1->1

        child_node = MonteCarloTreeSearchNode(next_state, parent=self) #next_state->child_node
        self.children.append(child_node) #子节点加入到children
        return child_node

    def is_terminal_node(self): #胜、负、平都算terminal node
        return self.state.is_game_over()

    def simulate_man_machine_game(self): #从当前状态开始，人机随机落子下棋，直到胜负平
        current_rollout_state = self.state
        while not current_rollout_state.is_game_over():
            possible_moves = current_rollout_state.get_legal_actions()
            action = self.rollout_policy(possible_moves)
            current_rollout_state = current_rollout_state.move(action)
        return current_rollout_state.game_result

    def backpropagate(self, result): #从当前节点到根节点递归的回传对战信息
        self._number_of_visits += 1.
        self._results[result] += 1.
        if self.parent:
            self.parent.backpropagate(result)

    def is_fully_expanded(self):
        return len(self.untried_actions) == 0

    def best_child(self, c_param=1.4): #根据统计信息获取最优的落子位置
        choices_weights = [
            (c.q / (c.n)) + c_param * np.sqrt((2 * np.log(self.n) / (c.n)))
            for c in self.children
        ]
        return self.children[np.argmax(choices_weights)]

    def rollout_policy(self, possible_moves): #随机产生一个可以落子的位置
        return possible_moves[np.random.randint(len(possible_moves))]

四、井字棋状态类 tictactoe.py

import numpy as np

class TicTacToeMove(object):
    def __init__(self, x_coordinate, y_coordinate, value):
        self.x_coordinate = x_coordinate    #棋盘X坐标
        self.y_coordinate = y_coordinate    #棋盘Y坐标
        self.value = value                  #棋盘的棋子:1:X,-1:o, 0:空闲

class TicTacToeGameState(object):
    x = +1  #机器的棋子X用+1表示
    o = -1  #人的棋子O用-1表示

    def __init__(self, state, next_to_move=1):
        self.board = state
        self.board_size = state.shape[0]
        self.next_to_move = next_to_move

    @property
    def game_result(self): #机器的视角: 胜、负、平
        rowsum = np.sum(self.board, 0) #按列求和
        colsum = np.sum(self.board, 1) #按行求和
        diag_sum_tl = self.board.trace()        #主对角线求和
        diag_sum_tr = self.board[::-1].trace()  #副对角线求和
        
        #1.行，列，对角线之和 = 3，机器获胜，返回1
        if any(rowsum == self.board_size) or any(colsum == self.board_size) or diag_sum_tl == self.board_size or diag_sum_tr == self.board_size:
            return 1.
        #2.行，列，对角线之和 = -3，机器失败，返回-1
        elif any(rowsum == -self.board_size) or any(colsum == -self.board_size) or diag_sum_tl == -self.board_size or diag_sum_tr == -self.board_size:
            return -1.
        #3.棋盘已经填满，但未分出胜负，平局，返回0
        elif np.all(self.board != 0):
            return 0.
        #4.正在走棋，还没到终止状态
        else:
            return None

    def is_game_over(self):
        return self.game_result != None

    def is_move_legal(self, move):
        # next_to_move:1:机器，-1：人，若是机器，那么move.value = 1：X, 反之亦然
        if move.value != self.next_to_move:
            return False
        # 落子是否越界
        if not (move.x_coordinate < self.board_size and move.x_coordinate >= 0):
            return False
        if not (move.y_coordinate < self.board_size and move.y_coordinate >= 0):
            return False
        # 落子的位置是否没有棋子
        return self.board[move.x_coordinate, move.y_coordinate] == 0

    def move(self, move): #在(move.x_coordinate, move.y_coordinate)上放置move.value,并切换谁下次落子
        new_board = np.copy(self.board)
        new_board[move.x_coordinate, move.y_coordinate] = move.value 
        next_to_move = TicTacToeGameState.o if self.next_to_move == TicTacToeGameState.x else TicTacToeGameState.x
        return TicTacToeGameState(new_board, next_to_move)

    def get_legal_actions(self): #空闲的位置均可落子
        indices = np.where(self.board == 0)
        return [TicTacToeMove(coords[0], coords[1], self.next_to_move) for coords in list(zip(indices[0], indices[1]))]