tic tac toe游戏 java_强化学习应用于游戏Tic-Tac-Toe

本文介绍了使用强化学习实现Tic-Tac-Toe游戏的过程。通过创建State类表示游戏状态,使用Q学习算法让AI玩家逐步学习最优策略。代码中包含游戏状态的计算、游戏规则判断和玩家行为选择等关键部分。经过训练,AI在无随机行为时能够达到平局水平。
摘要由CSDN通过智能技术生成

Tic-Tac-Toe游戏为3*3格子里轮流下棋,一方先有3子成直线的为赢家。

参考代码如下,我只删除了几个没用的地方:

########################################################################Copyright (C) ##2016 - 2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com) ##2016 Jan Hakenberg(jan.hakenberg@gmail.com) ##2016 Tian Jun(tianjun.cpp@gmail.com) ##2016 Kenta Shimada(hyperkentakun@gmail.com) ##Permission given to modify the code as long as you keep this ##declaration at the top ##########################################################################https://www.cnblogs.com/pinard/p/9385570.html #### 强化学习(一)模型基础 ##

importnumpy as npimportpickle

BOARD_ROWS= 3BOARD_COLS= 3BOARD_SIZE= BOARD_ROWS * BOARD_COLS

State状态类

简要描述:每个状态用自定义hash值描述,主要方法为get_all_states(运行一次得到所有状态)和next_state(下一次棋,返回新的状态)

classState:def __init__(self):#the board is represented by an n * n array,

#1 represents a chessman of the player who moves first,

#-1 represents a chessman of another player

#0 represents an empty position

self.data =np.zeros((BOARD_ROWS, BOARD_COLS))

self.winner=None

self.hash_val=None

self.end=None#compute the hash value for one state, it's unique

defhash(self):if self.hash_val isNone:

self.hash_val=0for i in self.data.reshape(BOARD_ROWS *BOARD_COLS):#即原来取值-1,0,1,现在将-1设置为2,为了hash方便

if i == -1:

i= 2self.hash_val= self.hash_val * 3 +ireturnint(self.hash_val)#check whether a player has won the game, or it's a tie

defis_end(self):if self.end is notNone:returnself.end

results=[]#check row

for i inrange(0, BOARD_ROWS):

results.append(np.sum(self.data[i, :]))#check columns

for i inrange(0, BOARD_COLS):

results.append(np.sum(self.data[:, i]))#check diagonals

results.append(0)for i inrange(0, BOARD_ROWS):

results[-1] +=self.data[i, i]

results.append(0)for i inrange(0, BOARD_ROWS):

results[-1] += self.data[i, BOARD_ROWS - 1 -i]for result inresults:if result == 3:

self.winner= 1self.end=Truereturnself.endif result == -3:

self.winner= -1self.end=Truereturnself.end#whether it's a tie

sum =np.sum(np.abs(self.data))if sum == BOARD_ROWS *BOARD_COLS:

self.winner=0

self.end=Truereturnself.end#game is still going on

self.end =Falsereturnself.end#@symbol: 1 or -1

#put chessman symbol in position (i, j)

defnext_state(self, i, j, symbol):

new_state=State()

new_state.data=np.copy(self.data)

new_state.data[i, j]=symbolreturnnew_state#print the board

def print(self):for i inrange(0, BOARD_ROWS):print('-------------')

out= '|'

for j inrange(0, BOARD_COLS):if self.data[i, j] == 1:

token= '*'

if self.data[i, j] ==0:

token= '0'

if self.data[i, j] == -1:

token= 'x'out+= token + '|'

print(out)print('-------------')defget_all_states_impl(current_state, current_symbol, all_states):'''all_states:字典,以hash值为key,value为(state,is_End)'''

for i inrange(0, BOARD_ROWS):for j inrange(0, BOARD_COLS):if current_state.data[i][j] ==0:

newState=current_state.next_state(i, j, current_symbol)

newHash=newState.hash()if newHash not inall_states.keys():

isEnd=newState.is_end()

all_states[newHash]=(newState, isEnd)#如果没结束对局,下一个选手继续下

if notisEnd:

get_all_states_impl(newState,-current_symbol, all_states)defget_all_states():

current_symbol= 1current_state=State()

all_states=dict()

all_states[current_state.hash()]=(current_state, current_state.is_end())

get_all_states_impl(current_state, current_symbol, all_states)returnall_states#all possible board configurations

all_states = get_all_states()

裁判:监督选手轮流下棋。主要方法为alternate(轮流选手),play(监督游戏执行,play里重要的为选手的act方法,后面讲)

classJudger:#@player1: the player who will move first, its chessman will be 1

#@player2: another player with a chessman -1

#@feedback: if True, both players will receive rewards when game is end

def __init__(self, player1, player2):

self.p1=player1

self.p2=player2

self.p1_symbol= 1self.p2_symbol= -1self.p1.set_symbol(self.p1_symbol)

self.p2.set_symbol(self.p2_symbol)

self.current_state=State()defreset(self):

self.p1.reset()

self.p2.reset()defalternate(self):whileTrue:yieldself.p1yieldself.p2#@print: if True, print each board during the game

def play(self, print=False):

alternator=self.alternate()

self.reset()

current_state=self.current_state

self.p1.set_state(current_state)

self.p2.set_state(current_state)whileTrue:

player=next(alternator)if print:

current_state.print()

[i, j, symbol]=player.act()

next_state_hash=current_state.next_state(i, j, symbol).hash()

current_state, is_end=all_states[next_state_hash]

self.p1.set_state(current_state)

self.p2.set_state(current_state)ifis_end:if print:

current_state.print()return current_state.winner

AI选手:estimations表示不同状态下的分值,用以进行下一状态的选择,greedy区分随机行为,即随机行为不参与更新状态的分值

主要方法为set_symbol(设置对于每个选手各状态分值的初始值),backup(更新状态分值,如果下一状态分值更高,那么当前状态的分值也要提高,即将长远的结果反作用到现在),act(获取下一步坐标)

classPlayer:#@step_size: the step size to update estimations

#@epsilon: the probability to explore

def __init__(self, step_size=0.1, epsilon=0.1):

self.estimations=dict()

self.step_size=step_size

self.epsilon=epsilon

self.states=[]

self.greedy=[]defreset(self):

self.states=[]

self.greedy=[]defset_state(self, state):

self.states.append(state)

self.greedy.append(True)defset_symbol(self, symbol):

self.symbol=symbol#对状态分值初始化,最终赢了得1分,输了不得分,平局0.5分,未到终局设置为0.5分

for hash_val inall_states.keys():

(state, is_end)=all_states[hash_val]ifis_end:if state.winner ==self.symbol:

self.estimations[hash_val]= 1.0

elif state.winner ==0:#we need to distinguish between a tie and a lose

self.estimations[hash_val] = 0.5

else:

self.estimations[hash_val]=0else:

self.estimations[hash_val]= 0.5

#update value estimation

defbackup(self):#for debug

#print('player trajectory')

#for state in self.states:

#state.print()

self.states= [state.hash() for state inself.states]#顺序更新

for i in reversed(range(len(self.states) - 1)):

state=self.states[i]

td_error= self.greedy[i] * (self.estimations[self.states[i + 1]] -self.estimations[state])

self.estimations[state]+= self.step_size *td_error#choose an action based on the state

defact(self):#取出当前(最后一个)状态

state = self.states[-1]#下一步可能的状态的hash

next_states =[]#下一步可能的坐标

next_positions =[]for i inrange(BOARD_ROWS):for j inrange(BOARD_COLS):if state.data[i, j] ==0:

next_positions.append([i, j])

next_states.append(state.next_state(i, j, self.symbol).hash())#小概率随机探索

if np.random.rand()

action=next_positions[np.random.randint(len(next_positions))]

action.append(self.symbol)#表示随机行为不参与价值更新

self.greedy[-1] =Falsereturnaction#大概率按奖励最高行动

values =[]for hash, pos inzip(next_states, next_positions):

values.append((self.estimations[hash], pos))

values.sort(key=lambda x: x[0], reverse=True)

action= values[0][1]

action.append(self.symbol)returnactiondefsave_policy(self):

with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'wb') as f:

pickle.dump(self.estimations, f)defload_policy(self):

with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'rb') as f:

self.estimations= pickle.load(f)

人类选手:act方法为自己下棋

#human interface#input a number to put a chessman#| q | w | e |#| a | s | d |#| z | x | c |

classHumanPlayer:def __init__(self, **kwargs):

self.symbol=None

self.keys= ['q', 'w', 'e', 'a', 's', 'd', 'z', 'x', 'c']

self.state=Nonereturn

defreset(self):return

defset_state(self, state):

self.state=statedefset_symbol(self, symbol):

self.symbol=symbolreturn

defbackup(self, _):return

defact(self):

self.state.print()

key= input("Input your position:")

data=self.keys.index(key)

i= data //int(BOARD_COLS)

j= data %BOARD_COLSreturn (i, j, self.symbol)

训练:

deftrain(epochs):

player1= Player(epsilon=0.01)

player2= Player(epsilon=0.01)

judger=Judger(player1, player2)

player1_win= 0.0player2_win= 0.0

for i in range(1, epochs + 1):

winner= judger.play(print=False)if winner == 1:

player1_win+= 1

if winner == -1:

player2_win+= 1

# 输出2个选手的获胜概率,到最后基本是平局

if i%100==0:print('Epoch %d, player 1 win %.02f, player 2 win %.02f' % (i, player1_win / i, player2_win /i))

player1.backup()

player2.backup()

player1.save_policy() # 保存状态价值,其实训练获取的就是各状态分别对每个选手的价值

player2.save_policy()

AI自测:

defcompete(turns):#不允许随机行为

player1 = Player(epsilon=0)

player2= Player(epsilon=0)

judger=Judger(player1, player2)

player1.load_policy()

player2.load_policy()

player1_win= 0.0player2_win= 0.0

for i inrange(0, turns):

winner=judger.play()if winner == 1:

player1_win+= 1

if winner == -1:

player2_win+= 1

#judger.reset()

print('%d turns, player 1 win %.02f, player 2 win %.02f' % (turns, player1_win / turns, player2_win / turns))

人机大战:

defplay():whileTrue:

player1=HumanPlayer()

player2= Player(epsilon=0)

judger=Judger(player1, player2)

player2.load_policy()

winner=judger.play()if winner ==player2.symbol:print("You lose!")elif winner ==player1.symbol:print("You win!")else:print("It is a tie!")

开始!

if __name__ == '__main__':

train(int(1e4))

compete(int(1e3))

play()

训练结束后,战绩为Epoch 10000, player 1 win 0.08, player 2 win 0.03

因为此时有一定随机行为(1%)

当AI自测时,去除了随机性,结果为1000 turns, player 1 win 0.00, player 2 win 0.00

可以看到,都是平局

后面就是人机大战了,根本赢不了这个AI的。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值