Tic-Tac-Toe游戏为3*3格子里轮流下棋,一方先有3子成直线的为赢家。
参考代码如下,我只删除了几个没用的地方:
########################################################################Copyright (C) ##2016 - 2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com) ##2016 Jan Hakenberg(jan.hakenberg@gmail.com) ##2016 Tian Jun(tianjun.cpp@gmail.com) ##2016 Kenta Shimada(hyperkentakun@gmail.com) ##Permission given to modify the code as long as you keep this ##declaration at the top ##########################################################################https://www.cnblogs.com/pinard/p/9385570.html #### 强化学习(一)模型基础 ##
importnumpy as npimportpickle
BOARD_ROWS= 3BOARD_COLS= 3BOARD_SIZE= BOARD_ROWS * BOARD_COLS
State状态类
简要描述:每个状态用自定义hash值描述,主要方法为get_all_states(运行一次得到所有状态)和next_state(下一次棋,返回新的状态)
classState:def __init__(self):#the board is represented by an n * n array,
#1 represents a chessman of the player who moves first,
#-1 represents a chessman of another player
#0 represents an empty position
self.data =np.zeros((BOARD_ROWS, BOARD_COLS))
self.winner=None
self.hash_val=None
self.end=None#compute the hash value for one state, it's unique
defhash(self):if self.hash_val isNone:
self.hash_val=0for i in self.data.reshape(BOARD_ROWS *BOARD_COLS):#即原来取值-1,0,1,现在将-1设置为2,为了hash方便
if i == -1:
i= 2self.hash_val= self.hash_val * 3 +ireturnint(self.hash_val)#check whether a player has won the game, or it's a tie
defis_end(self):if self.end is notNone:returnself.end
results=[]#check row
for i inrange(0, BOARD_ROWS):
results.append(np.sum(self.data[i, :]))#check columns
for i inrange(0, BOARD_COLS):
results.append(np.sum(self.data[:, i]))#check diagonals
results.append(0)for i inrange(0, BOARD_ROWS):
results[-1] +=self.data[i, i]
results.append(0)for i inrange(0, BOARD_ROWS):
results[-1] += self.data[i, BOARD_ROWS - 1 -i]for result inresults:if result == 3:
self.winner= 1self.end=Truereturnself.endif result == -3:
self.winner= -1self.end=Truereturnself.end#whether it's a tie
sum =np.sum(np.abs(self.data))if sum == BOARD_ROWS *BOARD_COLS:
self.winner=0
self.end=Truereturnself.end#game is still going on
self.end =Falsereturnself.end#@symbol: 1 or -1
#put chessman symbol in position (i, j)
defnext_state(self, i, j, symbol):
new_state=State()
new_state.data=np.copy(self.data)
new_state.data[i, j]=symbolreturnnew_state#print the board
def print(self):for i inrange(0, BOARD_ROWS):print('-------------')
out= '|'
for j inrange(0, BOARD_COLS):if self.data[i, j] == 1:
token= '*'
if self.data[i, j] ==0:
token= '0'
if self.data[i, j] == -1:
token= 'x'out+= token + '|'
print(out)print('-------------')defget_all_states_impl(current_state, current_symbol, all_states):'''all_states:字典,以hash值为key,value为(state,is_End)'''
for i inrange(0, BOARD_ROWS):for j inrange(0, BOARD_COLS):if current_state.data[i][j] ==0:
newState=current_state.next_state(i, j, current_symbol)
newHash=newState.hash()if newHash not inall_states.keys():
isEnd=newState.is_end()
all_states[newHash]=(newState, isEnd)#如果没结束对局,下一个选手继续下
if notisEnd:
get_all_states_impl(newState,-current_symbol, all_states)defget_all_states():
current_symbol= 1current_state=State()
all_states=dict()
all_states[current_state.hash()]=(current_state, current_state.is_end())
get_all_states_impl(current_state, current_symbol, all_states)returnall_states#all possible board configurations
all_states = get_all_states()
裁判:监督选手轮流下棋。主要方法为alternate(轮流选手),play(监督游戏执行,play里重要的为选手的act方法,后面讲)
classJudger:#@player1: the player who will move first, its chessman will be 1
#@player2: another player with a chessman -1
#@feedback: if True, both players will receive rewards when game is end
def __init__(self, player1, player2):
self.p1=player1
self.p2=player2
self.p1_symbol= 1self.p2_symbol= -1self.p1.set_symbol(self.p1_symbol)
self.p2.set_symbol(self.p2_symbol)
self.current_state=State()defreset(self):
self.p1.reset()
self.p2.reset()defalternate(self):whileTrue:yieldself.p1yieldself.p2#@print: if True, print each board during the game
def play(self, print=False):
alternator=self.alternate()
self.reset()
current_state=self.current_state
self.p1.set_state(current_state)
self.p2.set_state(current_state)whileTrue:
player=next(alternator)if print:
current_state.print()
[i, j, symbol]=player.act()
next_state_hash=current_state.next_state(i, j, symbol).hash()
current_state, is_end=all_states[next_state_hash]
self.p1.set_state(current_state)
self.p2.set_state(current_state)ifis_end:if print:
current_state.print()return current_state.winner
AI选手:estimations表示不同状态下的分值,用以进行下一状态的选择,greedy区分随机行为,即随机行为不参与更新状态的分值
主要方法为set_symbol(设置对于每个选手各状态分值的初始值),backup(更新状态分值,如果下一状态分值更高,那么当前状态的分值也要提高,即将长远的结果反作用到现在),act(获取下一步坐标)
classPlayer:#@step_size: the step size to update estimations
#@epsilon: the probability to explore
def __init__(self, step_size=0.1, epsilon=0.1):
self.estimations=dict()
self.step_size=step_size
self.epsilon=epsilon
self.states=[]
self.greedy=[]defreset(self):
self.states=[]
self.greedy=[]defset_state(self, state):
self.states.append(state)
self.greedy.append(True)defset_symbol(self, symbol):
self.symbol=symbol#对状态分值初始化,最终赢了得1分,输了不得分,平局0.5分,未到终局设置为0.5分
for hash_val inall_states.keys():
(state, is_end)=all_states[hash_val]ifis_end:if state.winner ==self.symbol:
self.estimations[hash_val]= 1.0
elif state.winner ==0:#we need to distinguish between a tie and a lose
self.estimations[hash_val] = 0.5
else:
self.estimations[hash_val]=0else:
self.estimations[hash_val]= 0.5
#update value estimation
defbackup(self):#for debug
#print('player trajectory')
#for state in self.states:
#state.print()
self.states= [state.hash() for state inself.states]#顺序更新
for i in reversed(range(len(self.states) - 1)):
state=self.states[i]
td_error= self.greedy[i] * (self.estimations[self.states[i + 1]] -self.estimations[state])
self.estimations[state]+= self.step_size *td_error#choose an action based on the state
defact(self):#取出当前(最后一个)状态
state = self.states[-1]#下一步可能的状态的hash
next_states =[]#下一步可能的坐标
next_positions =[]for i inrange(BOARD_ROWS):for j inrange(BOARD_COLS):if state.data[i, j] ==0:
next_positions.append([i, j])
next_states.append(state.next_state(i, j, self.symbol).hash())#小概率随机探索
if np.random.rand()
action=next_positions[np.random.randint(len(next_positions))]
action.append(self.symbol)#表示随机行为不参与价值更新
self.greedy[-1] =Falsereturnaction#大概率按奖励最高行动
values =[]for hash, pos inzip(next_states, next_positions):
values.append((self.estimations[hash], pos))
values.sort(key=lambda x: x[0], reverse=True)
action= values[0][1]
action.append(self.symbol)returnactiondefsave_policy(self):
with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'wb') as f:
pickle.dump(self.estimations, f)defload_policy(self):
with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'rb') as f:
self.estimations= pickle.load(f)
人类选手:act方法为自己下棋
#human interface#input a number to put a chessman#| q | w | e |#| a | s | d |#| z | x | c |
classHumanPlayer:def __init__(self, **kwargs):
self.symbol=None
self.keys= ['q', 'w', 'e', 'a', 's', 'd', 'z', 'x', 'c']
self.state=Nonereturn
defreset(self):return
defset_state(self, state):
self.state=statedefset_symbol(self, symbol):
self.symbol=symbolreturn
defbackup(self, _):return
defact(self):
self.state.print()
key= input("Input your position:")
data=self.keys.index(key)
i= data //int(BOARD_COLS)
j= data %BOARD_COLSreturn (i, j, self.symbol)
训练:
deftrain(epochs):
player1= Player(epsilon=0.01)
player2= Player(epsilon=0.01)
judger=Judger(player1, player2)
player1_win= 0.0player2_win= 0.0
for i in range(1, epochs + 1):
winner= judger.play(print=False)if winner == 1:
player1_win+= 1
if winner == -1:
player2_win+= 1
# 输出2个选手的获胜概率,到最后基本是平局
if i%100==0:print('Epoch %d, player 1 win %.02f, player 2 win %.02f' % (i, player1_win / i, player2_win /i))
player1.backup()
player2.backup()
player1.save_policy() # 保存状态价值,其实训练获取的就是各状态分别对每个选手的价值
player2.save_policy()
AI自测:
defcompete(turns):#不允许随机行为
player1 = Player(epsilon=0)
player2= Player(epsilon=0)
judger=Judger(player1, player2)
player1.load_policy()
player2.load_policy()
player1_win= 0.0player2_win= 0.0
for i inrange(0, turns):
winner=judger.play()if winner == 1:
player1_win+= 1
if winner == -1:
player2_win+= 1
#judger.reset()
print('%d turns, player 1 win %.02f, player 2 win %.02f' % (turns, player1_win / turns, player2_win / turns))
人机大战:
defplay():whileTrue:
player1=HumanPlayer()
player2= Player(epsilon=0)
judger=Judger(player1, player2)
player2.load_policy()
winner=judger.play()if winner ==player2.symbol:print("You lose!")elif winner ==player1.symbol:print("You win!")else:print("It is a tie!")
开始!
if __name__ == '__main__':
train(int(1e4))
compete(int(1e3))
play()
训练结束后,战绩为Epoch 10000, player 1 win 0.08, player 2 win 0.03
因为此时有一定随机行为(1%)
当AI自测时,去除了随机性,结果为1000 turns, player 1 win 0.00, player 2 win 0.00
可以看到,都是平局
后面就是人机大战了,根本赢不了这个AI的。
本文介绍了使用强化学习实现Tic-Tac-Toe游戏的过程。通过创建State类表示游戏状态,使用Q学习算法让AI玩家逐步学习最优策略。代码中包含游戏状态的计算、游戏规则判断和玩家行为选择等关键部分。经过训练,AI在无随机行为时能够达到平局水平。
2138

被折叠的 条评论
为什么被折叠?



