最近学AI,想做点什么,后面想想能不能做一个相对简单的AI,让AI自己学会下五子棋呢!我不想写任何逻辑,我只告诉AI,你这样输了哦。也就是反馈,奖惩的方式,也叫深度强化学习吧(DQN)。我做得很简单,2个一样的网络,一个判断结果,让2个网络自己对抗,策略是部分按训练结果来下,部分按随机概率来下(用来探索最佳答案)。训练过程发现损失越来越少,下棋周期越来越长,棋子会相对离散。这里输出“0”表示先手,“*”表示后手。
因为朋友的AI项目开始了,我后面没有跟进这个东西了。有兴趣的朋友完善下,一起交流!
import tensorflow as tf
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
k = 0;
M = 10
N = 10
chessState = {}
def getTrainData():
tmp = []
for i in range( M ):
one = []
for j in range( N ):
one.append( chessState[i, j] )
tmp.append( one )
return np.reshape( tmp, [-1, 10] )
def getOneEpData():
tmp = []
for i in range( len( OneRePlay ) ):
tmp.append()
def initState():
for i in range( M ):
for j in range( N ):
chessState[i, j] = -1
# chessState[5,5]=1
def outPutChess():
print("{}".format(getTrainData()).replace("-1"," ").replace("1","*"))
def outputState():
for i in range( M ):
print( "" )
print( "{:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d}".format( chessState[i, 0],
chessState[i, 1],
chessState[i, 2],
chessState[i, 3],
chessState[i, 4],
chessState[i, 5],
chessState[i, 6],
chessState[i, 7],
chessState[i, 8],
chessState[i, 9] ).replace("-1","-").replace("1","*") )
def PlayOneStep(L=0., who=-1, sess=tf.Session):
i1, j1 = 0, 0
p1, p2 = 0.00, 0.000
for i in range( M ): # 所有点测试都尝试
for j in range( N ):
if (chessState[i, j] == -1):
chessState[i, j] = who
p1 = sess.run( L, feed_dict={x: np.reshape( getTrainData(), [-1, 100] )} )
if (p1[0][0] > p2):
i1, j1 = i, j
p2 = p1[0][0]
chessState[i, j] = -1
if (True):#np.random.uniform() < 1-p2): # 随机尝试不同的地方
return i1, j1
else:
while True:
i1 = np.random.randint( 0, 9 )
j1 = np.random.randint( 0, 9 )
if (chessState[i1, j1] == -1):
break
return i1, j1
def Normalize(data):
m = np.mean(data)
mx = max(data)
mn = min(data)
return [(float(i) - m) / (mx - mn) for i in data]
def whoWin(who=1):
i, j = 0, 0
h, v, p, l = 0, 0, 0, 0
for i in range( M ):
for j in range( N ):
if (chessState[i, j] == who):
h, v, p, l = 1, 1, 1, 1
for m in range( j + 1, N ): # h方向-
if (chessState[i, m] == who):
h += 1
if (h >= 5):
return True, "—"
else:
h = 0
for m in range( i + 1, M ): # V方向|
if (chessState[m, j] == who):
v += 1
if (v >= 5):
return True, "|"
else:
v = 0
for m in range( 1, M - j ): # L方向\
if (i + m >= M):
break
if (j + m >= M):
break
if (chessState[i + m, j + m] == who):
p += 1
if (p >= 5):
return True, "\\"
else:
p = 0
for m in range( 1, M - i ): # P方向/(1,10)
if (i + m >= M):
break
if (j - m < 0):
break
if (chessState[i + m, j - m] == who):
l += 1
if (l >= 5):
return True, "/"
else:
l = 0
return False, ""
initState()
learning_rate = 0.0001
"""
chessState[1,5]=1
chessState[1,6]=1
chessState[1,7]=1
chessState[1,8]=1
chessState[1,9]=1
print("1111")
print(whoWin(1))
print(getTrainData())
exit()
chessState[1,9]=-1
chessState[2,8]=-1
chessState[3,7]=-1
chessState[4,6]=-1
chessState[5,5]=-1
print(whoWin(-1))
exit(0)
"""
#print( getTrainData() )
outPutChess();
# 先手网络
x = tf.placeholder( dtype=tf.float32, shape=[None, 100], name="X_In" )
y = tf.placeholder( dtype=tf.float32, shape=[None, 1], name="Y_In" )
w1 = tf.get_variable( "W1", shape=[100, 40], initializer=tf.contrib.layers.xavier_initializer() )
b1 = tf.get_variable( "b1", shape=[40], initializer=tf.contrib.layers.xavier_initializer() )
w2 = tf.get_variable( "W2", shape=[40, 1], initializer=tf.contrib.layers.xavier_initializer() )
# np.random.uniform(0,1,size=[50,1]))
b2 = tf.get_variable( "b2", shape=[1], initializer=tf.contrib.layers.xavier_initializer() )
L1 = tf.matmul( x, w1 ) + b1
L2_R = tf.matmul( L1, w2 ) + b2
L2 = tf.nn.sigmoid( L2_R )
# loglik = tf.log(y * (y - L2) + (1 - y) * (y + L2))
# los = -tf.reduce_mean(loglik)
#los = -tf.reduce_mean( y * tf.log( L2_R ) )
los=tf.reduce_mean(tf.square(L2_R-y))
#los = tf.nn.softmax_cross_entropy_with_logits_v2(logits=L2,labels=y)
# los=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=L2, labels=y))
train = tf.train.AdamOptimizer( learning_rate ).minimize( los )
# train=tf.train.GradientDescentOptimizer(0.0001).minimize(los)
# 后手网络
w1_h = tf.get_variable( "w1_h", shape=[100, 40], initializer=tf.contrib.layers.xavier_initializer() )
b1_h = tf.get_variable( "b1_h", shape=[40], initializer=tf.contrib.layers.xavier_initializer() )
w2_h = tf.get_variable( "w2_h", shape=[40, 1], initializer=tf.contrib.layers.xavier_initializer() )
b2_h = tf.get_variable( "b2_h", shape=[1], initializer=tf.contrib.layers.xavier_initializer() )
L1_h = tf.matmul( x, w1_h ) + b1_h
L2_h_R = tf.matmul( L1, w2_h ) + b2_h
L2_h = tf.nn.sigmoid( L2_h_R )
# loglik_h = tf.log(y * (y - L2_h) + (1 - y) * (y + L2_h))
# los_h = -tf.reduce_mean(loglik_h)
#los_h = -tf.reduce_mean( y * tf.log( L2_h_R ) )
los_h=tf.reduce_mean(tf.square(L2_h_R-y))
#los_h = tf.nn.softmax_cross_entropy_with_logits_v2(logits=L2_h,labels=y)
# los_h=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=L2_h, labels=y))
# train_h=tf.train.GradientDescentOptimizer(0.001).minimize(los_h)
train_h = tf.train.AdamOptimizer( learning_rate ).minimize( los_h )
epCount = 0
AllReplay = []
AllReward = []
AllRewardH = []
OneRePlay = []
OneReward = []
OneRewardH = []
OneRePlay.append( getTrainData() )
OneReward.append( 1 )
OneRewardH.append( 1 )
step = 0
who = 1 # 后手
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run( init )
while True:
oneTran = getTrainData()
if (who == 1):
i, j = PlayOneStep( L2, who, sess ) # 位置
else:
i, j = PlayOneStep( L2_h, who, sess ) # 位置
chessState[i, j] = who
done, posWin = whoWin( who )
step += 1
if (done):
print(
"------【{:1d}】----------------{:6d}----------【{:s}】----step{:d}--------".format( who, epCount, posWin,
step ).replace("-1","-") )
step = 0
#print( getTrainData() )
#outputState()
outPutChess()
done = True
iMax = len( OneReward )
if (who == 1): # 先手赢了
OneReward[iMax - 1] = 0.96
for i in reversed( range( iMax - 1 ) ):
OneReward[i] *= OneReward[i + 1] * 0.995
OneRewardH[iMax - 1] = 0.10
for i in reversed( range( iMax - 1 ) ):
OneRewardH[i] *= OneRewardH[i + 1] * 1.02
else: # 后手赢了
OneRewardH[iMax - 1] = 0.96
for i in reversed( range( iMax - 1 ) ):
OneRewardH[i] *= OneRewardH[i + 1] * 0.995
OneReward[iMax - 1] = 0.10
for i in reversed( range( iMax - 1 ) ):
OneReward[i] *= OneReward[i + 1] * 1.02
AllReplay.append( OneRePlay )
AllReward.append( OneReward )
AllRewardH.append( OneRewardH )
initState() # 重新开始
# print(getTrainData())
if (len( AllReplay ) > 0):# and done): # 更新梯度,
x_feed = np.vstack( AllReplay )
x_feed = np.array( x_feed )
x_feed = np.reshape( x_feed, [-1, 100] )
r = np.hstack( AllReward )
r = np.array( r )
rh = np.hstack( AllRewardH )
rh = np.array( rh )
_, tlos1, ww1, ww2 = sess.run( [train, los, w1, w2], feed_dict={x: x_feed, y: np.reshape( r, [-1, 1] )} )
_, tlos2 = sess.run( [train_h, los_h], feed_dict={x: x_feed, y: np.reshape( rh, [-1, 1] )} )
if (step % 10 == 0):
print( "los1,los2:", tlos1, tlos2 )
# print("w1,w2:",ww1,ww2)
if (done): # 一轮结束
OneReward = []
OneRewardH = []
OneRePlay = []
OneRePlay.append( getTrainData() )
OneReward.append( 1 )
OneRewardH.append( 1 )
if (who == 1):
who = 0
elif (who == 0):
who =1
epCount += 1
if (len( AllReplay ) > 50):
AllReplay.pop()
AllReward.pop()
AllRewardH.pop()
# if(epCount%10==0):#看看啥情况
# print(getTrainData())
# outputState()
训练过程如下:
训练