来自 up 主:南方小鱼儿
https://link.zhihu.com/?target=https%3A//www.bilibili.com/video/BV1S34y1d7sR%3Fp%3D1%26vd_source%3D3365b984607b6d55023066944ecb8f9e
参考知乎文章,https://zhuanlan.zhihu.com/p/535076514,做了部分补充
1.导入库
import numpy as np
import cv2
from PIL import Image
import time
import pickle
import pylab as plt
plt.style.use('ggplot')
2.参数定义
SIZE = 10 #区域大小
EPISODES = 30000 #局数
SHOW_EVERY = 3000 #定义每隔多少局展示一次图像
FOOD_REWARD = 25 #agent获得食物的奖励
ENEMY_PENALITY = 300 #遇上对手的惩罚
MOVE_PENALITY = 1 #遇上敌人的惩罚
epsilon = 0.6 #抽取的概率,epsilon-greedy,强化学习随机是必须的,才能让对手猜不到你在干什么,但这个概率逐步变小
EPS_DECAY = 0.998 #epsilon衰减指标,到最后只选最大值
DISCOUNT = 0.95 #折扣回报
LEARING_RATE = 0.1
#q_table = None #第一次训练就用None,后续可以用下面这句训练好的q_table文件,改成相应的名字
q_table = './qtable/qtable_1704725891.pickle'
d = {1:(255,0,0), #blue player
2:(0,255,0), #green food
3:(0,0,255)} #red enemy
PLAYER_N = 1
FOOD_N = 2
ENEMY_N = 3
# print(d[FOOD_N])
3.创建智能体cube类(定义智能体的初始位置及其动作函数)
# 智能体的类,有位置信息和动作函数
class Cube: #cube方块的意思
def __init__(self) -> None:
self.x = np.random.randint(0,SIZE - 1) #0~9
self.y = np.random.randint(0,SIZE - 1)
def __str__(self) -> str:
return f'{self.x},{self.y}' #打印对象名字时,就输出这个魔法函数的返回值
# player = Cube() #测试__str__方法
# food = Cube()
# print(player) #测试__str__方法
def __sub__(self,other): #两个对象比如玩家和食物的距离坐标
return (self.x - other.x,self.y - other.y)
def move(self,x=False,y=False):
if not x: #如果x还未给值,就随机给一个
self.x += np.random.randint(-1,2)
else:
self.x += x
if not y: #如果y还未给值,就随机给一个
self.y += np.random.randint(-1,2)
else:
self.y += y
#考虑边界情况,小于0或者大于9
if self.x <0:
self.x = 0
if self.x > SIZE -1:
self.x = SIZE - 1
if self.y < 0:
self.y = 0
if self.y >SIZE - 1:
self.y = SIZE - 1
def action(self,choice): #强化学习的经典函数,涉及到动作的选择,跟移动这个动作有关,先写move
if choice == 0: #0,1,2,3分别代表上下左右
self.move(x=0,y=1)
elif choice == 1:
self.move(x=0,y=-1)
elif choice == 2:
self.move(x=1,y=0)
elif choice == 3:
self.move(x=-1,y=0)
# player = Cube() #测试action方法
# print(player)
# player.action(0)
# print(player)
4.初始化q table
if q_table is None: #判断q table是否存在,如果不存在就初始化一下
q_table = {}
#四个循环,x1代表玩家跟食物横坐标的差值,y1代表纵坐标差值
#x2代表玩家跟敌人enemy横坐标差值,y2代表纵坐标差值
#状态空间为4个循环。得分初始化为-5~0
for x1 in range(-SIZE+1,SIZE):
for y1 in range(-SIZE+1,SIZE):
for x2 in range(-SIZE+1,SIZE):
for y2 in range(-SIZE+1,SIZE):
q_table[((x1,y1),(x2,y2))] = [np.random.randint(-5,0) for i in range(4)]
else:
#如果提供了q表,把q_table当作一个对象load进来
with open(q_table,'rb') as f:
q_table = pickle.load(f)
# print(len(q_table))#测试q表个数
# print(19**4)
# print(q_table[((1,3),(2,-4))])
5.训练
# 训练一个智能体
episode_rewards = [] #初始化一个奖励
for episode in range(EPISODES): #玩30000局游戏
#实例化玩家、食物和敌人
player = Cube()
food = Cube()
enemy = Cube()
# 每隔一段时间设定show为True,显示图像
if episode % SHOW_EVERY == 0:
print(f'episode:{episode},epsilon:{epsilon}')
print(f'mean_reward:{np.mean(episode_rewards[-SHOW_EVERY:])}') #每3000次显示一次3000局的平均值
show = True
else:
show = False
episode_reward = 0 #初始化
for i in range(200): #200步以内完成游戏,否则就放弃游戏
obs = (player-food,player-enemy) #拿到游戏的状态值,玩家和食物以及enemy的距离
if np.random.random() > epsilon:
action = np.argmax(q_table[obs]) #找到q值最大值的动作,作为下一步的决策
#强化学习需要有概率来抽,否则容易被敌人发现规律,但是
else:
action = np.random.randint(0,4)
# print(player) #测试
# print(action)
# print(obs)
player.action(action)
#food.move()
#enemy.move()食物 敌人动对训练没有好处,先注释
# print(f'after action:{player}')
if player.x == food.x and player.y == food.y: #玩家抓住猎物,奖励25已经在前面初始化
reward = FOOD_REWARD
elif player.x == enemy.x and player.y == enemy.y:
reward = -ENEMY_PENALITY #玩家遇到敌人,收到惩罚-300
else:
reward = -MOVE_PENALITY #既没抓住猎物,又没遇到敌人,移动的惩罚-1
# print(f'rewared:{reward}')
#更新q表
current_q = q_table[obs][action]
# print(f'current_q:{current_q}')
new_obs = (player-food,player-enemy) #执行动作后新的状态
# print(f'new_obs:{new_obs}')
max_future_q = np.max(q_table[new_obs]) #新的状态下,最大的q值
# print(f'max_future_q:{max_future_q}')
if reward == FOOD_REWARD: #获取到了食物就不需要用下面的公式计算了
new_q = FOOD_REWARD
else:
new_q = (1-LEARING_RATE)*current_q + LEARING_RATE * (reward+DISCOUNT*max_future_q) #著名的公式
# print(f'new_q:{new_q}')
q_table[obs][action] = new_q
if show:
env = np.zeros((SIZE,SIZE,3),dtype=np.uint8) #10x10x3的一个矩阵
env[food.x][food.y] = d[FOOD_N] #FOOD_N=2食物的颜色序号
env[player.x][player.y] = d[PLAYER_N]
env[enemy.x][enemy.y] = d[ENEMY_N]
img = Image.fromarray(env,'RGB')
img = img.resize((500,500)) #图像大小
cv2.imshow('',np.array(img)) #显示
#停500ms
if reward == FOOD_REWARD or reward == ENEMY_PENALITY:
if cv2.waitKey(500) & 0xFF == ord('q'):
break
else:
if cv2.waitKey(1) & 0xFF == ord('q'):
break
episode_reward += reward
if reward == FOOD_REWARD or reward == ENEMY_PENALITY:
break
# break
episode_rewards.append(episode_reward)
# break #两个break方便测试,后续注释掉
epsilon *= EPS_DECAY #探索率减小
# food = Cube()
# player = Cube()
# enemy = Cube()
# env = np.zeros((SIZE,SIZE,3),dtype=np.uint8) #10x10x3的一个矩阵
# env[food.x][food.y] = d[FOOD_N] #FOOD_N=2食物的颜色序号
# env[player.x][player.y] = d[PLAYER_N]
# env[enemy.x][enemy.y] = d[ENEMY_N]
# img = Image.fromarray(env,'RGB')
# img = img.resize((500,500)) #图像大小
# cv2.imshow('',np.array(img)) #显示
# #停500ms
# if cv2.waitKey(500) & 0xFF == ord('q'):
# pass
6.输出奖励曲线
# conv = np.convolve([3,8,10,9,11,23,44,55,57,48],np.ones((3,))/3,mode='valid')#测试卷积函数
# print(f'conv:{conv}')
# print(f'np.mean;{np.mean([3,8,10])}')
moving_avg = np.convolve(episode_rewards,np.ones((SHOW_EVERY,))/SHOW_EVERY,mode='valid')
# print(len(moving_avg))#测试
plt.plot([i for i in range(len(moving_avg))],moving_avg)
plt.xlabel('episode#')
plt.ylabel(f'mean {SHOW_EVERY} reward')
plt.show()
7.保存文件
with open(f'./qtable/qtable_{int(time.time())}.pickle','wb') as f:
pickle.dump(q_table,f)