问题描述:
确定环境中的最佳操作的规则叫做策略,学习这些策略的网络称为策略网络。
代码展示:
import numpy as np
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
#Pong env
env = gym.make("Pong-v0")
observation = env.reset()
for i in range(22):
#20 帧之后发球
if i>20:
plt.imshow(observation)
plt.show()
#得到下一个观察
observation,_,_,_ = env.step(1)
#函数预处理输入数据
def preprocess_frame(frame):
# 移去图像顶部和某些背景
frame = frame[35:195,10:150]
# 图像帧度灰度化并缩小1/2
frame = frame [::2,::2,0]
# 设置背景值为0
frame[frame==144] =0
frame[frame ==109] = 0
# 设置球拍及拍数为1
frame[frame != 0] =1
return frame.astype(np.float).ravel()
obs_preprocessed = preprocess_frame(observation).reshape(80,70)
plt.imshow(obs_preprocessed,cmap ='gray')
plt.show()
observation_next,_,_,_ = env.step(1)
diff = preprocess_frame(obs