详情请见莫烦老师主页:https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/5-1-A-PG/
RL_brain.py
import numpy as np
import tensorflow as tf
# reproducible
np.random.seed(1)
tf.set_random_seed(1)
class PolicyGradient:
def __init__( #初始化
self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.95,
output_graph=False,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate #反向训练用到
self.gamma = reward_decay
self.ep_obs, self.ep_as, self.ep_rs = [], [], []#分别用于存储当前回合的状态,动作,奖励值
self._build_net()
self.sess = tf.Session()
if output_graph:
# $ tensorboard --logdir=logs
# http://0.0.0.0:6006/
# tf.train.SummaryWriter soon be deprecated, use following
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
def _build_net(self): # 建立 policy gradient 神经网络
with tf.name_scope('inputs'):
self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations")
self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value"