#========================================================================================#
# ----------SECTION 4----------
# Placeholders
#
# Need these for batch observations / actions / advantages in policy gradient loss function.
#========================================================================================#
sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
if discrete:
sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
else:
sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32)
# Define a placeholder for advantages
sy_adv_n = tf.placeholder(shape=[None,1],name="adv",dtype=tf.float32)
#========================================================================================#
# ----------SECTION 4----------
# Networks
#
# Make symbolic operations for
# 1. Policy network outputs which describe the policy distribution.描述政策分配的网络输出
# a. For the discrete case, just logits for each action.
# 对于离散环境,对每个action都logits
# b. For the continuous case, the mean / log std of a Gaussian distribution over
# actions.
# 对于连续的情况,高斯分布的平均值/对数标准相对于动作
# Hint: use the 'build_mlp' function you defined in utilities.
#
# Note: these ops should be functions of the placeholder 'sy_ob_no'
# 这些操作应该是占位符'sy_ob_no'的函数
# 2. Producing samples stochastically from the policy distribution.从政策分配中随机生成样本。
# a. For the discrete case, an op that takes in logits and produces actions.
# 对于离散案例来说,一个操作会接受登录并产生操作
# Should have shape [None]
#
# b. For the continuous case, use the reparameterization trick:对于连续的情况,请使用重新参数化技巧:
# The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
#
# mu + sigma * z, z ~ N(0, I)
#
# This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
# 这样可以将问题简化为只对z进行采样
# Should have shape [None, ac_dim]
#
# Note: these ops should be functions of the policy network output ops.
# 这些操作应该是策略网络输出操作的功能。
# 3. Computing the log probability of a set of actions that were actually taken,
# according to the policy.
# 根据策略计算实际采取的一组操作的日志概率。
# Note: these ops should be functions of the placeholder 'sy_ac_na', and the
# policy network output ops.
# 这些操作应该是占位符'sy_ac_na'的功能,以及策略网络输出操作
#========================================================================================#
if discrete:
# YOUR_CODE_HERE
sy_mean=build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, scope="scopeAlex", n_layers=n_layers, size=size,activation=tf.tanh, output_activation=None)
sy_logits_na = tf.log(tf.nn.softmax(sy_mean))
sy_sampled_ac = tf.multinomial(sy_logits_na,1)[0] # Hint: Use the tf.multinomial op
sy_logprob_n = tf.reduce_sum(tf.multiply(tf.one_hot(sy_ac_na,ac_dim),sy_logits_na),1, keep_dims=True)
else:
# YOUR_CODE_HERE
sy_mean = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, scope="scopeAlex", n_layers=n_layers, size=size,activation=tf.tanh, output_activation=None)
sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
sy_sampled_ac = TODO
sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian.
对参数的理解:
ob_dim:observation的维度
ac_dim:action的维度(离散的话)
sy_ob_no:存储ob的矩阵,shape[None,ob_dim]
sy_ac_na:存储action,如果离散action,shape=[n],经过tf.one_hot后即为action;如果连续,shape=[n,ac_dim]
sy_adv_n:存储advantage,是一个轨迹的优势,
discrete:
sy_logits_na:输出层经softmax回归后所得的概率分布的对数
sy_sampled_ac:用sy_logits_na根据分布概率随机产生的action,使用tf.multionmial函数
sy_logprob_n:根据policy计算实际一组action的概率,∑p(a)=sum(tf.one_hot(sy_ac_na),sy_logits_na)
else: