Continuous control with deep reinforcement learning(DDPG,深度确定策略梯度)练习

5深度确定策略梯度,论文在这:
http://xueshu.baidu.com/s?wd=paperuri%3A%283752bdb69e8a3f4849ecba38b2b0168f%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Fwww.oalib.com%2Fpaper%2F4051743&ie=utf-8&sc_us=1138439324812222606

=======================experience_replay.py=============

from collections import deque
import numpy as np
import random


'''
flag = tf.app.flags
FLAG = flag.FLAGS
flag.DEFINE_string('size','5','size')
print flag.FLAGS.size
'''

class Experience_replay:
    def __init__(self, size, action_dim,state_dim):
        self.d = deque(maxlen=size)
        self.action_dim = action_dim
        self.state_dim = state_dim

    def experience_in(self, memory):
        self.d.append(memory)

    def experience_out(self, sample_size):
        s_list = random.sample(self.d, sample_size)

        rs = np.asarray([i[0] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
        ra = np.asarray([i[1] for i in s_list], dtype=np.float32).reshape((sample_size, self.action_dim))
        rr = np.asarray([i[2] for i in s_list], dtype=np.float32).reshape((sample_size, 1))
        rss = np.asarray([i[3] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
        rt = np.asarray([i[4] for i in s_list], dtype=np.bool).reshape((sample_size, 1))

        return rs, ra, rr, rss, rt

    def experience_out_partly(self,sample_size,part_experience_size):
        sample_index = np.random.randint(0,part_experience_size,sample_size).tolist()

        rs = np.asarray([self.d[i][0] for i in sample_index], dtype=np.float32).reshape((sample_size, self.state_dim))

        return rs


#############test###########
if __name__ == "__main__":
    pass

============================Critic.py=========================

import tensorflow as tf
from tensorflow.contrib import layers
import math

class Critic:
    def __init__(self, sess,action_dim,state_dim):

        self.sess = sess
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = 32
        self.GAMMA = 0.9
        self.num_units_l1 = 50
        self.num_units_l2 = 40
        self.learning_rate = 0.001
        self.update_TDnet_rate = 0.2
        self.reg = layers.l2_regularizer(0.006)
        self.init_var = 0.01

        self.state_input = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim], name='state_input')
        self.actor_input = tf.placeholder(dtype=tf.float32, shape=[None, self.action_dim], name='actor_input')
        self.Q_value_input = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='TD_Q_value_input')
        self.reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='reward')
        self.terminal = tf.placeholder(dtype=tf.bool, shape=[None, 1], name='terminal')

        with tf.variable_scope('critic'):
            self.Q_output, self.Q_net_var_set = self.create_network(trainable=True)
        with tf.variable_scope(<
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值