=======================experience_replay.py=============
from collections import deque
import numpy as np
import random
'''
flag = tf.app.flags
FLAG = flag.FLAGS
flag.DEFINE_string('size','5','size')
print flag.FLAGS.size
'''
class Experience_replay:
def __init__(self, size, action_dim,state_dim):
self.d = deque(maxlen=size)
self.action_dim = action_dim
self.state_dim = state_dim
def experience_in(self, memory):
self.d.append(memory)
def experience_out(self, sample_size):
s_list = random.sample(self.d, sample_size)
rs = np.asarray([i[0] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
ra = np.asarray([i[1] for i in s_list], dtype=np.float32).reshape((sample_size, self.action_dim))
rr = np.asarray([i[2] for i in s_list], dtype=np.float32).reshape((sample_size, 1))
rss = np.asarray([i[3] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
rt = np.asarray([i[4] for i in s_list], dtype=np.bool).reshape((sample_size, 1))
return rs, ra, rr, rss, rt
def experience_out_partly(self,sample_size,part_experience_size):
sample_index = np.random.randint(0,part_experience_size,sample_size).tolist()
rs = np.asarray([self.d[i][0] for i in sample_index], dtype=np.float32).reshape((sample_size, self.state_dim))
return rs
#############test###########
if __name__ == "__main__":
pass
============================Critic.py=========================
import tensorflow as tf
from tensorflow.contrib import layers
import math
class Critic:
def __init__(self, sess,action_dim,state_dim):
self.sess = sess
self.state_dim = state_dim
self.action_dim = action_dim
self.batch_size = 32
self.GAMMA = 0.9
self.num_units_l1 = 50
self.num_units_l2 = 40
self.learning_rate = 0.001
self.update_TDnet_rate = 0.2
self.reg = layers.l2_regularizer(0.006)
self.init_var = 0.01
self.state_input = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim], name='state_input')
self.actor_input = tf.placeholder(dtype=tf.float32, shape=[None, self.action_dim], name='actor_input')
self.Q_value_input = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='TD_Q_value_input')
self.reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='reward')
self.terminal = tf.placeholder(dtype=tf.bool, shape=[None, 1], name='terminal')
with tf.variable_scope('critic'):
self.Q_output, self.Q_net_var_set = self.create_network(trainable=True)
with tf.variable_scope(<