强化学习--DDPG---tensorflow实现

完整代码:https://github.com/zle1992/Reinforcement_Learning_Game

论文《Continuous control with deep reinforcement learning》https://arxiv.org/pdf/1509.02971.pdf

Deep_Deterministic_Policy_Gradient

 DDPG与AC的区别:

AC:

  Actor: 利用td_error更新参数,td_error 来自Critic

  Critic:根据value(s)函数的贝尔曼方程更新梯度

DDPG:

  Actor:   maximize the q,输出action

  Critic:根据Q(s,a)函数的贝尔曼方程更新梯度, 输出q值

 

DDPG 只能预测连续的动作输出。

逻辑梳理:

1、DDPG是AC 模型,输入包括(S,R,S_,A)

 

2、Actor 

intput:(S)

output: a

loss :max(q) 

q 来自Critic

 

3、Critic

input : S 、A

output: q

loss: R+ GAMMA * q_  - q

 

问题来了,q_ how to get? ---->Critic网络可以输入(S_,a_)得到q_ 但是,不能用同一个网络啊,所以,利用错位时间,我们使用Critic2(不可训练的)

Critic2需要a_ how to get?/----->Action网络可以输出(S_)得到a_,同理,我们使用Actor2(不可训练的)得到a_

 

流程

a = actor(s ,trian)

a_ = actor(s_,not_train)

q  = critic(s,a  trian)

q_critic(s_,a_,not_train)

 

a_loss = max(q)

c_loss = RGAMMA * q_  - q

 

 

 

 

代码:

DDPY.py

  1 import os
  2 import numpy as np 
  3 import tensorflow as tf
  4 from abc import ABCMeta, abstractmethod
  5 np.random.seed(1)
  6 tf.set_random_seed(1)
  7 
  8 import logging  # 引入logging模块
  9 logging.basicConfig(level=logging.DEBUG,
 10                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')  # logging.basicConfig函数对日志的输出格式及方式做相关配置
 11 # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上
 12 
 13 tfconfig = tf.ConfigProto()
 14 tfconfig.gpu_options.allow_growth = True
 15 session = tf.Session(config=tfconfig)
 16 
 17 
 18 class DDPG(object):
 19     __metaclass__ = ABCMeta
 20     """docstring for ACNetwork"""
 21     def __init__(self, 
 22             n_actions,
 23             n_features,
 24             reward_decay,
 25             lr_a,
 26             lr_c,
 27             memory_size,
 28             output_graph,
 29             log_dir,
 30             model_dir,
 31             TAU,
 32             a_bound,
 33             ):
 34         super(DDPG, self).__init__()
 35         
 36         self.n_actions = n_actions
 37         self.n_features = n_features
 38         self.gamma=reward_decay
 39         self.memory_size =memory_size
 40         self.output_graph=output_graph
 41         self.lr_a =lr_a
 42         self.lr_c = lr_c
 43         self.log_dir = log_dir
 44     
 45         self.model_dir = model_dir 
 46         # total learning step
 47         self.learn_step_counter = 0
 48         self.TAU = TAU     # soft replacement
 49         self.a_bound = a_bound
 50 
 51 
 52 
 53 
 54         self.s = tf.placeholder(tf.float32,[None]+self.n_features,name='s')
 55         self.s_next = tf.placeholder(tf.float32,[None]+self.n_features,name='s_next')
 56         self.r = tf.placeholder(tf.float32,[None,],name='r')
 57 
 58         #self.a = tf.placeholder(tf.int32,[None,1],name='a')
 59 
 60         with tf.variable_scope('Actor'):
 61             self.a = self._build_a_net(self.s, scope='eval', trainable=True)
 62             a_ = self._build_a_net(self.s_next, scope='target', trainable=False)
 63 
 64         with tf.variable_scope('Critic'):
 65 
 66             q  = self._build_c_net(self.s, self.a,scope='eval', trainable=True)
 67             q_  = self._build_c_net(self.s_next, a_,scope='target', trainable=False)
 68 
 69         # networks parameters
 70         self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
 71         self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
 72         self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
 73         self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
 74 
 75        
 76           
 77         
 78         with tf.variable_scope('train_op_actor'):
 79             self.loss_actor = -tf.reduce_mean(q)
 80             self.train_op_actor = tf.train.AdamOptimizer(self.lr_a).minimize(self.loss_actor,var_list=self.ae_params)  
 81     
 82     
 83 
 84         
 85         with tf.variable_scope('train_op_critic'):
 86             
 87             q_target = self.r + self.gamma * q_ 
 88             self.loss_critic =tf.losses.mean_squared_error(labels=q_target, predictions=q)
 89             self.train_op_critic = tf.train.AdamOptimizer(self.lr_c).minimize(self.loss_critic,var_list=self.ce_params)
 90 
 91        
 92 
 93             # target net replacement
 94         self.soft_replace = [tf.assign(t, (1 - self.TAU) * t + self.TAU * e)
 95                                for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
 96        
 97         self.sess = tf.Session()
 98         if self.output_graph:
 99             tf.summary.FileWriter(self.log_dir,self.sess.graph)
100 
101         self.sess.run(tf.global_variables_initializer())
102         
103         self.cost_his =[0]
104         self.cost =0 
105 
106         self.saver = tf.train.Saver()
107 
108         if not os.path.exists(self.model_dir):
109             os.mkdir(self.model_dir)
110 
111         checkpoint = tf.train.get_checkpoint_state(self.model_dir)
112         if checkpoint and checkpoint.model_checkpoint_path:
113             self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
114             print ("Loading Successfully")
115             self.learn_step_counter = int(checkpoint.model_checkpoint_path.split('-')[-1]) + 1
116    
117 
118     @abstractmethod
119     def _build_a_net(self,x,scope,trainable):
120 
121         raise NotImplementedError
122     def _build_c_net(self,x,scope,trainable):
123 
124         raise NotImplementedError
125     def learn(self,data):
126 
127         # soft target replacement
128         self.sess.run(self.soft_replace)
129        
130 
131         batch_memory_s = data['s']
132         batch_memory_a =  data['a']
133         batch_memory_r = data['r']
134         batch_memory_s_ = data['s_']
135       
136         _, cost = self.sess.run(
137             [self.train_op_actor, self.loss_actor],
138             feed_dict={
139                 self.s: batch_memory_s,                      
140             })
141 
142         _, cost = self.sess.run(
143             [self.train_op_critic, self.loss_critic],
144             feed_dict={
145                 self.s: batch_memory_s,
146                 self.a: batch_memory_a,
147                 self.r: batch_memory_r,
148                 self.s_next: batch_memory_s_,
149            
150             })
151 
152 
153         
154         self.cost_his.append(cost)
155         self.cost =cost
156         self.learn_step_counter += 1
157             # save network every 100000 iteration
158         if self.learn_step_counter % 10000 == 0:
159             self.saver.save(self.sess,self.model_dir,global_step=self.learn_step_counter)
160 
161 
162 
163     def choose_action(self,s): 
164 
165         return self.sess.run(self.a, {self.s: s[np.newaxis,:]})[0]
166         # s = s[np.newaxis,:]
167        
168         # probs = self.sess.run(self.acts_prob,feed_dict={self.s:s})
169         # return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   

 

game.py

 

  1 import sys
  2 import gym
  3 import numpy as np 
  4 import tensorflow as tf
  5 sys.path.append('./')
  6 sys.path.append('model')
  7 
  8 from util import Memory ,StateProcessor
  9 from DDPG import DDPG
 10 from ACNetwork import ACNetwork
 11 np.random.seed(1)
 12 tf.set_random_seed(1)
 13 
 14 import logging  # 引入logging模块
 15 logging.basicConfig(level=logging.DEBUG,
 16                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')  # logging.basicConfig函数对日志的输出格式及方式做相关配置
 17 # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上
 18 import os
 19 os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 20 tfconfig = tf.ConfigProto()
 21 tfconfig.gpu_options.allow_growth = True
 22 session = tf.Session(config=tfconfig)
 23 
 24 
 25 
 26 class DDPG4Pendulum(DDPG):
 27     """docstring for ClassName"""
 28     def __init__(self, **kwargs):
 29         super(DDPG4Pendulum, self).__init__(**kwargs)
 30     
 31     def _build_a_net(self,s,scope,trainable):
 32         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
 33         #w_initializer, b_initializer = None,None
 34         with tf.variable_scope(scope):
 35             e1 = tf.layers.dense(inputs=s, 
 36                     units=30, 
 37                     bias_initializer = b_initializer,
 38                     kernel_initializer=w_initializer,
 39                     activation = tf.nn.relu,
 40                     trainable=trainable)  
 41             a = tf.layers.dense(inputs=e1, 
 42                     units=self.n_actions, 
 43                     bias_initializer = b_initializer,
 44                     kernel_initializer=w_initializer,
 45                     activation = tf.nn.tanh,
 46                     trainable=trainable) 
 47 
 48         return tf.multiply(a, self.a_bound, name='scaled_a')  
 49     
 50     def _build_c_net(self,s,a,scope,trainable):
 51         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
 52 
 53         with tf.variable_scope(scope):
 54             n_l1 = 30
 55             w1_s = tf.get_variable('w1_s',self.n_features+[n_l1],trainable=trainable)
 56             w1_a = tf.get_variable('w1_a',[self.n_actions,n_l1],trainable=trainable)
 57             b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
 58             net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
 59             
 60             q = tf.layers.dense(inputs=net, 
 61                     units=1, 
 62                     bias_initializer = b_initializer,
 63                     kernel_initializer=w_initializer,
 64                     activation =None,
 65                     trainable=trainable) 
 66 
 67         return q   
 68 
 69 
 70 
 71 batch_size = 32
 72 
 73 memory_size  =10000
 74 env = gym.make('Pendulum-v0') #连续
 75 
 76 
 77 n_features= [env.observation_space.shape[0]]
 78 n_actions= env.action_space.shape[0]
 79 a_bound = env.action_space.high
 80 env = env.unwrapped
 81 MAX_EP_STEPS =200
 82 def run():
 83    
 84     RL = DDPG4Pendulum(
 85         n_actions=n_actions,
 86         n_features=n_features,
 87         reward_decay=0.9,
 88         lr_a = 0.001,
 89         lr_c = 0.002,
 90         memory_size=memory_size,
 91         TAU = 0.01,
 92         output_graph=False,
 93         log_dir = 'Pendulum/log/DDPG4Pendulum/',
 94         a_bound =a_bound,
 95         model_dir = 'Pendulum/model_dir/DDPG4Pendulum/'
 96         )
 97 
 98     memory = Memory(n_actions,n_features,memory_size=memory_size)
 99   
100     var = 3  # control exploration
101     step = 0
102     
103     for episode in range(2000):
104         # initial observation
105         observation = env.reset()
106         ep_r = 0
107 
108         for j in range(MAX_EP_STEPS):
109             
110             # RL choose action based on observation
111             action = RL.choose_action(observation)
112             action = np.clip(np.random.normal(action, var), -2, 2)    # add randomness to action selection for exploration
113             # RL take action and get_collectiot next observation and reward
114             observation_, reward, done, info=env.step(action) # take a random action
115            
116             #print('step:%d---episode:%d----reward:%f---action:%f'%(step,episode,reward,action))
117             memory.store_transition(observation, action, reward/10, observation_)
118             
119             if step > memory_size:
120                 #env.render()
121                 var *= .9995    # decay the action randomness
122                 data = memory.sample(batch_size)
123                 RL.learn(data)
124                
125             # swap observation
126             observation = observation_
127             ep_r += reward
128             # break while loop when end of this episode
129             if(episode>200): 
130                 env.render()  # render on the screen
131             if j == MAX_EP_STEPS-1:
132                 print('step: ',step,
133                     'episode: ', episode,
134                       'ep_r: ', round(ep_r, 2),     
135                       'var:',var,   
136                       #loss: ',RL.cost
137                       )
138                 break
139             step += 1
140 
141     # end of game
142     print('game over')
143     env.destroy()
144 
145 def main():
146  
147     run()
148 
149 
150 
151 if __name__ == '__main__':
152     main()
153     #run2()

 

转载于:https://www.cnblogs.com/zle1992/p/10247326.html

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
GCN-DDPG是一种基于图卷积神经网络和深度确定性策略梯度(DDPG)的强化学习算法。以下是一个简单的Python代码实现,仅供参考: ```python import tensorflow as tf import numpy as np import gym import random from collections import deque # 定义超参数 EPISODES = 5000 BATCH_SIZE = 64 GAMMA = 0.99 TAU = 0.001 LR_ACTOR = 0.0001 LR_CRITIC = 0.001 # 定义图卷积神经网络层 class GraphConvolution(tf.keras.layers.Layer): def __init__(self, output_dim): super(GraphConvolution, self).__init__() self.output_dim = output_dim def build(self, input_shape): self.kernel = self.add_weight(name='kernel', shape=(input_shape[1], self.output_dim), initializer='glorot_uniform', trainable=True) def call(self, inputs): features, adj = inputs output = tf.matmul(adj, features) output = tf.matmul(output, self.kernel) return tf.nn.relu(output) # 定义Actor模型 class Actor(tf.keras.Model): def __init__(self, state_shape, action_shape): super(Actor, self).__init__() self.fc1 = tf.keras.layers.Dense(64, activation='relu') self.fc2 = tf.keras.layers.Dense(64, activation='relu') self.fc3 = tf.keras.layers.Dense(action_shape[0], activation='tanh') def call(self, state): x = self.fc1(state) x = self.fc2(x) x = self.fc3(x) return x # 定义Critic模型 class Critic(tf.keras.Model): def __init__(self, state_shape, action_shape): super(Critic, self).__init__() self.fc1 = tf.keras.layers.Dense(64, activation='relu') self.fc2 = tf.keras.layers.Dense(64, activation='relu') self.fc3 = tf.keras.layers.Dense(action_shape[0], activation='linear') def call(self, inputs): state, action = inputs x = tf.concat([state, action], axis=-1) x = self.fc1(x) x = self.fc2(x) x = self.fc3(x) return x # 定义Replay Buffer class ReplayBuffer: def __init__(self, buffer_size): self.buffer_size = buffer_size self.buffer = deque(maxlen=buffer_size) def add(self, state, action, reward, next_state, done): experience = (state, action, reward, next_state, done) self.buffer.append(experience) def sample(self, batch_size): batch = random.sample(self.buffer, batch_size) state_batch = np.array([experience[0] for experience in batch]) action_batch = np.array([experience[1] for experience in batch]) reward_batch = np.array([experience[2] for experience in batch]) next_state_batch = np.array([experience[3] for experience in batch]) done_batch = np.array([experience[4] for experience in batch]) return state_batch, action_batch, reward_batch, next_state_batch, done_batch def size(self): return len(self.buffer) # 定义环境 env = gym.make('Pendulum-v0') state_shape = env.observation_space.shape action_shape = env.action_space.shape # 初始化Actor和Critic模型 actor = Actor(state_shape, action_shape) critic = Critic(state_shape, action_shape) actor_target = Actor(state_shape, action_shape) critic_target = Critic(state_shape, action_shape) actor_optimizer = tf.keras.optimizers.Adam(learning_rate=LR_ACTOR) critic_optimizer = tf.keras.optimizers.Adam(learning_rate=LR_CRITIC) # 将Actor和Critic的参数复制到对应的目标网络 actor_target.set_weights(actor.get_weights()) critic_target.set_weights(critic.get_weights()) # 定义Replay Buffer replay_buffer = ReplayBuffer(10000) # 定义训练函数 @tf.function def train_actor(state): with tf.GradientTape() as tape: action = actor(state) q_value = critic([state, action]) loss = -tf.reduce_mean(q_value) gradients = tape.gradient(loss, actor.trainable_variables) actor_optimizer.apply_gradients(zip(gradients, actor.trainable_variables)) @tf.function def train_critic(state, action, reward, next_state, done): with tf.GradientTape() as tape: target_action = actor_target(next_state) target_q_value = critic_target([next_state, target_action]) y = reward + (1 - done) * GAMMA * target_q_value q_value = critic([state, action]) td_error = y - q_value loss = tf.reduce_mean(tf.square(td_error)) gradients = tape.gradient(loss, critic.trainable_variables) critic_optimizer.apply_gradients(zip(gradients, critic.trainable_variables)) # 开始训练 for episode in range(EPISODES): state = env.reset() episode_reward = 0 while True: action = actor(np.expand_dims(state, axis=0))[0] action += np.random.normal(0, 0.1, size=action_shape[0]) action = np.clip(action, -1.0, 1.0) next_state, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, next_state, done) episode_reward += reward if replay_buffer.size() >= BATCH_SIZE: state_batch, action_batch, reward_batch, next_state_batch, done_batch = replay_buffer.sample(BATCH_SIZE) train_critic(state_batch, action_batch, reward_batch, next_state_batch, done_batch) train_actor(state_batch) # 软更新Actor和Critic的目标网络 for t, e in zip(actor_target.trainable_variables, actor.trainable_variables): t.assign(t * (1 - TAU) + e * TAU) for t, e in zip(critic_target.trainable_variables, critic.trainable_variables): t.assign(t * (1 - TAU) + e * TAU) state = next_state if done: break print('Episode: {}, Reward: {}'.format(episode, episode_reward)) ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值