[python3 Numpy使用技巧]一条语句实现numpy数组relu激活函数

一条语句实现numpy数组relu激活函数

就是实现numpy数组中,小于零的元素赋值零。类似于神经网络里的激活函数,请看代码:

Idata = 1 * (Idata > 0) * Idata

可以分解一下

temp = Idata > 0
temp = 1 * temp
Idata = temp * Idata

自己编程的时候发现的,python如此强大呀!!!

好的,以下是一个用Python实现DDPG算法的案例,并添加了注释以方便理解: ```python # 导入必要的库 import tensorflow as tf import numpy as np import gym # 定义Actor和Critic网络结构 class Actor(object): def __init__(self, sess, state_dim, action_dim, action_bound): self.sess = sess self.state_dim = state_dim self.action_dim = action_dim self.action_bound = action_bound # 创建Actor网络 self.inputs, self.out = self.create_actor_network() # 创建Actor目标网络 self.target_inputs, self.target_out = self.create_actor_network() # 定义Actor网络参数更新操作 self.actor_params = tf.trainable_variables() # 定义Actor目标网络更新操作 self.target_actor_params = tf.trainable_variables()[len(self.actor_params):] # 创建Actor梯度操作 self.action_gradient = tf.placeholder(tf.float32, [None, self.action_dim]) self.actor_gradients = tf.gradients(self.out, self.actor_params, -self.action_gradient) # 定义Actor参数更新操作 self.optimizer = tf.train.AdamOptimizer(0.0001).apply_gradients(zip(self.actor_gradients, self.actor_params)) # 创建Actor网络 def create_actor_network(self): inputs = tf.placeholder(tf.float32, [None, self.state_dim]) w1 = tf.Variable(tf.random_normal([self.state_dim, 64])) b1 = tf.Variable(tf.random_normal([64])) l1 = tf.nn.relu(tf.matmul(inputs, w1) + b1) w2 = tf.Variable(tf.random_normal([64, self.action_dim])) b2 = tf.Variable(tf.random_normal([self.action_dim])) out = tf.nn.tanh(tf.matmul(l1, w2) + b2) scaled_out = tf.multiply(out, self.action_bound) return inputs, scaled_out # 训练Actor网络 def train(self, inputs, a_gradient): self.sess.run(self.optimizer, feed_dict={ self.inputs: inputs, self.action_gradient: a_gradient }) # 更新Actor目标网络 def update_target_network(self): self.sess.run([self.target_actor_params[i].assign(tf.multiply(self.actor_params[i], 0.1) + tf.multiply(self.target_actor_params[i], 0.9)) for i in range(len(self.target_actor_params))]) # 预测Actor输出结果 def predict(self, inputs): return self.sess.run(self.out, feed_dict={ self.inputs: inputs }) # 预测Actor目标输出结果 def predict_target(self, inputs): return self.sess.run(self.target_out, feed_dict={ self.target_inputs: inputs }) class Critic(object): def __init__(self, sess, state_dim, action_dim): self.sess = sess self.state_dim = state_dim self.action_dim = action_dim # 创建Critic网络 self.inputs, self.action, self.out = self.create_critic_network() # 创建Critic目标网络 self.target_inputs, self.target_action, self.target_out = self.create_critic_network() # 定义Critic网络参数更新操作 self.critic_params = tf.trainable_variables()[len(self.action_dim):] # 定义Critic目标网络更新操作 self.target_critic_params = tf.trainable_variables()[(len(self.action_dim) + len(self.critic_params)):] # 创建Critic目标Q值 self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # 定义Critic网络损失函数 self.loss = tf.reduce_mean(tf.square(self.predicted_q_value - self.out)) # 定义Critic网络更新操作 self.optimizer = tf.train.AdamOptimizer(0.001).minimize(self.loss) # 定义Critic网络梯度操作 self.action_grads = tf.gradients(self.out, self.action) # 创建Critic网络 def create_critic_network(self): inputs = tf.placeholder(tf.float32, [None, self.state_dim]) action = tf.placeholder(tf.float32, [None, self.action_dim]) w1 = tf.Variable(tf.random_normal([self.state_dim, 64])) b1 = tf.Variable(tf.random_normal([64])) l1 = tf.nn.relu(tf.matmul(inputs, w1) + b1) w2 = tf.Variable(tf.random_normal([64, self.action_dim])) a2 = tf.Variable(tf.random_normal([self.action_dim])) l2 = tf.matmul(l1, w2) + tf.matmul(action, a2) w3 = tf.Variable(tf.random_normal([self.action_dim, 1])) b3 = tf.Variable(tf.random_normal([1])) out = tf.matmul(l2, w3) + b3 return inputs, action, out # 训练Critic网络 def train(self, inputs, action, predicted_q_value): return self.sess.run([self.out, self.optimizer], feed_dict={ self.inputs: inputs, self.action: action, self.predicted_q_value: predicted_q_value }) # 更新Critic目标网络 def update_target_network(self): self.sess.run([self.target_critic_params[i].assign(tf.multiply(self.critic_params[i], 0.1) + tf.multiply(self.target_critic_params[i], 0.9)) for i in range(len(self.target_critic_params))]) # 预测Critic输出结果 def predict(self, inputs, action): return self.sess.run(self.out, feed_dict={ self.inputs: inputs, self.action: action }) # 计算Critic网络梯度 def action_gradients(self, inputs, actions): return self.sess.run(self.action_grads, feed_dict={ self.inputs: inputs, self.action: actions }) # 定义DDPG算法 class DDPG(object): def __init__(self, state_dim, action_dim, action_bound): # 初始化参数 self.state_dim = state_dim self.action_dim = action_dim self.action_bound = action_bound # 创建Actor和Critic网络 self.sess = tf.Session() self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound) self.critic = Critic(self.sess, self.state_dim, self.action_dim) # 初始化Actor和Critic目标网络 self.actor.update_target_network() self.critic.update_target_network() # 初始化replay buffer self.buffer_size = 1000000 self.buffer = ReplayBuffer(self.buffer_size) # 定义DDPG参数 self.batch_size = 64 self.gamma = 0.99 self.tau = 0.001 # 定义添加replay buffer操作 def add_buffer(self, state, action, reward, next_state, done): self.buffer.add(state, action, reward, next_state, done) # 定义选择动作操作 def choose_action(self, state): return self.actor.predict(np.reshape(state, (1, self.state_dim))) + \ np.random.normal(0, self.action_bound / 10, size=self.action_dim) # 定义训练操作 def train(self): # 从replay buffer中随机抽取样本 batch_state, batch_action, batch_reward, batch_next_state, batch_done = self.buffer.sample(self.batch_size) # 计算目标Q值 target_action = self.actor.predict_target(batch_next_state) target_q_value = self.critic.predict_target(batch_next_state, target_action) y_i = [] for k in range(self.batch_size): if batch_done[k]: y_i.append(batch_reward[k]) else: y_i.append(batch_reward[k] + self.gamma * target_q_value[k]) # 训练Critic网络 predicted_q_value, _ = self.critic.train(batch_state, batch_action, np.reshape(y_i, (self.batch_size, 1))) # 计算Actor梯度 action_out = self.actor.predict(batch_state) grads = self.critic.action_gradients(batch_state, action_out) # 训练Actor网络 self.actor.train(batch_state, grads[0]) # 更新Actor和Critic目标网络 self.actor.update_target_network() self.critic.update_target_network() # 定义保存模型操作 def save_model(self): saver = tf.train.Saver() saver.save(self.sess, "./model/model.ckpt") # 定义加载模型操作 def load_model(self): saver = tf.train.Saver() saver.restore(self.sess, "./model/model.ckpt") # 定义replay buffer class ReplayBuffer(object): def __init__(self, buffer_size): self.buffer_size = buffer_size self.buffer = [] # 添加样本到replay buffer def add(self, state, action, reward, next_state, done): if len(self.buffer) + 1 >= self.buffer_size: self.buffer[0:(1 + len(self.buffer)) - self.buffer_size] = [] self.buffer.append((state, action, reward, next_state, done)) # 随机抽取一个batch的样本 def sample(self, batch_size): batch = [] for i in range(batch_size): batch.append(self.buffer[np.random.randint(0, len(self.buffer))]) batch_state = np.array([_[0] for _ in batch]) batch_action = np.array([_[1] for _ in batch]) batch_reward = np.array([_[2] for _ in batch]) batch_next_state = np.array([_[3] for _ in batch]) batch_done = np.array([_[4] for _ in batch]) return batch_state, batch_action, batch_reward, batch_next_state, batch_done # 运行DDPG算法 def run_ddpg(): # 初始化gym环境 env = gym.make("Pendulum-v0") state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high[0] # 创建DDPG算法 ddpg = DDPG(state_dim, action_dim, action_bound) # 定义训练参数 episodes = 5000 steps = 200 for i in range(episodes): state = env.reset() total_reward = 0 for j in range(steps): # 选择动作 action = ddpg.choose_action(state) # 执行动作 next_state, reward, done, info = env.step(action) # 添加样本到replay buffer ddpg.add_buffer(state, action, reward, next_state, done) # 如果replay buffer中样本数量大于batch size,则开始训练 if len(ddpg.buffer.buffer) > ddpg.batch_size: ddpg.train() # 更新状态和奖励 state = next_state total_reward += reward # 如果游戏结束,则输出结果 if done: print("Episode:", i, "Reward:", total_reward) break # 保存模型 ddpg.save_model() ``` 以上就是一个用Python实现DDPG算法的案例,并添加了注释以方便理解。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小宋是呢

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值