算法题 拓扑排序-06-Reward

Dandelion’s uncle is a boss of a factory. As the spring festival is coming , he wants to distribute rewards to his workers. Now he has a trouble about how to distribute the rewards.
The workers will compare their rewards ,and some one may have demands of the distributing of rewards ,just like a’s reward should more than b’s.Dandelion’s unclue wants to fulfill all the demands, of course ,he wants to use the least money.Every work’s reward will be at least 888 , because it’s a lucky number.
Input
One line with two integers n and m ,stands for the number of works and the number of demands .(n<=10000,m<=20000)
then m lines ,each line contains two integers a and b ,stands for a’s reward should be more than b’s.
Output
For every case ,print the least money dandelion ‘s uncle needs to distribute .If it’s impossible to fulfill all the works’ demands ,print -1.
Sample Input
2 1
1 2
2 2
1 2
2 1
Sample Output
1777
-1

思路:老板打算给员工们发奖励,但是员工会去存在一些要求,例如,输入a b 即a员工的奖励要比b员工的奖励多。老板决定每个员工的最低奖励为888元。老板想要知道自己最少需要多少钱俩奖励员工。所以最低工资为888,它的上一层的工资就是889,反向拓扑方便计算工资,先找出最低的那个在往上求。

#include<iostream>
#include<vector>
#include<string>
#include<queue>
using namespace std;
vector <int> map[10005];
int in[10005];
int topo[10005];
int cnt;
int n, m,i,ans;
int money[10005];
int toposport() {
	ans = 0;
	queue<int> q;
	cnt = 0;
	int num=0;
	for (i = 1; i <= n; i++) {
		if (in[i] == 0) {
			money[i] = 888;
			q.push(i);
			num++;
		}
	}
	while (!q.empty()) {
		int u = q.front();
		q.pop();
		for (i = 0; i < map[u].size(); i++) {
			int v = map[u][i];
			in[v]--;
			
			
			if (in[v] == 0) {
				num++;
				q.push(v);
				money[v] = money[u] + 1;

			}
				
		}
		if (num == n) {
			for (i = 1; i <= n; i++) {
				ans += money[i];
			}
			return ans;
		}
	}
	return -1;
}
int main() {
	while (scanf("%d %d", &n, &m) != EOF) {
		memset(in, 0, sizeof(in));
		memset(money, 0, sizeof(money));
		for (i = 0; i <= n; i++) {
			map[i].clear();
		}
		while (m--) {
			int a, b;
			scanf("%d %d", &a, &b);
			map[b].push_back(a);
			in[a]++;
		}
		int z = toposport();
		printf("%d\n", z);
	}
	return 0;
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
首先,DDPG(Deep Deterministic Policy Gradient)是一种基于Actor-Critic的深度强化学习算法,可用于解决连续动作空间的问。而路径规划是一种典型的强化学习问,因此可以使用DDPG算法来解决路径规划问。 在Python中,可以使用TensorFlow或PyTorch等深度学习框架来实现DDPG算法。同时,可以使用highway-env这个Python库作为强化学习环境,用于测试DDPG算法的效果。 下面是一个基于TensorFlow实现的DDPG算法的示例代码,用于解决路径规划问: ```python import tensorflow as tf import numpy as np import gym import highway_env from ddpg import DDPG # 创建环境 env = gym.make('highway-v0') # 设置DDPG算法的超参数 actor_lr = 0.0001 critic_lr = 0.001 gamma = 0.99 tau = 0.001 buffer_size = 1000000 batch_size = 64 action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] # 创建DDPG对象 ddpg = DDPG(actor_lr, critic_lr, gamma, tau, buffer_size, batch_size, action_dim, state_dim) # 训练DDPG算法 for i in range(5000): obs = env.reset() done = False while not done: action = ddpg.choose_action(obs) next_obs, reward, done, info = env.step(action) ddpg.store_transition(obs, action, reward, next_obs, done) if len(ddpg.memory) > batch_size: ddpg.learn() obs = next_obs # 测试DDPG算法的效果 obs = env.reset() done = False while not done: action = ddpg.choose_action(obs) next_obs, reward, done, info = env.step(action) obs = next_obs env.render() ``` 在上面的代码中,DDPG类的实现可以参考如下代码: ```python class DDPG: def __init__(self, actor_lr, critic_lr, gamma, tau, buffer_size, batch_size, action_dim, state_dim): self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.tau = tau self.batch_size = batch_size self.action_dim = action_dim self.state_dim = state_dim self.memory = [] self.buffer_size = buffer_size self.actor = self.build_actor() self.critic = self.build_critic() self.target_actor = self.build_actor() self.target_critic = self.build_critic() self.update_target_op = self.update_target_network() # 创建Actor网络 def build_actor(self): inputs = tf.keras.layers.Input(shape=(self.state_dim,)) x = tf.keras.layers.Dense(256, activation='relu')(inputs) x = tf.keras.layers.Dense(128, activation='relu')(x) outputs = tf.keras.layers.Dense(self.action_dim, activation='tanh')(x) model = tf.keras.Model(inputs=inputs, outputs=outputs) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.actor_lr), loss='mse') return model # 创建Critic网络 def build_critic(self): state_inputs = tf.keras.layers.Input(shape=(self.state_dim,)) state_x = tf.keras.layers.Dense(256, activation='relu')(state_inputs) state_x = tf.keras.layers.Dense(128, activation='relu')(state_x) action_inputs = tf.keras.layers.Input(shape=(self.action_dim,)) action_x = tf.keras.layers.Dense(128, activation='relu')(action_inputs) x = tf.keras.layers.Concatenate()([state_x, action_x]) x = tf.keras.layers.Dense(128, activation='relu')(x) outputs = tf.keras.layers.Dense(1)(x) model = tf.keras.Model(inputs=[state_inputs, action_inputs], outputs=outputs) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.critic_lr), loss='mse') return model # 更新目标网络 def update_target_network(self): weights = [] targets = self.target_actor.weights for i, weight in enumerate(self.actor.weights): weights.append(weight * self.tau + targets[i] * (1 - self.tau)) self.target_actor.set_weights(weights) weights = [] targets = self.target_critic.weights for i, weight in enumerate(self.critic.weights): weights.append(weight * self.tau + targets[i] * (1 - self.tau)) self.target_critic.set_weights(weights) # 存储经验 def store_transition(self, state, action, reward, next_state, done): self.memory.append([state, action, reward, next_state, done]) if len(self.memory) > self.buffer_size: self.memory.pop(0) # 选择动作 def choose_action(self, state): state = np.array([state]) action = self.actor.predict(state)[0] return action # 学习 def learn(self): minibatch = np.random.choice(self.memory, self.batch_size, replace=False) states = np.array([transition[0] for transition in minibatch]) actions = np.array([transition[1] for transition in minibatch]) rewards = np.array([transition[2] for transition in minibatch]) next_states = np.array([transition[3] for transition in minibatch]) dones = np.array([transition[4] for transition in minibatch]) # 更新Critic网络 with tf.GradientTape() as tape: next_actions = self.target_actor.predict(next_states) target_next_q = self.target_critic.predict([next_states, next_actions]) target_q = rewards + self.gamma * target_next_q * (1 - dones) q = self.critic.predict([states, actions]) critic_loss = tf.reduce_mean(tf.square(target_q - q)) critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables)) # 更新Actor网络 with tf.GradientTape() as tape: actor_actions = self.actor.predict(states) actor_loss = -tf.reduce_mean(self.critic([states, actor_actions])) actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables)) # 更新目标网络 self.update_target_network() ``` 最后,运行上述代码,可以得到DDPG算法在highway-env环境下的路径规划效果。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值