4-1 Deque (25分)

本文介绍了一种使用双向链表实现的双端队列Deque,支持Push、Pop、Inject和Eject四种操作,每个操作的时间复杂度为O(1)。文章详细阐述了数据结构的设计和函数接口,提供了示例程序和输入输出说明。
摘要由CSDN通过智能技术生成
4-1 Deque   (25分)

A "deque" is a data structure consisting of a list of items, on which the following operations are possible:

  • Push(X,D): Insert item X on the front end of deque D.
  • Pop(D): Remove the front item from deque D and return it.
  • Inject(X,D): Insert item X on the rear end of deque D.
  • Eject(D): Remove the rear item from deque D and return it.Write routines to support the deque that take O(1) time per operation.

Format of functions:

Deque CreateDeque();
int Push( ElementType X, Deque D );
ElementType Pop( Deque D );
int Inject( ElementType X, Deque D );
ElementType Eject( Deque D );

where Deque is defined as the following:

typedef struct Node *PtrToNode;
struct Node {
    ElementType Element;
    PtrToNode Next, Last;
  • 4
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,D4PG是一种基于Actor-Critic框架的深度强化学习算法,下面是一个简单的D4PG代码实现: ```python import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import numpy as np import random from collections import deque # Hyperparameters BATCH_SIZE = 64 GAMMA = 0.99 TAU = 0.001 LR_ACTOR = 0.0001 LR_CRITIC = 0.001 BUFFER_SIZE = 1000000 MIN_BUFFER_SIZE = 10000 UPDATE_EVERY = 5 N_STEP = 3 N_EPISODES = 1000 MAX_STEPS = 1000 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") class Actor(nn.Module): def __init__(self, state_dim, action_dim, max_action): super(Actor, self).__init__() self.fc1 = nn.Linear(state_dim, 256) self.fc2 = nn.Linear(256, 256) self.fc3 = nn.Linear(256, action_dim) self.max_action = max_action def forward(self, state): x = F.relu(self.fc1(state)) x = F.relu(self.fc2(x)) x = self.max_action * torch.tanh(self.fc3(x)) return x class Critic(nn.Module): def __init__(self, state_dim, action_dim): super(Critic, self).__init__() self.fc1 = nn.Linear(state_dim + action_dim, 256) self.fc2 = nn.Linear(256, 256) self.fc3 = nn.Linear(256, 1) def forward(self, state, action): x = torch.cat([state, action], 1) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x class ReplayBuffer: def __init__(self, buffer_size): self.buffer_size = buffer_size self.memory = deque(maxlen=buffer_size) self.n_step_buffer = deque(maxlen=N_STEP) self.gamma = GAMMA def add(self, state, action, reward, next_state, done): self.n_step_buffer.append((state, action, reward, next_state, done)) if len(self.n_step_buffer) == N_STEP: state, action, _, _, done = self.n_step_buffer[0] _, _, _, next_state, _ = self.n_step_buffer[-1] n_reward = sum([self.gamma**i * self.n_step_buffer[i][2] for i in range(N_STEP)]) self.memory.append((state, action, n_reward, next_state, done)) if done: while len(self.n_step_buffer) > 1: state, action, _, _, _ = self.n_step_buffer.popleft() _, _, _, next_state, _ = self.n_step_buffer[-1] n_reward = sum([self.gamma**i * self.n_step_buffer[i][2] for i in range(len(self.n_step_buffer))]) self.memory.append((state, action, n_reward, next_state, done)) self.n_step_buffer.clear() def sample(self, batch_size): batch = random.sample(self.memory, batch_size) state, action, reward, next_state, done = zip(*batch) return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done) def __len__(self): return len(self.memory) class D4PG: def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(DEVICE) self.critic = Critic(state_dim, action_dim).to(DEVICE) self.target_actor = Actor(state_dim, action_dim, max_action).to(DEVICE) self.target_critic = Critic(state_dim, action_dim).to(DEVICE) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC) self.replay_buffer = ReplayBuffer(BUFFER_SIZE) def act(self, state): state = torch.FloatTensor(state.reshape(1, -1)).to(DEVICE) action = self.actor(state).cpu().data.numpy().flatten() return action def update(self): if len(self.replay_buffer) < MIN_BUFFER_SIZE: return state, action, reward, next_state, done = self.replay_buffer.sample(BATCH_SIZE) state = torch.FloatTensor(state).to(DEVICE) action = torch.FloatTensor(action).to(DEVICE) reward = torch.FloatTensor(reward).unsqueeze(1).to(DEVICE) next_state = torch.FloatTensor(next_state).to(DEVICE) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(DEVICE) # Update critic with torch.no_grad(): target_actions = self.target_actor(next_state) target_values = self.target_critic(next_state, target_actions) y = reward + (1 - done) * GAMMA * target_values values = self.critic(state, action) critic_loss = F.mse_loss(y, values) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Update actor actions = self.actor(state) actor_loss = -self.critic(state, actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data) def train(self, env): for episode in range(N_EPISODES): state = env.reset() total_reward = 0 for t in range(MAX_STEPS): action = self.act(state) next_state, reward, done, _ = env.step(action) total_reward += reward self.replay_buffer.add(state, action, reward, next_state, done) state = next_state if len(self.replay_buffer) >= MIN_BUFFER_SIZE and t % UPDATE_EVERY == 0: for _ in range(UPDATE_EVERY): self.update() if done: break print("Episode: {}, Total Reward: {}".format(episode, total_reward)) ``` 这个代码实现了一个D4PG算法,包括Actor/Critic网络、Replay Buffer、更新等部。你可以使用它来训练你的强化学习模型。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值