class BPModel(nn.Module):
def __init__(self, n_x, n_y):
super(BPModel, self).__init__()
self.layer1 = nn.Linear(n_x, 10)
self.layer2 = nn.Linear(10, 10)
self.layer3 = nn.Linear(10, n_y)
def forward(self, x):
x = torch.relu(self.layer1(x))
x = torch.relu(self.layer2(x))
output = self.layer3(x)
return output
class PolicyGradient():
def __init__(self, n_x, n_y, learning_rate=0.01, load_path=None, save_path=None):
self.n_x = n_x # 策略网络输入
self.n_y = n_y # 策略网络输出
self.lr = learning_rate # 学习率
self.episode_rewards = [] # 存放单次经验轨迹中的时间差分误差
self.model = BPModel(n_x, n_y) # 建立策略网络
self.loss_func = nn.MSELoss() # 平方差损失
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def store_transition(self, error):
self.episode_rewards.append(error) # 存储每次时间差分误差
def predict(self, state):
state = state[:, np.newaxis]
prob_weights = self.model(state)
prob_weights = nn.softmax(prob_weights) # 获得动作概率
action = np.random.choice(range(len(prob_weights.ravel())), p=prob_weights.ravel())
return action
def learn(self, state, action, reward):
# disc_ep_reward = np.zeros_like(self.episode_rewards) # 计算每一步的状态价值
# disc_ep_reward -= np.mean(disc_ep_reward) # 减均值
# disc_ep_reward /= np.std(disc_ep_reward) # 除以标准差
action__ = np.zeros(self.n_y)
action__[action] = 1
# X = np.vstack(state)
# Y = np.vstack(action__)
X = state
Y = action__
outputs = self.model(X) # 动作概率
neg_log_prob = self.loss_func(outputs, Y) # 本身这种算法不存在误差,此步思想是这次被选中的行为更有可能在下次发生
loss = torch.mean(neg_log_prob * reward) # 乘以评论家给出的时间差分误差(即对动作的评价),旨在确定这个行为是不是应当被增加被选的概率,相当于通过价值修正动作选择概率
self.optimizer.zero_grad()
loss.backward()
self.episode_rewards = []
class ValueEstimator():
def __init__(self, n_x, n_y,
learning_rate=0.01, load_path=None, save_path=None):
self.n_x = n_x
self.n_y = n_y
self.lr = learning_rate
self.episode_rewards = []
self.model = BPModel(n_x, n_y) # 建立策略网络
self.loss_func = nn.MSELoss() # 平方差损失
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def store_transition(self, target):
self.episode_rewards.append(target)
def predict(self, state):
state = state[:, np.newaxis]
# get softmax probabilities
prob_weights = torch.relu(self.model(state))
return prob_weights[0]
def learn(self, state, target):
disc_norm_ep_reward = self.__disc_and_norm_rewards()
X = state[:, np.newaxis]
Y = target
outputs = self.model(X) # 动作概率
loss = self.loss_func(outputs, Y) # 将目标价值(时间差分目标)作为标签值
self.optimizer.zero_grad()
loss.backward()
self.episode_rewards = []
class Actor_Critic():
def __init__(self, env, num_episodes=200, learning_rate=0.01, reward_decay=0.95):
self.nA = env.action_space.n
self.nS = env.observation_space.shape[0]
self.nR = 1
self.env = env
self.num_episodes = num_episodes
self.reward_decay = reward_decay
self.learning_rate = learning_rate
self.rewards = []
self.RENDER_REWARD_MIN = 50 # 最小奖励阈值
self.RENDER_ENV = False
self.actor = PolicyGradient(n_x=self.nS, n_y=self.nA,
learning_rate=self.learning_rate)
self.critic = ValueEstimator(n_x=self.nS, n_y=self.nR,
learning_rate=self.learning_rate)
record_head = namedtuple("Stats", ["episode_lengths", "episode_rewards"])
self.record = record_head(
episode_lengths=np.zeros(num_episodes),
episode_rewards=np.zeros(num_episodes))
def mcpg_learn(self):
for i_episode in range(self.num_episodes): # 经验轨迹迭代
state = env.reset() # 环境初始化
reward_ = 0 # 轨迹总奖励初始化
for t in itertools.count(): # 遍历经验轨迹
action = self.actor.predict(state) # 根据策略网络(actor)选择动作
next_state, reward, done, _ = env.step(action) # 执行动作
reward_ += reward # 总奖励累加
value_next = self.critic.predict(next_state) # 评论家预测下一状态的价值
td_target = reward + self.reward_decay * value_next # 时间差分目标
td_error = td_target - self.critic.predict(state) # 时间差分误差
self.critic.learn(state, td_target) # 更新价值网络(评论家)
self.actor.learn(state, action, td_error) # 更新策略网络(演员)
self.record.episode_rewards[i_episode] += reward
self.record.episode_lengths[i_episode] = t
if done:
self.rewards.append(reward_)
max_reward = np.amax(self.rewards)
break
state = next_state
return self.record```
2021-10-25 7.18
最新推荐文章于 2024-05-30 11:43:13 发布