class Monte_Carlo_Policy_Gradient():
def __init__(self, env, num_episodes=200, learning_rate=0.01, reward_decay=0.95):
self.nA = env.action_space.n # 动作空间数量
self.nS = env.observation_space.shape[0] # 状态空间数量
self.env = env # 环境
self.num_episodes = num_episodes # 经验轨迹迭代次数
self.reward_decay = reward_decay # 奖励衰减值
self.learning_rate = learning_rate # 网络学习率
self.rewards = [] # 记录所有奖励
self.RENDER_REWARD_WIN = 20 # 最小奖励阈值
self.RENDER_ENV = False # 是否重新分配环境标志位
self.PG = PolicyGradient(n_x=self.nS, n_y=self.nA, learning_rate=self.learning_rate, reward_decay=self.reward_decay)
record_head = namedtuple('states', ['episode_lengths', 'episode_rewards'])
self.record = record_head(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))
def mcpg_learn(self):
for i_episode in range(self.num_episodes): # 迭代经验轨迹次数
state = self.env.reset() # 初始化环境
reward = 0 # 初始化奖励为0
for t in itertools.count(): # 遍历经验轨迹
if self.RENDER_ENV: # 如果环境重置标志位位True,则对环境进行重置
self.env.render()
action = self.PG.choose_action(state) # 步骤1:根据给定状态,策略网络选择出相应动作
next_state, reward, done, _ = self.env.step(action) # 步骤2:环境执行动作给出反馈
self.PG.store_transition(state, action, reward) # 步骤3:记录环境反馈,用于策略网络的训练数据
self.record.episode_rewards[i_episode] += reward
self.record.episode_lengths[i_episode] = t
if done: # 游戏结束
episode_rewards_sum = sum(self.PG.episode_rewards) # 计算本次经验轨迹所获得的累计奖励
self.rewards.append(episode_rewards_sum)
max_reward = np.amax(self.rewards)
self.PG.learn() # 步骤4:结束游戏后对策略网络进行训练
if max_reward > self.RENDER_REWARD_WIN: # 如果历史最大奖励大于奖励最小阈值,则重新环境标志位为True,主要作为在于增加环境的复杂性,让环境尽可能产生不一样的状态
self.RENDER_ENV = True
break # 退出本次经验轨迹
state = next_state # 步骤5:存储下一个状态作为新的状态记录
return self.record
class BPModel(nn.Module):
def __init__(self, n_x, n_y):
super(BPModel, self).__init__()
self.layer1 = nn.Linear(n_x, 10)
self.layer2 = nn.Linear(10, 10)
self.layer3 = nn.Linear(10, n_y)
def forward(self, x):
x = torch.relu(self.layer1(x))
x = torch.relu(self.layer2(x))
output = nn.softmax(self.layer3(x))
return output
class PolicyGradient():
def __init__(self, n_x, n_y, learning_rate=0.01, reward_decay=0.95, load_path=None, save_path=None):
self.n_x = n_x # 策略网络输入
self.n_y = n_y # 策略网络输出
self.lr = learning_rate # 学习率
self.reward_decay = reward_decay # 折扣因子
self.episode_states, self.episode_actions, self.episode_rewards = [], [], [] # 经验轨迹采样数据
self.model = BPModel(n_x, n_y) # 建立策略网络
self.loss_func = nn.CrossEntropyLoss() # 交叉熵损失
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def choose_action(self, state):
state = state[:, np.newaxis]
prob_action = self.model(state) # 策略网络前向传播
action = np.random.choice(range(len(prob_action.ravel())), p=prob_action.ravel()) # 根据动作概率选择执行动作
return action
def store_memory(self, state, action, reward):
self.episode_states.append(state)
self.episode_rewards.append(reward)
action__ = np.zeros(self.n_y)
action__[action] = 1
self.episode_actions.append(action__)
def learn(self):
disc_ep_reward = np.zeros_like(self.episode_rewards) # 计算每一步的状态价值
running_add = 0
for t in reversed(range(0, len(self.episode_rewards))): # 从后往前算出每一步的状态价值
running_add = running_add * self.reward_decay + self.episode_actions[t]
disc_ep_reward[t] = running_add
disc_ep_reward -= np.mean(disc_ep_reward) # 减均值
disc_ep_reward /= np.std(disc_ep_reward) # 除以标准差
X = np.vstack(self.episode_states)
Y = np.vstack(self.episode_actions)
outputs = self.model(X) # 动作概率
neg_log_prob = self.loss_func(outputs, Y) # 本身这种算法不存在误差,此步思想是这次被选中的行为更有可能在下次发生
loss = torch.mean(neg_log_prob * disc_ep_reward) # 乘以标准化的奖励值,旨在确定这个行为是不是应当被增加被选的概率,相当于通过价值修正动作选择概率
self.optimizer.zero_grad()
loss.backward()
self.episode_states, self.episode_actions, self.episode_rewards = [], [], []
2021-10-24 7.7
最新推荐文章于 2024-05-31 14:56:08 发布