# 贪婪算法
def epsilon_greedy_policy(q, epsilon, nA):
def __policy__(state):
A_ = np.ones(nA, dtype=float) # 初始化动作概率
A = A_ * epsilon / nA # 以epsilon设定动作概率
best = np.argmax(q[state]) # 选取动作值函数中的最大值作为最优值
A[best] += 1 - epsilon
return A
return __policy__()
# 固定策略的非起始点探索的蒙特卡洛控制
def mc_firstvisit_control_epsilon_greedy(env, num_episodes=100, epsilon=0.1, episode_endtime=10, discount=1.0):
nA = env.action_space.n # 环境中的状态对应动作空间数量
Q = defaultdict(lambda: np.zeros(nA)) # 动作值函数
r_sum = defaultdict(float) # 状态-动作对的累计奖励
r_cou = defaultdict(float) # 状态-动作对的计数器
policy = epsilon_greedy_policy(Q, epsilon, nA) # 初始化贪婪策略
for i in range(num_episodes):
episode = []
state = env.reset()
for j in range(episode_endtime): # 经验轨迹产生
action_prob = policy(state) # 通过贪婪算法对状态动作对进行探索和利用
action = np.random.choice(np.arange(action_prob.shape[0]), p=action_prob) # 根据动作概率选取动作
next_state, reward, done, _ = env.step(action)
episode.append((state, action, reward))
if done:
break
state = next_state
for k, (state, actions, reward) in enumerate(episode):
sa_pair = (state, action) # 提取状态-动作对
first_visit_idx = k
G = sum([x[2]*np.power(discount, i) for i, x in enumerate(episode[first_visit_idx:])]) # 计算未来累计奖励
r_sum[sa_pair] += G # 更新未来累计奖励
r_cou[sa_pair] += 1 # 更新计数器
Q[state][actions] = r_sum[sa_pair] / r_cou[sa_pair] # 计算平均累计奖励
return Q```
2021-10-17 4.10
最新推荐文章于 2024-05-30 11:43:13 发布