bellman最优方程
def evaluate_bellman(env, policy, gamma=1.):
a, b = np.eye(env.nS), np.zeros((env.nS))
i=0
for state in range(env.nS - 1):
for action in range(env.nA):
pi = policy[state][action]
for p, next_state, reward, done in env.P[state][action]:
a[state, next_state] -= (pi * gamma * p)
b[state] += (pi * reward * p)
print(i)
print('a=',a)
print('b=',b)
v = np.linalg.solve(a, b)
q = np.zeros((env.nS, env.nA))
for state in range(env.nS - 1):
for action in range(env.nA):
for p, next_state, reward, done in env.P[state][action]:
q[state][action] += ((reward + gamma * v[next_state]) * p)
return v, q
policy = np.random.uniform(size=(env.nS, env.nA))
policy = policy / np.sum(policy, axis=1)[:, np.newaxis]
print(policy.shape)
state_values, action_values = evaluate_bellman(env, policy)
env.P
#第一个是状态
#每个括号里第一个是动作
#数组中第一个是概率,第二个是到达的状态,第三个是奖励,第四个是否结束
bellman实现矩阵见pad
最优方程没搞明白