一、导入库
from typing import Dict, List, Optional, Tuple
import dataclasses
import numpy as np
二、MDP实现
@dataclasses.dataclass
class Transition:
state: Tuple[int, int]
action: str
next_state: Tuple[int, int]
reward: float
termination: bool
class GridEnv:
_states: np.array
_rewards: np.array
_action_semantics: List[str]
_actions: Dict[str, np.array]
_init_state: Tuple[int, int]
_current_state: Tuple[int, int]
_goal: Tuple[int, int]
_pits: Optional[List[Tuple[int, int]]]
_transition_probabilities: np.array
def __init__(self,
rows: int,
cols: int,
step_cost: float,
goal: Tuple[int, int],
pits: Optional[List[Tuple[int, int]]] = None,
obstacles: Optional[List[Tuple[int, int]]] = None) -> None:
self._states = np.zeros((rows, cols))
obstacles = [] if obstacles is None else obstacles
for r, c in obstacles:
self._states[r, c] = 1
self._rewards = -step_cost*np.ones((rows, cols))
self._rewards[goal[0], goal[1]] = +1
for r, c in pits:
self._rewards[r, c] = -1
self._action_semantics = ['up', 'left', 'down', 'right']
self._actions = np.array([[-1, 0], [0, -1], [1, 0], [0, 1]])
self._init_state = (rows - 1, 0)
self._current_state = self._init_state
self._goal = goal
self._pits = pits
self._transition_probabilities = np.array([0.1, 0.8, 0.1])
@property
def actions(self) -> List[str]:
return self._action_semantics
@property
def current_state(self) -> Tuple[int, int]:
return self._current_state
@property
def reward(self) -> float:
r, c = self._current_state
return self._rewards[r, c]
@property
def termination(self) -> bool:
return self._current_state == self._goal
def render(self) -> None:
grid = np.array(self._states, dtype=str)
r, c = self._current_state
grid[r, c] = 'x'
r, c = self._goal
grid[r, c] = 'G'
for r, c in self._pits:
grid[r, c] = 'P'
print(grid)
def _transition(self, state: Tuple[int, int], a: np.array) -> Tuple[int, int]:
n_actions = len(self._actions)
a = self._actions[a + n_actions if a < 0 else a % n_actions]
new_r = max(0, min(self._states.shape[0] - 1, state[0] + a[0]))
new_c = max(0, min(self._states.shape[1] - 1, state[1] + a[1]))
return (new_r, new_c) if self._states[new_r, new_c] == 0. else state
def step(self, action: str) -> Transition:
a_idx = self._action_semantics.index(action)
rnd = np.random.rand()
chosen_action = a_idx + np.random.choice([1, 0, -1], p=self._transition_probabilities)
prev_state = self._current_state
self._current_state = self._transition(self._current_state, chosen_action)
return Transition(state=prev_state,
action=action,
next_state=self._current_state,
reward=self.reward,
termination=self.termination)
def reset(self) -> None:
self._current_state = self._init_state
my_env = GridEnv(rows=3, cols=4, step_cost=0.4, goal=(0, 3), pits=[(1, 3)], obstacles=[(1, 1)])
my_env.render()
print(my_env.current_state)
print(my_env.reward)
print(my_env.termination)
print(my_env.actions)
for _ in range(5):
a = np.random.choice(my_env.actions)
print(a)
print(my_env.step(a))
my_env.render()
my_env.reset()
my_env.render()
三、值迭代 Value Iteration
size_r = my_env._states.shape[0]
size_c = my_env._states.shape[1]
V = np.zeros((size_r, size_c))
gamma = 0.8
max_error = 1e-8
while True:
V_prime = V.copy()
get_v = lambda r, c: V[r, c]
delta = 0
for i in range(size_r*size_c):
r = i // size_c
c = i % size_c
reward = my_env._rewards[r, c]
actions = my_env._actions
V_max = None
for a_idx in range(len(actions)):
possible_outcomes = [my_env._transition((r, c), a) for a in a_idx + np.array([1, 0, -1])]
value = sum([my_env._transition_probabilities[k]*get_v(*list(s_prime))
for k, s_prime in enumerate(possible_outcomes)])
if V_max is None or value > V_max:
V_max = value
V_prime[r, c] = reward + gamma * V_max
err = abs(V[r, c] - V_prime[r, c])
delta = err if err > delta else delta
V = V_prime
print(delta)
if delta < max_error:
break
print(V)
T = 1/(1 - gamma)
print(T)