[RL]MDP和值迭代

本文介绍了马尔科夫决策过程(MDP)的实现,包括Transition类的定义,GridEnv环境类的构造和操作,以及值迭代算法的详细步骤。通过实例展示了如何在GridEnv环境中进行状态转移、奖励计算和值迭代求解最优策略。
摘要由CSDN通过智能技术生成


一、导入库

from typing import Dict, List, Optional, Tuple

import dataclasses
import numpy as np

二、MDP实现

@dataclasses.dataclass
class Transition:
    state: Tuple[int, int]
    action: str
    next_state: Tuple[int, int]
    reward: float
    termination: bool
        

class GridEnv:
    _states: np.array
    _rewards: np.array
    _action_semantics: List[str]
    _actions: Dict[str, np.array]
    _init_state: Tuple[int, int]
    _current_state: Tuple[int, int]
    _goal: Tuple[int, int]
    _pits: Optional[List[Tuple[int, int]]]
    _transition_probabilities: np.array
        
    def __init__(self, 
                 rows: int, 
                 cols: int,
                 step_cost: float,
                 goal: Tuple[int, int], 
                 pits: Optional[List[Tuple[int, int]]] = None,
                 obstacles: Optional[List[Tuple[int, int]]] = None) -> None:
        self._states = np.zeros((rows, cols))
        
        obstacles = [] if obstacles is None else obstacles
        
        for r, c in obstacles:
            self._states[r, c] = 1
            
        self._rewards = -step_cost*np.ones((rows, cols))
        self._rewards[goal[0], goal[1]] = +1
        
        for r, c in pits:
            self._rewards[r, c] = -1
        
        self._action_semantics = ['up', 'left', 'down', 'right']
        self._actions = np.array([[-1, 0], [0, -1], [1, 0], [0, 1]])
        self._init_state = (rows - 1, 0)
        self._current_state = self._init_state
        self._goal = goal
        self._pits = pits
        
        # going right, straight, left wrt chosen action
        self._transition_probabilities = np.array([0.1, 0.8, 0.1])
        
    @property
    def actions(self) -> List[str]:
        return self._action_semantics
    
    @property
    def current_state(self) -> Tuple[int, int]:
        return self._current_state
    
    @property
    def reward(self) -> float:
        r, c = self._current_state
        return self._rewards[r, c]
    
    @property
    def termination(self) -> bool:
        return self._current_state == self._goal
    
    def render(self) -> None:
        grid = np.array(self._states, dtype=str)
        r, c = self._current_state
        grid[r, c] = 'x'
        r, c = self._goal
        grid[r, c] = 'G'
        
        for r, c in self._pits:
            grid[r, c] = 'P'

        print(grid)
        
    def _transition(self, state: Tuple[int, int], a: np.array) -> Tuple[int, int]:
        n_actions = len(self._actions)
        a = self._actions[a + n_actions if a < 0 else a % n_actions]
        new_r = max(0, min(self._states.shape[0] - 1, state[0] + a[0]))
        new_c = max(0, min(self._states.shape[1] - 1, state[1] + a[1]))
        return (new_r, new_c) if self._states[new_r, new_c] == 0. else state
        
    def step(self, action: str) -> Transition:
        a_idx = self._action_semantics.index(action)
        
        rnd = np.random.rand()
        chosen_action = a_idx + np.random.choice([1, 0, -1], p=self._transition_probabilities)
        prev_state = self._current_state
        self._current_state = self._transition(self._current_state, chosen_action)
        return Transition(state=prev_state,
                          action=action,
                          next_state=self._current_state,
                          reward=self.reward,
                          termination=self.termination)
    
    def reset(self) -> None:
        self._current_state = self._init_state
my_env = GridEnv(rows=3, cols=4, step_cost=0.4, goal=(0, 3), pits=[(1, 3)], obstacles=[(1, 1)])
my_env.render()
print(my_env.current_state)
print(my_env.reward)
print(my_env.termination)
print(my_env.actions)
for _ in range(5):
    a = np.random.choice(my_env.actions)
    print(a)
    print(my_env.step(a))
    my_env.render()
my_env.reset()
my_env.render()

三、值迭代 Value Iteration

size_r = my_env._states.shape[0]
size_c = my_env._states.shape[1]

V = np.zeros((size_r, size_c))
gamma = 0.8
max_error = 1e-8

while True:
    V_prime = V.copy()
    
    get_v = lambda r, c: V[r, c]
    delta = 0
    
    for i in range(size_r*size_c):
        r = i // size_c
        c = i % size_c
        
        reward = my_env._rewards[r, c]
        actions = my_env._actions
        
        V_max = None
        for a_idx in range(len(actions)):
            possible_outcomes = [my_env._transition((r, c), a) for a in a_idx + np.array([1, 0, -1])]
            value = sum([my_env._transition_probabilities[k]*get_v(*list(s_prime))
             for k, s_prime in enumerate(possible_outcomes)])
            
            if V_max is None or value > V_max:
                V_max = value

        V_prime[r, c] = reward + gamma * V_max
        err = abs(V[r, c] - V_prime[r, c])
        delta = err if err > delta else delta
        V = V_prime
        
    print(delta)
    if delta < max_error:
        break
        
print(V)
T = 1/(1 - gamma)
print(T)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

是土豆大叔啊!

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值