逆强化学习经典算法复现(一)

**前言:**这篇博客复现的是文章“Algorithms for Inverse Reinforcement Learning”中有限状态空间的Grid World的相关实验,重点是如何将非线性规划模型转化为线性规划模型。

环境模型

首先,构造环境模型Gridworld,代码如下所示:

import numpy as np
import random
import copy


class MyGirdWorld(object):
    size = 5
    reward_grid = np.zeros([size, size])
    reward_grid[0][4] = 1
    position_grid = np.zeros([size, size])
    position_grid[4][0] = 1.1
    action_space = 4
    noisy_move_chance = 0.3
    current_i = 4
    current_j = 0
    done_status = False
    enable_noise = True
    observation_spaces = size * size

    def basic_reset(self):
        self.size = 5
        self.reward_grid = np.zeros([self.size, self.size])
        self.reward_grid[0][4] = 1
        self.position_grid = np.zeros([self.size, self.size])
        self.position_grid[4][0] = 1.1
        self.action_space = 4
        self.noisy_move_chance = 0.3
        self.observation_spaces = self.size*self.size
        self.current_i = 4
        self.current_j = 0
        self.done_status = False
        self.enable_noise = True

    def __init__(self, size=5, noisy_move_chance=0.3, enable_noise=True):
        self.basic_reset()
        self.enable_noise = enable_noise

        if noisy_move_chance in range(0,1):
            self.noisy_move_chance = noisy_move_chance

    def reset(self, size=5, noisy_move_chance=0.3, enable_noise=True):
        self.__init__(size, noisy_move_chance, enable_noise)
        return self.current_i*self.size+self.current_j

    def print_reward_grid(self):
        for i in range(len(self.reward_grid)):
            for j in range(len(self.reward_grid[0])):
                print(self.reward_grid[i][j], end=" ")
            print()

    def print_position_grid(self):
        for i in range(len(self.position_grid)):
            for j in range(len(self.position_grid[0])):
                print(self.position_grid[i][j], end=" ")
            print()

    def get_position_grid(self):
        return self.position_grid

    def render(self):
        self.print_position_grid()

    def get_available_moves(self):
        return self.action_space

    def get_size(self):
        return self.size

    def make_proper_move(self, action):
        """
        direction
        0: left
        1: down
        2: right
        3: up
        :param action:
        :return:
        """
        if action==0:
            if 0<self.current_j:
                self.position_grid[self.current_i][self.current_j] = 0
                self.current_j -= 1
                self.position_grid[self.current_i][self.current_j] = 1.1
        elif action==1:
            if self.current_i < self.size-1:
                self.position_grid[self.current_i][self.current_j] = 0
                self.current_i += 1
                self.position_grid[self.current_i][self.current_j] = 1.1
        elif action==2:
            if self.current_j < self.size-1:
                self.position_grid[self.current_i][self.current_j] = 0
                self.current_j += 1
                self.position_grid[self.current_i][self.current_j] = 1.1
        elif action==3:
            if 0<self.current_i:
                self.position_grid[self.current_i][self.current_j] = 0
                self.current_i -= 1
                self.position_grid[self.current_i][self.current_j] = 1.1
        else:
            print("sorry, no this action!")

        if self.current_i==0 and self.current_j==self.size-1:
            self.done_status = True

    def make_noisy_move(self, action):
        rand_num = random.randint(0, 3)
        self.make_proper_move(rand_num)

    def move(self, action):
        rand_num = random.random()
        if self.enable_noise and rand_num<self.noisy_move_chance:
            self.make_noisy_move(action)
        else:
            self.make_proper_move(action)
        return self.current_i, self.current_j, self.current_i*self.size+self.current_j, \
               self.reward_grid[self.current_i][self.current_j], self.done_status

    def step(self, action):
        return self.move(action)


class MyGridWorldTrainer(object):
    env = []
    Q = []
    matrix = []
    trajectories =[]
    direction_matrix = []

    def all_in_one(self, model, num_trajectories):
        self.env = model
        Q = self.train_model(model)
        matrix = self.get_directions(Q)
        return self.get_trajectories(matrix, num_trajectories)

    def train_model(self, model):
        env = self.env
        alpha = 0.6
        gamma = 0.9
        Q = np.zeros([env.observation_spaces, env.action_space])

        for episode in range(1, 10001):
            done = False
            total_reward = 0

            state = env.reset()

            while not done:
                if episode < 500:
                    action = random.randint(0, 3)
                else:
                    action = np.argmax(Q[state])

                i, j, state2, reward, done = env.step(action)
                Q[state, action] += alpha * (reward + gamma * np.max(Q[state2])-Q[state, action])
                total_reward += reward
                state = state2
        self.Q = Q
        return Q

    def get_directions(self, Q):
        matrix = []
        for i in range(0, 25):
            matrix.append(np.argmax(Q[i]))
        matrix = np.reshape(matrix, (5, 5))
        direction_matrix = []
        for i in range(5):
            row = []
            for j in range(5):
                if matrix[i][j] == 0:
                    row.append("\u2190")
                elif matrix[i][j] == 1:
                    row.append("\u2193")
                elif matrix[i][j] == 2:
                    row.append('\u2192')
                elif matrix[i][j] == 3:
                    row.append("\u2191")
                else:
                    print("Sorry! No this action")
            direction_matrix.append(row)
        self.direction_matrix = direction_matrix
        self.matrix = matrix
        return matrix

    def get_trajectories(self, matrix, num_trajectories):
        trajectories = []

        for iter in range(num_trajectories):
            path = []
            done = False
            state = self.env.reset()
            total_reward = 0
            path.append(state)

            i = int(state/self.env.size)
            j = state%self.env.size

            while not done:
                action = matrix[i][j]
                i, j, state2, reward, done = self.env.step(action)
                total_reward += reward
                state = state2
                path.append(state)

            trajectories.append(path)

        self.trajectories = trajectories
        return trajectories

优化问题建模

在论文中,优化问题被建模为:
m a x i m i z e   ∑ i = 1 N min ⁡ a ∈ { a 2 , . . . , a k } ( P a 1 ( i ) − P a ( i ) ) ( I − γ P a 1 ) − 1 R − λ ∣ ∣ R ∣ ∣ 1 maximize\ \sum_{i=1}^{N} \min_{a\in\{a_2,...,a_k\}}(\mathbf{P}_{a_1}(i)-\mathbf{P}_{a}(i))(\mathbf{I}-\gamma \mathbf{P}_{a_1})^{-1}R-\lambda ||R||_1 maximize i=1Na{a2,...,ak}min(Pa1(i)Pa(i))(IγPa1)1RλR1
s . t .   ( P a 1 − P a ) ( I − γ P a 1 ) − 1 R ⪰ 0   ∀ s ∈ A ∖ a 1 s.t.\ (\mathbf{P}_{a_1}-\mathbf{P}_{a})(\mathbf{I}-\gamma\mathbf{P}_{a_1})^{-1}R \succeq 0\ \forall s \in A\setminus a_1 s.t. (Pa1Pa)(IγPa1)1R0 sAa1
∣ R ∣ ≤ R max ⁡ ,   i = 1 , . . . , N |\mathbf{R}|\leq R_{\max},\ i=1,...,N RRmax, i=1,...,N

其中 P a ( i ) \mathbf{P}_a(i) Pa(i)表示矩阵 P a \mathbf{P}_a Pa的第 i i i行。清楚地是,这个可以被很简单的建模成一个线性规划问题而且被很高效的解决。 (How to do?)
建模为线性规划模型后为
m a x i m i z e   ∑ i = 1 N Z i R − λ ∑ i = 1 N ( u i + v i ) maximize\ \sum_{i=1}^{N} Z_iR-\lambda \sum_{i=1}^{N}(u_i+v_i) maximize i=1NZiRλi=1N(ui+vi)
s . t .    Z i − min ⁡ a ∈ { a 2 , . . . , a k } ( P a 1 ( i ) − P a ( i ) ) ( I − γ P a 1 ) − 1 ( U − V ) ≤ 0 s.t.\ \ Z_i-\min_{a\in\{a_2,...,a_k\}}(\mathbf{P}_{a_1}(i)-\mathbf{P}_{a}(i))(\mathbf{I}-\gamma \mathbf{P}_{a_1})^{-1}(U-V) \leq 0 s.t.  Zia{a2,...,ak}min(Pa1(i)Pa(i))(IγPa1)1(UV)0
( P a 1 − P a ) ( I − γ P a 1 ) − 1 R ⪰ 0   ∀ s ∈ A ∖ a 1 (\mathbf{P}_{a_1}-\mathbf{P}_{a})(\mathbf{I}-\gamma\mathbf{P}_{a_1})^{-1}R \succeq 0\ \forall s \in A\setminus a_1 (Pa1Pa)(IγPa1)1R0 sAa1
U ⪰ 0 U \succeq 0 U0
V ⪰ 0 V \succeq 0 V0
∣ R ∣ ≤ R max ⁡ ,   i = 1 , . . . , N |\mathbf{R}|\leq R_{\max},\ i=1,...,N RRmax, i=1,...,N
其中
Z i = min ⁡ a ∈ { a 2 , . . . , a k } { ( P a 1 ( i ) − P a ( i ) ) ( I − γ P a 1 ) − 1 Z_i=\min_{a\in\{a_2,...,a_k\}}\{(\mathbf{P}_{a_1}(i)-\mathbf{P}_{a}(i))(\mathbf{I}-\gamma \mathbf{P}_{a_1})^{-1} Zi=a{a2,...,ak}min{(Pa1(i)Pa(i))(IγPa1)1
∣ R i ∣ = u i + v i |R_i| = u_i+v_i Ri=ui+vi
R i = u i − v i R_i=u_i-v_i Ri=uivi
至于为什么可以这样转换,请参考这篇文章“优化 | 线性规划和整数规划的若干建模技巧”

最后对应的代码如下:

import numpy as np
from gridworld import *
from cvxopt import matrix, solvers


def construct_transition_matrix(size=5, states=25, actions=4, noisy_move_chance=0.3):
    transition_matrix = np.zeros([states, states, actions])

    for state in range(states):
        for action in range(actions):
            i = int(state/size)
            j = state % size

            prob_stay = 1
            if action == 0:
                if 0 < j:
                    prob_stay = prob_stay-(1-noisy_move_chance)
                    j2 = j-1
                    transition_matrix[state][int(i*size+j2)][action] = 1-noisy_move_chance
            elif action == 1:
                if i < size-1:
                    prob_stay = prob_stay-(1-noisy_move_chance)
                    i2 = i+1
                    transition_matrix[state][int(i2*size+j)][action] = 1-noisy_move_chance
            elif action == 2:
                if j < size-1:
                    prob_stay = prob_stay-(1-noisy_move_chance)
                    j2 = j+1
                    transition_matrix[state][int(i*size+j2)][action] = 1-noisy_move_chance
            elif action == 3:
                if 0 < i:
                    prob_stay = prob_stay-(1-noisy_move_chance)
                    i2 = i-1
                    transition_matrix[state][int(i2*size+j)][action] = 1-noisy_move_chance

            if 0 < j:
                prob_stay = prob_stay-(noisy_move_chance/4)
                j2 = j-1
                transition_matrix[state][int(i*size+j2)][action] += (noisy_move_chance/4)
            if i < size-1:
                prob_stay = prob_stay-(noisy_move_chance/4)
                i2 = i+1
                transition_matrix[state][int(i2*size+j)][action] += (noisy_move_chance/4)
            if j < size-1:
                prob_stay = prob_stay-(noisy_move_chance/4)
                j2 = j+1
                transition_matrix[state][int(i*size+j2)][action] += (noisy_move_chance/4)
            if 0 < i:
                prob_stay = prob_stay-(noisy_move_chance/4)
                i2 = i-1
                transition_matrix[state][int(i2*size+j)][action] += (noisy_move_chance/4)

            if prob_stay<10**(-15):
                prob_stay = 0
            transition_matrix[state][state][action] = prob_stay
    return transition_matrix


def get_optimal_policy():
    optimum_policy = [[2, 2, 2, 2, 2], [3, 2, 2, 3, 3], [3, 3, 3, 3, 3], [3, 3, 2, 3, 3], [3, 2, 2, 2, 3]]
    directional_matrix = []
    for i in range(5):
        row = []
        for j in range(5):
            if optimum_policy[i][j] == 0:
                row.append('\u2190')
            elif optimum_policy[i][j] == 1:
                row.append('\u2193')
            elif optimum_policy[i][j] == 2:
                row.append('\u2192')
            elif optimum_policy[i][j] == 3:
                row.append('\u2191')
        directional_matrix.append(row)
    for i in directional_matrix:
        print(i)
    return optimum_policy


def initialize_solver_matrix(penalty=10):
    A = np.zeros([250, 75])
    b = np.zeros([250])
    x = np.zeros([75])

    def initialize():
        size = 25
        num = 150
        i = 0
        while i < 25:
            A[num + i, i] = 1
            A[num + size + i, i] = -1
            j = 2
            while j < 4:
                A[num + j * size + i, i] = 1
                A[num + j * size + i, 2 * size + i] = -1
                j += 1
            b[num + i] = 1
            # b[num + size + i] = 0
            i += 1

    i = 0
    initialize()
    while i < 25:
        x[25:] = -1
        x[-25:] = penalty
        i += 1
    return A, b, x


def inverse_reinforcement_learning(policy, gamma=0.5, penalty=10):
    trans_prob = construct_transition_matrix()
    # the order in c is r, z, u+v
    A, b, x = initialize_solver_matrix(penalty)

    i = 0
    while (i < 25):
        optimalAction = int(policy[i])
        tempTransProbMatrix = gamma * trans_prob[:, :, optimalAction]
        patialInvertedMatrix = np.linalg.inv(np.identity(25) - tempTransProbMatrix)

        temp = 0
        j = 0
        while (j < 4):
            if j != optimalAction:
                otherPartialMatrix = trans_prob[i, :, optimalAction] - trans_prob[i, :, j]
                val = - np.dot(otherPartialMatrix, patialInvertedMatrix)
                pos = 25 * 3
                A[i * 3 + temp, :25] = val
                A[pos + i * 3 + temp, :25] = val
                A[pos + i * 3 + temp, 25 + i] = 1
            else:
                temp = temp - 1
            temp = temp + 1
            j = j + 1
        i = i + 1
    x = matrix(x)
    A = matrix(A)
    b = matrix(b)
    return solvers.lp(x, A, b)

if __name__ == '__main__':
    policy = np.reshape(get_optimal_policy(), [25, 1])
    rewards = inverse_reinforcement_learning(policy, 0.1, 3)
    rewards = rewards['x']
    print(rewards)

    print("*******************************")
    rewards = rewards[:5 * 5]

    print(rewards)

    print("*******************************")

    rewards = rewards / max(rewards)
    rewards = np.reshape(rewards, [5, 5])

    print(rewards)

参考文献

[1] Ng, A.Y., and Russell, S.J.: ‘Algorithms for inverse reinforcement learning’, in Editor (Ed.)^(Eds.): ‘Book Algorithms for inverse reinforcement learning’ (2000, edn.), pp. 2

  • 4
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值