**前言:**这篇博客复现的是文章“Algorithms for Inverse Reinforcement Learning”中有限状态空间的Grid World的相关实验,重点是如何将非线性规划模型转化为线性规划模型。
环境模型
首先,构造环境模型Gridworld,代码如下所示:
import numpy as np
import random
import copy
class MyGirdWorld(object):
size = 5
reward_grid = np.zeros([size, size])
reward_grid[0][4] = 1
position_grid = np.zeros([size, size])
position_grid[4][0] = 1.1
action_space = 4
noisy_move_chance = 0.3
current_i = 4
current_j = 0
done_status = False
enable_noise = True
observation_spaces = size * size
def basic_reset(self):
self.size = 5
self.reward_grid = np.zeros([self.size, self.size])
self.reward_grid[0][4] = 1
self.position_grid = np.zeros([self.size, self.size])
self.position_grid[4][0] = 1.1
self.action_space = 4
self.noisy_move_chance = 0.3
self.observation_spaces = self.size*self.size
self.current_i = 4
self.current_j = 0
self.done_status = False
self.enable_noise = True
def __init__(self, size=5, noisy_move_chance=0.3, enable_noise=True):
self.basic_reset()
self.enable_noise = enable_noise
if noisy_move_chance in range(0,1):
self.noisy_move_chance = noisy_move_chance
def reset(self, size=5, noisy_move_chance=0.3, enable_noise=True):
self.__init__(size, noisy_move_chance, enable_noise)
return self.current_i*self.size+self.current_j
def print_reward_grid(self):
for i in range(len(self.reward_grid)):
for j in range(len(self.reward_grid[0])):
print(self.reward_grid[i][j], end=" ")
print()
def print_position_grid(self):
for i in range(len(self.position_grid)):
for j in range(len(self.position_grid[0])):
print(self.position_grid[i][j], end=" ")
print()
def get_position_grid(self):
return self.position_grid
def render(self):
self.print_position_grid()
def get_available_moves(self):
return self.action_space
def get_size(self):
return self.size
def make_proper_move(self, action):
"""
direction
0: left
1: down
2: right
3: up
:param action:
:return:
"""
if action==0:
if 0<self.current_j:
self.position_grid[self.current_i][self.current_j] = 0
self.current_j -= 1
self.position_grid[self.current_i][self.current_j] = 1.1
elif action==1:
if self.current_i < self.size-1:
self.position_grid[self.current_i][self.current_j] = 0
self.current_i += 1
self.position_grid[self.current_i][self.current_j] = 1.1
elif action==2:
if self.current_j < self.size-1:
self.position_grid[self.current_i][self.current_j] = 0
self.current_j += 1
self.position_grid[self.current_i][self.current_j] = 1.1
elif action==3:
if 0<self.current_i:
self.position_grid[self.current_i][self.current_j] = 0
self.current_i -= 1
self.position_grid[self.current_i][self.current_j] = 1.1
else:
print("sorry, no this action!")
if self.current_i==0 and self.current_j==self.size-1:
self.done_status = True
def make_noisy_move(self, action):
rand_num = random.randint(0, 3)
self.make_proper_move(rand_num)
def move(self, action):
rand_num = random.random()
if self.enable_noise and rand_num<self.noisy_move_chance:
self.make_noisy_move(action)
else:
self.make_proper_move(action)
return self.current_i, self.current_j, self.current_i*self.size+self.current_j, \
self.reward_grid[self.current_i][self.current_j], self.done_status
def step(self, action):
return self.move(action)
class MyGridWorldTrainer(object):
env = []
Q = []
matrix = []
trajectories =[]
direction_matrix = []
def all_in_one(self, model, num_trajectories):
self.env = model
Q = self.train_model(model)
matrix = self.get_directions(Q)
return self.get_trajectories(matrix, num_trajectories)
def train_model(self, model):
env = self.env
alpha = 0.6
gamma = 0.9
Q = np.zeros([env.observation_spaces, env.action_space])
for episode in range(1, 10001):
done = False
total_reward = 0
state = env.reset()
while not done:
if episode < 500:
action = random.randint(0, 3)
else:
action = np.argmax(Q[state])
i, j, state2, reward, done = env.step(action)
Q[state, action] += alpha * (reward + gamma * np.max(Q[state2])-Q[state, action])
total_reward += reward
state = state2
self.Q = Q
return Q
def get_directions(self, Q):
matrix = []
for i in range(0, 25):
matrix.append(np.argmax(Q[i]))
matrix = np.reshape(matrix, (5, 5))
direction_matrix = []
for i in range(5):
row = []
for j in range(5):
if matrix[i][j] == 0:
row.append("\u2190")
elif matrix[i][j] == 1:
row.append("\u2193")
elif matrix[i][j] == 2:
row.append('\u2192')
elif matrix[i][j] == 3:
row.append("\u2191")
else:
print("Sorry! No this action")
direction_matrix.append(row)
self.direction_matrix = direction_matrix
self.matrix = matrix
return matrix
def get_trajectories(self, matrix, num_trajectories):
trajectories = []
for iter in range(num_trajectories):
path = []
done = False
state = self.env.reset()
total_reward = 0
path.append(state)
i = int(state/self.env.size)
j = state%self.env.size
while not done:
action = matrix[i][j]
i, j, state2, reward, done = self.env.step(action)
total_reward += reward
state = state2
path.append(state)
trajectories.append(path)
self.trajectories = trajectories
return trajectories
优化问题建模
在论文中,优化问题被建模为:
m
a
x
i
m
i
z
e
∑
i
=
1
N
min
a
∈
{
a
2
,
.
.
.
,
a
k
}
(
P
a
1
(
i
)
−
P
a
(
i
)
)
(
I
−
γ
P
a
1
)
−
1
R
−
λ
∣
∣
R
∣
∣
1
maximize\ \sum_{i=1}^{N} \min_{a\in\{a_2,...,a_k\}}(\mathbf{P}_{a_1}(i)-\mathbf{P}_{a}(i))(\mathbf{I}-\gamma \mathbf{P}_{a_1})^{-1}R-\lambda ||R||_1
maximize i=1∑Na∈{a2,...,ak}min(Pa1(i)−Pa(i))(I−γPa1)−1R−λ∣∣R∣∣1
s
.
t
.
(
P
a
1
−
P
a
)
(
I
−
γ
P
a
1
)
−
1
R
⪰
0
∀
s
∈
A
∖
a
1
s.t.\ (\mathbf{P}_{a_1}-\mathbf{P}_{a})(\mathbf{I}-\gamma\mathbf{P}_{a_1})^{-1}R \succeq 0\ \forall s \in A\setminus a_1
s.t. (Pa1−Pa)(I−γPa1)−1R⪰0 ∀s∈A∖a1
∣
R
∣
≤
R
max
,
i
=
1
,
.
.
.
,
N
|\mathbf{R}|\leq R_{\max},\ i=1,...,N
∣R∣≤Rmax, i=1,...,N
其中
P
a
(
i
)
\mathbf{P}_a(i)
Pa(i)表示矩阵
P
a
\mathbf{P}_a
Pa的第
i
i
i行。清楚地是,这个可以被很简单的建模成一个线性规划问题而且被很高效的解决。 (How to do?)
建模为线性规划模型后为
m
a
x
i
m
i
z
e
∑
i
=
1
N
Z
i
R
−
λ
∑
i
=
1
N
(
u
i
+
v
i
)
maximize\ \sum_{i=1}^{N} Z_iR-\lambda \sum_{i=1}^{N}(u_i+v_i)
maximize i=1∑NZiR−λi=1∑N(ui+vi)
s
.
t
.
Z
i
−
min
a
∈
{
a
2
,
.
.
.
,
a
k
}
(
P
a
1
(
i
)
−
P
a
(
i
)
)
(
I
−
γ
P
a
1
)
−
1
(
U
−
V
)
≤
0
s.t.\ \ Z_i-\min_{a\in\{a_2,...,a_k\}}(\mathbf{P}_{a_1}(i)-\mathbf{P}_{a}(i))(\mathbf{I}-\gamma \mathbf{P}_{a_1})^{-1}(U-V) \leq 0
s.t. Zi−a∈{a2,...,ak}min(Pa1(i)−Pa(i))(I−γPa1)−1(U−V)≤0
(
P
a
1
−
P
a
)
(
I
−
γ
P
a
1
)
−
1
R
⪰
0
∀
s
∈
A
∖
a
1
(\mathbf{P}_{a_1}-\mathbf{P}_{a})(\mathbf{I}-\gamma\mathbf{P}_{a_1})^{-1}R \succeq 0\ \forall s \in A\setminus a_1
(Pa1−Pa)(I−γPa1)−1R⪰0 ∀s∈A∖a1
U
⪰
0
U \succeq 0
U⪰0
V
⪰
0
V \succeq 0
V⪰0
∣
R
∣
≤
R
max
,
i
=
1
,
.
.
.
,
N
|\mathbf{R}|\leq R_{\max},\ i=1,...,N
∣R∣≤Rmax, i=1,...,N
其中
Z
i
=
min
a
∈
{
a
2
,
.
.
.
,
a
k
}
{
(
P
a
1
(
i
)
−
P
a
(
i
)
)
(
I
−
γ
P
a
1
)
−
1
Z_i=\min_{a\in\{a_2,...,a_k\}}\{(\mathbf{P}_{a_1}(i)-\mathbf{P}_{a}(i))(\mathbf{I}-\gamma \mathbf{P}_{a_1})^{-1}
Zi=a∈{a2,...,ak}min{(Pa1(i)−Pa(i))(I−γPa1)−1
∣
R
i
∣
=
u
i
+
v
i
|R_i| = u_i+v_i
∣Ri∣=ui+vi
R
i
=
u
i
−
v
i
R_i=u_i-v_i
Ri=ui−vi
至于为什么可以这样转换,请参考这篇文章“优化 | 线性规划和整数规划的若干建模技巧”
最后对应的代码如下:
import numpy as np
from gridworld import *
from cvxopt import matrix, solvers
def construct_transition_matrix(size=5, states=25, actions=4, noisy_move_chance=0.3):
transition_matrix = np.zeros([states, states, actions])
for state in range(states):
for action in range(actions):
i = int(state/size)
j = state % size
prob_stay = 1
if action == 0:
if 0 < j:
prob_stay = prob_stay-(1-noisy_move_chance)
j2 = j-1
transition_matrix[state][int(i*size+j2)][action] = 1-noisy_move_chance
elif action == 1:
if i < size-1:
prob_stay = prob_stay-(1-noisy_move_chance)
i2 = i+1
transition_matrix[state][int(i2*size+j)][action] = 1-noisy_move_chance
elif action == 2:
if j < size-1:
prob_stay = prob_stay-(1-noisy_move_chance)
j2 = j+1
transition_matrix[state][int(i*size+j2)][action] = 1-noisy_move_chance
elif action == 3:
if 0 < i:
prob_stay = prob_stay-(1-noisy_move_chance)
i2 = i-1
transition_matrix[state][int(i2*size+j)][action] = 1-noisy_move_chance
if 0 < j:
prob_stay = prob_stay-(noisy_move_chance/4)
j2 = j-1
transition_matrix[state][int(i*size+j2)][action] += (noisy_move_chance/4)
if i < size-1:
prob_stay = prob_stay-(noisy_move_chance/4)
i2 = i+1
transition_matrix[state][int(i2*size+j)][action] += (noisy_move_chance/4)
if j < size-1:
prob_stay = prob_stay-(noisy_move_chance/4)
j2 = j+1
transition_matrix[state][int(i*size+j2)][action] += (noisy_move_chance/4)
if 0 < i:
prob_stay = prob_stay-(noisy_move_chance/4)
i2 = i-1
transition_matrix[state][int(i2*size+j)][action] += (noisy_move_chance/4)
if prob_stay<10**(-15):
prob_stay = 0
transition_matrix[state][state][action] = prob_stay
return transition_matrix
def get_optimal_policy():
optimum_policy = [[2, 2, 2, 2, 2], [3, 2, 2, 3, 3], [3, 3, 3, 3, 3], [3, 3, 2, 3, 3], [3, 2, 2, 2, 3]]
directional_matrix = []
for i in range(5):
row = []
for j in range(5):
if optimum_policy[i][j] == 0:
row.append('\u2190')
elif optimum_policy[i][j] == 1:
row.append('\u2193')
elif optimum_policy[i][j] == 2:
row.append('\u2192')
elif optimum_policy[i][j] == 3:
row.append('\u2191')
directional_matrix.append(row)
for i in directional_matrix:
print(i)
return optimum_policy
def initialize_solver_matrix(penalty=10):
A = np.zeros([250, 75])
b = np.zeros([250])
x = np.zeros([75])
def initialize():
size = 25
num = 150
i = 0
while i < 25:
A[num + i, i] = 1
A[num + size + i, i] = -1
j = 2
while j < 4:
A[num + j * size + i, i] = 1
A[num + j * size + i, 2 * size + i] = -1
j += 1
b[num + i] = 1
# b[num + size + i] = 0
i += 1
i = 0
initialize()
while i < 25:
x[25:] = -1
x[-25:] = penalty
i += 1
return A, b, x
def inverse_reinforcement_learning(policy, gamma=0.5, penalty=10):
trans_prob = construct_transition_matrix()
# the order in c is r, z, u+v
A, b, x = initialize_solver_matrix(penalty)
i = 0
while (i < 25):
optimalAction = int(policy[i])
tempTransProbMatrix = gamma * trans_prob[:, :, optimalAction]
patialInvertedMatrix = np.linalg.inv(np.identity(25) - tempTransProbMatrix)
temp = 0
j = 0
while (j < 4):
if j != optimalAction:
otherPartialMatrix = trans_prob[i, :, optimalAction] - trans_prob[i, :, j]
val = - np.dot(otherPartialMatrix, patialInvertedMatrix)
pos = 25 * 3
A[i * 3 + temp, :25] = val
A[pos + i * 3 + temp, :25] = val
A[pos + i * 3 + temp, 25 + i] = 1
else:
temp = temp - 1
temp = temp + 1
j = j + 1
i = i + 1
x = matrix(x)
A = matrix(A)
b = matrix(b)
return solvers.lp(x, A, b)
if __name__ == '__main__':
policy = np.reshape(get_optimal_policy(), [25, 1])
rewards = inverse_reinforcement_learning(policy, 0.1, 3)
rewards = rewards['x']
print(rewards)
print("*******************************")
rewards = rewards[:5 * 5]
print(rewards)
print("*******************************")
rewards = rewards / max(rewards)
rewards = np.reshape(rewards, [5, 5])
print(rewards)
参考文献
[1] Ng, A.Y., and Russell, S.J.: ‘Algorithms for inverse reinforcement learning’, in Editor (Ed.)^(Eds.): ‘Book Algorithms for inverse reinforcement learning’ (2000, edn.), pp. 2