Policy Iteration算法实现:gridworld环境
策略迭代结果:
class GridWorld:
def __init__(self, number):
self.states = [i for i in range(number**2)]
self.terminate_states = [0, number**2-1]
self.actions = ['up', 'down', 'left', 'right']
self.gamma = 1
# 初始化V(s)
self.v_s = dict() # dict() 函数用于创建一个字典
for s in self.states:
self.v_s[s] = 0.0
# 初始化pi(s)
self.pi_s = dict()
for s in self.states:
self.pi_s[s] =[]
if s in self.terminate_states:
continue
self.pi_s[s].append("up")
self.pi_s[s].append("down")
self.pi_s[s].append("left")
self.pi_s[s].append("right")
# 初始化最优值函数 pi_optimal(s)
self.pi_optimal = self.pi_s.copy()
def state_to_grid(self, s):
# 将状态转换成网格中的坐标位置
row = s // 4
col = s % 4
return row, col
def grid_to_state(self, row, col):
return row*4 + col
def action(self, s, action):
if s in self.terminate_states:
return s, 0
row, col = self.state_to_grid(s)
reward = -1
if action == "up":
row = row - 1
if row < 0: row = 0
if action == "down":
row = row + 1
if row > 3: row = 3
if action == "left":
col = col - 1
if col < 0: col = 0
if action == "right":
col = col + 1
if col > 3: col = 3
state_ = self.grid_to_state(row, col)
return state_, reward
def policy_evaluation(self, theta):
print("------- k = 0 --------")
self.showV(self.v_s)
for i in range(10000):
delta = 0
lastV = self.v_s.copy() # 上一个V(s)
for s in self.states:
if s in self.terminate_states:
continue
v = lastV[s]
action = self.pi_s[s]
nextV = 0.0
for a in action:
s_, r = self.action(s, a)
nextV += (r + self.gamma * lastV[s_]) / 4
self.v_s[s] = nextV
l1 = [delta, abs(v - self.v_s[s])]
delta = max(l1)
print("------- k = %d --------" % (i + 1))
self.showV(self.v_s)
if delta < theta:
break
def showV(self, v_s):
print('%.2f, %.2f, %.2f, %.2f' % (v_s[0], v_s[1], v_s[2], v_s[3]))
print('%.2f, %.2f, %.2f, %.2f' % (v_s[4], v_s[5], v_s[6], v_s[7]))
print('%.2f, %.2f, %.2f, %.2f' % (v_s[8], v_s[9], v_s[10], v_s[11]))
print('%.2f, %.2f, %.2f, %.2f' % (v_s[12], v_s[13], v_s[14], v_s[15]))
print("-----------------------")
def policy_improvement(self):
policy_stable = True
for s in self.states:
if s in self.terminate_states:
continue
old_action = self.pi_optimal[s]
max_v = float('-inf')
for a in self.actions:
s_, r = self.action(s, a)
v = (r + self.gamma * self.v_s[s_]) / 4
if v > max_v:
max_v = v
self.pi_optimal[s] = [a]
if v == max_v and a not in self.pi_optimal[s]:
self.pi_optimal[s].append(a)
if old_action != self.pi_optimal[s]:
policy_stable = False
print(s, self.pi_optimal[s])
return policy_stable
if __name__ == '__main__':
env = GridWorld(4)
theta = 0.00001
for _ in range(100):
# policy evaluation
env.policy_evaluation(theta)
# policy improvement
policy_stable = env.policy_improvement()
print(policy_stable)
if policy_stable:
break
Value Iteration算法实现
class GridWorld:
def __init__(self, number):
self.states = [i for i in range(number**2)]
self.terminate_states = [0, number**2-1]
self.actions = ['up', 'down', 'left', 'right']
self.gamma = 1
# 初始化V(s)
self.v_s = dict() # dict() 函数用于创建一个字典
for s in self.states:
self.v_s[s] = 0.0
# 初始化pi(s)
self.pi_s = dict()
for s in self.states:
self.pi_s[s] =[]
if s in self.terminate_states:
continue
self.pi_s[s].append("up")
self.pi_s[s].append("down")
self.pi_s[s].append("left")
self.pi_s[s].append("right")
# 初始化最优值函数 pi_optimal(s)
self.pi_optimal = self.pi_s.copy()
def state_to_grid(self, s):
# 将状态转换成网格中的坐标位置
row = s // 4
col = s % 4
return row, col
def grid_to_state(self, row, col):
return row*4 + col
def action(self, s, action):
if s in self.terminate_states:
return s, 0
row, col = self.state_to_grid(s)
reward = -1
if action == "up":
row = row - 1
if row < 0: row = 0
if action == "down":
row = row + 1
if row > 3: row = 3
if action == "left":
col = col - 1
if col < 0: col = 0
if action == "right":
col = col + 1
if col > 3: col = 3
state_ = self.grid_to_state(row, col)
return state_, reward
def value_iteration(self, theta):
print("------- k = 0 --------")
self.showV(self.v_s)
for i in range(10000):
delta = 0
lastV = self.v_s.copy() # 上一个V(s)
for s in self.states:
if s in self.terminate_states:
continue
v = lastV[s]
action = self.pi_s[s]
nextMaxV = float('-inf')
for a in action:
s_, r = self.action(s, a)
nextV = (r + self.gamma * lastV[s_]) / 4
if nextV > nextMaxV:
nextMaxV = nextV
self.pi_optimal[s] = [a]
if nextV == nextMaxV and a not in self.pi_optimal[s]:
self.pi_optimal[s].append(a)
self.v_s[s] = nextMaxV
l1 = [delta, abs(v - self.v_s[s])]
delta = max(l1)
print("------- k = %d --------" % (i + 1))
self.showV(self.v_s)
if delta < theta:
print("------- output an optimal policy --------")
print(self.pi_optimal)
break
def showV(self, v_s):
print('%.2f, %.2f, %.2f, %.2f' % (v_s[0], v_s[1], v_s[2], v_s[3]))
print('%.2f, %.2f, %.2f, %.2f' % (v_s[4], v_s[5], v_s[6], v_s[7]))
print('%.2f, %.2f, %.2f, %.2f' % (v_s[8], v_s[9], v_s[10], v_s[11]))
print('%.2f, %.2f, %.2f, %.2f' % (v_s[12], v_s[13], v_s[14], v_s[15]))
print("-----------------------")
if __name__ == '__main__':
env = GridWorld(4)
theta = 0.00001
# value iteration
env.value_iteration(theta)