强化学习实例2:MDP

红色块移动到黄色,黑色为障碍物

马尔科夫链,

预测最好的路径,值函数为回报r(reward)和the discounted value of the ending state

SARSA代表state, action, reward, next state和next action。it is known as an own policy Reinforcement Learning。An own policy means  that we can see only our own experiences。It accumulates updates in one or more steps and learns to update from its experiences。

从当前状态,我们选择一个行为和下一步状态。在下一个状态,我们选择另一个状态和使用当前状态和

算法:

  1. 初始化Q(s, a)
  2. 初始化s
  3. 通过s选择行为a,在每轮中重复这两步骤
  4. 从a、r和s'中学习Q Learing
  5. 在每轮中重复以上步骤

Q Learning

Q由s和a组成的一个表格,Q代表状态s下执行行为a的值

Q[s, a] = Immediate reward + discounted reward

The Immediate reward是一个状态到另一个状态执行行为的,discounted reward是未来的

Qtable决定在当前状态s下哪个行为a是最优的

Pi(s) = ARGMAX a (Q[s, a])

Pi(a | s) = P[ At=a | St=s ]

def learn(self, s, a, r, s_, a_)

迷宫maze.py

import numpy as np  
import time
import sys

if sys.version_info.major==2:
	import Tkinter as tk 
else:
	import tkinter as tk

UNIT = 40     # pixels
MAZE_H = 4    # 高
MAZE_W = 4    # 宽

class Maze(tk.Tk, object):
	def __init__(self):
		super(Maze, self).__init__()
		self.action_space = ['u', 'd', 'l', 'r']
		self.n_actions = len(self.action_space)
		self.title('maze')
		self.geometry('{0}x{1}'.format(MAZE_H*UNIT, MAZE_W*UNIT))
		self._build_maze()

	def _build_maze(self):
		self.canvas = tk.Canvas(self, bg='white',
							height = MAZE_H * UNIT,
							width = MAZE_W * UNIT)
		# create grids
		for c in range(0, MAZE_W * UNIT, UNIT):
			x0, y0, x1, y1 = c, 0, c, MAZE_H*UNIT
			self.canvas.create_line(x0, y0, x1, y1)
		for r in range(0, MAZE_H * UNIT, UNIT):
			x0, y0, x1, y1 = 0, r, MAZE_H*UNIT, r
			self.canvas.create_line(x0, y0, x1, y1)
		# create origin
		origin = np.array([20, 20])
		# hell
		hell1_center = origin + np.array([UNIT*2, UNIT])
		self.hell1 = self.canvas.create_rectangle(
			hell1_center[0] - 15, hell1_center[1] - 15,
			hell1_center[0] + 15, hell1_center[1] + 15,
			fill='black'
		)
		# hell
		hell2_center = origin + np.array([UNIT, UNIT*2])
		self.hell2 = self.canvas.create_rectangle(
			hell2_center[0] - 15, hell2_center[1] - 15,
			hell2_center[0] + 15, hell2_center[1] + 15,
			fill='black'
		)
		# create oval
		oval_center = origin + UNIT * 2
		self.oval = self.canvas.create_oval(
			oval_center[0] - 15, oval_center[1] - 15,
			oval_center[0] + 15, oval_center[1] + 15,
			fill='yellow'
		)
		self.rect = self.canvas.create_rectangle(
			origin[0] - 15, origin[1] - 15,
			origin[0] + 15, origin[1] + 15,
			fill='yellow'
		)
		# pack all
		self.canvas.pack()

	def reset(self):
		self.update()
		time.sleep(0.5)
		self.canvas.delete(self.rect)
		origin = np.array([20, 20])
		self.rect = self.canvas.create_rectangle(
			origin[0] - 15, origin[1] - 15,
			origin[0] + 15, origin[1] + 15,
			fill='red'
		)
		# retrun observation
		return self.canvas.coords(self.rect)

	def step(self, action):
		s = self.canvas.coords(self.rect)
		base_action = np.array([0, 0])
		if action == 0: # up
			if s[1] > UNIT:
				base_action[1] -= UNIT
		elif action == 1:  # down
			if s[1] < (MAZE_H - 1)*UNIT:
				base_action[1] += UNIT
		elif action == 2:  # right
			if s[0] < (MAZE_W - 1)*UNIT:
				base_action[0] += UNIT
		elif action == 3:  # left
			if s[0] > UNIT:
				base_action[0] -= UNIT
		# 移动
		self.canvas.move(self.rect, base_action[0], base_action[1])
		s_ = self.canvas.coords(self.rect)
		# 奖励函数
		if s_ == self.canvas.coords(self.oval):
			reward = 1
			done = True
		elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
			reward = -1
			done = True
		else:
			reward = 0
			done = False
		return s_, reward, done

	def render(self):
		time.sleep(0.1)
		self.update()  # 更新页面

rl.py

import numpy as np  
import pandas as pd  

class RL(object):
	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
		self.actions = actions
		self.lr = learning_rate
		self.gamma = reward_decay
		self.epsilon = e_greedy
		self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

	def choose_action(self, observation):
		self.check_state_exist(observation)
		if np.random.uniform() < self.epsilon:
			state_action = self.q_table.loc[observation, :]
			action = np.random.choice(state_action[state_action==np.max(state_action)].index)
		else:
			action = np.random.choice(self.actions)
		return action

	def learn(self, *args):
		Pass

	def check_state_exist(self, state):
		if state not in self.q_table.index:
			self.q_table = self.q_table.append(
				pd.Series(
					[0]*len(self.actions),
					index=self.q_table.columns,
					name=state,
				)
			)

# off-policy
class QLearningTable(RL):
	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
		super(QLearningTable, self).__init__(actions, learning_rate,
			reward_decay, e_greedy)

	def learn(self, s, a, r, s_, a_):
		self.check_state_exist(s_)
		q_predict = self.q_table.ix[s, a]
		if s_ != 'terminal':
			q_target = r + self.gamma * self.q_table.ix[s_, :].max()
		else:
			q_target = r
		self.q_table.ix[s,a] += self.lr * (q_target - q_predict) # 更新

# on-policy
class SarsaTable(RL):
	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
		super(SarsaTable, self).__init__(actions, learning_rate,
			reward_decay, e_greedy)

	def learn(self, s, a, r, s_, a_):
		self.check_state_exist(s_)
		q_predict = self.q_table.ix[s, a]
		if s_ != 'terminal':
			q_target = r + self.gamma * self.q_table.ix[s_, a_]
		else:
			q_target = r
		self.q_table.ix[s,a] += self.lr * (q_target - q_predict) # 更新

主函数run.py

from maze import Maze 
from rl import QLearningTable, SarsaTable

def update():
	for episode in range(100):
		observation = env.reset()
		action = RL.choose_action(str(observation))
		while True:
			env.render()
			observation_, reward, done = env.step(action)
			action_ = RL.choose_action(str(observation_))
			# (s, a, r, s, a) == Sarsa
			RL.learn(str(observation), action, reward, str(observation_),
				action_)
			observation = observation_
			action = action_
			if done:
				break

	print('game over')
	env.destory()

if __name__ == '__main__':
	env = Maze()
	# RL = QLearningTable(actions=list(range(env.n_actions)))
	RL = SarsaTable(actions=list(range(env.n_actions)))
	env.after(100, update)
	env.mainloop()

 

  • 3
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值