强化学习实例2：MDP

最新推荐文章于 2024-04-18 16:19:23 发布

CopperDong

最新推荐文章于 2024-04-18 16:19:23 发布

阅读量1.7k

点赞数 3

分类专栏：强化学习

本文链接：https://blog.csdn.net/QFire/article/details/91993295

版权

强化学习专栏收录该内容

14 篇文章 4 订阅

订阅专栏

红色块移动到黄色，黑色为障碍物

马尔科夫链，

预测最好的路径，值函数为回报r（reward）和the discounted value of the ending state

SARSA代表state, action, reward, next state和next action。it is known as an own policy Reinforcement Learning。An own policy means that we can see only our own experiences。It accumulates updates in one or more steps and learns to update from its experiences。

从当前状态，我们选择一个行为和下一步状态。在下一个状态，我们选择另一个状态和使用当前状态和

算法：

初始化Q(s, a)
初始化s
通过s选择行为a，在每轮中重复这两步骤
从a、r和s'中学习Q Learing
在每轮中重复以上步骤

Q Learning

Q由s和a组成的一个表格，Q代表状态s下执行行为a的值

Q[s, a] = Immediate reward + discounted reward

The Immediate reward是一个状态到另一个状态执行行为的，discounted reward是未来的

Qtable决定在当前状态s下哪个行为a是最优的

Pi(s) = ARGMAX a (Q[s, a])

Pi(a | s) = P[ At=a | St=s ]

def learn(self, s, a, r, s_, a_)

迷宫maze.py

import numpy as np  
import time
import sys

if sys.version_info.major==2:
	import Tkinter as tk 
else:
	import tkinter as tk

UNIT = 40     # pixels
MAZE_H = 4    # 高
MAZE_W = 4    # 宽

class Maze(tk.Tk, object):
	def __init__(self):
		super(Maze, self).__init__()
		self.action_space = ['u', 'd', 'l', 'r']
		self.n_actions = len(self.action_space)
		self.title('maze')
		self.geometry('{0}x{1}'.format(MAZE_H*UNIT, MAZE_W*UNIT))
		self._build_maze()

	def _build_maze(self):
		self.canvas = tk.Canvas(self, bg='white',
							height = MAZE_H * UNIT,
							width = MAZE_W * UNIT)
		# create grids
		for c in range(0, MAZE_W * UNIT, UNIT):
			x0, y0, x1, y1 = c, 0, c, MAZE_H*UNIT
			self.canvas.create_line(x0, y0, x1, y1)
		for r in range(0, MAZE_H * UNIT, UNIT):
			x0, y0, x1, y1 = 0, r, MAZE_H*UNIT, r
			self.canvas.create_line(x0, y0, x1, y1)
		# create origin
		origin = np.array([20, 20])
		# hell
		hell1_center = origin + np.array([UNIT*2, UNIT])
		self.hell1 = self.canvas.create_rectangle(
			hell1_center[0] - 15, hell1_center[1] - 15,
			hell1_center[0] + 15, hell1_center[1] + 15,
			fill='black'
		)
		# hell
		hell2_center = origin + np.array([UNIT, UNIT*2])
		self.hell2 = self.canvas.create_rectangle(
			hell2_center[0] - 15, hell2_center[1] - 15,
			hell2_center[0] + 15, hell2_center[1] + 15,
			fill='black'
		)
		# create oval
		oval_center = origin + UNIT * 2
		self.oval = self.canvas.create_oval(
			oval_center[0] - 15, oval_center[1] - 15,
			oval_center[0] + 15, oval_center[1] + 15,
			fill='yellow'
		)
		self.rect = self.canvas.create_rectangle(
			origin[0] - 15, origin[1] - 15,
			origin[0] + 15, origin[1] + 15,
			fill='yellow'
		)
		# pack all
		self.canvas.pack()

	def reset(self):
		self.update()
		time.sleep(0.5)
		self.canvas.delete(self.rect)
		origin = np.array([20, 20])
		self.rect = self.canvas.create_rectangle(
			origin[0] - 15, origin[1] - 15,
			origin[0] + 15, origin[1] + 15,
			fill='red'
		)
		# retrun observation
		return self.canvas.coords(self.rect)

	def step(self, action):
		s = self.canvas.coords(self.rect)
		base_action = np.array([0, 0])
		if action == 0: # up
			if s[1] > UNIT:
				base_action[1] -= UNIT
		elif action == 1:  # down
			if s[1] < (MAZE_H - 1)*UNIT:
				base_action[1] += UNIT
		elif action == 2:  # right
			if s[0] < (MAZE_W - 1)*UNIT:
				base_action[0] += UNIT
		elif action == 3:  # left
			if s[0] > UNIT:
				base_action[0] -= UNIT
		# 移动
		self.canvas.move(self.rect, base_action[0], base_action[1])
		s_ = self.canvas.coords(self.rect)
		# 奖励函数
		if s_ == self.canvas.coords(self.oval):
			reward = 1
			done = True
		elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
			reward = -1
			done = True
		else:
			reward = 0
			done = False
		return s_, reward, done

	def render(self):
		time.sleep(0.1)
		self.update()  # 更新页面

rl.py

import numpy as np  
import pandas as pd  

class RL(object):
	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
		self.actions = actions
		self.lr = learning_rate
		self.gamma = reward_decay
		self.epsilon = e_greedy
		self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

	def choose_action(self, observation):
		self.check_state_exist(observation)
		if np.random.uniform() < self.epsilon:
			state_action = self.q_table.loc[observation, :]
			action = np.random.choice(state_action[state_action==np.max(state_action)].index)
		else:
			action = np.random.choice(self.actions)
		return action

	def learn(self, *args):
		Pass

	def check_state_exist(self, state):
		if state not in self.q_table.index:
			self.q_table = self.q_table.append(
				pd.Series(
					[0]*len(self.actions),
					index=self.q_table.columns,
					name=state,
				)
			)

# off-policy
class QLearningTable(RL):
	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
		super(QLearningTable, self).__init__(actions, learning_rate,
			reward_decay, e_greedy)

	def learn(self, s, a, r, s_, a_):
		self.check_state_exist(s_)
		q_predict = self.q_table.ix[s, a]
		if s_ != 'terminal':
			q_target = r + self.gamma * self.q_table.ix[s_, :].max()
		else:
			q_target = r
		self.q_table.ix[s,a] += self.lr * (q_target - q_predict) # 更新

# on-policy
class SarsaTable(RL):
	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
		super(SarsaTable, self).__init__(actions, learning_rate,
			reward_decay, e_greedy)

	def learn(self, s, a, r, s_, a_):
		self.check_state_exist(s_)
		q_predict = self.q_table.ix[s, a]
		if s_ != 'terminal':
			q_target = r + self.gamma * self.q_table.ix[s_, a_]
		else:
			q_target = r
		self.q_table.ix[s,a] += self.lr * (q_target - q_predict) # 更新

主函数run.py

from maze import Maze 
from rl import QLearningTable, SarsaTable

def update():
	for episode in range(100):
		observation = env.reset()
		action = RL.choose_action(str(observation))
		while True:
			env.render()
			observation_, reward, done = env.step(action)
			action_ = RL.choose_action(str(observation_))
			# (s, a, r, s, a) == Sarsa
			RL.learn(str(observation), action, reward, str(observation_),
				action_)
			observation = observation_
			action = action_
			if done:
				break

	print('game over')
	env.destory()

if __name__ == '__main__':
	env = Maze()
	# RL = QLearningTable(actions=list(range(env.n_actions)))
	RL = SarsaTable(actions=list(range(env.n_actions)))
	env.after(100, update)
	env.mainloop()

CopperDong

关注

3
点赞
踩
13

收藏

觉得还不错? 一键收藏
1
评论
强化学习实例2：MDP

红色块移动到黄色，黑色为障碍物马尔科夫链，预测最好的路径，值函数为回报r（reward）和the discounted value of the ending stateSARSA代表state, action, reward, next state和next action。it is known as an own policy Reinforcement Learnin...
复制链接

扫一扫