强化学习QLearning CliffWalking-v0环境使用介绍
1. 定义算法
QLearning是一种用于强化学习的算法,它可以用于找到一个Markov决策过程(MDP)中的最优行动策略。它的主要思想是:
- 算法使用Q函数来确定状态-行动(state-action)值函数。Q函数定义为:Q(s, a)返回执行行动a在状态s下的最大化期望回报。
- Q函数通过迭代更新,使用新的经验来改进值函数。更新公式为:
Q ( s , a ) < − Q ( s , a ) + a l p h a ∗ ( r e w a r d + g a m m a ∗ m a x a ′ Q ( s ′ , a ′ ) − Q ( s , a ) ) Q(s, a) <- Q(s, a) + alpha * (reward + gamma * max_a' Q(s', a') - Q(s, a)) Q(s,a)<−Q(s,a)+alpha∗(reward+gamma∗maxa′Q(s′,a′)−Q(s,a))
- s和s’分别表示当前状态和下一状态,a和a’表示当前行动和下一行动
- alpha是学习率,控制更新的量
- reward 是执行行动a在状态s后获得的奖励
- gamma是折扣因子,衰减了max_a’ Q(s’, a’)的值
- 根据Q函数选择行动策略。通常使用ε-greedy策略,以概率ε选择随机行动,否则选择Q值最大的行动。
- 通过大量迭代并Decaying ε, QLearning可以收敛到最优策略。
QLearning的好处是: - 可以在不需要环境模型的情况下工作,只需要环境的奖励反馈。
- 可以确保找到最优解,如果探索足够的话。
- 相比 SARSA 算法简单易理解。
- 易于实现,代码量少。
1.1定义模型
import numpy as np
import math
import torch
from collections import defaultdict
import seaborn as sns
import os
import random
import gymnasium as gym
import matplotlib.pyplot as plt
import seaborn as sns
class Qlearning(object):
def __init__(self,cfg):
'''智能体类
Args:
cfg (class): 超参数类
'''
self.n_actions = cfg.n_actions
self.exploration_type = 'e-greedy' # 探索策略如 e-greedy ,boltzmann ,softmax, ucb 等
self.lr = cfg.lr
self.gamma = cfg.gamma
self.epsilon = cfg.epsilon_start
self.sample_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.Q_table = defaultdict(lambda: np.zeros(self.n_actions)) # 使用嵌套字典来表示 Q(s,a),并将指定所有的 Q_table 创建时, Q(s,a) 初始设置为 0
def sample_action(self, state):
''' 以 e-greedy 策略训练时选择动作
Args:
state (array): 状态
Returns:
action (int): 动作
'''
if self.exploration_type == 'e-greedy':
action = self._epsilon_greedy_sample_action(state)
else:
raise NotImplementedError
return action
def predict_action(self,state):
''' 预测动作
Args:
state (array): 状态
Returns:
action (int): 动作
'''
if self.exploration_type == 'e-greedy':
action = self._epsilon_greedy_predict_action(state)
else:
raise NotImplementedError
return action
def _epsilon_greedy_sample_action(self, state):
'''
采用 epsilon-greedy 策略进行动作选择
Args:
state (array): 状态
Returns:
action (int): 动作
'''
self.sample_count += 1
# epsilon 值需要衰减,衰减方式可以是线性、指数等,以平衡探索和开发
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay)
if np.random.uniform(0, 1) > self.epsilon:
action = np.argmax(self.Q_table[str(state)]) # 选择具有最大 Q 值的动作
else:
action = np.random.choice(self.n_actions) # 随机选择一个动作
return action
def _epsilon_greedy_predict_action(self,state):
'''
使用 epsilon-greedy 算法进行动作预测
Args:
state (array): 状态
Returns:
action (int): 动作
'''
action = np.argmax(self.Q_table[str(state)])
return action
def update(self, state, action, reward, next_state, done):
''' 更新模型
Args:
state (array): 当前状态
action (int): 当前动作
reward (float): 当前奖励信号
next_state (array): 下一个状态
done (bool): 表示是否达到终止状态
'''
Q_predict = self.Q_table[str(state)][action]
if done: # 终止状态
Q_target = reward
else:
Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)])
self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
def save_model(self,path):
'''
保存模型
Args:
path (str): 模型存储路径
'''
import dill
from pathlib import Path
# 确保存储路径存在
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(
obj=self.Q_table,
f=path+"Qleaning_model.pkl",
pickle_module=dill
)
print("Model saved!")
def load_model(self, path):
'''
根据模型路径导入模型
Args:
fpath (str): 模型路径
'''
import dill
self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)
print("Mode loaded!")
def train(cfg, env, agent):
''' 训练
'''
print("开始训练!")
rewards = [] # 记录所有回合的奖励
steps = []
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 一轮的累计奖励
ep_step = 0
state = env.reset(seed = cfg.seed) # 重置环境并获取初始状态
for _ in range(cfg.max_steps):
ep_step += 1
action = agent.sample_action(state) # 采样动作
if cfg.new_step_api:
next_state, reward, terminated, truncated , info = env.step(action) # 更新环境并返回新状态、奖励、终止状态、截断标志和其他信息(使用 OpenAI Gym 的 new_step_api)
else:
next_state, reward, terminated, info = env.step(action) # 更新环境并返回新状态、奖励、终止状态和其他信息(使用 OpenAI Gym 的 old_step_api)
agent.update(state, action, reward, next_state, terminated) # 更新 agent
state = next_state # 更新状态
ep_reward += reward # 增加奖励
if terminated:
break
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep + 1) % 10 == 0:
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print("完成训练!")
return {'rewards':rewards}
def test(cfg, env, agent):
print("开始测试!")
rewards = [] # 记录所有回合的奖励
steps = []
print(cfg.test_eps)
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 一轮的累计奖励
ep_step = 0
state = env.reset(seed = cfg.seed) # 重置环境并获取初始状态
for _ in range(cfg.test_eps):
if cfg.render:
env.render()
ep_step += 1
action = agent.predict_action(state) # 预测动作
if cfg.new_step_api:
next_state, reward, terminated, truncated , info = env.step(action) # 更新环境并返回新状态、奖励、终止状态、截断标志和其他信息(使用 OpenAI Gym 的 new_step_api)
else:
next_state, reward, terminated, info = env.step(action) # 更新环境并返回新状态、奖励、终止状态和其他信息(使用 OpenAI Gym 的 old_step_api)
state = next_state # 更新状态
ep_reward += reward # 增加奖励
if terminated:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}")
print("完成测试")
env.close()
return {'rewards':rewards}
def all_seed(env,seed = 1):
''' 万能的seed函数
'''
env.reset(seed=seed) # env config
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed) # config for CPU
torch.cuda.manual_seed(seed) # config for GPU
os.environ['PYTHONHASHSEED'] = str(seed) # config for python scripts
# config for cudnn
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
def env_config(cfg):
if(cfg.render== True):
env = gym.make(cfg.env_name,render_mode=cfg.render_mode)
else:
env = gym.make(cfg.env_name) # 创建环境
all_seed(env,seed=cfg.seed)
print(env.observation_space.shape)
n_states = env.observation_space.n
n_actions = env.action_space.n
print(f"状态空间维度:{n_states},动作空间维度:{n_actions}")
# 更新n_states和n_actions到cfg参数中
setattr(cfg, 'n_states', n_states)
setattr(cfg, 'n_actions', n_actions)
setattr(cfg, 'action_space', env.action_space)
agent = Qlearning(cfg)
return env
class Config:
def __init__(self) -> None:
## 通用参数
self.env_name = "CliffWalking-v0" # name of environment
self.new_step_api = True # whether to use new step api of gym
self.wrapper = None # wrapper of environment
self.render = True # whether to render environment
self.render_mode = "human" # 渲染模式, "human" 或者 "rgb_array"
self.algo_name = "Qlearning" # name of algorithm
self.mode = "train" # train or test
self.mp_backend = "mp" # 多线程框架,ray或者mp(multiprocessing),默认mp
self.seed = 1 # random seed
self.device = "cuda" # device to use
self.train_eps = 500 # number of episodes for training
self.test_eps = 10 # number of episodes for testing
self.eval_eps = 10 # number of episodes for evaluation
self.eval_per_episode = 5 # evaluation per episode
self.max_steps = 1000 # max steps for each episode
self.load_checkpoint = False
self.load_path = "tasks" # path to load model
self.show_fig = False # show figure or not
self.save_fig = True # save figure or not
## Qlearing参数
self.epsilon_start = 0.95 # epsilon 初始值
self.epsilon_end = 0.01 # epsilon 终止值
self.epsilon_decay = 300 # epsilon 衰减率
self.gamma = 0.90 # 奖励折扣因子
self.lr = 0.1 # 学习率
def smooth(data, weight=0.9):
'''用于平滑曲线,类似于Tensorboard中的smooth曲线
'''
last = data[0]
smoothed = []
for point in data:
smoothed_val = last * weight + (1 - weight) * point # 计算平滑值
smoothed.append(smoothed_val)
last = smoothed_val
return smoothed
def plot_rewards(rewards,title="learning curve"):
sns.set()
plt.figure() # 创建一个图形实例,方便同时多画几个图
plt.title(f"{title}")
plt.xlim(0, len(rewards), 10) # 设置x轴的范围
plt.xlabel('epsiodes')
plt.plot(rewards, label='rewards')
plt.plot(smooth(rewards), label='smoothed')
plt.legend()
plt.show()
# 获取参数
cfg = Config()
cfg.render = False
# 训练
env = env_config(cfg)
agent = Qlearning(cfg)
res_dic = train(cfg, env, agent)
agent.save_model('./')
plot_rewards(res_dic['rewards'], title=f"training curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}")
# 测试
cfg.render = True
env = env_config(cfg)
res_dic = test(cfg, env, agent)
plot_rewards(res_dic['rewards'], title=f"testing curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}") # 画出结果