基于“蘑菇书”的强化学习知识点（十三）：第三章的代码：MonteCarlo.ipynb及其涉及的其他代码的更新以及注解（gym版本＞= 0.26）（一）

本文链接：https://blog.csdn.net/xzs1210652636/article/details/145865620
第三章的代码：MonteCarlo.ipynb及其涉及的其他代码的更新以及注解（gym版本＞= 0.26）（一）

- 摘要
摘要

本系列知识点讲解基于蘑菇书EasyRL中的内容进行详细的疑难点分析！具体内容请阅读蘑菇书EasyRL！
在MonteCarlo.ipynb目录下面创建envs文件夹，然后下载racetrack.py和track.txt放到envs文件夹中。
import numpy as np
'''来自 Python 的 collections 模块，
它可以在访问不存在的键时返回一个默认值（这里默认返回一个固定格式的数组或浮点数）。'''
from collections import defaultdict

class FirstVisitMC:
    ''' On-Policy First-Visit MC Control
    这个类实现的是 on‑policy（即智能体遵循当前策略）首次访问蒙特卡罗控制算法。'''
    def __init__(self,cfg):
        '''
        self.n_actions：从配置对象中获得动作总数。例如，如果环境中有 4 个动作，则 n_actions=4。
        '''
        self.n_actions = cfg.n_actions
        '''
        self.epsilon：用于 ε‑greedy 策略中的探索概率（虽然在 MC 中常用“随机性”策略，
                                                    这里依然保留该参数）。
        '''
        self.epsilon = cfg.epsilon
        '''
        self.gamma：折扣因子，用于计算未来奖励的折扣和。比如 γ=0.9 表示后续奖励权重逐渐衰减。
        '''
        self.gamma = cfg.gamma 
        '''
        self.Q_table：
        使用 defaultdict 构造。
        当访问一个新状态时，默认返回一个形状为 (n_actions,) 的零数组。
        例如，对于状态 "0"（转换为字符串后），如果未见过，
        则 Q_table["0"] 初始化为 np.zeros(4) → [0, 0, 0, 0]。
        '''
        self.Q_table = defaultdict(lambda: np.zeros(cfg.n_actions))
        '''self.returns_sum：用于保存每个 (state, action) 对在所有回合中累积的回报之和。'''
        self.returns_sum = defaultdict(float) # 保存return之和
        '''self.returns_count：用于记录每个 (state, action) 对被访问的次数。'''
        self.returns_count = defaultdict(float)
        
    def sample_action(self,state):
        if isinstance(state, tuple):
            state = state[0]
        else:
            state = state
        '''
        状态键转换
        将传入的 state 转换为字符串（state = str(state)），这样可以作为字典的键。
        例如：若 state 为数字 0，则转换为 "0"。
        '''
        state = str(state)
        '''
        判断状态是否已存在
        如果状态在 Q_table 的键中存在，则说明之前已经访问过该状态，已有 Q 值信息。
        否则，直接随机采样一个动作（均匀随机）。
        '''
        if state in self.Q_table.keys():
            '''
            取 Q_table[state] 中最大的索引。
            例如：若 Q_table["0"] = [0, 2, 1, 0]，则 best_action = 1。
            '''
            best_action = np.argmax(self.Q_table[state])
            '''
            初始化每个动作的基础概率为 ε/n_actions。
            例如，若 ε=0.2, n_actions=4，则每个动作基础概率为 0.05。
            '''
            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            '''
            对于最佳动作，加上额外的概率 1-ε，使得该动作的总概率为 0.05+(1-0.2)=0.05+0.8=0.85。
            '''
            """
            整体概率分布示例：假设 n_actions=4，ε=0.2，最佳动作为 1，
                             则概率分布为 [0.05, 0.85, 0.05, 0.05]。
            """
            action_probs[best_action] += (1.0 - self.epsilon)
            '''使用 np.random.choice 根据构造的概率分布采样动作，返回采样得到的动作索引。'''
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        else:
            '''
            如果状态不在 Q_table 中，
            则直接用 np.random.randint(0, self.n_actions) 均匀随机选择一个动作。
            '''
            action = np.random.randint(0,self.n_actions)
        return action
    
    def predict_action(self,state):
        '''
        用于测试阶段选择动作，一般采用确定性策略。
        '''
        if isinstance(state, tuple):
            state = state[0]
        else:
            state = state
        state = str(state)
        '''
        虽然代码中计算了 action_probs，但最终直接用 np.argmax(self.Q_table[state]) 返回最佳动作。
        例如：若 Q_table["0"] = [0, 2, 1, 0]，predict_action 返回 1。
        '''
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.argmax(self.Q_table[state])
        else:
            action = np.random.randint(0,self.n_actions)
        return action
    
    def update(self,one_ep_transition):
        """
        参数 one_ep_transition 是一个列表，
        每个元素通常为一个元组 (state, action, reward, ...)，代表一次完整回合中按时间顺序的转移记录。
        """
        # Find all (state, action) pairs we've visited in this one_ep_transition
        # We convert each state to a tuple so that we can use it as a dict key
        '''
        作用：
        遍历整个回合的转移，提取出所有唯一的 (state, action) 对（将 state 转换为字符串作为字典键）。
        举例：
        假设 one_ep_transition 为 [(0, 1, 0), (1, 2, 1), (2, 0, 2), (1, 2, 3)]
        转换后得到集合： { ("0", 1), ("1", 2), ("2", 0) }
        注意 (1,2) 只出现一次（第一次出现），因为采用的是集合。
        '''
        sa_in_episode = set([(str(x[0]), x[1]) for x in one_ep_transition])
        for state, action in sa_in_episode:
            '''
            作用：
            对每个唯一的 (state, action) 对，准备更新 Q 值。
            sa_pair：
            用作字典键，记录该状态动作对的累计回报和访问次数。
            '''
            sa_pair = (state, action)
            # Find the first occurence of the (state, action) pair in the one_ep_transition
            '''
            作用：
            遍历回合中所有转移，找到第一次出现当前 (state, action) 对的索引。
            举例：
            对于 (state, action)=("1", 2) 在上例中，第一次出现在索引 1。
            '''
            '''
            one_ep_transition = [
                (0, 1, 0),   # 第0步：状态0, 动作1, 奖励0
                (1, 2, 1),   # 第1步：状态1, 动作2, 奖励1
                (2, 0, 2),   # 第2步：状态2, 动作0, 奖励2
                (1, 2, 3)    # 第3步：状态1, 动作2, 奖励3
            ]
            那么 enumerate(one_ep_transition) 会生成：
                第0次迭代返回 (0, (0, 1, 0))
                第1次迭代返回 (1, (1, 2, 1))
                第2次迭代返回 (2, (2, 0, 2))
                第3次迭代返回 (3, (1, 2, 3))
            对于每个 (index, x) 对，如果转移中状态和动作匹配指定的 state 和 action，
            生成器就返回该索引 i。
            next() 函数用于从生成器中取出第一个满足条件的值。
            '''
            first_occurence_idx = next(i for i,x in enumerate(one_ep_transition)
                                       if str(x[0]) == state and x[1] == action)
            # Sum up all rewards since the first occurance
            '''
            对于该 (state, action) 对，计算从第一次出现之后（包括第一次）的所有奖励的折扣和。
            假设 gamma=0.9，且从第一次出现 (state,action)=("1",2) 开始的转移为
            [(1, 2, 1), (2, 0, 2), (1, 2, 3)]
            则 G = 1×(0.9^0) + 2×(0.9^1) + 3×(0.9^2) = 1 + 1.8 + 2.43 = 5.23。
            '''
            G = sum([x[2]*(self.gamma**i) for i,x in enumerate(one_ep_transition[first_occurence_idx:])])
            # Calculate average return for this state over all sampled episodes
            self.returns_sum[sa_pair] += G
            self.returns_count[sa_pair] += 1.0
            self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]
            
            
def train(cfg,env,agent):
    print('开始训练！')
    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
    rewards = []  # 记录奖励
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录每个回合的奖励
        one_ep_transition = []
        state = env.reset(seed=cfg.seed) # 重置环境,即开始新的回合
        for _ in range(cfg.max_steps):
            action = agent.sample_action(state)  # 根据算法采样一个动作
            
            # step_result = env.step(action)
            # if len(step_result) == 3:
            #     next_state, reward, done = step_result
            # elif len(step_result) >= 4:
            #     next_state, reward, done, *_ = step_result  # 忽略多余的返回值
            
            # if env_name != 'FrozenLakeEasy-v0':
            #     next_state, reward, terminated, info = env.step(action)  # 与环境进行一次动作交互
            # else:
            #     next_state, reward, terminated, truncated, info = env.step(action)  # 与环境进行一次动作交互
            
            next_state, reward, terminated, info = env.step(action)   # 与环境进行一次动作交互
            '''
            将当前的转移 (state, action, reward) 保存到 one_ep_transition 列表中。
            这样整个回合结束后，one_ep_transition 包含了该回合中所有步骤的信息。
            '''
            one_ep_transition.append((state, action, reward))  # 保存transitions
            '''
            调用 agent.update(one_ep_transition)，这通常是蒙特卡罗（MC）方法的更新过程。
            在 First-Visit MC 中，
            更新是基于该回合中每个 (state, action) 对首次出现后的累计回报进行平均更新 Q 值。
            注意：在 MC 算法中，
                 通常需要整一回合结束后才能更新 Q 值，因此 update() 函数的输入是整个回合的转移列表。
            '''
            agent.update(one_ep_transition)  # 更新智能体
            state = next_state  # 更新状态
            ep_reward += reward  
            if terminated:
                break
        rewards.append(ep_reward)
        print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.1f}")
    print('完成训练！')
    return {"rewards":rewards}


def test(cfg,env,agent):
    print('开始测试！')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录每个episode的reward
        state = env.reset(seed=cfg.seed)  # 重置环境, 重新开一局（即开始新的一个回合）
        for _ in range(cfg.max_steps):
            action = agent.predict_action(state)  # 根据算法选择一个动作
            next_state, reward, terminated, info = env.step(action)  # 与环境进行一个交互
            state = next_state  # 更新状态
            ep_reward += reward
            if terminated:
                break
        rewards.append(ep_reward)
        print(f"回合数：{i_ep+1}/{cfg.test_eps}, 奖励：{ep_reward:.1f}")
    print('完成测试！')
    return {"rewards":rewards}           
            

# 定义环境
import sys,os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))
import torch
import numpy as np
import random
from envs.racetrack import RacetrackEnv

def all_seed(env,seed = 1):
    ''' omnipotent seed for RL, attention the position of seed function, you'd better put it just following the env create function
    '''
    if seed == 0:
        return
    # print(f"seed = {seed}")
    '''
    作用：
    调用环境的 seed 方法固定环境内部的随机性。
    注意：
    这要求环境实现了 seed 方法（通常 Gym 环境都实现了该方法），
    以便在 env.reset() 时能生成确定的初始状态。
    '''
    if not hasattr(env, 'seed'):
        def seed_fn(self, seed=None):
            env.reset(seed=seed)
            return [seed]
        env.seed = seed_fn.__get__(env, type(env))
    env.seed(seed) # env config
    '''
    作用：
    分别固定 NumPy、Python 内置 random 模块和 PyTorch 的随机数种子（包括 CPU 和 GPU 的随机数生成器）。
    目的：
    保证整个程序中涉及到随机性操作（例如采样、初始化权重等）都基于同一个种子，从而使实验结果可复现。
    '''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed) # config for CPU
    torch.cuda.manual_seed(seed) # config for GPU
    '''
    作用：
    设置环境变量 PYTHONHASHSEED，使得 Python 内部的哈希函数具有确定性。
    目的：
    防止由于哈希随机化导致数据结构（如字典）的顺序变化，影响实验结果。
    '''
    os.environ['PYTHONHASHSEED'] = str(seed) # config for python scripts
    # config for cudnn
    '''
    作用：
    配置 PyTorch 的 CuDNN 后端：
    将 CuDNN 设为确定性模式，防止其内部算法的随机性。
    关闭 benchmark 模式，防止自动选择非确定性算法。
    关闭 CuDNN 启用标志，确保使用 CPU 或避免不确定性。
    注意：
    这些设置通常会使计算变慢，但能保证结果一致。
    '''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False
    
def env_agent_config(cfg):
    '''创建环境和智能体
    '''    
    """
    作用：
    通过调用 RacetrackEnv() 创建一个赛道环境实例。
    举例：
    如果 RacetrackEnv 是一个 Gym 环境，那么创建后的 env 将具有标准 Gym 接口，
    如 reset(), step() 等。
    """
    env = RacetrackEnv()  # 创建环境
    '''
    作用：
    调用 all_seed 函数，用配置中的 seed 固定环境及其他随机数生成器的种子。
    举例：
    如果 cfg.seed=1，则所有随机操作将以种子 1 固定，从而保证每次运行的初始状态和随机采样一致。
    '''
    all_seed(env,seed=cfg.seed) 
    '''
    作用：
    - 从环境的 observation_space 获取状态空间的维度。
        这里假设环境的 observation_space 是一个形状为 (n, ...) 的数组，
        此处取第 0 维度作为状态维度。
    动作维度：
    - 使用 env.action_space.n 获取离散动作空间中动作的总数。
    举例：
    - 如果 RacetrackEnv 的 observation_space 是一个向量，
    且有 100 个状态，那么 n_states=100；如果动作空间为离散，且有 4 个动作，则 n_actions=4。
    '''
    n_states = env.observation_space.shape[0]  # 状态空间维度
    n_actions = env.action_space.n # 动作空间维度
    setattr(cfg, 'n_states', n_states) # 将状态维度添加到配置参数中
    setattr(cfg, 'n_actions', n_actions) # 将动作维度添加到配置参数中
    agent = FirstVisitMC(cfg)
    return env,agent            
            
            
# 设置参数       
import torch
import matplotlib.pyplot as plt
import seaborn as sns
class Config:
    '''配置参数
    '''
    def __init__(self):
        self.env_name = 'Racetrack-v0' # 环境名称
        self.algo_name = "FirstVisitMC" # 算法名称
        self.train_eps = 400 # 训练回合数
        self.test_eps = 20 # 测试回合数
        self.max_steps = 200 # 每个回合最大步数
        self.epsilon = 0.1 # 贪婪度
        self.gamma = 0.9 # 折扣因子
        self.lr = 0.5 # 学习率
        self.seed = 1 # 随机种子
        if torch.cuda.is_available(): # 是否使用GPUs
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        # self.device = torch.device('cpu')
def smooth(data, weight=0.9):  
    '''用于平滑曲线
    '''
    last = data[0]  # First value in the plot (first timestep)
    smoothed = list()
    for point in data:
        smoothed_val = last * weight + (1 - weight) * point  # 计算平滑值
        smoothed.append(smoothed_val)                    
        last = smoothed_val                                
    return smoothed

def plot_rewards(rewards,title="learning curve"):
    sns.set()
    plt.figure()  # 创建一个图形实例，方便同时多画几个图
    plt.title(f"{title}")
    # plt.xlim(0, len(rewards), 10)  # 设置x轴的范围
    # plt.xlim(0, len(rewards))  
    plt.xticks(range(0, len(rewards), 10)) 
    plt.xlabel('epsiodes')
    plt.plot(rewards, label='rewards')
    plt.plot(smooth(rewards), label='smoothed')
    plt.legend()            
            
            

# 获取参数
cfg = Config() 
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
 
plot_rewards(res_dic['rewards'], title=f"training curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}")  
# 测试
res_dic = test(cfg, env, agent)
plot_rewards(res_dic['rewards'], title=f"testing curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}")  # 画出结果