【蘑菇书】蒙特卡洛算法

资源存储库

于 2024-08-02 16:48:57 发布

阅读量69

点赞数 5

分类专栏：算法文章标签：算法 python 机器学习

本文链接：https://blog.csdn.net/wq6qeg88/article/details/140874519

版权

1、定义算法

import numpy as np
from collections import defaultdict
class FisrtVisitMC:
    ''' On-Policy First-Visit MC Control
    '''
    def __init__(self,cfg):
        self.n_actions = cfg.n_actions
        self.epsilon = cfg.epsilon
        self.gamma = cfg.gamma 
        self.Q_table = defaultdict(lambda: np.zeros(cfg.n_actions))
        self.returns_sum = defaultdict(float) # 保存return之和
        self.returns_count = defaultdict(float)
        
    def sample_action(self,state):
        state = str(state)
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        else:
            action = np.random.randint(0,self.n_actions)
        return action
    def predict_action(self,state):
        state = str(state)
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.argmax(self.Q_table[state])
        else:
            action = np.random.randint(0,self.n_actions)
        return action
    def update(self,one_ep_transition):
        # Find all (state, action) pairs we've visited in this one_ep_transition
        # We convert each state to a tuple so that we can use it as a dict key
        sa_in_episode = set([(str(x[0]), x[1]) for x in one_ep_transition])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            # Find the first occurence of the (state, action) pair in the one_ep_transition

            first_occurence_idx = next(i for i,x in enumerate(one_ep_transition)
                                       if str(x[0]) == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([x[2]*(self.gamma**i) for i,x in enumerate(one_ep_transition[first_occurence_idx:])])
            # Calculate average return for this state over all sampled episodes
            self.returns_sum[sa_pair] += G
            self.returns_count[sa_pair] += 1.0
            self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]

2、定义训练

def train(cfg,env,agent):
    print('开始训练！')
    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
    rewards = []  # 记录奖励
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录每个回合的奖励
        one_ep_transition = []
        state = env.reset(seed=cfg.seed) # 重置环境,即开始新的回合
        for _ in range(cfg.max_steps):
            action = agent.sample_action(state)  # 根据算法采样一个动作
            next_state, reward, terminated, info = env.step(action)   # 与环境进行一次动作交互
            one_ep_transition.append((state, action, reward))  # 保存transitions
            agent.update(one_ep_transition)  # 更新智能体
            state = next_state  # 更新状态
            ep_reward += reward  
            if terminated:
                break
        rewards.append(ep_reward)
        print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.1f}")
    print('完成训练！')
    return {"rewards":rewards}
def test(cfg,env,agent):
    print('开始测试！')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录每个episode的reward
        state = env.reset(seed=cfg.seed)  # 重置环境, 重新开一局（即开始新的一个回合）
        for _ in range(cfg.max_steps):
            action = agent.predict_action(state)  # 根据算法选择一个动作
            next_state, reward, terminated, info = env.step(action)  # 与环境进行一个交互
            state = next_state  # 更新状态
            ep_reward += reward
            if terminated:
                break
        rewards.append(ep_reward)
        print(f"回合数：{i_ep+1}/{cfg.test_eps}, 奖励：{ep_reward:.1f}")
    print('完成测试！')
    return {"rewards":rewards}

3、定义环境

import sys,os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))
import torch
import numpy as np
import random
from envs.racetrack import RacetrackEnv

def all_seed(env,seed = 1):
    ''' omnipotent seed for RL, attention the position of seed function, you'd better put it just following the env create function
    '''
    if seed == 0:
        return
    # print(f"seed = {seed}")
    env.seed(seed) # env config
    np

最低0.47元/天解锁文章

资源存储库

关注

5
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
【蘑菇书】蒙特卡洛算法

'''self.returns_sum = defaultdict(float) # 保存return之和else:else:print('开始训练！')print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')rewards = [] # 记录奖励ep_reward = 0 # 记录每个回合的奖励state = env.reset(seed=cfg.seed) # 重置环境,即开始新的回合。
复制链接

扫一扫