【蘑菇书】蒙特卡洛算法

目录

1、定义算法

2、定义训练

3、定义环境

4、设置参数

5、开始训练


1、定义算法

import numpy as np
from collections import defaultdict
class FisrtVisitMC:
    ''' On-Policy First-Visit MC Control
    '''
    def __init__(self,cfg):
        self.n_actions = cfg.n_actions
        self.epsilon = cfg.epsilon
        self.gamma = cfg.gamma 
        self.Q_table = defaultdict(lambda: np.zeros(cfg.n_actions))
        self.returns_sum = defaultdict(float) # 保存return之和
        self.returns_count = defaultdict(float)
        
    def sample_action(self,state):
        state = str(state)
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        else:
            action = np.random.randint(0,self.n_actions)
        return action
    def predict_action(self,state):
        state = str(state)
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.argmax(self.Q_table[state])
        else:
            action = np.random.randint(0,self.n_actions)
        return action
    def update(self,one_ep_transition):
        # Find all (state, action) pairs we've visited in this one_ep_transition
        # We convert each state to a tuple so that we can use it as a dict key
        sa_in_episode = set([(str(x[0]), x[1]) for x in one_ep_transition])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            # Find the first occurence of the (state, action) pair in the one_ep_transition

            first_occurence_idx = next(i for i,x in enumerate(one_ep_transition)
                                       if str(x[0]) == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([x[2]*(self.gamma**i) for i,x in enumerate(one_ep_transition[first_occurence_idx:])])
            # Calculate average return for this state over all sampled episodes
            self.returns_sum[sa_pair] += G
            self.returns_count[sa_pair] += 1.0
            self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]

2、定义训练

def train(cfg,env,agent):
    print('开始训练!')
    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
    rewards = []  # 记录奖励
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录每个回合的奖励
        one_ep_transition = []
        state = env.reset(seed=cfg.seed) # 重置环境,即开始新的回合
        for _ in range(cfg.max_steps):
            action = agent.sample_action(state)  # 根据算法采样一个动作
            next_state, reward, terminated, info = env.step(action)   # 与环境进行一次动作交互
            one_ep_transition.append((state, action, reward))  # 保存transitions
            agent.update(one_ep_transition)  # 更新智能体
            state = next_state  # 更新状态
            ep_reward += reward  
            if terminated:
                break
        rewards.append(ep_reward)
        print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}")
    print('完成训练!')
    return {"rewards":rewards}
def test(cfg,env,agent):
    print('开始测试!')
    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录每个episode的reward
        state = env.reset(seed=cfg.seed)  # 重置环境, 重新开一局(即开始新的一个回合)
        for _ in range(cfg.max_steps):
            action = agent.predict_action(state)  # 根据算法选择一个动作
            next_state, reward, terminated, info = env.step(action)  # 与环境进行一个交互
            state = next_state  # 更新状态
            ep_reward += reward
            if terminated:
                break
        rewards.append(ep_reward)
        print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
    print('完成测试!')
    return {"rewards":rewards}

3、定义环境

import sys,os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))
import torch
import numpy as np
import random
from envs.racetrack import RacetrackEnv

def all_seed(env,seed = 1):
    ''' omnipotent seed for RL, attention the position of seed function, you'd better put it just following the env create function
    '''
    if seed == 0:
        return
    # print(f"seed = {seed}")
    env.seed(seed) # env config
    np
  • 5
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
蘑菇分类算法是一种常用的机器学习算法,可以用来对蘑菇进行分类,判断其是否有毒性。以下是用Python实现蘑菇分类算法的基本步骤: 1. 数据预处理:读取数据集,将数据集分为训练集和测试集,并进行标准化处理。 ```python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler # 读取数据集 data = pd.read_csv('mushrooms.csv') # 分离特征和标签 X = data.drop('class', axis=1) y = data['class'] # 将数据集分为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 标准化处理 sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ``` 2. 模型训练:使用逻辑回归算法进行模型训练。 ```python from sklearn.linear_model import LogisticRegression # 创建逻辑回归模型 classifier = LogisticRegression(random_state=0) # 模型训练 classifier.fit(X_train, y_train) ``` 3. 模型评估:使用测试集对模型进行评估。 ```python from sklearn.metrics import confusion_matrix, accuracy_score # 模型预测 y_pred = classifier.predict(X_test) # 混淆矩阵 cm = confusion_matrix(y_test, y_pred) print(cm) # 准确率 accuracy = accuracy_score(y_test, y_pred) print('Accuracy: {:.2f}%'.format(accuracy * 100)) ``` 完整代码如下: ```python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, accuracy_score # 读取数据集 data = pd.read_csv('mushrooms.csv') # 分离特征和标签 X = data.drop('class', axis=1) y = data['class'] # 将数据集分为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 标准化处理 sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # 创建逻辑回归模型 classifier = LogisticRegression(random_state=0) # 模型训练 classifier.fit(X_train, y_train) # 模型预测 y_pred = classifier.predict(X_test) # 混淆矩阵 cm = confusion_matrix(y_test, y_pred) print(cm) # 准确率 accuracy = accuracy_score(y_test, y_pred) print('Accuracy: {:.2f}%'.format(accuracy * 100)) ``` 注意:这只是一个简单的示例,实际应用中需要根据具体情况进行调整和优化。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

资源存储库

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值