![](https://img-blog.csdnimg.cn/20201014180756754.png?x-oss-process=image/resize,m_fixed,h_64,w_64)
深度强化学习原理与实践代码清单
图灵保佑
这个作者很懒,什么都没留下…
展开
-
2021-11-07
import torch class SharedAdam(torch.optim.Adam): # params--待优化参数的iterable或者是定义了参数组的dict # lr--学习率(默认:1e-3) # betas–-用于计算梯度以及梯度平方的运行平均值的系数(默认:0.9,0.999) # eps–-为了增加数值计算的稳定性而加到分母里的项(默认:1e-8) # weight_decay--权重衰减(L2惩罚)(默认: 0) def __i原创 2021-11-07 10:28:45 · 434 阅读 · 0 评论 -
2021-11-1 9
import math import random import gym import numpy as np import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions import Normal import matplotlib.pyplot as plt use_cuda = torch.cuda.is_availabl原创 2021-11-01 20:34:17 · 177 阅读 · 0 评论 -
2021-10-31 8
网络模型根据输入可以更改形式 # Hyper Parameters BATCH_SIZE = 32 LR = 0.01 # learning rate EPSILON = 0.9 # greedy policy GAMMA = 0.9 # reward discount TARGET_REPLACE_ITER = 100 # target update frequency MEMORY_CAPACITY =原创 2021-10-31 10:37:45 · 100 阅读 · 0 评论 -
2021-10-25 7.18
class BPModel(nn.Module): def __init__(self, n_x, n_y): super(BPModel, self).__init__() self.layer1 = nn.Linear(n_x, 10) self.layer2 = nn.Linear(10, 10) self.layer3 = nn.Linear(10, n_y) def forward(self, x):原创 2021-10-25 10:49:25 · 96 阅读 · 0 评论 -
2021-10-24 7.7
class Monte_Carlo_Policy_Gradient(): def __init__(self, env, num_episodes=200, learning_rate=0.01, reward_decay=0.95): self.nA = env.action_space.n # 动作空间数量 self.nS = env.observation_space.shape[0] # 状态空间数量 self.env = env # 环原创 2021-10-24 16:07:22 · 78 阅读 · 0 评论 -
2021-10-23 6.9
# 值函数近似器 class Estimator(): def __init__(self): state_examples = np.array([env.observation_space.sample() for x in range(10000)]) # 对环境进行10000次采样,便于后续对状态state抽取特征 # 特征处理1:归一化数据状态为零均值和单位方差 self.scaler = Scaler() self.sca原创 2021-10-23 09:41:18 · 192 阅读 · 0 评论 -
2021-10-17 5.13
class Q_learning(): def __init__(self, env, num_episodes, discount=1.0, alpha=0.5, epsilon=0.1, n_bins=10): self.nA = env.action_space.n # 动作空间数 self.nS = env.observation_space.shape[0] # 状态空间数 self.env = env # 环境 sel原创 2021-10-19 15:16:10 · 78 阅读 · 0 评论 -
2021-10-17 5.7
class SARSA(): def __init__(self, env, num_episodes, discount=1.0, alpha=0.5, epsilon=0.1, n_bins=10): self.nA = env.action_space.n # 动作空间数 self.nS = env.observation_space.shape[0] # 状态空间数 self.env = env # 环境 self.num原创 2021-10-19 14:58:34 · 94 阅读 · 0 评论 -
2021-10-17 4.13
def create_random_policy(nA): A = np.ones(nA, dtype=float) / nA # 创建随即策略 def policy_fn(observation): # 策略函数 return A return policy_fn def create_greedy_policy(Q): def policy_fn(state): # 创建贪婪策略 A = np.zeros_like(Q[原创 2021-10-18 21:18:39 · 91 阅读 · 0 评论 -
2021-10-17 4.10
# 贪婪算法 def epsilon_greedy_policy(q, epsilon, nA): def __policy__(state): A_ = np.ones(nA, dtype=float) # 初始化动作概率 A = A_ * epsilon / nA # 以epsilon设定动作概率 best = np.argmax(q[state]) # 选取动作值函数中的最大值作为最优值 A[best] += 1 - ep原创 2021-10-18 17:01:19 · 62 阅读 · 0 评论 -
2021-10-17 3.9
# 对于给定状态计算各个动作a的期望 def clac_action_value(state, V, discount_factor=1.0): A = np.zeros(env.nA) # 初始化动作期望向量 for a in range(env.nA): # 遍历当前状态下所有动作 for prob, next_state, reward, done in env.P[state][a]: A[a] += prob * (reward + di原创 2021-10-18 15:01:34 · 80 阅读 · 0 评论 -
2021-10-17 3.5
def policy_iteration(env, policy, dicount_factor=1.0): while True: V = policy_evaluation(policy, env, dicount_factor) # 策略评估 policy_stable = True # policy标志位,当某个状态策略变化后,变为false for s in range(env.nS): # 遍历状态 old_a原创 2021-10-17 20:50:31 · 62 阅读 · 0 评论 -
2021-10-17 3.2
# discount_factor为折扣因子,theta为变化阈值,如果状态值函数的变化不大于阈值,则迭代停止 def policy_evaluation(policy, environment, discount_factor=1.0, theta=1.0): env = environment V = np.zeors(env.ns) # 初始化全0的值函数向量用于记录状态值 for _ in range(10000): # 迭代开始 delta = 0原创 2021-10-17 20:05:09 · 58 阅读 · 0 评论 -
2021-10-17 2.1
# nA为动作空间数量, T为进行的总时间步 def epsilon_greedy(nA, R, T, epsilon=0.6): r = 0 # 初始化累计奖励r N = [0] * nA # 对动作空间进行全零初始化 for _ in range(T): if np.random.rand() < epsilon: a = np.random.randint(q_value.shape[0]) else:原创 2021-10-17 17:37:14 · 70 阅读 · 0 评论