强化学习实践一||交叉熵法

强化学习实践一||交叉熵法

import gym
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

class Net(nn.Module):
    # 此处obs_size 为4维,n_actions 为2维
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

# 带有名称的元组,元组元素为 reward,steps,可用Episode来定义该元祖
Episode = namedtuple('Episode', field_names=['reward', 'steps'])

EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

# 产出batch_size轮游戏记录,每轮游戏记录为元组(rewards,[(state1,action1),(state2,action2),...])
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0    # 总奖励
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)

    while True:
        obs_v = torch.FloatTensor([obs])    # 将游戏状态转为tensor
        act_probs_v = sm(net(obs_v))    # 得到一个 1*2的矩阵,表示左右移动的概率
        act_probs = act_probs_v.data.numpy()[0]     # 取矩阵的第一行,得到2维动作向量
        action = np.random.choice(len(act_probs), p=act_probs)  # 随机取0、1两个动作,p是与之对应的概率

        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        step = EpisodeStep(observation=obs, action=action)
        episode_steps.append(step)

        if is_done:     # 游戏结束
            e = Episode(reward=episode_reward, steps=episode_steps) # e中包含一局游戏的总奖励 & 若干(观测,动作)值
            batch.append(e)     # 将该轮游戏 e 添加至列表batch

            # 条件初始化
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:    # 记录下 batch_size 局游戏结果
                yield batch     # yield 语句先看成return,返回batch后,在此处停止,在以后的next()调用时,从此处开始
                batch = []      # batch列表保存的是若干个元组e(reward,steps)

        obs = next_obs

def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))      # 取batch列表的所有reward
    reward_bound = np.percentile(rewards, percentile)   # 算出 rewards列表中的 70百分位数,即数组由小到大排序,第70%大的那个数
    reward_mean = float(np.mean(rewards))   # 求均值

    train_obs = []
    train_act = []
    for reward, steps in batch:     # steps 是一个列表,包含元组(observation,action)
        if reward < reward_bound:   # 若小于第70%的那个数 则什么也不做
            continue
        train_obs.extend(map(lambda step: step.observation, steps))     # 取出step列表中所有的observation
        train_act.extend(map(lambda step: step.action, steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)

    # 返回的状态、动作所产生的回报都大于70百分位数
    return train_obs_v, train_act_v, reward_bound, reward_mean

if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]   # 状态空间的维度
    n_actions = env.action_space.n              # 动作空间的维度

    net = Net(obs_size, HIDDEN_SIZE, n_actions)

    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=0.01)

    writer = SummaryWriter(comment="-cartpole") # 初始化SummaryWriter

    # enumerate() 给出索引和值
    for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)

        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)     # 动作是标签,状态是input
        loss_v.backward()
        optimizer.step()

        print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)   # 参数为(名称, y, x)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        if reward_m > 199:
            print("Solved!")
            break

    writer.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值