模型描述
MDP模型:
- 只考虑状态、动作、回报
- 不考虑转移函数和衰减因子
模型代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/5/17 14:56
# @Author : Liu Lihao
# @File : DQN.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import collections
from torch import optim
import numpy as np
'''
参数设置
'''
learning_rate = 0.001 #0.0001
N_episode = 100 # 训练回合数
train_threshold = 2000
batch_size = 20
buffer_limit = 50000 # 经验池最大容量
gamma = 0 # 预测的下一状态的最大动作值折合到reward中的衰减程度
'''
构建Q网络
使用全连接神经网络
策略采用 epsilon 贪婪策略,有 epsilon 的概率选择随机动作, 1-epsilon 的概率选择贪婪动作。
'''
class Qnet(nn.Module):
def __init__(self, state_dim, action_num):
super(Qnet, self).__init__()
self.fc1 = nn.Linear(state_dim, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, 128)
self.fc4 = nn.Linear(128, action_num)
self.action_num = action_num
def forward(self, x):
x = F.leaky_relu(self.fc1(x))
x = F.leaky_relu(self.fc2(x))
x = F.leaky_relu(self.fc3(x))
x = self.fc4(x)
return x
# 选择动作
# in: state
# out: action
def sample_action(self, obs, epsilon):
out = self.forward(obs)
coin = random.random() # 生成0和1之间的随机浮点数
if coin < epsilon:
# 随机返回一个动作
return random.randint(0, self.action_num-1)
else:
# 返回最大Q值对应的动作
return out.argmax().item()
'''
定义经验回放池
经验回放池实际上是一个队列,当经验回放池满时,会抛弃旧的经验值,加入新采样的经验值。
采样时,从经验回放池中随机抽取batch_size个经验值作为一个transition返回给训练机进行学习。
'''
class ReplayBuffer():
def __init__(self):
self.buffer = collections.deque(maxlen=buffer_limit) # 双端队列
# 插入经验池
def put(self, transition):
self.buffer.append(transition)
# 从经验池中抽样
'''
shape:
s_lst: [batch_size,input_size]
a_lst: [batch_size,1]
r_lst: [batch_size,1]
'''
def sample(self, n):
mini_batch = random.sample(self.buffer, n) # 随机抽取n个样本
s_lst, a_lst, r_lst = [], [], []
for transition in mini_batch:
s, a, r = transition
s_lst.append(s)
a_lst.append([a])
r_lst.append([r])
return torch.tensor(s_lst, dtype=torch.float),\
torch.tensor(a_lst),\
torch.tensor(r_lst, dtype=torch.float)
# 返回buffer长度
def size(self):
return len(self.buffer)
'''
定义智能体
参数:
state_dim:状态的维度
action_dim:动作的维度
save_path:模型保存路径
'''
class DQNModel:
def __init__(self, state_dim, action_num, save_path):
self.q = Qnet(state_dim, action_num)
self.memory = ReplayBuffer()
self.optimizer = optim.Adam(self.q.parameters(), lr=learning_rate)
self.save_path = save_path
'''神经网络前传+选择动作'''
def get_policy(self, s, epsilon):
# 从已经训练的模型中载入参数
try:
checkpoint = torch.load(self.save_path)
self.q.load_state_dict(checkpoint['q_state_dict'])
# print("载入模型成功...")
except Exception as e:
print(e)
# print("模型参数随机初始化...")
# 根据state选择动作
action = self.q.sample_action(torch.Tensor(s).float(), epsilon)
return action
'''存储经验样本'''
def store_memory(self, state, action, reward):
self.memory.put((state, action, reward))
'''
训练Q_net
训练N_episode次
每次抽取batch_size个经验样本
'''
def update(self):
LOSS = []
if self.memory.size() > batch_size:
for i in range(N_episode):
s, a, r = self.memory.sample(batch_size)
q_out = self.q(s) # 将状态s输入神经网络得到输出
q_a = q_out.gather(1, a) # 在状态s处执行a动作对应的q值
loss = F.smooth_l1_loss(q_a, r) # 计算loss
LOSS.append(loss.item())
'''
shape:
q_out:[batch_size, action_number]
q_a:[batch_size, 1]
loss:[]
'''
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 保存网络参数
torch.save(
{
'q_state_dict': self.q.state_dict(),
'optim_state_dict': self.optimizer.state_dict()
}, self.save_path)
return np.mean(LOSS)