参考文章
添加链接描述
(https://www.cnblogs.com/cjnmy36723/p/7018860.html)
(https://www.pythonheidong.com/blog/article/363261/59ae746d690b1ffb13c0/)
(https://blog.csdn.net/weixin_40759186/article/details/87524192)
感谢老师们的文章。
很多文章使用的是gym来实现强化学习算法,这是使用的自己创建的简单环境,如图:
代码比较简单,直接上代码吧:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque
r = np.array([[-1, -1, -1, -1, 0, -1],
[-1, -1, -1, 0, -1, 100.0],
[-1, -1, -1, 0, -1, -1],
[-1, 0, 0, -1, 0, -1],
[0, -1, -1, 1, -1, 100],
[-1, 0, -1, -1, 0, 100],
])
# 状态数。
state_num = 6
# 动作数。
action_num = 6
# 选取的小批量训练样本数。
BATCH = 20
# epsilon 的最小值,当 epsilon 小于该值时,将不在随机选择行为。
FINAL_EPSILON = 0.0001
# epsilon 的初始值,epsilon 逐渐减小。
INITIAL_EPSILON = 0.1
# epsilon 衰减的总步数。
EXPLORE = 3000000.
# 探索模式计数。
epsilon = 0
# 训练步数统计。
learn_step_counter = 0
# 学习率。
learning_rate = 0.001
# γ经验折损率。
gamma = 0.9
# 记忆上限。
MEMORY_CAPACITY = 5000
# 保存观察到的执行过的行动的存储器,即:曾经经历过的记忆。
replay_memory_store = deque()
class net(nn.Module):
def __init__(self, s_dim, a_dim):
super(net, self).__init__()
self.fc1 = nn.Linear(s_dim, 30)
self.fc1.weight.data.normal_(0, 0.1) # initialization
self.out = nn.Linear(30, a_dim)
self.out.weight.data.normal_(0, 0.1) # initialization
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
class Dqn(object):
def __init__(self, s_dim, a_dim):
self.eval_net = net(s_dim, a_dim)
self.target_net = net(s_dim, a_dim)
self.learn_step_counter = learn_step_counter
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
self.loss_func = nn.MSELoss()
self.step_index = 0
self.batch_size = BATCH # 批量数据的大小
self.memory_counter = 0 # 当前记忆数。
self.memory = np.zeros((MEMORY_CAPACITY, 5))
self.replay_memory_store = replay_memory_store
self.memory_size = MEMORY_CAPACITY # 记忆容量
self.state_list = np.identity(6)
# 对角线为1的6X6动作矩阵,每一行代表一个状态。
self.action_list = np.identity(6)
self.INITIAL_EPSILON = 0.1
self.FINAL_EPSILON = 0.0001
self.EXPLORE = 3000000.
# 训练之前观察多少步。
self.OBSERVE = 1000.
def choose_action(self, state_index):
self.epsilon = self.INITIAL_EPSILON
current_state = self.state_list[state_index:state_index + 1]
current_state = torch.FloatTensor(current_state)
action_value = self.eval_net(current_state).detach().numpy()
# action_value=action_value.numpy()
current_action_index = np.argmax(action_value)
if np.random.uniform() < self.epsilon:
current_action_index = np.random.randint(0, 5)
# 开始训练后,在 epsilon 小于一定的值之前,将逐步减小 epsilon。
if self.step_index > self.OBSERVE and self.epsilon > self.FINAL_EPSILON:
self.epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLORE
return current_action_index
def store(self, current_state_index, current_action_index, current_reward, next_state_index, done):
current_state = self.state_list[current_state_index:current_state_index + 1]
current_action = self.action_list[current_action_index:current_action_index + 1]
next_state = self.state_list[next_state_index:next_state_index + 1]
self.replay_memory_store.append((
current_state,
current_action,
current_reward,
next_state,
done))
# 如果超过记忆的容量,则将最久远的记忆移除。
if len(self.replay_memory_store) > self.memory_size:
self.replay_memory_store.popleft()
self.memory_counter += 1
def train(self):
current_state = np.random.randint(0, 5)
while True:
action = self.choose_action(current_state)
next_state, reward, done = self.step(current_state, action)
self.store(current_state, action, reward, next_state, done)
# 训练之前先观察的步数
if self.memory_counter > 2000:
self.learn()
if self.step_index > 10000:
break
if done:
current_state = np.random.randint(0, 5)
else:
current_state = next_state
self.step_index += 1
def learn(self):
if self.learn_step_counter % 100 == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter + 1
batch = self.batch_size if self.memory_counter > self.batch_size else self.memory_counter
data_batch = random.sample(self.replay_memory_store, batch)
batch_state = None
batch_action = None
batch_reward = None
batch_next_state = None
batch_done = None
for index in range(len(data_batch)):
if batch_state is None:
batch_state = data_batch[index][0]
elif batch_state is not None:
batch_state = np.vstack((batch_state, data_batch[index][0]))
if batch_action is None:
batch_action = data_batch[index][1]
elif batch_action is not None:
batch_action = np.vstack((batch_action, data_batch[index][1]))
if batch_reward is None:
batch_reward = data_batch[index][2]
elif batch_reward is not None:
batch_reward = np.vstack((batch_reward, data_batch[index][2]))
if batch_next_state is None:
batch_next_state = data_batch[index][3]
elif batch_next_state is not None:
batch_next_state = np.vstack((batch_next_state, data_batch[index][3]))
if batch_done is None:
batch_done = data_batch[index][4]
elif batch_done is not None:
batch_done = np.vstack((batch_done, data_batch[index][4]))
batch_state = torch.FloatTensor(batch_state)
batch_action = torch.LongTensor(batch_action)
batch_next_state = torch.FloatTensor(batch_next_state)
batch_reward = torch.LongTensor(batch_reward)
y = torch.nonzero(batch_action)
action = torch.LongTensor(len(y), 1)
for i in range(len(y)):
action[i] = y[i][1]
q_eval = self.eval_net(batch_state).gather(1, action) # Q估计
q_next = self.target_net(batch_next_state).detach()
q_next, i = q_next.max(1)
q_next = q_next.reshape(len(q_next), 1) # 选择一个状态最大的Q值
q_target = torch.FloatTensor(len(batch_reward), 1)
for i in range(len(batch_reward)):
current_reward = batch_reward[i][0]
# q_value = current_reward + gamma * q_next.max(1)[i].view(self.batch_size, 1)
q_value = current_reward + gamma * q_next[i]
if current_reward <= -1:
q_target[i] = current_reward
else:
q_target[i] = q_value # Q现实
loss = self.loss_func(q_eval, q_target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def step(self, state, action):
reward = r[state][action]
next_state = action
done = False
if action == 5:
done = True
return next_state, reward, done
def pay(self):
self.train()
for index in range(5):
start_room = index
print("#############################", "Agent 在", start_room, "开始行动", "#############################")
current_state = start_room
step = 0
target_state = 5
while current_state != target_state:
out_result = self.eval_net(torch.FloatTensor(self.state_list[current_state:current_state + 1])).detach()
out_result = out_result.numpy()
next_state = np.argmax(out_result[0])
print(out_result[0])
print("Agent 由", current_state, "号房间移动到了", next_state, "号房间")
current_state = next_state
step += 1
print("Agent 在", start_room, "号房间开始移动了", step, "步到达了目标房间 5")
print("#############################", "Agent 在", 5, "结束行动", "#############################")
if __name__ == "__main__":
s_dim = 6
a_dim = 6
dqn = Dqn(s_dim, a_dim)
dqn.pay()
一篇菜鸡写的博客,有错误还请大家帮忙指出,谢谢。