DQN解决背包问题(pytorch版)

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import collections
import random
import matplotlib.pyplot as plt
from torch import optim

class DQN(nn.Module):
    def __init__(self, n_actions, n_inputs, lr=0.01):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(n_inputs, 64)  # 输入维度为4
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, n_actions)  # 输出维度为2,某个状态下每个动作空间的Q值

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = 'cpu'
        self.loss = nn.MSELoss()
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        return actions

class Agent:
    def __init__(self, env, n_actions, state_n_dim, gamma=0.99, epsilon=1):
        self.env = env
        self.n_actions = n_actions
        self.state_n_dim = state_n_dim
        self.gamma = gamma  # 未来值影响参数
        self.epsilon = epsilon  # 探索策略
        self.eps_min = 0.05  # epsilon会衰减,但不低于eps_min
        self.eps_dec = 1e-4  # 每次衰减0.0005
        self.iter_count = 0 #记录学习次数
        self.policy_net = DQN(self.n_actions, self.state_n_dim)    #待学习的网络
        self.target_net = DQN(self.n_actions, self.state_n_dim)  # 目标网络
        self.target_net.load_state_dict(self.policy_net.state_dict())#初始目标网络参数同待学习的网络
        self.replay_memory = collections.deque(maxlen=10000)#经验池
        self.min_replay_memory_size = 100  # 经验池至少要100个transition
        self.batch_size = 64    #每次从经验池中取64个trainsition学习
        self.update_target = 10  # 每学习10次更新一次target网络
        self.scores = []#记录每个episode的得分
    def update_replay_memory(self, obs):
        self.replay_memory.append(obs)

    def choose_action(self, state, score):
        action_selected = set(np.where(state == 1)[0])  # 已选择了的动作集合
        if np.random.random() > self.epsilon and score >= 0:
            q = self.policy_net(torch.tensor([state], dtype=torch.float32))
            print('处理前的q: ', q) # tensor([[-0.2269, -0.0726, -0.1312,  0.0524,  0.0142]],grad_fn=<AddmmBackward0>)
            # qq = q.detach().numpy()[0] # 不能修改tensor???
            # # print('处理前的qq: ', qq)
            # for index in action_selected:
            #     qq[index] = -10000.00
            # # print('处理后的qq: ', qq)
            # action = qq.argmax()  # 值最大的下标
            action = torch.argmax(q).item()
            print('根据网络输出,选择最大值对应的下标,作为action:', action)
        else:
            action_list = list(set(self.env.action_space).difference(action_selected)) # 可选动作
            action = np.random.choice(action_list) # 随机采样一个动作
            print('随机采样得到action:', action)
        return action

    def train(self):
        len_replay_memory = len(self.replay_memory)
        if len_replay_memory < self.batch_size:
            print('经验池内的transition不够,目前:', len_replay_memory, '条!')
            return
        self.policy_net.optimizer.zero_grad()#清空gradient buffer
        batch = random.sample(self.replay_memory, self.batch_size)  # 随机采样结果[(state, action, reward, next_state, done),(),..,()]
        states, actions, rewards, next_states, dones = [trans[0] for trans in batch], [trans[1] for trans in batch], [
            trans[2] for trans in batch], [trans[3] for trans in batch], [trans[4] for trans in batch]
        state_batch = torch.tensor(states, dtype=torch.float32) # tensor([0,0,0,0,0],[1,0,0,0,0])
        # print(state_batch)
        next_state_batch = torch.tensor(next_states, dtype=torch.float32)
        # print(next_state_batch)
        action_batch = torch.tensor(actions).numpy()  # tensor([0, 1, 2, 1, 4,...],dtype=torch.int32)
        # print(action_batch) # [1 4 0 2 4 0 1 2 4 3 1 3 3 3...]
        reward_batch = torch.tensor(rewards)
        # print(reward_batch)
        done_batch = torch.tensor(dones)  # tensor([False, False,False,...,False,False,True,False,False,True,...])
        batch_index = np.arange(self.batch_size, dtype=np.int32)  # [0 1 2 3 ...]

        pred_list = self.policy_net(state_batch)[batch_index, action_batch]  # 预测值tensor([0.89,0.98,0.7,0.9,0.1])
        next_action_list = self.target_net(next_state_batch)  # [[0.89,0.98,0.7,0.9,0.1],[],...]
        next_action_list[done_batch] = 0.0  # 下一步结束,y = r;其他,y = r + gamma * maxQ(next_s,a)-->[[0.02,0,89],[0.98,0.01],[0.0,0.0],...]
        new_q = reward_batch + self.gamma * torch.max(next_action_list, dim=1)[0]  # 取下一个状态动作对应价值最大的

        loss = self.policy_net.loss(pred_list, torch.as_tensor(new_q, dtype=torch.float32))  # as_tensor()修改数组值,张量值也会变
        loss.backward()  # 反向计算梯度
        self.policy_net.optimizer.step()  # 梯度传播,更新参数
        self.iter_count += 1
        if not self.iter_count % self.update_target:  # 每学习10次更新一次target网络
            self.target_net.load_state_dict(self.policy_net.state_dict())

    def step(self):  # 完成一个episode
        done = False
        state = self.env.reset()  # [0 0 0 0 0]
        print('游戏开始,得到一个状态state:', state)
        episode_reward = 0
        reward = 0
        while not done:
            action = self.choose_action(state,reward)  # 返回最大值对应的下标,作为action,如 0
            next_state, reward, done, repet = self.env.step(action) # [1,0,0,0,0] 1 False
            print('环境交互得到next_s:', next_state, 'reward: ', reward, 'done: ', done)
            if reward > 0:
                episode_reward = reward
            # if repet == 0: #repet=1时是错误的transition,不应该学习
            self.update_replay_memory((state, action, reward, next_state, done))  # trainsition放入经验池
            self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
            print('完成一个step,epsilon更新为:', self.epsilon)
            state = next_state
        print('Episode结束, reward为: ', episode_reward)
        self.scores.append(episode_reward)
    def save_model(self):
        torch.save(self.policy_net.state_dict(), 'policy_net.pth')  # 保存模型参数

if __name__ == '__main__':
    round_count = 3  # 跑5次的结果取平均
    round_all_score = 0
    env = gym.make('MyEnv-v0')
    n_actions = env.n_actions  # 可选动作有5个
    state_n_dims = env.state_dim  # 状态的维度为5
    for i in range(round_count):
        agent = Agent(env, n_actions, state_n_dims)
        episodes = 2000  # 每次跑900个回合
        for episode in range(episodes):
            agent.step()  # 完成一个episode,将每个step的trainsition放入经验池
            agent.train()  # 利用经验池中的trainsition学习
            print('Episode: ', episode, '| reward: ', agent.scores[episode])
        avg_score = np.mean(agent.scores)  # 900个episodes的平均分
        print('Round: ', i, '| Average score: ', int(avg_score))
        round_all_score += avg_score
        agent.env.close()
    fig = plt.figure(figsize=(7, 7))  # figsize是图片的大小`
    plt.plot(range(episodes),agent.scores,'g-')
    plt.xlabel(u'iters')
    plt.ylabel(u'scores')
    plt.show()
    print('run ', round_count, 'rounds,the score is: ', int(round_all_score / round_count))

环境代码:

import gym
import numpy as np
from numpy import random
import time

class MyEnv(gym.Env):
    def __init__(self):
        self.viewer = None
        # 状态空间和回报
        # self.files = [(1,1),(2,6),(5,18),(6,22),(7,28)] # 定义文件的大小和回报
        self.files = [(2,3),(3,4),(4,5),(5,6),(4,3),(7,12),(3,3),(2,2)]
        self.min_weight = 2
        self.limit_weight = 8 #网关最大容量
        self.state_dim = len(self.files) # 状态文件个数
        # 动作空间
        self.action_space = np.arange(len(self.files)) # [0,1,2,3,4]
        self.n_actions = self.state_dim # 定义可选动作个数
        self.repet = 0 # 标志是否重复放入
    def step(self, action):
        if self.state[action] == 1: # 接下来要选的动作已选中
            weight_sum = self.state[action] * self.files[action][0]
            for i in range(self.state_dim):
                weight_sum += self.state[i] * self.files[i][0]
            if weight_sum > self.limit_weight: # 超重
                is_terminal = True
                r = -30
                # self.repet = 1 # 标志这条transition不能用于学习
            else:
                is_terminal = False
                r = -30
            next_state = self.state
        else:
            #系统当前状态
            state = self.state
            state[action] = 1 #[1,0,0,0,0]
            weight_sum = 0
            for i in range(self.state_dim):
                weight_sum += self.state[i] * self.files[i][0]
            if weight_sum > self.limit_weight: # 超重
                is_terminal = True
                r = -30
            else:
                is_terminal = False
                r = 0
                for i in range(self.state_dim):
                    r += self.state[i] * self.files[i][1]
            next_state = state
            self.state = next_state
        if weight_sum + self.min_weight > self.limit_weight:
            is_terminal = True

        return next_state, r, is_terminal, self.repet

    def reset(self):
        self.state = np.array([0,0,0,0,0,0,0,0]) # 网关为空
        return self.state

    def close(self):
        if self.viewer:
            self.viewer.close()
            self.viewer = None

    def render(self, mode="human"):
        from gym.envs.classic_control import rendering
        width = 60
        height = 40
        edge_x = 0
        edge_y = 0
        if self.viewer is None:
            self.viewer = rendering.Viewer(300, 200)

        # 右下角                 用黑色表示墙
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(0, 0, 0)).add_attr(
            rendering.Transform((edge_x + width * 2, edge_y + height * 1)))
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(0, 0, 0)).add_attr(
            rendering.Transform((edge_x + width * 3, edge_y + height * 1)))
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(0, 0, 0)).add_attr(
            rendering.Transform((edge_x + width * 4, edge_y + height * 1)))
        # 左边
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(0, 0, 0)).add_attr(rendering.Transform((edge_x, edge_y + height * 3)))
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(0, 0, 0)).add_attr(
            rendering.Transform((edge_x + width * 1, edge_y + height * 3)))
        # 上边
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(0, 0, 0)).add_attr(
            rendering.Transform((edge_x + width * 3, edge_y + height * 4)))
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(0, 0, 0)).add_attr(
            rendering.Transform((edge_x + width * 3, edge_y + height * 5)))
        # 出口,用黄色表示出口
        self.viewer.draw_polygon([(0, 0), (0, height), (width, height), (width, 0)], filled=True,
                                 color=(1, 0.9, 0)).add_attr(
            rendering.Transform((edge_x + width * 4, edge_y + height * 3)))
        # 画网格
        for i in range(1, 7):
            self.viewer.draw_line((edge_x, edge_y + height * i), (edge_x + 5 * width, edge_y + height * i))  # 横线
            self.viewer.draw_line((edge_x + width * (i - 1), edge_y + height),
                                  (edge_x + width * (i - 1), edge_y + height * 6))  # 竖线

        # 人的像素位置
        self.x = [edge_x + width * 0.5, edge_x + width * 1.5, edge_x + width * 2.5, 0, edge_x + width * 4.5,
                  edge_x + width * 0.5, edge_x + width * 1.5, edge_x + width * 2.5, 0, edge_x + width * 4.5,
                  0, 0, edge_x + width * 2.5, edge_x + width * 3.5, edge_x + width * 4.5,
                  edge_x + width * 0.5, edge_x + width * 1.5, edge_x + width * 2.5, edge_x + width * 3.5,
                  edge_x + width * 4.5,
                  edge_x + width * 0.5, edge_x + width * 1.5, 0, 0, 0]

        self.y = [edge_y + height * 5.5, edge_y + height * 5.5, edge_y + height * 5.5, 0, edge_y + height * 5.5,
                  edge_y + height * 4.5, edge_y + height * 4.5, edge_y + height * 4.5, 0, edge_y + height * 4.5,
                  0, 0, edge_y + height * 3.5, edge_y + height * 3.5, edge_y + height * 3.5,
                  edge_y + height * 2.5, edge_y + height * 2.5, edge_y + height * 2.5, edge_y + height * 2.5,
                  edge_y + height * 2.5,
                  edge_y + height * 1.5, edge_y + height * 1.5, 0, 0, 0]
        # 用圆表示人
        # self.viewer.draw_circle(18,color=(0.8,0.6,0.4)).add_attr(rendering.Transform(translation=(edge_x+width/2,edge_y+height*1.5)))
        self.viewer.draw_circle(18, color=(0.8, 0.6, 0.4)).add_attr(
            rendering.Transform(translation=(self.x[self.state - 1], self.y[self.state - 1])))

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

# env=gym.make('MyEnv-v0')
# state = env.reset()
# print('初始状态state: ', state)
# n_act = env.action_space
# print('可选动作个数:',n_act)
# reward=0
# while True:
#         action = env.actions[int(random.random()*len(env.actions))]
#         next_state,r,is_terminal,info = env.step(action)
#         env.render()
#         reward += r
#         if is_terminal == True:
#             print("游戏结束,reward:",reward)
#             time.sleep(18)
#             break
#         time.sleep(1)

如何用gym建自己的环境可参考我的上一篇文章
运行结果:
在这里插入图片描述
目前效果还有待提高,还请大佬们指出改进的地方_~~

  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值