1 # coding=utf-8 2 3 import sys 4 import os 5 import gym 6 import pylab 7 import random 8 import numpy as np 9 from collections import deque 10 from keras.layers import Dense 11 from keras.optimizers import Adam 12 from keras.models import Sequential 13 from keras.callbacks import Callback 14 import matplotlib.pyplot as plt 15 16 class LossHistory(Callback): 17 def on_train_begin(self, logs={}): 18 self.losses = [] 19 20 def on_batch_end(self, batch, logs={}): 21 self.losses.append(logs.get('loss')) 22 23 class DQNAgent: 24 def __init__(self, state_size, action_size, 25 render=False, load_model=False, 26 gamma=0.99, learning_rate=0.001, 27 epsilon=1.0, epsilon_decay=0.999, 28 epsilon_min=0.01, batch_size=64, 29 train_start=100, memory_size=2000, 30 ): 31 # env 的状态空间的设置 32 self.state_size = state_size 33 self.action_size = action_size 34 35 # render表示是否打开gym下的动画展示,打开的话运行速度会大幅减慢 36 self.render = render 37 # load_model=True表示从文件中加载model 38 self.load_model = load_model 39 40 # 接下来的都是DQN的超参 41 self.gamma = gamma 42 self.learning_rate = learning_rate 43 self.epsilon = epsilon 44 self.epsilon_decay = epsilon_decay # e-贪心的e值随着步骤不断减小的比例 45 self.epsilon_min = epsilon_min # e-贪心的e值减小到一个阈值不再减小 46 47 self.train_start = train_start 48 self.batch_size = batch_size 49 50 # 记忆数据存储模块 51 self.memory = deque(maxlen=memory_size) 52 53 # 初始化模型 54 self.model = self.build_model() 55 56 # 记录损失值 57 self.history = LossHistory() 58 self.losses_list = [] 59 60 def build_model(self, units=128): 61 model = Sequential() 62 model.add(Dense(units, input_dim=self.state_size, 63 activation='sigmoid', kernel_initializer='he_uniform')) 64 model.add(Dense(units, activation='sigmoid', 65 kernel_initializer='he_uniform')) 66 model.add(Dense(self.action_size, activation='linear', 67 kernel_initializer='he_uniform')) 68 model.summary() 69 70 model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate)) 71 return model 72 73 def choose_action(self, state): 74 if np.random.rand() <= self.epsilon: 75 return random.randrange(self.action_size) 76 else: 77 q_value = self.model.predict(state) 78 return np.argmax(q_value[0]) 79 80 def add_memory(self, state, action, reward, done, next_state): 81 self.memory.append((state, action, reward, done, next_state)) 82 if self.epsilon > self.epsilon_min: 83 self.epsilon *= self.epsilon_decay 84 85 def train_model(self): 86 if len(self.memory) < self.train_start: 87 return 88 batch_size = min(self.batch_size, len(self.memory)) 89 min_batch = random.sample(self.memory, batch_size) 90 91 update_input = np.zeros((batch_size, self.state_size)) 92 update_target = np.zeros((batch_size, self.state_size)) 93 action, reward, done = [], [], [] 94 95 for i in range(batch_size): 96 update_input[i] = min_batch[i][0] 97 action.append(min_batch[i][1]) 98 reward.append(min_batch[i][2]) 99 done.append(min_batch[i][3]) 100 update_target[i] = min_batch[i][4] 101 102 target = self.model.predict(update_input, batch_size=batch_size) 103 target_val = self.model.predict(update_target, batch_size=batch_size) 104 105 for i in range(self.batch_size): 106 if done[i]: 107 target[i][action[i]] = reward[i] 108 else: 109 target[i][action[i]] = reward[i] + self.gamma * np.amax(target_val[i]) 110 111 self.model.fit(update_input, target, batch_size=batch_size, epochs=1, verbose=0, callbacks=[self.history]) 112 self.losses_list.append(self.history.losses[0]) 113 114 115 def draw_score_plot(scores, filename='graph.png'): 116 fig = plt.figure() 117 ax1 = fig.add_subplot(1, 1, 1) 118 ax1.set_title('mean score') 119 ax1.plot(range(len(scores)), scores, color='blue') 120 plt.savefig(filename) 121 122 123 def draw_plot(scores, losses, filename='graph.png'): 124 fig = plt.figure() 125 ax1 = fig.add_subplot(1, 2, 1) 126 ax1.set_title('mean score') 127 ax1.plot(range(len(scores)), scores, color='blue') 128 129 130 ax2 = fig.add_subplot(1, 2, 2) 131 ax2.set_title('mean loss-reward') 132 ax2.plot(range(len(losses)), losses, color='blue') 133 plt.savefig(filename)