【RL从入门到放弃】【二十四】

最新推荐文章于 2022-04-18 21:24:47 发布

money_yuan

最新推荐文章于 2022-04-18 21:24:47 发布

阅读量505

点赞数

分类专栏： AI

AI 专栏收录该内容

60 篇文章

订阅专栏

1、dropout解决过拟合

dropout就是踢掉里面的一些神经连接

pool可以视为保留一些参数

import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from __future__ import print_function
import tensorflow as tf
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

#保留概率
with tf.name_scope("keep_prob"):
    keep_prob = tf.placeholder(tf.float32)

digits = load_digits()

x = digits.data
print(x.shape)


y =  digits.target
print(y.shape)

y = LabelBinarizer().fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=3)


def add_layer(inputs, in_size, out_size, n_layer, activation=None):
    layer_name = "layer%s"%n_layer
    with tf.name_scope("weights"):
        w = tf.Variable(tf.random_normal([in_size, out_size]))
        tf.summary.histogram("layer"+layer_name+"weight", w)
    with tf.name_scope("bias"):
        b = tf.Variable(tf.zeros([1, out_size])+0.1)
        tf.summary.histogram("layer"+layer_name+"bias", b)
    w_b = tf.matmul(inputs, w) +b 
    w_b = tf.nn.dropout(w_b, keep_prob)
    if activation is None:
        out_put = w_b
    else:
        out_put = activation(w_b)
    tf.summary.histogram("out_put", out_put)
    return out_put


xs = tf.placeholder(tf.float32, [None, 64])

ys = tf.placeholder(tf.float32, [None, 10])
keep_prob = tf.placeholder(tf.float32)
l1 = add_layer(xs, 64, 20, 0, activation=tf.nn.tanh)
predict = add_layer(l1, 20, 10, 1,activation=tf.nn.softmax)


cross_entrory = tf.reduce_mean(-tf.reduce_sum(ys*tf.log(predict), reduction_indices=[1]))

optimizer = tf.train.GradientDescentOptimizer(0.1)

train = optimizer.minimize(cross_entrory)

sess = tf.Session()
merge = tf.summary.merge_all()


init = tf.global_variables_initializer()


sess.run(init)


for step in range(100):
    sess.run(train, feed_dict={xs:x_train, ys:y_train, keep_prob:1})
    print(sess.run(cross_entrory, feed_dict={xs: x_train, ys: y_train, keep_prob: 0.5}))

2、DQN

这里不讲环境，只将DQN的核心算法，环境可以搭配走迷宫的游戏

class DQN():
    def __init__(self):
        #initial
        self.lr = 0.1
        self.greedy = 0.9
        self.actions = 4
        self.features = 2
        self.memory_size = 2000
        self.batch_size = 32
        self.gamma = 0.9
        self.replace_inter = 100
        self.memory = np.zeros((self.memory_size, self.features*2+2))
        print(self.memory.shape)
        
        self.build_net()
        self.sess = tf.Session()
        inital = tf.global_variables_initializer()
        self.sess.run(inital)
        
        #在定义变量的时候使用collections可以
        t_params = tf.get_collection("target_net_paras")
        e_params = tf.get_collection("eval_net_paras")
        self.replace_target_op = [tf.assign(t,e) for t, e in zip(t_params, e_params)]
        
    def add_layer(self, xs, paras_name):
        c_names = [paras_name, tf.GraphKeys.GLOBAL_VARIABLES]
        w1 = tf.Variable(tf.random_uniform([self.features, 10]), collections=c_names)
        b1 = tf.Variable(tf.zeros([10])+0.1, collections=c_names)
        y1 = tf.nn.relu(tf.matmul(xs, w1)+b1)
        
        w2 = tf.Variable(tf.random_uniform([10, self.actions]), collections=c_names)
        b2 = tf.Variable(tf.zeros([self.actions])+0.1, collections=c_names)
        y2 = tf.matmul(y1, w2)+b2        
        
        return y2
    
    def build_net(self):
        #动作值函数对应的
        self.s = tf.placeholder(tf.float32, [None, self.features])
        self.q_eval = self.add_layer(self.s, "eval_net_paras")
        
        self.s_ = tf.placeholder(tf.float32, [None, self.features])
        self.q_next = self.add_layer(self.s_, "target_net_paras")
        
        self.q_target = tf.placeholder(tf.float32, [None, self.actions])
        self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self.train = optimizer.minimize(self.loss)
    
    def choose_action(self, observation):
        observation = observation[np.newaxis, :]#因为features是2维的，所以这里需要添加1维
        #print(observation.shape)
        action_value = self.sess.run(self.q_eval, feed_dict={self.s:observation})
        #rint(type(action_value))
        if np.random.uniform()<self.greedy:
            action = np.argmax(action_value)
        else:
            action = np.random.randint(0, self.actions)
        return action
    def store_transition(self, s, reward, action, s_):
        if not hasattr(self, "memory_counter"):
            self.memory_counter = 0
        
        transition = np.hstack((s, [action, reward], s_))
        #print(transition)
        index = self.memory_counter%self.memory_size
        self.memory[index, :] = transition
        self.memory_counter +=1
    def learn(self):
        if not hasattr(self, "learn_step"):
            self.learn_step = 0
        if self.learn_step%self.replace_inter==0:
            self.sess.run(self.replace_target_op)
            
        if self.memory_counter >  self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size = self.batch_size)
        #print("sample_index is {0}".format(sample_index))
        batch_memory = self.memory[sample_index, :]
        #print(batch_memory)
        q_next, q_eval = self.sess.run(
            [self.q_next,self.q_eval], 
            feed_dict={
                self.s_:batch_memory[:,-self.features:],#计算target目标网络
                self.s:batch_memory[:, :self.features]})
        #print(q_next)
        q_target = q_eval.copy()
        batch_index = np.arange(self.batch_size)
        #print(batch_index)
        eval_act_index = batch_memory[:, self.features].astype(int)
        #print(eval_act_index)
        reward = batch_memory[:, self.features+1]
        #print(reward)
        
        q_target[batch_index, eval_act_index] = reward + self.gamma*np.max(q_next, axis=1)
        
        _, self.cost = self.sess.run([self.train, self.loss],feed_dict = {self.s:batch_memory[:, 
        :self.features], self.q_target: q_target})

存储和恢复参数：

存储参数在整个程序运行之后：

saver = tf.train.Saver()
save_path = saver.save(Rl.sess, "my_net/save_net.ckpt")

恢复参数需要在build_net的时候进行

saver = tf.train.Saver()

saver.restore(self.sess, "my_net/save_net.ckpt")

此时不需要再进行learn的过程。