代码学习:When Learning Joins Edge: Real-time Proportional Computation Offloading via Deep Reinforcement

学习文章
原文如下:https://ieeexplore.ieee.org/document/8975787/
共两个文件,主文件run_this.py,强化学习DQN模块RL_brain.py

run_this.py

from RL_brain import DeepQNetwork
import numpy as np
np.random.seed(6)

actions=np.array([[0,0],[0,0.1],[0,0.2],[0,0.3],[0,0.4],[0,0.5],[0,0.6],[0,0.7],[0,0.8],[0,0.9],[0,1],
                  [1, 0], [1, 0.1], [1, 0.2], [1, 0.3], [1, 0.4], [1, 0.5], [1, 0.6], [1, 0.7], [1, 0.8], [1, 0.9],
                  [1, 1],
                  [2, 0], [2, 0.1], [2, 0.2], [2, 0.3], [2, 0.4], [2, 0.5], [2, 0.6], [2, 0.7], [2, 0.8], [2, 0.9],
                  [2, 1],
                  [3, 0], [3, 0.1], [3, 0.2], [3, 0.3], [3, 0.4], [3, 0.5], [3, 0.6], [3, 0.7], [3, 0.8], [3, 0.9],
                  [3, 1]])


n_actions = len(actions)
n_features=14 #The number of features in your state space
lam_local,beta_local,cycle_perbyte,energy_per_l= 0.6,0.4,1,6
lam_re,beta_re,energy_per_r = 0.8,0.2,0.3
local_core_max,local_core_min=200,50
d2d_core_max,d2d_core_min=400,150
upload_max,upload_min = 350,100
download_max,download_min = 600,250


def reset():
    workload = np.random.randint(2000,3000)#定义工作量
    local_comp = np.random.randint(150,200)#定义本地可用计算资源
    upload = np.array([np.random.randint(150,200),np.random.randint(150,200),
                            np.random.randint(150,200),np.random.randint(150,200)])
    download = np.array([np.random.randint(150,200),np.random.randint(150,200),
                            np.random.randint(150,200),np.random.randint(150,200)])
    # download = np.array([np.random.randint(300,500),np.random.randint(300,500),
    #                      np.random.randint(300,500),np.random.randint(300,500)])
    # d2d_cap = np.array([np.random.randint(200,300),np.random.randint(200,300),
    #                         np.random.randint(200,300),np.random.randint(200,300)])
    d2d_cap = np.array([np.random.randint(150,200),np.random.randint(150,200),
                            np.random.randint(150,200),np.random.randint(150,200)])#定义helper的可用计算资源,数量为4
    observation=np.array([workload,local_comp])
    return np.hstack((observation,d2d_cap,upload,download))

def d2d_step(observation,action,time1):
    workload,local_comp,d2d_cap,upload,download= \
        observation[0],observation[1],observation[2:6],observation[6:10],observation[10:14]
    target_d2d,percen = int(action[0]),action[1]

    #贪心算法,每次选择可用计算资源最多的helper
    MAX_c = max(d2d_cap)

    # wait_local  = (local_core_max-local_comp)*0.1
    # wait_d2d = (np.array([d2d_core_max,d2d_core_max,d2d_core_max,d2d_core_max])-d2d_cap)*0.01
    wait_local,wait_d2d = 2,1
    local_cost = lam_local*workload*cycle_perbyte*(1-percen)/(local_comp)+beta_local*workload*energy_per_l*(1-percen)+lam_local*wait_local

    local_only = lam_local*workload*cycle_perbyte/(local_comp)+beta_local*workload*energy_per_l+lam_local*wait_local

    remote_only = workload * lam_re * (cycle_perbyte  / (d2d_cap[target_d2d]) +
                                       percen / upload[target_d2d] + 0.01 / download[target_d2d]) + lam_re * wait_d2d + \
                  beta_re * energy_per_r * workload

    remote_cost = workload * lam_re * ((cycle_perbyte * percen) / (d2d_cap[target_d2d])+
                percen / upload[target_d2d] + (percen * 0.01) / download[target_d2d]) + lam_re * wait_d2d + \
                 beta_re * energy_per_r * workload * percen

    total_cost = workload * lam_local * ((cycle_perbyte * (1 - percen)) / (local_comp) +
                beta_local * energy_per_l * (1 - percen)) + lam_local * wait_local + \
                 workload * lam_re * ((cycle_perbyte * percen) / (d2d_cap[target_d2d])+
                percen / upload + (percen * 0.01) / download) + lam_re * wait_d2d + \
                 beta_re * energy_per_r * workload * percen

    total_cost_ = local_cost+remote_cost
    # reward = -total_cost_
    reward = -total_cost_
    # reward = (local_only-total_cost_)/local_only
    np.random.seed(np.random.randint(1,1000))

    #建立下一个过程的模拟生成
    a = np.random.uniform()
    b=0.9
    if (time1>=0) and (time1<=36):
        if (a>b) :
            local_comp = min(local_comp+np.random.randint(0,6),local_core_max)
            for i in range(4):
                d2d_cap[i] = min(d2d_cap[i] + np.random.randint(0, 15), d2d_core_max)
                download[i] = min(download[i]+np.random.randint(0,8),download_max)
                upload[i] = min(upload[i]+np.random.randint(0,5),upload_max)

        else:
            local_comp = max(local_comp+np.random.randint(-5,0),local_core_min)
            for i in range(4):
                d2d_cap[i] = max(d2d_cap[i] + np.random.randint(-14, 0), d2d_core_min)
                download[i] = max(download[i] - np.random.randint(0, 8), download_min)
                upload[i] = max(upload[i] - np.random.randint(0, 5), upload_min)
        workload += np.random.randint(-100, 200)


    elif (time1>36) and (time1<=72):
        if (a < b):
            local_comp = min(local_comp + np.random.randint(0, 6), local_core_max)
            for i in range(4):
                d2d_cap[i] = min(d2d_cap[i] + np.random.randint(0, 15), d2d_core_max)#仿真的这么随便?,香农公式随便给个速率?
                download[i] = min(download[i] + np.random.randint(0, 8), download_max)
                upload[i] = min(upload[i] + np.random.randint(0, 5), upload_max)

        else:
            local_comp = max(local_comp + np.random.randint(-5, 0), local_core_min)
            for i in range(4):
                d2d_cap[i] = max(d2d_cap[i] + np.random.randint(-14, 0), d2d_core_min)
                download[i] = max(download[i] - np.random.randint(0, 8), download_min)
                upload[i] = max(upload[i] - np.random.randint(0, 5), upload_min)
        workload += np.random.randint(-200, 100)


    elif (time1>72) and (time1<=108):
        if (a > b):
            local_comp = min(local_comp + np.random.randint(0, 6), local_core_max)
            for i in range(4):
                d2d_cap[i] = min(d2d_cap[i] + np.random.randint(0, 15), d2d_core_max)
                download[i] = min(download[i] + np.random.randint(0, 8), download_max)
                upload[i] = min(upload[i] + np.random.randint(0, 5), upload_max)

        else:
            local_comp = max(local_comp + np.random.randint(-5, 0), local_core_min)
            for i in range(4):
                d2d_cap[i] = max(d2d_cap[i] + np.random.randint(-14, 0), d2d_core_min)
                download[i] = max(download[i] - np.random.randint(0, 8), download_min)
                upload[i] = max(upload[i] - np.random.randint(0, 5), upload_min)
        workload += np.random.randint(-100, 200)
    observation_ = np.array([workload,local_comp])
    observation_1 = np.hstack((observation_,d2d_cap,upload,download))
    return  observation_1,reward,local_only,remote_only

def run_d2d_offloading():
    step = 0
    local_only_cost,remote_only_cost,total_cost=[],[],[]
    for episode in range(100):

        observation = reset()

        for time_1 in range(108):
            print("当前状态值为:",observation)
            action = RL.choose_action(observation)
            print(action)

            observation_, reward ,local_only,remote_only= d2d_step(observation,action,time_1)

            RL.store_transition(observation, action, reward, observation_)

            if (step > 20) and (step % 5 == 0):
                 RL.learn()
            if  step>20 and step % 100 == 0:
                 local_only_cost.append(local_only)
                 remote_only_cost.append(remote_only)
                 total_cost.append(-reward)

            observation = observation_
            step += 1
    import matplotlib.pyplot as plt
    plt.rc('font',family='Times New Roman',size=14)
    plt.rc('axes',unicode_minus=False)
    plt.plot(np.arange(len(local_only_cost)), local_only_cost,'b')
    plt.plot(np.arange(len(remote_only_cost)), remote_only_cost,'g')
    plt.plot(np.arange(len(total_cost)), total_cost,'r')

    plt.legend(("Execute_Local","Execute_d2d","d2d_hybird"))
    plt.ylabel('Cost')
    plt.xlabel('training steps')
    plt.savefig('compare.png',dpi=600)
    plt.show()
    # end of game
    print('game over')

if __name__ == "__main__":
    # maze game

    RL = DeepQNetwork(n_actions, n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.99,
                      replace_target_iter=200,
                      memory_size=2000,
                      output_graph=True
                      )
    run_d2d_offloading()
    RL.plot_cost()

RL_brain.py

import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'
np.random.seed(1)
# tf.set_random_seed(1)
actions=np.array([[0,0],[0,0.1],[0,0.2],[0,0.3],[0,0.4],[0,0.5],[0,0.6],[0,0.7],[0,0.8],[0,0.9],[0,1],
                  [1, 0], [1, 0.1], [1, 0.2], [1, 0.3], [1, 0.4], [1, 0.5], [1, 0.6], [1, 0.7], [1, 0.8], [1, 0.9],
                  [1, 1],
                  [2, 0], [2, 0.1], [2, 0.2], [2, 0.3], [2, 0.4], [2, 0.5], [2, 0.6], [2, 0.7], [2, 0.8], [2, 0.9],
                  [2, 1],
                  [3, 0], [3, 0.1], [3, 0.2], [3, 0.3], [3, 0.4], [3, 0.5], [3, 0.6], [3, 0.7], [3, 0.8], [3, 0.9],
                  [3, 1]])
# Deep Q Network off-policy
class DeepQNetwork:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.05,
            reward_decay=0.9,
            e_greedy=0.99,
            replace_target_iter=300,
            memory_size=500,
            batch_size=32,
            e_greedy_increment=0.001,
            output_graph=True,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        # self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.epsilon = 0
        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]#每个状态用n个feature表示
        self.memory = np.zeros((self.memory_size, n_features * 2 + 3))#初始化经验池

        # consist of [target_net, evaluate_net]
        self._build_net()
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.Session()

        if output_graph:
            # $ tensorboard --logdir=logs
            # tf.train.SummaryWriter soon be deprecated, use following
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())
        self.cost_his = []

    def _build_net(self):#建立目标网络和评估网络
        # ------------------ build evaluate_net ------------------
        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
        with tf.variable_scope('eval_net'):
            # c_names(collections_names) are the collections to store variables
            c_names, n_l1, w_initializer, b_initializer = \
                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers

            # first layer. collections is used later when assign to target net
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)

            # second layer. collections is used later when assign to target net
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_eval = tf.matmul(l1, w2) + b2

        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

        # ------------------ build target_net ------------------
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
        with tf.variable_scope('target_net'):
            # c_names(collections_names) are the collections to store variables
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

            # first layer. collections is used later when assign to target net
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)

            # second layer. collections is used later when assign to target net
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_next = tf.matmul(l1, w2) + b2

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):#函数hasattr查看对象中是否含有属性
            self.memory_counter = 0

        transition = np.hstack((s, a, r, s_))#将每一行实例按行存储
        print("按行存入经验元组:",transition)
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition

        self.memory_counter += 1

    def choose_action(self, observation):#选择动作
        # to have batch dimension when feed into tf placeholder
        observation = observation[np.newaxis, :]
        print("当前贪心率:",self.epsilon)
        if np.random.random() < self.epsilon:
            # forward feed the observation and get q value for every actions
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            index = np.argmax(actions_value)
            action = actions[index]#找Q值最大的动作,即贪心策略
        else:
            index = np.random.randint(0, self.n_actions)
            action = actions[index]
        return action

    def learn(self):
        # check to replace target parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            print('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        q_next, q_eval = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={
                self.s_: batch_memory[:, -self.n_features:],  # fixed params
                self.s: batch_memory[:, :self.n_features],  # newest params
            })

        # change q_target w.r.t q_eval's action
        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 2]

        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        # train eval network
        _, self.cost = self.sess.run([self._train_op, self.loss],
                                     feed_dict={self.s: batch_memory[:, :self.n_features],
                                                self.q_target: q_target})
        if self.learn_step_counter >100:
            self.cost_his.append(self.cost)

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max

        print("贪心率为:",self.epsilon)
        self.learn_step_counter += 1

    def plot_cost(self):
        import matplotlib.pyplot as plt
        plt.rc('font',family='Times New Roman',size=14)
        plt.rc('axes',unicode_minus=False)
        cost_ = self.cost_his
        for i in range (100):
            cost_.remove(max(cost_))
        plt.plot(np.arange(len(cost_)), cost_)
        plt.ylabel('Cost')
        plt.xlabel('training steps')
        plt.savefig('result.png',dpi=500)
        plt.show()

运行结果如下,看起来效果并不是很好,参数初始化设置的不太好
在这里插入图片描述
在这里插入图片描述

  • 6
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

じょりゅう

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值