python实现bandit算法

Python实现bandit算法, 分别用normal/uniform/poisson/gamma/laplace 分布计算reward epsilon=0.1 (for epsilon-greedy), c=1 (for UCB), and alpha=0.25 (for gradient method).并绘出图像

代码如下:

# 设定epsilon=0.1,c=1,alpha=0.25 

import numpy as np
import matplotlib.pyplot as plt

class E_greedy:
    def __init__(self,arms,arm_num=10,epsilon=0.1):
        self.arm_num = arm_num
        self.epsilon = epsilon
        self.arms = arms
        self.Q = np.zeros(arm_num)

        self.NA = np.zeros(arm_num)
        self.average_reward = np.zeros(25)
        self.T = 2500
        self.R = 0
        self.R_list = []
        self.HAP = np.zeros(arm_num)
        self.HA = np.zeros(arm_num)
        self.R_ = np.zeros(arm_num)

    def reset(self):
        self.Q = np.zeros(self.arm_num)
        self.NA = np.zeros(self.arm_num)
        self.R_list = []
        self.R = 0

    # 计算得出当前结果
    def get_reward(self,arm_index):
        return self.arms[arm_index] + np.random.normal(0, 1)

    def update_Q_NA(self,arm_index,reward):
        self.NA[arm_index] += 1
        self.Q[arm_index] += 1/self.NA[arm_index]*(reward-self.Q[arm_index])

    def update_NA(self,arm_index):
        self.NA[arm_index] += 1

    # e_greedy
    def e_greedy(self,epsilon=None):
        if epsilon:
            self.epsilon = epsilon
        j = 0
        print('\n\n以下为e-greedy')
        for iter in range(1,self.T+1):
            if np.random.random() > self.epsilon:
                if iter == 1:
                    arm_index = np.random.randint(0,self.arm_num)
                else:
                    arm_index = np.argmax(self.Q)
            else:
                arm_index = np.random.randint(0, self.arm_num)
            reward = self.get_reward(arm_index)
            self.R += 1/iter*(reward-self.R)
            self.R_list.append((iter,self.R))
            self.update_Q_NA(arm_index,reward)
            # 求出并打印average_reward,下同
            j += 1  
            if j == 100:
                self.average_reward = np.mean(self.R_list[-100:])
                print(self.average_reward)
                j = 0

    # 以下为UCB
    def ucb(self,c):
        j = 0
        print('\n\n以下为UCB')
        for iter in range(1,self.T+1):
            A_list = self.Q + c*(np.sqrt(np.log(iter)/(self.NA+1e-8)))
            arm_index = np.argmax(A_list)
            reward = self.get_reward(arm_index)
            self.R += 1/iter*(reward-self.R)
            self.R_list.append((iter,self.R))
            self.update_Q_NA(arm_index,reward)
            j += 1  
            if j == 100:
                self.average_reward = np.mean(self.R_list[-100:])
                print(self.average_reward)
                j = 0

    def get_HA(self):
        hae = np.array([np.e**x for x in self.HA])
        sum = np.sum(hae)
        self.HAP = hae/sum

    # 以下为gradient
    def gradient(self,a):
        j = 0
        print('\n\n以下为gradient')
        for iter in range(1,self.T+1):
            self.get_HA()
            arm_index = np.random.choice(list(range(self.arm_num)),p=self.HAP)
            reward = self.get_reward(arm_index)
            self.update_NA(arm_index)
            self.R_[arm_index] += 1/self.NA[arm_index] * (reward - self.R_[arm_index])
            self.R += 1/iter*(reward-self.R)
            self.R_list.append((iter,self.R))
            ha_temp = self.HA[arm_index] + a*(reward-self.R_[arm_index])*(1-self.HAP[arm_index])
            self.HA -= a * (reward - self.R_[arm_index]) * self.HAP
            self.HA[arm_index] = ha_temp
            j += 1  
            if j == 100:
                self.average_reward = np.mean(self.R_list[-100:])

                print(self.average_reward)
                j = 0

    def plot_R(self,c):
        plt.plot([k[0] for k in self.R_list[100:]], [k[1] for k in self.R_list[100:]], c=c)
        plt.plot([k[0] for k in self.R_list[100:]], [np.max(self.arms) for k in self.R_list[100:]],c='r')

def main(arms):
    e = E_greedy(arms)
    e.ucb(1)
    e.plot_R(c='g')
    e.reset()
    e.e_greedy(0.1)
    e.plot_R(c='b')
    e.reset()
    e.gradient(0.25)
    e.plot_R(c='k')
    plt.show()

n = 10
s1 = np.random.normal(5,7.2,n)
main(s1)
# 并没有发现同时画出五组图形的方式,关闭上一图形的窗口后可绘制下一图形
s2 = np.random.uniform(3,8,n)
main(s2)
s3 = np.random.poisson(5.1,n)
main(s3)
s4 = np.random.gamma(2.65,2,n)
main(s4)
s5 = np.random.laplace(5.25,1,n)
main(s5)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

today__present

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值