无状态问题
中奖概率未知 有限尝试次数内最大回报
贪婪算法
大概率选择目前中奖率最高的 小概率随机探索
虚拟十台老虎机
虚拟十个按钮
import numpy as np
probs = np.random. uniform(size=10)
rewards=[[1]for _ in range(10)]
probs, rewards
动作函数
def choose_one():
if random.random()<0.01:
return random.randint(0,9)
rewards_mean =[np.mean(i) for i in rewards]
return np.argmax(rewards_mean)
choose_one()
def try_and_play():
i=choose_one()
reward = 0
if random.random()<probs[i]:
reward = 1
rewards[i].append(reward)
try_and_play()
rewards
#把机会用完再评估结果
def get_result():
#玩n次
for _ in range(5000):
try_and_play()
#期望的最好结果
target = probs.max() * 5000
#实际玩出的结果
result = sum([sum(i)for i in rewards])
return target,result
get_result()