%程序如下:
% Example 4.3 :Gambler' Problem. Sutton 2nd RLbook2020.pdf
V= zeros(1,101);
actionMax = zeros(1,99);
V(100) = 0;
V(101) = 0; % Gambler ran out of all money.
kk=1;
Delta = zeros(1,99);
phead = 0.4;
vtemp = zeros(1,100);
figure(1);
for jj = 1:20000
for s = 1:99
% s = floor(rand()*99)+1;
v = V(s);
rhead = 0;
actionTotal = min( s, 100 - s);
for action = 0 : actionTotal
shead = s + action;
if shead >= 100
rhead = 1;
shead = 100;
end
stail = s - action;
if stail == 0
stail = 101;
end
vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);
end
V(s) = max( vtemp(1:actionTotal+1)); % max (for all action when S = s );
Delta(s) = max( Delta(s), abs(V(s) - v ));
% ll = find( vtemp == V(s));
% actionMax(s) = ll(1);
end
if jj == 1 || jj == 2 || jj ==3 || jj ==32
plot(V(1:99))
hold on
end
end
% find out Pi*(s)
Pi = zeros(1,99);
CurrentMaxA = zeros(1,99);
for s = 1:99
rhead = 0;
actionTotal = min( s, 100 - s);
if s == 25
rtail =0;
end
vtemp(1) = V(s);
for action = 1: actionTotal
shead = s + action;
if shead >= 100
rhead = 1;
shead = 100;
end
stail = s - action;
if stail == 0
stail = 101;
end
vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);
end
vtemp(1:actionTotal+1)
tt = max( vtemp(2:actionTotal+1)); % max (for all action when S = s );
MaxA = 1;
if s == 55
ee =1;
end
for tt1 = 2:actionTotal+1
if abs((vtemp(tt1) - tt)) <= 1e-7
MaxA = tt1;
break;
end
end
CurrentMaxA(s) = MaxA-1;
end
plot(V(1:99))
figure
plot(CurrentMaxA)
disp("finished.....\n");
=========
运行结果如下:
图1
图2
这个程序花了不少时间,特别是下图最优决策,用到了一点小小的技巧。
补充说明:最优价值函数是一个,但最优决策有很多,如下图所示:
图3
横坐标是当前的赌资s:(1-100),纵坐标是在赌资为s 时的最优下注数。很显然最优策略有很多种,如图3所示,对不同的s,最优下注数最多可有4种选择,都具有同样的价值。图2 是图3中的一种特殊情况。
极端情况,使用横坐标,即每次下注数为0!不赌则不输!!!
可生成图3 的代码如下,不要运行clear all,重复运行如下程序:
% Example 4.3 :Gambler' Problem. Sutton 2nd RLbook2020.pdf
V= zeros(1,101);
actionMax = zeros(1,99);
V(100) = 0;
V(101) = 0; % Gambler ran out of all money.
kk=1;
Delta = zeros(1,99);
phead = 0.4;
vtemp = zeros(1,100);
figure(1);
for jj = 1:20000
for s = 1:99
% s = floor(rand()*99)+1;
v = V(s);
rhead = 0;
actionTotal = min( s, 100 - s);
for action = 0 : actionTotal
shead = s + action;
if shead >= 100
rhead = 1;
shead = 100;
end
stail = s - action;
if stail == 0
stail = 101;
end
vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);
end
V(s) = max( vtemp(1:actionTotal+1)); % max (for all action when S = s );
Delta(s) = max( Delta(s), abs(V(s) - v ));
% ll = find( vtemp == V(s));
% actionMax(s) = ll(1);
end
if jj == 1 || jj == 2 || jj ==3 || jj ==32
plot(V(1:99))
hold on
end
end
% find out Pi*(s)
Pi = zeros(1,99);
CurrentMaxA = zeros(1,99);
MaxNo = zeros(1,99);
MaxPos = zeros(99,4);
for s = 1:99
rhead = 0;
actionTotal = min( s, 100 - s);
if s == 25
rtail =0;
end
vtemp(1) = V(s);
for action = 1: actionTotal
shead = s + action;
if shead >= 100
rhead = 1;
shead = 100;
end
stail = s - action;
if stail == 0
stail = 101;
end
vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);
end
vtemp(1:actionTotal+1)
tt = max( vtemp(2:actionTotal+1)); % max (for all action when S = s );
MaxA = 1;
if s == 55
ee =1;
end
tt0=0;
for tt1 = 1:actionTotal+1
if abs((vtemp(tt1) - tt)) <= 1e-7
MaxA = tt1;
MaxNo(s) = MaxNo(s) + 1;
MaxPos(s,tt0+1) = MaxA -1;
tt0 = tt0 + 1;
end
end
tt2 = floor(rand()*tt0);
CurrentMaxA(s) = MaxPos(s,tt2+1);
% CurrentMaxA(s) = MaxA-1;
end
plot(V(1:99))
figure(5)
plot(CurrentMaxA,'*')
hold on
disp("finished.....\n");