【关于Sutton 《Reinforcement Learning:An Introduction》Example 4.3: Gambler‘s Problem的MATLAB语言实现】

本文链接：https://blog.csdn.net/weixin_43047969/article/details/134916602

在这里插入图片描述

%程序如下：
% Example 4.3 :Gambler' Problem. Sutton 2nd RLbook2020.pdf
V= zeros(1,101);
actionMax = zeros(1,99);

V(100) = 0;
V(101) = 0; % Gambler ran out of all money.
kk=1;
Delta = zeros(1,99);
phead = 0.4;
vtemp = zeros(1,100);
figure(1);
for jj = 1:20000
  for s  = 1:99
    % s = floor(rand()*99)+1;
    v = V(s);
    
    rhead = 0;
    
    actionTotal = min( s, 100 - s);
    for action = 0 : actionTotal
        shead = s + action;
        if shead >= 100
            rhead = 1;
            shead = 100;
        end
        stail = s - action;
        if stail == 0
            stail = 101;
        end
        vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);  
    end    
    V(s) = max( vtemp(1:actionTotal+1));  % max (for all action when S = s );
    Delta(s) = max( Delta(s), abs(V(s) - v ));
%     ll = find( vtemp == V(s));
%     actionMax(s) = ll(1);
 
  end
  if jj == 1 || jj == 2 || jj ==3 || jj ==32 
      
    plot(V(1:99))
    hold on
  end
end
% find out Pi*(s)
Pi = zeros(1,99);
CurrentMaxA = zeros(1,99);
for s = 1:99
     rhead = 0;
    
    actionTotal = min( s, 100 - s);
    
    if s == 25 
        rtail =0;
    end
    vtemp(1) = V(s);
    for action = 1: actionTotal
        shead = s + action;
        if shead >= 100
            rhead = 1;
            shead = 100;
        end
        stail = s - action;
        if stail == 0
            stail = 101;
        end
        vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);  
    end   
    vtemp(1:actionTotal+1)
    tt = max( vtemp(2:actionTotal+1));  % max (for all action when S = s );
    MaxA = 1;
   if s == 55
       ee =1;
   end
    for tt1 = 2:actionTotal+1
       if abs((vtemp(tt1) - tt)) <= 1e-7
           MaxA = tt1;
           break;
       end
    end
    
    CurrentMaxA(s) = MaxA-1;  
end

plot(V(1:99))
figure
plot(CurrentMaxA)
disp("finished.....\n");

=========
运行结果如下：
图1 价值函数
图1
图2 一种最优策略
图2

这个程序花了不少时间，特别是下图最优决策，用到了一点小小的技巧。

补充说明：最优价值函数是一个，但最优决策有很多，如下图所示：
图 3 与图1对应的可能的最优策略
图3

横坐标是当前的赌资s:（1-100），纵坐标是在赌资为s 时的最优下注数。很显然最优策略有很多种，如图3所示，对不同的s，最优下注数最多可有4种选择，都具有同样的价值。图2 是图3中的一种特殊情况。

极端情况，使用横坐标，即每次下注数为0！不赌则不输！！！

可生成图3 的代码如下，不要运行clear all，重复运行如下程序：

% Example 4.3 :Gambler' Problem. Sutton 2nd RLbook2020.pdf
V= zeros(1,101);
actionMax = zeros(1,99);

V(100) = 0;
V(101) = 0; % Gambler ran out of all money.
kk=1;
Delta = zeros(1,99);
phead = 0.4;
vtemp = zeros(1,100);
figure(1);
for jj = 1:20000
  for s  = 1:99
    % s = floor(rand()*99)+1;
    v = V(s);
    
    rhead = 0;
    
    actionTotal = min( s, 100 - s);
    for action = 0 : actionTotal
        shead = s + action;
        if shead >= 100
            rhead = 1;
            shead = 100;
        end
        stail = s - action;
        if stail == 0
            stail = 101;
        end
        vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);  
    end    
    V(s) = max( vtemp(1:actionTotal+1));  % max (for all action when S = s );
    Delta(s) = max( Delta(s), abs(V(s) - v ));
%     ll = find( vtemp == V(s));
%     actionMax(s) = ll(1);
 
  end
  if jj == 1 || jj == 2 || jj ==3 || jj ==32 
      
    plot(V(1:99))
    hold on
  end
end
% find out Pi*(s)
Pi = zeros(1,99);
CurrentMaxA = zeros(1,99);
MaxNo = zeros(1,99);
MaxPos = zeros(99,4);
for s = 1:99
     rhead = 0;
    
    actionTotal = min( s, 100 - s);
    
    if s == 25 
        rtail =0;
    end
    vtemp(1) = V(s);
    for action = 1: actionTotal
        shead = s + action;
        if shead >= 100
            rhead = 1;
            shead = 100;
        end
        stail = s - action;
        if stail == 0
            stail = 101;
        end
        vtemp( action+1 ) = phead * ( rhead + V(shead)) + (1 - phead ) * V( stail);  
    end   
    vtemp(1:actionTotal+1)
    tt = max( vtemp(2:actionTotal+1));  % max (for all action when S = s );
    MaxA = 1;
   if s == 55
       ee =1;
   end
   
    tt0=0;
    for tt1 = 1:actionTotal+1
       if abs((vtemp(tt1) - tt)) <= 1e-7
           MaxA = tt1;
           MaxNo(s) = MaxNo(s) + 1;
           MaxPos(s,tt0+1) = MaxA -1;
           tt0 = tt0 + 1;
       end   
    end
    tt2 = floor(rand()*tt0);
    CurrentMaxA(s) = MaxPos(s,tt2+1);
   % CurrentMaxA(s) = MaxA-1;  
end

plot(V(1:99))
figure(5)
plot(CurrentMaxA,'*')
hold on
disp("finished.....\n");