Q学习算法举例(ma)lab实现)

最新推荐文章于 2024-05-30 07:30:00 发布

weixin_45882238

最新推荐文章于 2024-05-30 07:30:00 发布

阅读量1.1k

点赞数 3

文章标签：人工智能

本文链接：https://blog.csdn.net/weixin_45882238/article/details/107304686

版权

q学习是强化学习的一种方式下面给出一个例子机器人只能向左向右向上向下四个方向走，有陷阱的方向不能走，机器只有0.8的概率按照指定方向走，其余可行的方向中均分0.2。给定初始位置和目标位置给出初始位置到目标位置的一条最短路径。
在这里插入图片描述

在这里插入代码片

% clear

% clc

n=18;%行数

m=20;%列数                                                                                                                               


start=[2,3];%初始位置

goal=[18,20];%目标位置

r=zeros(n,m);

r(goal(1),goal(2))=100;%目标位置奖励值

gamma=0.9;

%trap为存储陷阱和柱子位置的矩阵

trap=[2 2;5 6;3 3;4 2;1 4;6 2;6 9;7 8;9
6;10 10;10 4;9 3;4 10;3 9;8 4;8 2;2 6;

    3
7;11 8;7 11;8 9;6 4;7 6;1 11;11 1;4 4;13 14;13 15 ;2 4;4 8];

% trap=[2 2;2 4];

% trap=[];

stop=8000;%经历次数

% q=check2(trap ,goal,r,gamma,stop);%找价值矩阵q

q=check2_2(trap ,goal,r,gamma);

goplot2(start,goal,q,trap,n,m);%找初始点到目标的最优路径

% seek(2,3,n,m,q);

%******************************************************

function [q]=check2(trap
,goal,r,gamma,stop)

[n,m]=size(r);siz=n*m-1;

q=zeros(siz+1,4);

%q的列标：1代表行数加1的动作，2代表行数减1的动作，

%3代表列数数减1的动作，4代表列数加1的动作，

%不加概率,不叠加其他动作

%status 中目标位置的值为1，而不能走的位置为-1

status=zeros(n,m);

[leng,~]=size(trap);

for i=1:leng

   
status(trap(i,1),trap(i,2))=-1; %陷阱为-1

end

status(goal(1),goal(2))=1;%目标赋值为1

tic

for i=1:stop

   
k=round(rand*siz) +1 ;

   
y=rem(k,m);

   
if y~=0

       
x=(k-y)/m;x=x+1;

   
end

   
if y==0

       
x=(k-y)/m;y=m;

   
end

   
x1=x;y1=y;

   
while status(x1,y1)~=1&&status(x1,y1)~=-1

       
result=[-1 -1 -1 -1];

       
if x1-1>=1&&status(x1-1,y1)~=-1

           
q(k,2)=r(x1-1,y1)+gamma*max(q(k-m,1:4)); 
%up

           
result(2)=q(k,2);

           


       
end

       
%*****************

       
if y1+1<=m&&status(x1,y1+1)~=-1

           
q(k,4)=r(x1,y1+1)+gamma*max(q(k+1,1:4)); %right

           
result(4)=q(k,4);

           


       
end

       
%************

       
if x1+1<=n&&status(x1+1,y1)~=-1

           
q(k,1)=r(x1+1,y1)+gamma*max(q(k+m,1:4)); %down

           
result(1)=q(k,1);

           


       
end

       
%***************

       


       
if y1-1>=1&&status(x1,y1-1)~=-1

           
q(k,3)=r(x1,y1-1)+gamma*max(q(k-1,1:4)); 
%left

           
result(3)=q(k,3);

           


       
end

       
[m1]=max(q(k,1:4));

       
p=round(rand*399) +1 ;

       
pp=rem(p,4);

       
if pp==0

           
pp=4;

       
end

       
while  result(pp)~=m1

           
p=round(rand*399) +1 ;

           
pp=rem(p,4);

           
if pp==0

                pp=4;

           
end

       
end

       
if pp==1

           
x1=x1+1;

           
k=k+m;

           
continue;

       
end

       
if pp==2

           
x1=x1-1;

           
k=k-m;

           
continue;

       
end

       
if pp==3

           
y1=y1-1;

           
k=k-1;

           
continue;

       
end

       
if pp==4

           
y1=y1+1;

           
k=k+1;

           
continue;

       
end

   
end

    

end

toc

end

%********************************************************************

function
[path]=goplot2(start,goal,q,trap,n,m)

%由于是随机选择一个最大值选择方向，故对于同一个q，goplot2每次得到的最优路径可能不一样

s=start;

if start(2)==m

   
k=start(1)*m;

else

   
k=(start(1)-1)*m+start(2);

end

path=start;

while start(1)~=goal(1)||start(2)~=goal(2)

   
[m1,index]=max(q(k,1:4));

   
time=0;record=[ 0 0 0 0];

   
for i=1:4

       
if q(k,i)==m1

           
time=time+1;

           
record(time)=i;

       
end

   
end

   
if time>1

       
p=round(rand*(100*time-1)) +1 ;

       
pp=rem(p,time);

       
if pp==0

           
pp=time;

       
end

       
index=record(pp);

   
end

   
%*****************

   
if index==1

       
start(1)=start(1)+1;

       
k=k+m;

   
end

   
if index==2

       
start(1)=start(1)-1;

        k=k-m;

   
end

   
if index==3

       
start(2)=start(2)-1;

       
k=k-1;

   
end

   
if index==4

       
start(2)=start(2)+1;

       
k=k+1;

       


   
end

   
path=[path;start];

end

leng=length(path);

fig11=figure(1);

set(fig11,'Position',[550,50,650,550]);

clf%从新绘制图形

hold on;

for i=1:m

   
for j=1:n

       
rectangle('Position',[i-0.5,j-0.5,1,1],'facecolor','c');

   
end

end

rectangle('Position',[goal(2)-0.5,goal(1)-0.5,1,1],'facecolor','g');

rectangle('Position',[s(2)-0.5,s(1)-0.5,1,1],'facecolor','m');

for i=1:leng-1

   
plot(path(i,2),path(i,1),'*');

end

for i=1:n

   
a=num2str(i);

   
text(-0.2,i,a);

end

for i=1:m

   
a=num2str(i);

   
text(i,-0.1,a);

end

axis([0.5 ,m+0.5,0.5,n+0.5]);

set(gca,'ytick',[]);

set(gca,'xtick',[]);

plot(path(:,2),path(:,1),'r');

for i=1:m

   
plot([i-0.5,i-0.5],[0,n+0.5],'b');

end

for i=1:n

   
plot([0,m+0.5],[i-0.5,i-0.5],'b');

end

plot(path(leng,2),path(leng,1),'*');

text(-1.2,5.5,'行坐标');

text(5.3,-0.7,'列坐标');

text(goal(2)-0.5,goal(1)+0.3,'goal');

text(s(2)-0.3,s(1)-0.2,'start');

[leng,~]=size(trap);

for i=1:leng

   
rectangle('Position',[trap(i,2)-0.5,trap(i,1)-0.5,1,1],'facecolor',[0.4,0.45,0.5]);

   
text(trap(i,2)-0.2,trap(i,1),'T');

end

end
改进check2后的check2_2
function [q]=check2_2(trap ,goal,r,gamma)
[n,m]=size(r);siz=n*m-1;
q=zeros(siz+1,4);
%q的列标：1代表行数加1的动作，2代表行数减1的动作，
%3代表列数数减1的动作，4代表列数加1的动作，
%不加概率,不叠加其他动作
%status 中目标位置的值为1，而不能走的位置为-1
status=zeros(n,m);
[leng,~]=size(trap);
for i=1:leng
    status(trap(i,1),trap(i,2))=-1; %陷阱为-1
end
status(goal(1),goal(2))=1;%目标赋值为1
tic
for i1=1:10
for i=1:n*m
    k=i;
    y=rem(k,m);
    if y~=0
        x=(k-y)/m;x=x+1;
    end
    if y==0
        x=(k-y)/m;y=m;
    end
    x1=x;y1=y;
    while status(x1,y1)~=1&&status(x1,y1)~=-1
        result=[-1 -1 -1 -1];
        if x1-1>=1&&status(x1-1,y1)~=-1
            q(k,2)=r(x1-1,y1)+gamma*max(q(k-m,1:4));  %up
            result(2)=q(k,2);
            
        end
        %*****************
        if y1+1<=m&&status(x1,y1+1)~=-1
            q(k,4)=r(x1,y1+1)+gamma*max(q(k+1,1:4)); %right
            result(4)=q(k,4);
            
        end
        %************
        if x1+1<=n&&status(x1+1,y1)~=-1
            q(k,1)=r(x1+1,y1)+gamma*max(q(k+m,1:4)); %down
            result(1)=q(k,1);
            
        end
        %***************
        
        if y1-1>=1&&status(x1,y1-1)~=-1
            q(k,3)=r(x1,y1-1)+gamma*max(q(k-1,1:4));  %left
            result(3)=q(k,3);
            
        end
        [m1]=max(q(k,1:4));
        p=round(rand*399) +1 ;
        pp=rem(p,4);
        if pp==0
            pp=4;
        end
        while  result(pp)~=m1
            p=round(rand*399) +1 ;
            pp=rem(p,4);
            if pp==0
                pp=4;
            end
        end
        if pp==1
            x1=x1+1;
            k=k+m;
            continue;
        end
        if pp==2
            x1=x1-1;
            k=k-m;
            continue;
        end
        if pp==3
            y1=y1-1;
            k=k-1;
            continue;
        end
        if pp==4
            y1=y1+1;
            k=k+1;
            continue;
        end
    end
    
end
end
toc
end

weixin_45882238

关注

3
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
Q学习算法举例(ma)lab实现)

q学习是强化学习的一种方式下面给出一个例子机器人只能向左向右向上向下四个方向走，有陷阱的方向不能走，机器只有0.8的概率按照指定方向走，其余可行的方向中均分0.2。给定初始位置和目标位置给出初始位置到目标位置的一条最短路径。在这里插入代码片% clear% clcn=18;%行数m=20;%列数
复制链接

扫一扫