q学习是强化学习的一种方式下面给出一个例子机器人只能向左向右向上向下四个方向走,有陷阱的方向不能走,机器只有0.8的概率按照指定方向走,其余可行的方向中均分0.2。给定初始位置和目标位置给出初始位置到目标位置的一条最短路径。
在这里插入代码片
% clear
% clc
n=18;%行数
m=20;%列数
start=[2,3];%初始位置
goal=[18,20];%目标位置
r=zeros(n,m);
r(goal(1),goal(2))=100;%目标位置奖励值
gamma=0.9;
%trap为存储陷阱和柱子位置的矩阵
trap=[2 2;5 6;3 3;4 2;1 4;6 2;6 9;7 8;9
6;10 10;10 4;9 3;4 10;3 9;8 4;8 2;2 6;
3
7;11 8;7 11;8 9;6 4;7 6;1 11;11 1;4 4;13 14;13 15 ;2 4;4 8];
% trap=[2 2;2 4];
% trap=[];
stop=8000;%经历次数
% q=check2(trap ,goal,r,gamma,stop);%找价值矩阵q
q=check2_2(trap ,goal,r,gamma);
goplot2(start,goal,q,trap,n,m);%找初始点到目标的最优路径
% seek(2,3,n,m,q);
%******************************************************
function [q]=check2(trap
,goal,r,gamma,stop)
[n,m]=size(r);siz=n*m-1;
q=zeros(siz+1,4);
%q的列标:1代表行数加1的动作,2代表行数减1的动作,
%3代表列数数减1的动作,4代表列数加1的动作,
%不加概率,不叠加其他动作
%status 中目标位置的值为1,而不能走的位置为-1
status=zeros(n,m);
[leng,~]=size(trap);
for i=1:leng
status(trap(i,1),trap(i,2))=-1; %陷阱为-1
end
status(goal(1),goal(2))=1;%目标赋值为1
tic
for i=1:stop
k=round(rand*siz) +1 ;
y=rem(k,m);
if y~=0
x=(k-y)/m;x=x+1;
end
if y==0
x=(k-y)/m;y=m;
end
x1=x;y1=y;
while status(x1,y1)~=1&&status(x1,y1)~=-1
result=[-1 -1 -1 -1];
if x1-1>=1&&status(x1-1,y1)~=-1
q(k,2)=r(x1-1,y1)+gamma*max(q(k-m,1:4));
%up
result(2)=q(k,2);
end
%*****************
if y1+1<=m&&status(x1,y1+1)~=-1
q(k,4)=r(x1,y1+1)+gamma*max(q(k+1,1:4)); %right
result(4)=q(k,4);
end
%************
if x1+1<=n&&status(x1+1,y1)~=-1
q(k,1)=r(x1+1,y1)+gamma*max(q(k+m,1:4)); %down
result(1)=q(k,1);
end
%***************
if y1-1>=1&&status(x1,y1-1)~=-1
q(k,3)=r(x1,y1-1)+gamma*max(q(k-1,1:4));
%left
result(3)=q(k,3);
end
[m1]=max(q(k,1:4));
p=round(rand*399) +1 ;
pp=rem(p,4);
if pp==0
pp=4;
end
while result(pp)~=m1
p=round(rand*399) +1 ;
pp=rem(p,4);
if pp==0
pp=4;
end
end
if pp==1
x1=x1+1;
k=k+m;
continue;
end
if pp==2
x1=x1-1;
k=k-m;
continue;
end
if pp==3
y1=y1-1;
k=k-1;
continue;
end
if pp==4
y1=y1+1;
k=k+1;
continue;
end
end
end
toc
end
%********************************************************************
function
[path]=goplot2(start,goal,q,trap,n,m)
%由于是随机选择一个最大值选择方向,故对于同一个q,goplot2每次得到的最优路径可能不一样
s=start;
if start(2)==m
k=start(1)*m;
else
k=(start(1)-1)*m+start(2);
end
path=start;
while start(1)~=goal(1)||start(2)~=goal(2)
[m1,index]=max(q(k,1:4));
time=0;record=[ 0 0 0 0];
for i=1:4
if q(k,i)==m1
time=time+1;
record(time)=i;
end
end
if time>1
p=round(rand*(100*time-1)) +1 ;
pp=rem(p,time);
if pp==0
pp=time;
end
index=record(pp);
end
%*****************
if index==1
start(1)=start(1)+1;
k=k+m;
end
if index==2
start(1)=start(1)-1;
k=k-m;
end
if index==3
start(2)=start(2)-1;
k=k-1;
end
if index==4
start(2)=start(2)+1;
k=k+1;
end
path=[path;start];
end
leng=length(path);
fig11=figure(1);
set(fig11,'Position',[550,50,650,550]);
clf%从新绘制图形
hold on;
for i=1:m
for j=1:n
rectangle('Position',[i-0.5,j-0.5,1,1],'facecolor','c');
end
end
rectangle('Position',[goal(2)-0.5,goal(1)-0.5,1,1],'facecolor','g');
rectangle('Position',[s(2)-0.5,s(1)-0.5,1,1],'facecolor','m');
for i=1:leng-1
plot(path(i,2),path(i,1),'*');
end
for i=1:n
a=num2str(i);
text(-0.2,i,a);
end
for i=1:m
a=num2str(i);
text(i,-0.1,a);
end
axis([0.5 ,m+0.5,0.5,n+0.5]);
set(gca,'ytick',[]);
set(gca,'xtick',[]);
plot(path(:,2),path(:,1),'r');
for i=1:m
plot([i-0.5,i-0.5],[0,n+0.5],'b');
end
for i=1:n
plot([0,m+0.5],[i-0.5,i-0.5],'b');
end
plot(path(leng,2),path(leng,1),'*');
text(-1.2,5.5,'行坐标');
text(5.3,-0.7,'列坐标');
text(goal(2)-0.5,goal(1)+0.3,'goal');
text(s(2)-0.3,s(1)-0.2,'start');
[leng,~]=size(trap);
for i=1:leng
rectangle('Position',[trap(i,2)-0.5,trap(i,1)-0.5,1,1],'facecolor',[0.4,0.45,0.5]);
text(trap(i,2)-0.2,trap(i,1),'T');
end
end
改进check2后的check2_2
function [q]=check2_2(trap ,goal,r,gamma)
[n,m]=size(r);siz=n*m-1;
q=zeros(siz+1,4);
%q的列标:1代表行数加1的动作,2代表行数减1的动作,
%3代表列数数减1的动作,4代表列数加1的动作,
%不加概率,不叠加其他动作
%status 中目标位置的值为1,而不能走的位置为-1
status=zeros(n,m);
[leng,~]=size(trap);
for i=1:leng
status(trap(i,1),trap(i,2))=-1; %陷阱为-1
end
status(goal(1),goal(2))=1;%目标赋值为1
tic
for i1=1:10
for i=1:n*m
k=i;
y=rem(k,m);
if y~=0
x=(k-y)/m;x=x+1;
end
if y==0
x=(k-y)/m;y=m;
end
x1=x;y1=y;
while status(x1,y1)~=1&&status(x1,y1)~=-1
result=[-1 -1 -1 -1];
if x1-1>=1&&status(x1-1,y1)~=-1
q(k,2)=r(x1-1,y1)+gamma*max(q(k-m,1:4)); %up
result(2)=q(k,2);
end
%*****************
if y1+1<=m&&status(x1,y1+1)~=-1
q(k,4)=r(x1,y1+1)+gamma*max(q(k+1,1:4)); %right
result(4)=q(k,4);
end
%************
if x1+1<=n&&status(x1+1,y1)~=-1
q(k,1)=r(x1+1,y1)+gamma*max(q(k+m,1:4)); %down
result(1)=q(k,1);
end
%***************
if y1-1>=1&&status(x1,y1-1)~=-1
q(k,3)=r(x1,y1-1)+gamma*max(q(k-1,1:4)); %left
result(3)=q(k,3);
end
[m1]=max(q(k,1:4));
p=round(rand*399) +1 ;
pp=rem(p,4);
if pp==0
pp=4;
end
while result(pp)~=m1
p=round(rand*399) +1 ;
pp=rem(p,4);
if pp==0
pp=4;
end
end
if pp==1
x1=x1+1;
k=k+m;
continue;
end
if pp==2
x1=x1-1;
k=k-m;
continue;
end
if pp==3
y1=y1-1;
k=k-1;
continue;
end
if pp==4
y1=y1+1;
k=k+1;
continue;
end
end
end
end
toc
end