在学习之余我插入一则在学校学习的强化学习这一门课,我觉得蛮有意思的,可以训练一个小模型
在matlab中你可以使用help工具去搜索你所需的算法,比如DQN你就直接搜DQN就好啦
要熟练运用嗷,我们学校强化学习和机器学习学习时用的都是matkab
此篇我只说matlab,要熟练运用帮助里的代码哈哈哈,别的就不多说了
当然强化学习我建议运用pytorch机器学习我建议用pycharm
我拿网页版举例子,第一步点击图一的问号,基本大差不差
第二步直接在搜索中心搜DQN
1.DQN
直接帮助中找就好,我没记错的话用matlab进行强化学习最低版本支持是22b还是20b记不太清了,好像是22b,不然不支持
2.PG(1)离散
clear
clc
env = rlPredefinedEnv("CartPole-Discrete");
obsInfo = getObservationInfo(env);
numObservations = obsInfo.Dimension(1);
actInfo = getActionInfo(env);
actorNetwork = [
featureInputLayer(numObservations,'Normalization','none','Name','state')
fullyConnectedLayer(2,'Name','fc')
softmaxLayer('Name','actionProb')
];
actorOpts = rlRepresentationOptions('LearnRate',1e-2,'GradientThreshold',1);
actor = rlStochasticActorRepresentation(actorNetwork,obsInfo,actInfo,'Observation',{'state'},actorOpts);
agent = rlPGAgent(actor);
trainOpts = rlTrainingOptions(...
'MaxEpisodes', 1000, ...
'MaxStepsPerEpisode', 200, ...
'Verbose', false, ...
'Plots','training-progress',...
'StopTrainingCriteria','AverageReward',...
'StopTrainingValue',195,...
'ScoreAveragingWindowLength',100);
trainingStats = train(agent,env,trainOpts);
3.PG(2)连续
clear
clc
env = rlPredefinedEnv("CartPole-Continuous");
obsInfo = getObservationInfo(env);
numObservations = obsInfo.Dimension(1);
actInfo = getActionInfo(env);
% Input path layers
inPath = [
featureInputLayer(prod(obsInfo.Dimension), Name="netOin")
fullyConnectedLayer( prod(actInfo.Dimension), Name="infc")
];
% Path layers for mean value
% Using scalingLayer to scale range from (-1,1) to (-10,10)
meanPath = [
tanhLayer(Name="tanhMean");
fullyConnectedLayer(prod(actInfo.Dimension));
scalingLayer(Name="scale", ...
Scale=actInfo.UpperLimit)
];
% Path layers for standard deviations
% Using softplus layer to make them non negative
sdevPath = [
tanhLayer(Name="tanhStdv");
fullyConnectedLayer(prod(actInfo.Dimension));
softplusLayer(Name="splus")
];
% Add layers to network object
net = layerGraph(inPath);
net = addLayers(net,meanPath);
net = addLayers(net,sdevPath);
% Connect layers
net = connectLayers(net,"infc","tanhMean/in");
net = connectLayers(net,"infc","tanhStdv/in");
net = dlnetwork(net);
actor = rlContinuousGaussianActor(net, obsInfo, actInfo, ...
ActionMeanOutputNames="scale",...
ActionStandardDeviationOutputNames="splus",...
ObservationInputNames="netOin");
agent = rlPGAgent(actor);
trainOpts = rlTrainingOptions(...
'MaxEpisodes', 100000, ...
'MaxStepsPerEpisode', 200, ...
'Verbose', false, ...
'Plots','training-progress',...
'StopTrainingCriteria','AverageReward',...
'StopTrainingValue',195,...
'ScoreAveragingWindowLength',100);
plot(env)
trainingStats = train(agent,env,trainOpts);
4.DDPG
clear
clc
env = rlPredefinedEnv('CartPole-Continuous')
obsInfo = getObservationInfo(env);
numObservations = obsInfo.Dimension(1);
actInfo = getActionInfo(env);
statePath = [
featureInputLayer(numObservations,'Normalization','none','Name','observation')
fullyConnectedLayer(128,'Name','CriticStateFC1')
reluLayer('Name','CriticRelu1')
fullyConnectedLayer(200,'Name','CriticStateFC2')];
actionPath = [
featureInputLayer(1,'Normalization','none','Name','action')
fullyConnectedLayer(200,'Name','CriticActionFC1','BiasLearnRateFactor',0)];
commonPath = [
additionLayer(2,'Name','add')
reluLayer('Name','CriticCommonRelu')
fullyConnectedLayer(1,'Name','CriticOutput')];
criticNetwork = layerGraph(statePath);
criticNetwork = addLayers(criticNetwork,actionPath);
criticNetwork = addLayers(criticNetwork,commonPath);
criticNetwork = connectLayers(criticNetwork,'CriticStateFC2','add/in1');
criticNetwork = connectLayers(criticNetwork,'CriticActionFC1','add/in2');
criticOptions = rlRepresentationOptions('LearnRate',1e-03,'GradientThreshold',1);
critic = rlQValueRepresentation(criticNetwork,obsInfo,actInfo,...
'Observation',{'observation'},'Action',{'action'},criticOptions);
actorNetwork = [
featureInputLayer(numObservations,'Normalization','none','Name','observation')
fullyConnectedLayer(128,'Name','ActorFC1')
reluLayer('Name','ActorRelu1')
fullyConnectedLayer(200,'Name','ActorFC2')
reluLayer('Name','ActorRelu2')
fullyConnectedLayer(1,'Name','ActorFC3')
tanhLayer('Name','ActorTanh1')
scalingLayer('Name','ActorScaling','Scale',max(actInfo.UpperLimit))];
actorOptions = rlRepresentationOptions('LearnRate',5e-04,'GradientThreshold',1);
actor = rlDeterministicActorRepresentation(actorNetwork,obsInfo,actInfo,...
'Observation',{'observation'},'Action',{'ActorScaling'},actorOptions);
agentOptions = rlDDPGAgentOptions(...
'TargetSmoothFactor',1e-3,...
'ExperienceBufferLength',1e6,...
'MiniBatchSize',128);
agentOptions.NoiseOptions.Variance = 0.4;
agentOptions.NoiseOptions.VarianceDecayRate = 1e-5;
agent = rlDDPGAgent(actor,critic,agentOptions);
maxepisodes = 2000;
maxsteps = 1000;
trainingOptions = rlTrainingOptions(...
'MaxEpisodes',maxepisodes,...
'MaxStepsPerEpisode',maxsteps,...
'ScoreAveragingWindowLength',5,...
'Verbose',false,...
'Plots','training-progress',...
'StopTrainingCriteria','AverageReward',...
'StopTrainingValue',-1,...
'SaveAgentCriteria','EpisodeReward',...
'SaveAgentValue',-10);
trainingStats = train(agent,env,trainingOptions);
在这里的碎碎念,我学习并不深,但我觉得这是一个很有意思的过程,中间的数字可以变一变,自己看看,会有不一样的结果,我暂时找不见我曾经的实验报告了,找到了再过来截图