智能体路径规划

本文通过C++和Python实现Q-learning算法,探讨智能体如何在马尔可夫决策过程中进行路径规划。通过模拟环境,智能体在探索与利用过程中逐渐学习到最优路径,最终达到目标房间。代码实现包括探索过程的记录和利用过程的可视化。
摘要由CSDN通过智能技术生成







C++实现:

代码如下:


#include <iostream>
#include <time.h>
#include <cstdlib>
#include <vector>
#include <algorithm>
#include <iterator>
#include <fstream>


using namespace std;


ofstream out_explore("d:\\explore.txt");
ofstream out_use("d:\\use.txt");


void access(int state, int q[6][6], int r[6][6])
{
vector<int> q_action;
q_action.push_back(state+1);
out_use<<"Start from : "<<state+1<<endl;  //
while(state != 5)
{
int act = 0;
int max_state = q[state][0];
for(int action = 1; action < 6; ++action)
{
if(q[state][action] > max_state)
{
max_state = q[state][action];
act = action;
}
}
q_action.push_back(act+1);
state = act;
out_use<<"The most reward is from action "<<act+1 <<" ,and it is "<<max_state<<"."<<endl;
}
if(state == 5)
        out_use<<"Reach the goal room!"<<endl;
copy(q_action.begin(), q_action.end(), ostream_iterator<int>(cout,"  ->  "));
cout<<"ok"<<endl;
}




int special(int in_state, float gamma, int q[6][6], int r[6][6])
{
    vector<int> r_action;
    for(int action=0; action<6; ++action)
    {
        if(r[in_state][action] >=0 )
r_action.push_back(action);
    }
    int m = rand() % (r_action.size());
    int next_state = r_action[m];
    int max = q[next_state][0];
    for(int i = 1; i < 6; ++i)
    {
        if(max < q[next_state][i])
            max = q[next_state][i];
    }
    q[in_state][next_state] = r[in_state][next_state] + gamma * max;
    return next_state;
}




void qlearning(float gamma, int r[6][6], int q[6][6])
{
//获得随机进入的门
int in_state = (rand() % 5);  //产生0-5的随机数
out_explore<<"★ "<<in_state+1;
while (in_state != 5)
{
in_state = special(in_state, gamma, q, r);
out_explore<<"->"<<in_state+1;
if(in_state == 5)
        {
            special(in_state, gamma, q, r);
        }
}
out_explore<<endl;
}




int main()
{
int q[6][6]= {0};
int r[6][6]=
{
{-1, 0, 0, 0,-1, -1},
{0, -1, -1, 0, -1, -1},
{0, -1, -1, -1, -1, -1},
{0, 0, -1, -1, 0, 100},
{-1, -1, -1, 0, -1, 100},
{-1, -1, -1, 0, 0, 100}
};


for(int i=0; i<100; i++)           //执行多次qleaning函数  , 更新 q 数组
qlearning(0.8, r, q);


int max=0;

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值