C++实现:
代码如下:
#include <iostream>
#include <time.h>
#include <cstdlib>
#include <vector>
#include <algorithm>
#include <iterator>
#include <fstream>
using namespace std;
ofstream out_explore("d:\\explore.txt");
ofstream out_use("d:\\use.txt");
void access(int state, int q[6][6], int r[6][6])
{
vector<int> q_action;
q_action.push_back(state+1);
out_use<<"Start from : "<<state+1<<endl; //
while(state != 5)
{
int act = 0;
int max_state = q[state][0];
for(int action = 1; action < 6; ++action)
{
if(q[state][action] > max_state)
{
max_state = q[state][action];
act = action;
}
}
q_action.push_back(act+1);
state = act;
out_use<<"The most reward is from action "<<act+1 <<" ,and it is "<<max_state<<"."<<endl;
}
if(state == 5)
out_use<<"Reach the goal room!"<<endl;
copy(q_action.begin(), q_action.end(), ostream_iterator<int>(cout," -> "));
cout<<"ok"<<endl;
}
int special(int in_state, float gamma, int q[6][6], int r[6][6])
{
vector<int> r_action;
for(int action=0; action<6; ++action)
{
if(r[in_state][action] >=0 )
r_action.push_back(action);
}
int m = rand() % (r_action.size());
int next_state = r_action[m];
int max = q[next_state][0];
for(int i = 1; i < 6; ++i)
{
if(max < q[next_state][i])
max = q[next_state][i];
}
q[in_state][next_state] = r[in_state][next_state] + gamma * max;
return next_state;
}
void qlearning(float gamma, int r[6][6], int q[6][6])
{
//获得随机进入的门
int in_state = (rand() % 5); //产生0-5的随机数
out_explore<<"★ "<<in_state+1;
while (in_state != 5)
{
in_state = special(in_state, gamma, q, r);
out_explore<<"->"<<in_state+1;
if(in_state == 5)
{
special(in_state, gamma, q, r);
}
}
out_explore<<endl;
}
int main()
{
int q[6][6]= {0};
int r[6][6]=
{
{-1, 0, 0, 0,-1, -1},
{0, -1, -1, 0, -1, -1},
{0, -1, -1, -1, -1, -1},
{0, 0, -1, -1, 0, 100},
{-1, -1, -1, 0, -1, 100},
{-1, -1, -1, 0, 0, 100}
};
for(int i=0; i<100; i++) //执行多次qleaning函数 , 更新 q 数组
qlearning(0.8, r, q);
int max=0;