Q例子参考自: http://blog.csdn.net/pi9nc/article/details/27649323
Qlearning 选则策略:
1、 E-greedy
int getNextActionE_Greedy(int state)
{
int selectAction = -1;
boolean random= false;
double maxQ = -50000;
int[] doubleValues = new int[st.actionLength];
for(int i = 0;i < st.actionLength;i++)
{
doubleValues[i] = 0;
}
int maxDV = 0;
double selectPossbility = proRandomNumber.nextDouble();
if(selectPossbility < st.epsilon)
{
selectAction = -1;
random = true;
}
else
{
for(int action = 0;action < st.actionLength;action ++)
{
// 如果发现值相同的多个就记录下来
if(st.qValues[state][action]> maxQ)
{
selectAction = action;
maxQ = st.qValues[state][action];
maxDV = 0;
doubleValues[maxDV] = selectAction;
}
else if(st.qValues[state][action] == maxQ)
{
maxDV++;
doubleValues[maxDV] = action;
}
}
if( maxDV > 0 )
{
int randomIndex =proRandomNumber.nextInt(maxDV+1);
selectAction = doubleValues[randomIndex];
}
}
if(selectAction == -1)
{
selectAction = proRandomNumber.nextInt(st.actionLength);
}
while(!validAction(state,selectAction))
{
selectAction = proRandomNumber.nextInt(st.actionLength);;
}
return selectAction;
}
2、softMax
int getNextActionE_SoftMax(int state)
{
<span style="white-space:pre"> </span>int softmaxDiv = 1;
<span style="white-space:pre"> </span>int selectAction = -1;
<span style="white-space:pre"> </span>double[] prob = new double[st.actionLength];
<span style="white-space:pre"> </span>double sumProb = 0;
<span style="white-space:pre"> </span>int action_softmax;
<span style="white-space:pre"> </span>for(action_softmax = 0 ; action_softmax < st.actionLength ; action_softmax ++ )
<span style="white-space:pre"> </span>{
<span style="white-space:pre"> </span>if(validAction(state, action_softmax) == false){
<span style="white-space:pre"> </span>prob[action_softmax] = 0;
<span style="white-space:pre"> </span>continue;
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span>else{
<span style="white-space:pre"> </span> <span style="white-space:pre"> </span>double temp =1.0 * (st.qValues[state][action_softmax] / softmaxDiv);
<span style="white-space:pre"> </span> <span style="white-space:pre"> </span>prob[action_softmax] = Math.exp(temp);
<span style="white-space:pre"> </span> <span style="white-space:pre"> </span>sumProb += prob[action_softmax];
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span>for(action_softmax = 0 ; action_softmax < st.actionLength ; action_softmax ++ ){
<span style="white-space:pre"> </span>if(validAction(state, action_softmax) == false){
<span style="white-space:pre"> </span>continue;
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span> prob[action_softmax] = prob[action_softmax] / sumProb;
<span style="white-space:pre"> </span>}
boolean valid = false;
<span style="white-space:pre"> </span>while(valid == false)
<span style="white-space:pre"> </span>{
<span style="white-space:pre"> </span>double rndValue;
<span style="white-space:pre"> </span>double offset;
<span style="white-space:pre"> </span>rndValue =proRandomNumber.nextDouble();
<span style="white-space:pre"> </span>offset = 0;
<span style="white-space:pre"> </span>for(action_softmax = 0 ; action_softmax < st.actionLength ; action_softmax ++ )
<span style="white-space:pre"> </span>{
<span style="white-space:pre"> </span>//好好理解下
<span style="white-space:pre"> </span>if(validAction(state, action_softmax) == false){
<span style="white-space:pre"> </span>continue;
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span>if( rndValue > offset && rndValue < (offset + prob[action_softmax]))
<span style="white-space:pre"> </span>selectAction = action_softmax;
<span style="white-space:pre"> </span>offset += prob[action_softmax];
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span>//监测行为是否能够到达
<span style="white-space:pre"> </span>if(this.validAction(state, selectAction))
<span style="white-space:pre"> </span>{
<span style="white-space:pre"> </span>valid = true;
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span>break;
<span style="white-space:pre"> </span>}
<span style="white-space:pre"> </span>return selectAction;
}
验证状态的正确性
boolean validAction(int state,int action)
{
boolean ret = true;
if(action < 0 || action > 5)
ret = false;
if(st.reward[state][action] < 0)
ret = false;
return ret;
}
进行训练
void qLearningTrain(long time)
{
int count = 0;
int initState = proRandomNumber.nextInt(st.actionLength);;
do
{
double this_Q;
double max_Q;
double new_Q;
double reward = 0;
//int nextAction = proRandomNumber.nextInt(st.actionLength);
int nextAction = getNextActionE_SoftMax(initState);
// while(!validAction(initState,nextAction)){
// nextAction = proRandomNumber.nextInt(st.actionLength);
// }
reward = st.getReward(initState, nextAction);
this_Q = st.getqValues(initState, nextAction);
max_Q = st.getMaxQValues(nextAction);
// new_Q = reward + (st.alpha) * (max_Q);
new_Q = reward + (st.alpha) * (reward + (st.gamma) * max_Q - this_Q);
st.setqValues(initState, nextAction, new_Q);
initState = nextAction;
}while(initState != 5);
}
代码参考 :http://pan.baidu.com/s/1c1VTuyG