java实现qlearning,Qlearning java实现(含选择策略)

Q例子参考自:  http://blog.csdn.net/pi9nc/article/details/27649323

Qlearning 选则策略:

1、 E-greedy

int getNextActionE_Greedy(int state)

{

int selectAction = -1;

boolean random= false;

double maxQ = -50000;

int[] doubleValues = new int[st.actionLength];

for(int i = 0;i < st.actionLength;i++)

{

doubleValues[i] = 0;

}

int maxDV = 0;

double selectPossbility = proRandomNumber.nextDouble();

if(selectPossbility < st.epsilon)

{

selectAction = -1;

random = true;

}

else

{

for(int action = 0;action < st.actionLength;action ++)

{

// 如果发现值相同的多个就记录下来

if(st.qValues[state][action]> maxQ)

{

selectAction = action;

maxQ = st.qValues[state][action];

maxDV = 0;

doubleValues[maxDV] = selectAction;

}

else if(st.qValues[state][action] == maxQ)

{

maxDV++;

doubleValues[maxDV] = action;

}

}

if( maxDV > 0 )

{

int randomIndex =proRandomNumber.nextInt(maxDV+1);

selectAction = doubleValues[randomIndex];

}

}

if(selectAction == -1)

{

selectAction = proRandomNumber.nextInt(st.actionLength);

}

while(!validAction(state,selectAction))

{

selectAction = proRandomNumber.nextInt(st.actionLength);;

}

return selectAction;

}2、softMax

int getNextActionE_SoftMax(int state)

{

int softmaxDiv = 1;

int selectAction = -1;

double[] prob = new double[st.actionLength];

double sumProb = 0;

int action_softmax;

for(action_softmax = 0 ; action_softmax < st.actionLength ; action_softmax ++ )

{

if(validAction(state, action_softmax) == false){

prob[action_softmax] = 0;

continue;

}

else{

    double temp =1.0 * (st.qValues[state][action_softmax] / softmaxDiv);

    prob[action_softmax] = Math.exp(temp);

    sumProb += prob[action_softmax];

}

}

for(action_softmax = 0 ; action_softmax <  st.actionLength ; action_softmax ++ ){

if(validAction(state, action_softmax) == false){

continue;

}

   prob[action_softmax] = prob[action_softmax] / sumProb;

}

boolean valid = false;

while(valid == false)

{

double rndValue;

double offset;

rndValue =proRandomNumber.nextDouble();

offset = 0;

for(action_softmax = 0 ; action_softmax <  st.actionLength ; action_softmax ++ )

{

//好好理解下

if(validAction(state, action_softmax) == false){

continue;

}

if( rndValue > offset && rndValue < (offset + prob[action_softmax]))

selectAction = action_softmax;

offset += prob[action_softmax];

}

//监测行为是否能够到达

if(this.validAction(state, selectAction))

{

valid = true;

}

break;

}

return selectAction;

}验证状态的正确性

boolean validAction(int state,int action)

{

boolean ret = true;

if(action < 0 || action > 5)

ret = false;

if(st.reward[state][action] < 0)

ret = false;

return ret;

}进行训练

void qLearningTrain(long time)

{

int count = 0;

int initState = proRandomNumber.nextInt(st.actionLength);;

do

{

double this_Q;

double max_Q;

double new_Q;

double reward = 0;

//int nextAction = proRandomNumber.nextInt(st.actionLength);

int nextAction = getNextActionE_SoftMax(initState);

// while(!validAction(initState,nextAction)){

// nextAction = proRandomNumber.nextInt(st.actionLength);

// }

reward = st.getReward(initState, nextAction);

this_Q = st.getqValues(initState, nextAction);

max_Q = st.getMaxQValues(nextAction);

// new_Q = reward + (st.alpha) * (max_Q);

new_Q = reward + (st.alpha) * (reward + (st.gamma) * max_Q - this_Q);

st.setqValues(initState, nextAction, new_Q);

initState = nextAction;

}while(initState != 5);

}代码参考 :http://pan.baidu.com/s/1c1VTuyG

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值