Qlearning java实现(含选择策略)

Q例子参考自:  http://blog.csdn.net/pi9nc/article/details/27649323

Qlearning 选则策略:

1、 E-greedy

   int getNextActionE_Greedy(int state)
     {
     	int selectAction = -1;
     	boolean random= false;
     	double maxQ = -50000;
     	int[] doubleValues = new int[st.actionLength];
     	for(int i = 0;i < st.actionLength;i++)
     	{
     		doubleValues[i] = 0;
     	}
     	int maxDV = 0;
     	double selectPossbility = proRandomNumber.nextDouble();
     	
     	if(selectPossbility < st.epsilon)
     	{
     		selectAction = -1;
     		random = true;
     	}
     	else
     	{
     		for(int action = 0;action < st.actionLength;action ++)
     		{
     			// 如果发现值相同的多个就记录下来
     			if(st.qValues[state][action]> maxQ)
     			{
     				selectAction = action;
     				maxQ = st.qValues[state][action];
     				maxDV = 0;
     				doubleValues[maxDV] = selectAction;
     			}
     			else if(st.qValues[state][action] == maxQ)
     			{
     				maxDV++;
     			    doubleValues[maxDV] = action; 
     			}
     		}
     		if( maxDV > 0 ) 
     		{
     			
     			int randomIndex =proRandomNumber.nextInt(maxDV+1);
     			selectAction = doubleValues[randomIndex];
     		}
     	}
     	if(selectAction == -1)
     	{
     		selectAction = proRandomNumber.nextInt(st.actionLength);
     	}
     	
     	while(!validAction(state,selectAction))
     	{
     		selectAction = proRandomNumber.nextInt(st.actionLength);;
     	}
        return selectAction;
     }
2、softMax

  int getNextActionE_SoftMax(int state)
     {
    <span style="white-space:pre">	</span>int softmaxDiv = 1;
     <span style="white-space:pre">	</span>int selectAction = -1;
     <span style="white-space:pre">	</span>double[] prob = new double[st.actionLength];
     <span style="white-space:pre">	</span>double sumProb = 0;
     <span style="white-space:pre">	</span>int action_softmax;
     <span style="white-space:pre">	</span>for(action_softmax = 0 ; action_softmax < st.actionLength ; action_softmax ++ )
     <span style="white-space:pre">	</span>{
     <span style="white-space:pre">		</span>if(validAction(state, action_softmax) == false){
     <span style="white-space:pre">			</span>prob[action_softmax] = 0;
     <span style="white-space:pre">			</span>continue;
     <span style="white-space:pre">		</span>}
     <span style="white-space:pre">		</span>else{
<span style="white-space:pre">	</span>     <span style="white-space:pre">		</span>double temp =1.0 * (st.qValues[state][action_softmax] / softmaxDiv);
<span style="white-space:pre">	</span>     <span style="white-space:pre">		</span>prob[action_softmax] = Math.exp(temp);
<span style="white-space:pre">	</span>     <span style="white-space:pre">		</span>sumProb += prob[action_softmax];
     <span style="white-space:pre">		</span>}
     <span style="white-space:pre">	</span>}
     <span style="white-space:pre">	</span>for(action_softmax = 0 ; action_softmax <  st.actionLength ; action_softmax ++ ){
     <span style="white-space:pre">		</span>if(validAction(state, action_softmax) == false){
     <span style="white-space:pre">			</span>continue;
     <span style="white-space:pre">		</span>}
     <span style="white-space:pre">	</span>    prob[action_softmax] = prob[action_softmax] / sumProb;
     <span style="white-space:pre">	</span>}
        boolean valid = false;
     <span style="white-space:pre">	</span>while(valid == false)
     <span style="white-space:pre">	</span>{
     <span style="white-space:pre">		</span>double rndValue;
      <span style="white-space:pre">		</span>double offset;
     <span style="white-space:pre">		</span>rndValue =proRandomNumber.nextDouble();
     <span style="white-space:pre">		</span>offset = 0;
     <span style="white-space:pre">		</span>for(action_softmax = 0 ; action_softmax <  st.actionLength ; action_softmax ++ ) 
     <span style="white-space:pre">		</span>{
     <span style="white-space:pre">			</span>//好好理解下
     <span style="white-space:pre">			</span>if(validAction(state, action_softmax) == false){
         <span style="white-space:pre">			</span>continue;
         <span style="white-space:pre">		</span>}
     <span style="white-space:pre">			</span>if( rndValue > offset && rndValue < (offset + prob[action_softmax]))
     <span style="white-space:pre">				</span>selectAction = action_softmax;
     <span style="white-space:pre">			</span>offset += prob[action_softmax];
     <span style="white-space:pre">		</span>}
     <span style="white-space:pre">		</span>//监测行为是否能够到达
     <span style="white-space:pre">		</span>if(this.validAction(state, selectAction))
     <span style="white-space:pre">		</span>{
     <span style="white-space:pre">			</span>valid = true;
     <span style="white-space:pre">		</span>}
     <span style="white-space:pre">		</span>break;
     <span style="white-space:pre">	</span>}
     <span style="white-space:pre">	</span>return selectAction;
     }
验证状态的正确性

     boolean validAction(int state,int action)
     {
    	boolean ret = true;
     	if(action < 0 || action > 5)
     		ret = false;
     	if(st.reward[state][action] < 0)
     		ret = false;
     	return ret;
     }
     
进行训练

void qLearningTrain(long time)
     {
    	 int count = 0;
    	 int initState = proRandomNumber.nextInt(st.actionLength);;
     	 do
     	 {
     		  double this_Q;
     	      double max_Q;
     	      double new_Q;
     	      double reward = 0;
              //int nextAction =  proRandomNumber.nextInt(st.actionLength);
              int nextAction = getNextActionE_SoftMax(initState);
//              while(!validAction(initState,nextAction)){
//            	  nextAction =  proRandomNumber.nextInt(st.actionLength);
//              }
     	      reward = st.getReward(initState, nextAction);
     	      this_Q = st.getqValues(initState, nextAction);
     	      max_Q = st.getMaxQValues(nextAction);
     	     // new_Q = reward + (st.alpha) * (max_Q);
     	      new_Q = reward + (st.alpha) * (reward + (st.gamma) * max_Q - this_Q);
     	      st.setqValues(initState, nextAction, new_Q);
     	      initState = nextAction;
     	      
     	 }while(initState != 5);
     }
     
代码参考 :http://pan.baidu.com/s/1c1VTuyG



  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值