机器学习-决策树算法-Ble RSSI定位

在调试蓝牙钥匙定位中,由于RSSI信号的不稳定性,蓝牙定位准确率不高,最近也在研究机器学习的算法,用决策树算法模型来测试验证蓝牙RSSI定位的精确度

1、数据

汽车上布置了8个基站,基站编号从R1到R8,y对应的采集数据的位置。
在这里插入图片描述

2、读取数据

#读取了两份文件的数据

def loaddata():
    feature_name = ['r1','r2','r3','r4','r5','r6','r7','r8']
    dataSet = []
    fr = open('data/ble_rssi1.txt')
    for line in fr.readlines():
        lineArr = line.strip().split('\t')        
        inList = []
        n = len(lineArr)
         #listLine = [float(e) for e in lineArr[] ]
        for i in range(n - 1):
            inList.append(float(lineArr[i]))
        inList.append(lineArr[n-1])
        dataSet.append(inList)  
    fr = open('data/ble_rssi_android1.txt')
    for line in fr.readlines():
        lineArr = line.strip().split('\t') 
        inList = []
        n = len(lineArr)
         #listLine = [float(e) for e in lineArr[] ]
        for i in range(n - 1):
            inList.append(float(lineArr[i]))
        inList.append(lineArr[n-1])
        dataSet.append(inList)   
   return dataSet,feature_name

3 信息增益算法

3.1 计算信息熵

#计算数据集的熵
def entropy(dataSet):
    #数据集条数
    m = len(dataSet)
    #标签不同类别的计数字典
    labelCounts = {}
    #循环数据集
    for featVec in dataSet:
        #取标签 最后一列
        currentLabel = featVec[-1]
        #print('currentLabel =' ,currentLabel)
        #如果字典中不存在则值为0,否则值加1 (key对应的数据个数)
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    #保存最终的熵值
    e = 0.0 
    #根据公式计算熵
    for key in labelCounts:
        prob = float(labelCounts[key])/m
        e -= prob * log(prob,2)
    return e

3.2 数据划分

#划分数据集
def splitDataSet(dataSet, axis, value):
    #按轴和值划分好的数据集
    retDataSet1 = []
    retDataSet2 = []
    #循环数据集
    for featVec in dataSet:
        #当前数据按轴取出的数据符合传入的value值
       # print(featVec);
        reducedFeatVec = featVec[:axis] #留下axis前的列
        reducedFeatVec.extend(featVec[axis:])#连接上从axis开始取到最后列
         #print('featVec[axis] = ' ,featVec[axis] , ' value = ' ,value)
        if float(featVec[axis]) <= value:#如果符合则留下
            retDataSet1.append(reducedFeatVec)
        else:
            retDataSet2.append(reducedFeatVec) 
         #print('retDataSet1 ' ,retDataSet1 , ' retDataSet2' ,retDataSet2)
    return retDataSet1,retDataSet2

3.3 数据排序和离散化

#冒泡排序,降序排列
def bubbleSort(arr):
    for i in range(1, len(arr)):
        for j in range(0, len(arr)-i):
            if arr[j] > arr[j+1]:
                arr[j], arr[j + 1] = arr[j + 1], arr[j]
    return arr

       

#数据离散化处理
def discreteFeat(featList):
    featList = list(featList)
    featList = bubbleSort(featList)
    featVals = []
    for i in range(len(featList) - 1):
        value = (featList[i] + featList[i+1])/2;
        featVals.append(value)
    return featVals

3.4 选择最好特征和特征值

#选择最好的特征
def chooseBestFeature(dataSet):
    #特征数
    n = len(dataSet[0]) - 1
    #计数整个数据集的熵
    baseEntropy = entropy(dataSet)
    #定义两个变量,最好的信息增益,最好特征
    bestInfoGain = 0.0; bestFeature = -1
    bestFeatureValue = 0.0;
    #遍历每个特征
    for i in range(n):  
        #获取当前特征的所有值 i = 0 即取特征第一列数据 即r1
        featList = [example[i] for example in dataSet]
        
        featList = set(featList) 
       # print('featList = ' ,featList) 
       
        featCount = len(featList)
        
        if featCount > 1:
            #数据离散化处理
            featVals = discreteFeat(featList)
        else:
            featVals = featList
        #当前特征的可能取值
        uniqueVals = set(featVals) 
        
        #最小条件熵
        minEntropy = 0.0
        #定义一临时变量保存当前的条件熵
        newEntropy = 0.0
        #特征连续值T对应最好的划分值Ta
        bestValue = 0.0
        #循环每一个可能的取值
        for value in uniqueVals:
            #按连续值二分法进行数据集的划分
            subDataSet1,subDataSet2 = splitDataSet(dataSet, i, value)
            #计算条件熵(2行代码)
            prob1 = len(subDataSet1)/float(len(dataSet))
            prob2 = len(subDataSet2)/float(len(dataSet))
            #二分类划分的条件熵,选择最小条件熵(即信息增益最大)
            newEntropy = prob1 * entropy(subDataSet1) + prob2* entropy(subDataSet2)
            if minEntropy == 0:
                minEntropy = newEntropy
                bestValue = value
            if newEntropy < minEntropy:
                minEntropy = newEntropy
                bestValue = value
            
            
        #计算信息增益
        infoGain = baseEntropy - minEntropy  
        #保存当前最大的信息增益及对应的特征
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
            bestFeatureValue = bestValue
    #返回最优特征
    return bestFeature,bestFeatureValue,

3.5 生成决策树

global i  
i = 0 

#递归训练一棵树
def trainTree(dataSet,feature_name):
    global i
    i = i + 1
#print('dataSet ' ,i)
#print(dataSet)
    #将dataSet中的数据先按行依次放入example中,然后取得example中的example[i]元素,放入列表featList中
    classList = [example[-1] for example in dataSet]
    
#print('classList ' , i)
#print(classList)
#print('classList[0] = ' ,classList[0], ' dataSet[0] = ' ,dataSet[0])
    if len(classList) <= 0:
        return 'Unknow'
    #所有元素的类别一样
    if classList.count(classList[0]) == len(classList): 
        return classList[0]#所有类别都一致
    if len(dataSet[0]) == 1: #数据集中只剩下一个特征
        return classVote(classList)
    if i > 100:
        i = 0
        return classVote(classList)
    
    bestFeat,bestFeatValue = chooseBestFeature(dataSet)
    bestFeatName = feature_name[bestFeat]
    
   #print('i = ' , i , ' bestFeat = ' ,bestFeat , ' bestFeatName = ' , bestFeatName ,' bestFeatValue = ' ,bestFeatValue)
    
    myTree = {bestFeatName:{}}
    #del(labels[bestFeat])
    
    sub_feature_name = feature_name[:];
    splitDataSet1 ,splitDataSet2 = splitDataSet(dataSet, bestFeat, bestFeatValue)
    #print('splitDataSet1 = ' ,splitDataSet1)
   #print('splitDataSet2 = ' ,splitDataSet2)
    
    value_greate_equal = str(bestFeatValue) + '-'
    value_less = str(bestFeatValue) + '+'
#print(' mTree ' , myTree)
    myTree[bestFeatName][value_greate_equal] = trainTree(splitDataSet1,sub_feature_name)
    myTree[bestFeatName][value_less] = trainTree(splitDataSet2,sub_feature_name)  
    return myTree  

3.6 训练

myDat,feature_name = loaddata()
myTree = trainTree(myDat,feature_name)
print('myTree = ' ,myTree)

4 基尼指数算法

4.1 基尼指数

#计算基尼指数Gini(D) 1- sum{pow[(Ck/D),2]}
def giniIndex(dataSet):
    #数据集条数
    m = len(dataSet)
    #标签不同类别的计数字典
    labelCounts = {}
    #循环数据集
    for featVec in dataSet:
        #取标签 最后一列
        currentLabel = featVec[-1]
        #print('currentLabel =' ,currentLabel)
        #如果字典中不存在则值为0,否则值加1 (key对应的数据个数)
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    #保存最终的熵值
    e = 0.0 
    #根据公式计算熵
    for key in labelCounts:
        prob = float(labelCounts[key])/m
        e += pow(prob,2)
        
    e = 1 - e
    return e

4.2 根据基尼指数值选择最优特征

#根据gini 选择最好的特征
def chooseBestFeatureGini(dataSet):
    #特征数
    n = len(dataSet[0]) - 1
    #定义两个变量,最好的基尼系数,最好特征
    bestGini = -1; bestFeature = -1
    bestFeatureValue = 0.0;
    #遍历每个特征
    for i in range(n):  
        #获取当前特征的所有值 i = 0 即取特征第一列数据 即r1
        featList = [example[i] for example in dataSet]
        
        featList = set(featList)      
        featCount = len(featList)
        
        if featCount > 1:
            #数据离散化处理
            featVals = discreteFeat(featList)
        else:
            featVals = featList
        #当前特征的可能取值
        uniqueVals = set(featVals) 
        
         #print('i = ' ,i , ' featList = ' , featList , ' featVals = ' , featVals , ' dataSet =' ,dataSet)
        
        #最小条件熵
        minGiniDA = -1
        #给定特征A时的基尼指数
        giniDA = 0.0
        #特征连续值T对应最好的划分值Ta
        bestValue = 0.0
        #循环每一个可能的取值
        for value in uniqueVals:
            #按连续值二分法进行数据集的划分
            subDataSet1,subDataSet2 = splitDataSet(dataSet, i, value)
            #计算数据集D1,D2占比
            prob1 = len(subDataSet1)/float(len(dataSet))
            prob2 = len(subDataSet2)/float(len(dataSet))
            #给定特征A条件下的基尼指数)
            giniDA = prob1 * giniIndex(subDataSet1) + prob2* giniIndex(subDataSet2)
             #print('value = ' ,value , ' i = ' ,i)
             #print(' subDataSet1 =' ,subDataSet1 , ' giniIndex(subDataSet1) = ' ,giniIndex(subDataSet1))
             #print(' subDataSet2 =' ,subDataSet2 , ' giniIndex(subDataSet2) = ' ,giniIndex(subDataSet2))
            #选择基尼系数最小的特征和值进行划分
            if minGiniDA == -1:
                minGiniDA = giniDA
                bestValue = value
            if giniDA < minGiniDA:
                minGiniDA = giniDA
                bestValue = value
        
            
        #选择最小基尼指数,保存当前最小的基尼指数特征和对应的特征值
        if bestGini == -1:
            bestGini = minGiniDA
            bestFeature = i
            bestFeatureValue = bestValue
        if minGiniDA < bestGini:
            bestGini = minGiniDA
            bestFeature = i
            bestFeatureValue = bestValue
            
        #print('bestGini = ' ,bestGini , ' minGiniDA = ' ,minGiniDA , ' bestFeatureValue' ,bestFeatureValue)
            
    #返回最优特征
    return bestFeature,bestFeatureValue,

4.3 生成树

#递归训练一棵树
def trainTree(dataSet,feature_name):
    global i
    i = i + 1
#print('dataSet ' ,i)
#print(dataSet)
    #将dataSet中的数据先按行依次放入example中,然后取得example中的example[i]元素,放入列表featList中
    classList = [example[-1] for example in dataSet]
    
#print('classList ' , i)
#print(classList)
#print('classList[0] = ' ,classList[0], ' dataSet[0] = ' ,dataSet[0])
    if len(classList) <= 0:
        return 'Unknow'
    #所有元素的类别一样
    if classList.count(classList[0]) == len(classList): 
        return classList[0]#所有类别都一致
    if len(dataSet[0]) == 1: #数据集中只剩下一个特征
        return classVote(classList)
    if i > 100:
        i = 0
        return classVote(classList)
    
    bestFeat,bestFeatValue = chooseBestFeatureGini(dataSet)
    bestFeatName = feature_name[bestFeat]
    
   #print('i = ' , i , ' bestFeat = ' ,bestFeat , ' bestFeatName = ' , bestFeatName ,' bestFeatValue = ' ,bestFeatValue)
    
    myTree = {bestFeatName:{}}
    # del(labels[bestFeat])
    
    sub_feature_name = feature_name[:];
    splitDataSet1 ,splitDataSet2 = splitDataSet(dataSet, bestFeat, bestFeatValue)
   #print('splitDataSet1 = ' ,splitDataSet1)
   #print('splitDataSet2 = ' ,splitDataSet2)
    
    value_greate_equal = str(bestFeatValue) + '-'
    value_less = str(bestFeatValue) + '+'
  #print(' mTree ' , myTree)
    myTree[bestFeatName][value_greate_equal] = trainTree(splitDataSet1,sub_feature_name)
    myTree[bestFeatName][value_less] = trainTree(splitDataSet2,sub_feature_name)
    
    return myTree  

5 预测

#预测
def predict(inputTree,featLabels,testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    
    key = testVec[featIndex]
    key_values = secondDict.keys()
    key_value1 = list(key_values)[0]
    key_value2 = list(key_values)[1]
    
    value = str(key_value1)
    value = value[0:len(value) - 1]
    
    f_value = float(value)
    if(key <= f_value):
        key = key_value1
    else:
        key = key_value2
        
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict): 
        classLabel = predict(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    
    return classLabel

6 后剪枝

#定义一个常用函数 用来求numpy array中数值等于某值的元素数量
def equalNums(x ,y):
    #定义一字典,记录每个标签对应的个数
    classCount = 0
    #循环计数
    for vote in x:
        if vote == y:
            classCount += 1
    return classCount
#创建预剪枝决策树
#names feature_name
global j
j = 0
def createTreePrePruning(dataSet,feature_name):
    global j
    j = j + 1
    
    classList = [example[-1] for example in dataSet]
    if len(classList) <= 0:
        return 'Unknow'
    #所有元素的类别一样
    elif classList.count(classList[0]) == len(classList): 
        return classList[0]#所有类别都一致
    elif len(dataSet[0]) == 1: #数据集中只剩下一个特征
        return classVote(classList)
    elif j > 100:
        j = 0
        return classVote(classList)
    
    bestFeat,bestFeatValue = chooseBestFeatureGini(dataSet)
    bestFeatName = feature_name[bestFeat]
    
    
    # 根据最优特征进行分割
    sub_feature_name = feature_name[:];
    splitDataSet1 ,splitDataSet2 = splitDataSet(dataSet, bestFeat, bestFeatValue)
    #print('splitDataSet1 = ' ,splitDataSet1)
    #print('splitDataSet2 = ' ,splitDataSet2)
    # 预剪枝评估
    # 划分前的分类标签的精度
    labelTrainLabelPre = classVote(classList)
    labelTrainRatioPre = equalNums(classList, labelTrainLabelPre) / len(classList)
    
    #划分后 左后叶子结点的精度
    classList1 = [example[-1] for example in splitDataSet1]
    classList2 = [example[-1] for example in splitDataSet2]
    
    
    labelTrainLabelPost1 = classVote(classList1)
    labelTrainRatioPost1 = equalNums(classList1, labelTrainLabelPost1) / len(classList1)
    
    labelTrainLabelPost2 = classVote(classList2)
    labelTrainRatioPost2 = equalNums(classList2, labelTrainLabelPost2) / len(classList2)
    
    labelTrainRatioPost = labelTrainRatioPost1 + labelTrainRatioPost2;
    
    value_greate_equal = str(bestFeatValue) + '-'
    value_less = str(bestFeatValue) + '+'
    
    
    if dataSet is None:
        return labelTrainLabelPre 
    #如果划分后的精度相比划分前的精度下降, 则直接作为叶子节点返回
    elif labelTrainRatioPost < labelTrainRatioPre:
        return labelTrainLabelPre
    else :
        #根据选取的特征名称创建树节点
        decisionTree = {bestFeatName: {}}
        #对最优特征的每个特征值所分的数据子集进行计算
        decisionTree[bestFeatName][value_greate_equal] = createTreePrePruning(splitDataSet1,sub_feature_name)
        decisionTree[bestFeatName][value_less] = createTreePrePruning(splitDataSet2,sub_feature_name)
    return decisionTree 

7、预测测试

xgTreePrePruning = createTreePrePruning(myDat,feature_name)
print('xgTreePrePruning = ' ,xgTreePrePruning)


#测试,计算预测准备度
from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=1)

y_hat = []

n = X_test.shape[0] - 1
X_test = np.array(X_test)
y_test = np.array(y_test)

y_pre = []

predict_cnt = 0;
for i in range(n):
    #y_predet = predict(myTree,feature_name,X_test[i])
    y_predet = predict(xgTreePrePruning,feature_name,X_test[i])
    y_pre.append(y_predet)
   #print('i = ' ,i ,'y_test[i] = ' ,y_test[i] , ' y_predet =' ,y_predet)
    if (str(y_predet) == str(y_test[i])):
        predict_cnt = predict_cnt + 1       
rate = predict_cnt/n
print('rate = ' ,rate)

参考:https://blog.csdn.net/ylhlly/article/details/93213633

  • 3
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值