2023年9月-实验五
【实验名称】
分类算法-决策树算法实现与应用
【实验目的】
样本数据集如下,根据ID3算法,构建电脑类别属性的决策树。
【实验内容】
1、程序清单
# 使用ID3算法实现决策树
from math import log
import operator
def calcShannonEnt(dataSet):
numEntries=len(dataSet)
labelCounts={}
for featVec in dataSet:
currentLabel=featVec[-1] #提取标签信息
if currentLabel not in labelCounts.keys(): #如果标签没有放入统计次数的字典,添加进去
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1 #label计数
shannonEnt=0.0 #经验熵
#计算经验熵
for key in labelCounts:
prob=float(labelCounts[key])/numEntries #选择该标签的概率
shannonEnt-=prob*log(prob,2) #利用公式计算
return shannonEnt #返回经验熵
def createDataSet():
dataSet = [
[1, 1, 0, 1],
[0, 0, 0, 1],
[1, 1, 0, 1],
[1, 1, 0, 1],
[1, 0, 0, 0],
[1, 0, 1, 0]
]
labels=["sex", "student", "nationality", "pc"]
return dataSet,labels
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reduceFeatVec=featVec[:axis]
reduceFeatVec.extend(featVec[axis+1:])
retDataSet.append(reduceFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt((subDataSet))
infoGain = baseEntropy - newEntropy
print("第%d个特征的增益为%.3f" % (i, infoGain))
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet,labels,featLabels):
classList=[example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet)
bestFeatLabel=labels[bestFeat]
featLabels.append(bestFeatLabel)
myTree={bestFeatLabel:{}}
del(labels[bestFeat])
featValues=[example[bestFeat] for example in dataSet]
uniqueVls=set(featValues)
for value in uniqueVls:
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),
labels,featLabels)
return myTree
if __name__=='__main__':
dataSet,labels=createDataSet()
featLabels=[]
myTree=createTree(dataSet,labels,featLabels)
print(myTree)
2、结果截图
2023年9月-实验六
【实验名称】
分类算法-朴素贝叶斯实现与应用
【实验目的】
某个医院早上收了六个门诊病人,如下表:
现在又来了第七个病人,是一个打喷嚏的建筑工人。请问他最有可能患的什么疾病?使用朴素贝叶斯算法进行分类预测。
【实验内容】
1、程序清单
import numpy as np
from collections import defaultdict
dataset = [
[0, 0, "感冒"],
[0, 1, "过敏"],
[1, 2, "脑震荡"],
[1, 2, "感冒"],
[0, 3, "感冒"],
[1, 3, "脑震荡"],
]
new_data = [0,2]
# 计算每个分类值的频率
def naive_bayes(dataset, new_data):
# 计算每个分类值的频率
class_freq = defaultdict(int)
for row in dataset:
class_freq[row[2]] += 1
# 计算每个特征对应每个分类值的频率
feature_freq = defaultdict(int)
for row in dataset:
for i, feature in enumerate(row[:2]):
feature_freq[(i, feature, row[2])] += 1
# 计算每个分类值的先验概率和条件概率
probs = defaultdict(float)
for class_val, class_count in class_freq.items():
probs[class_val] = class_freq[class_val] / len(dataset) # 先验概率
for i, feature in enumerate(new_data):
# 使用拉普拉斯平滑
probs[class_val] *= (feature_freq[(i, feature, class_val)] + 1) / (class_count + len(new_data))
# 返回后验概率最大的分类值
return max(probs, key=probs.get)
result = naive_bayes(dataset, new_data)
print(result)
2、结果截图
【实验体会】
一开始的时候把感冒头疼这样的症状标记了一个index。然后因为算法除了问题导致结果和感冒的index一模一样。后来发现既然都用dict了那么键值是类型是完全没有影响的。于是改了调试完把daatset的分类改回汉字了。
2023年10月-实验七
【实验名称】
聚类算法-Kmeans实现与应用
【实验目的】
对数据进行kmeans聚类,使用python实现。(其中K=2)
序号 | 属性1 | 属性2 |
1 | 1 | 1 |
2 | 2 | 1 |
3 | 1 | 2 |
4 | 2 | 2 |
5 | 4 | 3 |
6 | 5 | 3 |
7 | 4 | 4 |
8 | 5 | 4 |
【实验内容】
1、程序清单
data = [
[1.0, 1.0],
[2.0, 1.0],
[1.0, 2.0],
[2.0, 2.0],
[4.0, 3.0],
[5.0, 3.0],
[4.0, 4.0],
[5.0, 4.0]
]
def distance(x,y):
return (x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2
def solve(dataset):
center = [dataset[0], dataset[1]]
while True:
result = assign(dataset, center)
print("result:", result)
new_center = [[0, 0], [0, 0]]
count = [0, 0]
for i in range(len(dataset)):
new_center[result[i]][0] += dataset[i][0]
new_center[result[i]][1] += dataset[i][1]
count[result[i]] += 1
for i in range(len(new_center)):
new_center[i][0] /= count[i]
new_center[i][1] /= count[i]
if new_center == center:
break
center = new_center
print("聚类结果:")
print("center:", center)
print("result:", result)
for i in range(len(result)):
print(dataset[i], ":", result[i])
def assign(dataset, center):
result = []
for i in range(len(dataset)):
distances = []
for j in range(len(center)):
distances.append(distance(dataset[i],center[j]))
min = 0
for j in range(len(distances)):
if distances[j] < distances[min]:
min = j
result.append(min)
return result
if __name__ == "__main__":
solve(data)
2、结果截图
【实验体会】
对于算法的操作流程有了更加深刻的理解