数据集为:
代码实现:
import pandas as pd
import numpy as np
import math
# 导入初始数据
data = [(0, 1, 3, -1),
(0, 3, 1, -1),
(1, 2, 2, -1),
(1, 1, 3, -1),
(1, 2, 3, -1),
(0, 1, 2, -1),
(1, 1, 2, 1),
(1, 1, 1, 1),
(1, 3, 1, -1),
(0, 2, 1, -1)]
data = pd.DataFrame(data, index=[i for i in range(1, len(data) + 1)],
columns=["body", "Business capability", "development potential", "classification"])
print(data)
# 筛选出单层决策树(初步筛选)
m, n, z = len(data), 0, 0 # 计数
lis = []
flag = 0
for i in range(len(data.columns) - 1):
v = (max(data.iloc[:, i]) - min(data.iloc[:, i])) / 10 # 设定步长为10
a = [i for i in np.arange((min(data.iloc[:, i]) - v), max(data.iloc[:, i]) + v, v)] # 生成第二次出现的列表
for j in a:
for k in range(len(data)):
# n 小于等于分类点为-1, 大于分类点为1
if (j >= data.iloc[k, i] and data.iloc[k, 3] == -1) or (j < data.iloc[k, i] and data.iloc[k, 3] == 1):
n = n + 1
# z 大于等于分类点为-1, 小于分类点为1
elif (j <= data.iloc[k, i] and data.iloc[k, 3] == -1) or (j > data.iloc[k, i] and data.iloc[k, 3] == 1):
z = z + 1
if n > z:
flag = 1
lis.append((i, j, n, 1 - n / m, flag))
else:
flag = -1
lis.append((i, j, z, 1 - z / m, flag))
n, z = 0, 0
# flag = 1 表示大于分类点为正,小于等于为负 flag = -1 表示大于等于分类点为负,小于为正
data1 = pd.DataFrame(lis, index=[i for i in range(1, len(lis) + 1)],
columns=["Partition attribute", "Dividing point", "Correct number", "error rate",
"Classification method"])
print("-" * 100)
# print(data1)
# print("-" * 100)
# 筛选弱分类器 (错误率小于0.2的分类器保留)
lis1 = []
for i in range(len(data1)):
if data1.iloc[i, 3] >= 0.4:
lis1.append(i + 1)
data1.drop(index=lis1, inplace=True)
data1.reset_index(drop=True, inplace=True) # 重新排序index
data1.set_axis([i for i in range(1, len(data1) + 1)], inplace=True) # 将index排序从1开始
# print(data1)
# 进行adaboost
# 每个数据点的权重
lis2 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
# 每个数据点分类正确或错误
alphalist = [] # 记录每个弱分类器的alpha
alpha = 1
lis4 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
for i in range(len(data1)):
Z = 0 # 规范化因子
# a1 为第几个属性 a2 为分割点 a3为flag
lis3 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # 记录每个点的正确或者错误情况
a1, a2, a3 = data1.iloc[i, 0], data1.iloc[i, 1], data1.iloc[i, 4]
# print(a1, a2, a3)
for j in range(len(data)):
# flag = 1 表示大于分类点为正,小于等于为负 flag = -1 表示大于等于分类点为负,小于为正
# 判别函数
if a3 == 1:
if (data.iloc[j, a1] > a2 and data.iloc[j, 3] == 1) or (data.iloc[j, a1] <= a2 and data.iloc[j, 3] == -1):
lis3[j] = 1
else:
lis3[j] = -1
else:
if (data.iloc[j, a1] >= a2 and data.iloc[j, 3] == -1) or (data.iloc[j, a1] < a2 and data.iloc[j, 3] == 1):
lis3[j] = 1
else:
lis3[j] = -1
# print(lis3)
# 求误差率
e = 0
for j in range(len(lis3)):
if lis3[j] == -1:
e = e + lis2[j] * lis3[j] * -1
# print(e)
alpha = 1 / 2 * math.log((1 - e) / e)
for j in range(len(lis2)):
Z = Z + math.exp(-alpha * lis3[j])
# 权重
for j in range(len(lis2)):
lis2[j] = lis2[j] * math.exp(-alpha * lis3[j]) / Z
# print(lis2)
alphalist.append(alpha)
# print(alphalist)
data1["coefficient"] = alphalist
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
print(data1)
输出结果为:
其中coefficient为各自弱分类器的系数