import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
warnings.filterwarnings("ignore")
# 由于各指标的权重是不同的,故这里考虑标准化而不是归一化
df= pd.read_excel(r'./data.xlsx')
# 提取特征训练数据
df_cluster=df[['进线次数','平均通话时长','平均静默时长','平均每通通话知识库点击次数']]
# 实例化,将数据标准化
data = np.array(df_cluster)
std = StandardScaler()
std_data= std.fit_transform(data)
print(std_data[:10])
3、训练数据
# KMeans繁类。由于数量极少,我们可以用个循环来判断
score_list = []
group_list = [i for i in range(2,15)]
for i in group_list:
model = KMeans(n_clusters=i)
y_pred = model.fit_predict(std_data)
score_list.append(metrics.calinski_harabaz_score(std_datay_pred))
score_list = np.array(score_list)
max_index = np.argmax(score_list)
print("最佳分组数:{0}".format(group_list[max_index]))
model = KMeans(n_clusters=group_list[max_index])
y_pred = model.fit_predict(std_data)
df['y_pred'] = y_pred