先附上代码,学习笔记回头再补充
完整代码如下:
# -*- coding: utf-8 -*-
# 关闭警告
# import warnings
# warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_validate
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.cluster import KMeans
from sklearn import preprocessing as prep
import matplotlib.pyplot as plt
class Cluster():
# #数据读取
def data_read(self, data_path, file_name, typeid):
'''
数据读取
:param data_path: 文件存储路径
:param file_name: 文件名
:param typeid: 价位段
:return: 价位段原始数据和价位段去无关变量数据
'''
data = pd.read_excel(data_path + file_name, index_col='pack_bar').dropna() # 删除缺失记录
data1_type = data[data['typeid'].isin(typeid)] # 取出价位段内记录
# data1_type = data1_type[data1_type['ccom_id'].isin([11110001])] # 取出某地市记录
data_type = data1_type.drop(['typeid', 'month_double', 'ccom_id', 'net_month_double'], 1) # ,'net_date'删除无关自变量
return data1_type, data_type
def outlier_filtrate(self,data_type,method='std',fill='nan',threshold=1):
'''
异常值处理机制
:param data_type: 原始数据
:param method: 处理方法,{'std':'正态异常','quantile':'箱线异常','threshold':'定值异常'}
:param fill: 值填充方法,{'nan':'空值','extremum':'极值替换'}
:param threshold: 异常值判断阈值,仅当method是threshold有效
:return:新数据
'''
ncol = data_type.shape[1]
colnames = data_type.columns
colnames2 = list(filter(lambda x:x.find('_incr')>0,colnames)) # 仅判断增长率数据
data2_type = data_type.copy()
for i in range(ncol):
datai = data2_type.iloc[:,i]
# 正态异常
if method == 'std':
xmean = datai.mean()
xstd = datai.std()
up = xmean + 2 * xstd
dw = xmean - 2 * xstd
if any(datai > up):
print('存在上限异常值')
if fill == 'nan':
data2_type.iloc[:, i][datai > up] = np.nan
else:
data2_type.iloc[:, i][datai > up] = datai[datai < up].max()
else:
print('不存在上限异常值')
if any(datai < dw):
print('存在下限异常值')
if fill == 'nan':
data2_type.iloc[:, i][datai < dw] = np.nan
else:
data2_type.iloc[:, i][datai < dw] = datai[datai < dw].min()
else:
print('不存在下限异常值')
# 箱线图异常
if method == 'quantile':
q1 = datai.quantile(0.25)
q3 = datai.quantile(0.75)
up = q3 + 1.5 * (q3 - q1)
dw = q1 - 1.5 * (q3 - q1)
if any(datai > up):
print('存在上限异常值')
if fill == 'nan':
data2_type.iloc[:, i][datai > up] = np.nan
else:
data2_type.iloc[:, i][datai > up] = datai[datai < up].max()
else:
print('不存在上限异常值')
if any(datai < dw):
print('存在下限异常值')
if fill == 'nan':
data2_type.iloc[:, i][datai < dw] = np.nan
else:
data2_type.iloc[:, i][datai < dw] = datai[datai < dw].min()
else:
print('不存在下限异常值')
# 超过阈值异常
if method == 'threshold':
# 箱线图监测
if colnames2.__contains__(colnames[i]):
up = threshold
dw = (-1.0)*threshold
if any(datai > up):
print('存在上限异常值')
if fill == 'nan':
data2_type.iloc[:, i][datai > up] = np.nan
else:
data2_type.iloc[:, i][datai > up] = up
else:
print('不存在上限异常值')
if any(datai < dw):
print('存在下限异常值')
if fill == 'nan':
data2_type.iloc[:, i][datai < dw] = np.nan
else:
data2_type.iloc[:, i][datai < dw] = dw
else:
print('不存在下限异常值')
# temp = abs(data2_type[colnames2]) <= threshold # 判断是否异常
# lab = temp.apply(lambda x: x.min(), axis=1) # 每行只要有异常值就为False
data2_type = data2_type.dropna() # 删除增长率在1以上的记录
return data2_type
def corr_filtrate(self, data_type, thred_corr=0.4):
'''
根据相关性阈值筛选变量
:param data_type:原数据
:param thred_corr:相关性阈值
:return:新数据
'''
corrX = data_type.corr()
colnames = data_type.columns
colnames3 = list()
for j in range(corrX.shape[1] - 1): # 删除相关系数大于0.5的变量
for i in range(j + 1, corrX.shape[0] - 1):
if abs(corrX.iloc[i, j]) >= thred_corr:
if np.mean(corrX.iloc[i, :]) < np.mean(corrX.iloc[:, j]): # 去掉其中平均绝对相关系数较大的那一个
colnames3.append(colnames[j])
else:
colnames3.append(colnames[i])
break
colnames4 = colnames.drop(list(set(colnames3)))
data2_type = data_type[colnames4]
return data2_type
def vif_filtrate(self,data2_type, thred_vif=4):
'''
膨胀因子阈值筛选变量
:param data2_type: 原数据
:param thred_vif: 膨胀因子阈值
:return: 新数据
'''
vif = [round(variance_inflation_factor(data2_type.values, i),2) for i in range(data2_type.shape[1])] # 共线性检验
data3_type = data2_type.copy()
while sum(list(map(lambda x:x>=thred_vif,vif))) > 0:
colnames = data3_type.columns[:-1]
for i in range(vif.__len__()-1): # 删除共线性较强的变量
if vif[i] >= thred_vif :
data3_type = data3_type.drop(colnames[i], 1)
vif = [round(variance_inflation_factor(data3_type.values, i), 2) for i in
range(data3_type.shape[1])] # 共线性检验
break
return data3_type
def data_scale(self, data3_type, method='normalize'): # 数据标准化
'''
数据标准化(归一化)
:param data3_type: 原数据
:param method: 标准化方法,{'minmax':'0-1标准化',
'z-score':'正态标准化',
'normalize':'归一化'
'maxabs':'缩放比例为绝对值最大值,并保留正负号',
'robust':'四分之一和四分之三分位点之间'}
:return: 新数据
'''
if method == 'minmax':
# 0-1标准化
data_minmax = prep.minmax_scale(data3_type, feature_range=(0, 1), axis=0, copy=True) # 直接用标准化函数
data_scale = pd.DataFrame(data=data_minmax, columns=data3_type.columns, index=data3_type.index)
elif method == 'z-score':
# z-score标准化
data_zs = prep.scale(data3_type, axis=0, with_mean=True, with_std=True, copy=True) # 直接用标准化函数
data_scale = pd.DataFrame(data=data_zs, columns=data3_type.columns, index=data3_type.index)
elif method == 'normalize':
# 归一化处理
data_norm = prep.normalize(data3_type, norm='l2', axis=1) # 直接用标准化函数
data_scale = pd.DataFrame(data=data_norm, columns=data3_type.columns, index=data3_type.index)
elif method == 'maxabs':
# 数据的缩放比例为绝对值最大值,并保留正负号,即在区间[-1, 1]内。唯一可用于稀疏数据scipy.sparse的标准化
data_ma = prep.maxabs_scale(data3_type, axis=0, copy=True)
data_scale = pd.DataFrame(data=data_ma, columns=data3_type.columns, index=data3_type.index)
elif method == 'robust':
# 通过 Interquartile Range(IQR) 标准化数据,即四分之一和四分之三分位点之间
data_rb = prep.robust_scale(data3_type, axis=0, with_centering=True, with_scaling=True, copy=True)
data_scale = pd.DataFrame(data=data_rb, columns=data3_type.columns, index=data3_type.index)
data4_type = data_scale
return data4_type
def data_factor(self, data4_type, replace='dependent', threshold=0.05, colnames=None):
'''
目的:数据二值化
:param data3_type: 原数据
:param replace: 替换的列,{'all':'all','dependent':'因变量','colnames':'自己输入变量名'}
:param threshold:二值化阈值
:param colnames:list类型,存储列名,仅当replace值为colnames时有效
:return:新数据
'''
data5_type = data4_type.copy()
nrow = data5_type.shape[0]
if replace == 'all':
# 所有变量二值化
data_binary = prep.binarize(data4_type, threshold=threshold,
copy=True) # 按照阈值threshold将数据转换成成0-1,小于等于threshold为 0
data_new = pd.DataFrame(data=data_binary, columns=data5_type.columns, index=data5_type.index)
data5_type = data_new
elif replace == 'dependent':
# 因变量二值化
for i in range(nrow):
value = 1 if data5_type.iloc[i, -1] > threshold else 0
data5_type.iloc[i, -1] = value
elif replace == 'colnames':
# 指定变量二值化
temp = data5_type[colnames]
if colnames.__len__() > 1:
data_binary = prep.binarize(temp, threshold=threshold,
copy=True) # 按照阈值threshold将数据转换成成0-1,小于等于threshold为 0
data5_type[colnames] = pd.DataFrame(data=data_binary, columns=temp.columns, index=temp.index)
else:
for i in range(nrow):
value = 1 if temp.values[i] > threshold else 0
data5_type[colnames].values[i] = value
# # 亚编码操作
# 打印二值化后数据分布
print(data5_type.iloc[:, -1].value_counts())
# encoder = prep.OneHotEncoder()
# X_OH = encoder.fit_transform(data3_type) #
# df = pd.DataFrame(X_OH.toarray())
return data5_type
def kmeans_cluster(self, data5_type):
# #######输入参数########
# data_type:价位段内相关数据
# #######################
data_train, data_test = train_test_split(data5_type,test_size=0.2) # random_state=1234
col_names = data_train.columns
X = data_train[col_names[:-1]]
K = range(1, 10)
meandistortion = []
for k in K:
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
meandistortion.append((sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))) / X.shape[0])
plt.subplot(2, 1, 2)
plt.plot(K, meandistortion, 'bx-')
plt.xlabel('k')
plt.ylabel(u'centers')
plt.title(u'Choose Best k')
plt.show()
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
X['cluster'] = kmeans.labels_
X['cluster'].value_counts()
centers = kmeans.cluster_centers_
return centers
def data_predict(self,data,colnames,estm):
# #######输入参数########
# data_type:价位段内相关数据;
# model:用来做预测的分析模型
# #######################
# # 利用回归模型预测优先投放城市
data_new = data[data['year1']==2016]
data_new2 = data_new.drop(['year1', 'year', 'type_id'], 1) # 删除无关变量
X = data_new2[colnames]
# data_g = self.data_group_byType(data_new2)
predictY = estm.predict(X)
result = pd.Series(index=X.index.tolist(),data=predictY.tolist()) # 城市增长预测
incr_ccom = result[result=='1'].index
return incr_ccom
if __name__ == '__main__':
# ##文件路径
data_path = 'C:\\Users\\90539\\PycharmProjects\\data\\'
file_name = 'data.xlsx'
typeid = ['B']
obj2 = Cluster()
data, data1 = obj2.data_read(data_path,file_name, typeid)
# data11 = obj2.data_group_byType(data1)
data2 = obj2.outlier_filtrate(data1, method='threshold', fill='nan', threshold=1)
data3 = obj2.data_scale(data2, method='minmax')
data4 = obj2.data_factor(data3, replace='dependent', threshold=0.1)
centers = obj2.kmeans_cluster(data4)