import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
data=pd.read_csv(r"G:\大数据实验数据库\3.大数据实验数据\credit_card.csv",sep=",",encoding="GBK")
data.columns
data_action=data[["瑕疵户","逾期","呆账","强制停卡记录","退票","拒往记录"]]
data_action_std=StandardScaler().fit_transform(data_action)
data_kmean1=KMeans(n_clusters=5,random_state=0).fit(data_action_std)
r1=pd.Series(data_kmean1.labels_).value_counts()print("行为特征每类的数目\n",r1)print("数据的聚类中心\n",data_kmean1.cluster_centers_)
二、求解Pearson相关系数
import numpy as np
import pandas as pd
data=pd.read_csv(r"G:\大数据实验数据库\data.csv",encoding="GBK")print("相关系数矩阵为:\n",np.round(data.corr(method="pearson"),2))
三、Lasso关键特性截取
from sklearn.linear_model import Lasso
lasso=Lasso(1000)
lasso.fit(data.iloc[:,0:13],data['y'])print("相关系数为:",np.round(lasso.coef_,5))print('相关系数非0个数为:',np.sum(lasso.coef_!=0))