#课本例题
import numpy as np
import pandas as pd
airline_data=pd.read_csv("C:\\data\\air_data.csv",encoding='gb18030')
print(airline_data.shape)
exp1=airline_data['SUM_YR_1'].notnull()
exp2=airline_data['SUM_YR_2'].notnull()
exp=exp1&exp2
airline_notnull=airline_data.loc[exp,:]
print(airline_notnull.shape)
index1=airline_notnull['SUM_YR_1']!=0
index2=airline_notnull['SUM_YR_2']!=0
index3=(airline_notnull['SEG_KM_SUM']>0)&(airline_notnull['avg_discount']!=0)
index=(index1|index2)&index3
airline=airline_notnull.loc[index,:]
print(airline.shape)
airline_selection=airline[['FFP_DATE','LOAD_TIME','FLIGHT_COUNT','LAST_TO_END',\
'avg_discount','SEG_KM_SUM']]
L=pd.to_datetime(airline_selection['LOAD_TIME'])-pd.to_datetime(airline_selection['FFP_DATE'])
L=L.astype('str').str.split().str[0]
L=L.astype('int')/30
airline_features=pd.concat([L,airline_selection.iloc[:,2:]],axis=1)
from sklearn.preprocessing import StandardScaler
data=StandardScaler().fit_transform(airline_features)
np.savez('C:\\data\\airline_scale.npz',data) #多个数组存储
import numpy as np
from sklearn.cluster import KMeans
airline_scale=np.load('C:\\data\\airline_scale.npz')['arr_0']
kmeans_model=KMeans(n_clusters=5,n_jobs=4,random_state=123).fit(airline_scale)
kmeans_model.cluster_centers_
kmeans_model.labels_
r1=pd.Series(kmeans_model.labels_).value_counts()
print(r1)
#课后练习题
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
data=pd.read_csv("C:\\data\\data.csv",encoding='gbk')
data=data.iloc[:,1:]
kmeans=KMeans(n_clusters=5,random_state=42).fit(data)
predict=kmeans.labels_
r=pd.Series(kmeans.labels_).value_counts()
print(r)
plt.scatter(data.iloc[:,0],data.iloc[:,1],c=predict,marker='o')
plt.title("Kmeans-Basketball Data")
plt.xlabel("assists_per_minute")
plt.ylabel("points_per_minute")
plt.legend(["Rank"])
plt.show()
#实训1
import numpy as np
import pandas as pd
credit_card=pd.read_csv("C:\\data\\credit_card.csv",encoding='gbk')
print(credit_card.shape)
index1=credit_card['逾期']==1
index2=credit_card['呆账']==1
index3=credit_card['强制停卡记录']==1
index4=credit_card['退票']==1
index5=credit_card['拒往记录']==1
index6=credit_card['瑕疵户']==2
index=((((index1 & index2) & index3) & index4) & index5) & index6
credit=credit_card.drop(labels=credit_card[index].index,axis=0)
print(credit.shape)
index1=credit_card['呆账']==1
index2=credit_card['强制停卡记录']==1
index3=credit_card['退票']==1
index4=credit_card['拒往记录']==2
index=((index1 & index2) & index3) & index4
credit=credit.drop(labels=credit[index].index,axis=0)
print(credit.shape)
index1=credit_card['频率']==5
index2=credit_card['月刷卡额']!=1
index=index1 & index2
credit=credit.drop(labels=credit[index].index,axis=0)
print(credit.shape)
#实训2
history=credit[['瑕疵户','逾期','呆账','强制停卡记录','退票','拒往记录']]
economy=credit[['借款余额','个人月收入','个人月开销','家庭月收入','月刷卡额']]
income=credit[['职业','年龄','住家']]
from sklearn.preprocessing import StandardScaler
historyScaler=StandardScaler().fit_transform(history)
economyScaler=StandardScaler().fit_transform(economy)
incomeScaler=StandardScaler().fit_transform(income)
credict_features=pd.concat([history,economy,income],axis=1)
credict_features_Scaler=StandardScaler().fit_transform(credict_features)
#实训3
from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=5,random_state=123).fit(credict_features_Scaler)
kmeans.cluster_centers_
pd.Series(kmeans.labels_).value_counts()
credict_features['label']=kmeans.labels_
from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=5,random_state=123).fit(historyScaler)
kmeans.cluster_centers_
pd.Series(kmeans.labels_).value_counts()
kmeans=KMeans(n_clusters=5,random_state=123).fit(economyScaler)
kmeans.cluster_centers_
pd.Series(kmeans.labels_).value_counts()
kmeans=KMeans(n_clusters=5,random_state=123).fit(incomeScaler)
kmeans.cluster_centers_
pd.Series(kmeans.labels_).value_counts()
#2 无不良记录,但是经济水平不高,花销不大
#0 无不良记录,但是经济水平不高,花销很大
#1 历史记录不好,经济水平低下,花销高
#4 无不良记录,但是经济水平高,花销很大
#3 无不良记录,但是经济水平高,花销很大
#1 0 2 3 4
第七章
最新推荐文章于 2022-11-15 19:18:47 发布