#%%
# K-means聚类
# 汽车产品聚类分析
# 对数据进行聚类分析,并找到vokswagen汽车的相应竞品
# car_price.csv, 数据包括了205款车的26个字段
#%%
import pandas as pd
data = pd.read_csv(r'E:\天池学习赛\汽车聚类分析\car_price.csv')
data
#%%
# 使用kmeans进行聚类,导入库
from sklearn.cluster import KMeans
# 进行数据预处理的库
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
# 第三方矩阵库
import numpy as np
#%%
# 建立训练集
train_x = data[["car_ID","symboling","CarName", "fueltype", "aspiration", "doornumber",
"carbody", "drivewheel", "enginelocation", "wheelbase", "carlength","carwidth",
"carheight", "curbweight", "enginetype", "cylindernumber", "enginesize","fuelsystem",
"boreratio", "stroke", "compressionratio", "horsepower", "peakrpm","citympg",
"highwaympg", "price"]]
train_x
#%%
# 将非数值形数据转为数值形数据
le = LabelEncoder()
columns = ['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation','enginetype','cylindernumber','fuelsystem']
for column in columns:
train_x[column] = le.fit_transform(train_x[column])
data.head()
#%%
# 进行数据规范化,至(0,1)之间
min_max_scaler = preprocessing.MinMaxScaler()
# 对数据进行规范
train_x = min_max_scaler.fit_transform(train_x)
pd.DataFrame(train_x).to_csv('temp.csv', index=False)
# 将归一化后的数据进行文件导出
#%%
# 选择聚类的组数
import matplotlib.pyplot as plt
sse = []
# 去找到一个最好的拐点系数
for k in range(1, 11):
Kmeans = KMeans(n_clusters=k)
Kmeans.fit(train_x)
# 计算簇内的误差平方和
sse.append(Kmeans.inertia_)
x = range(1, 11)
# 将图像嵌入到结果中
%matplotlib inline
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(x, sse, 'o-')
#%%
# 使用Kmeans聚类, 分成5类
Kmeans = KMeans(n_clusters = 5)
Kmeans.fit(train_x)
# 预测y的值
predict_y = Kmeans.predict(train_x)
predict_y
#%%
# 将结果插入至原始的数据当中,并用axis=1列进行展现
result = pd.concat((data, pd.DataFrame(predict_y)), axis=1)
result.rename({0:u'聚类结果'}, axis=1, inplace=True)
result
data
#%%
label = result[result.CarName.str.contains('vokswagen')]['聚类结果']
label
#%%
result[result.apply(lambda x: x['聚类结果']==4 and 'sedan' in x['carbody'], axis=1)][['CarName', 'wheelbase', 'price', 'horsepower', 'carbody', 'fueltype','聚类结果']].sort_values('price', ascending=False)
#%%
# 竞品车结果
benchmark = result[result['聚类结果']==4].CarName
print('竞平车如下所示')
benchmark
天池-汽车聚类分析
最新推荐文章于 2023-10-12 13:40:01 发布