coding=utf-8
“”"
author:lei
function:
“”"
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
order_product = pd.read_csv("./data/instacart/order_products__prior.csv")
products = pd.read_csv("./data/instacart/products.csv")
orders = pd.read_csv("./data/instacart/orders.csv")
aisles = pd.read_csv("./data/instacart/aisles.csv")
数据基本处理,表格合并
table1 = pd.merge(order_product, products, on=[“product_id”, “product_id”])
table2 = pd.merge(table1, orders, on=[“order_id”, “order_id”])
table = pd.merge(table2, aisles, on=[“aisle_id”, “aisle_id”])
print(table.shape)
print(table.head())
交叉表合并
data = pd.crosstab(table[“user_id”], table[“aisle”])
print(data.head())
print(data.shape)
数据截取
new_data = data[:1000]
print(new_data.shape)
特征工程
transform = PCA(n_components=0.9)
trans_data = transform.fit_transform(new_data)
print(trans_data.shape)
机器学习
estimator = KMeans(n_clusters=5)
y_pre = estimator.fit_predict(trans_data)
模型评估
print(silhouette_score(trans_data, y_pre))