机器学习算法基础案例1:instacart

import pandas as pd
from sklearn.decomposition import PCA
#读取四张表的数据
prior=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/order_products__prior.csv")
prior.head()
order_idproduct_idadd_to_cart_orderreordered
023312011
122898521
22932730
324591841
423003550
product=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/products.csv")
product.head()
product_idproduct_nameaisle_iddepartment_id
01Chocolate Sandwich Cookies6119
12All-Seasons Salt10413
23Robust Golden Unsweetened Oolong Tea947
34Smart Ones Classic Favorites Mini Rigatoni Wit...381
45Green Chile Anytime Sauce513
orders=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/orders.csv")
orders.head()
order_iduser_ideval_setorder_numberorder_doworder_hour_of_daydays_since_prior_order
025393291prior128NaN
123987951prior23715.0
24737471prior331221.0
322547361prior44729.0
44315341prior541528.0
aisles=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/aisles.csv")
aisles.head()
aisle_idaisle
01prepared soups salads
12specialty cheeses
23energy granola bars
34instant foods
45marinades meat preparation
_mg=pd.merge(prior,product,on=["product_id","product_id"])
_mg.head()
order_idproduct_idadd_to_cart_orderreorderedproduct_nameaisle_iddepartment_id
023312011Organic Egg Whites8616
1263312050Organic Egg Whites8616
212033120130Organic Egg Whites8616
33273312051Organic Egg Whites8616
439033120281Organic Egg Whites8616
_mg2=pd.merge(_mg,orders,on=["order_id","order_id"])
_mg2.head()
order_idproduct_idadd_to_cart_orderreorderedproduct_nameaisle_iddepartment_iduser_ideval_setorder_numberorder_doworder_hour_of_daydays_since_prior_order
023312011Organic Egg Whites8616202279prior3598.0
122898521Michigan Organic Kale834202279prior3598.0
22932730Garlic Powder10413202279prior3598.0
324591841Coconut Butter1913202279prior3598.0
423003550Natural Sweetener1713202279prior3598.0
mt=pd.merge(_mg2,aisles,on=["aisle_id","aisle_id"])
mt.head(10)
order_idproduct_idadd_to_cart_orderreorderedproduct_nameaisle_iddepartment_iduser_ideval_setorder_numberorder_doworder_hour_of_daydays_since_prior_orderaisle
023312011Organic Egg Whites8616202279prior3598.0eggs
1263312050Organic Egg Whites8616153404prior20167.0eggs
212033120130Organic Egg Whites861623750prior116810.0eggs
33273312051Organic Egg Whites861658707prior21698.0eggs
439033120281Organic Egg Whites8616166654prior480129.0eggs
55373312021Organic Egg Whites8616180135prior15283.0eggs
65823312071Organic Egg Whites8616193223prior621910.0eggs
76083312051Organic Egg Whites861691030prior1132112.0eggs
86233312011Organic Egg Whites861637804prior633123.0eggs
96893312041Organic Egg Whites8616108932prior161133.0eggs
#交叉表,特殊的分组工具
cross=pd.crosstab(mt["user_id"],mt["aisle"])
cross.head(10)
aisleair fresheners candlesasian foodsbaby accessoriesbaby bath body carebaby food formulabakery dessertsbaking ingredientsbaking supplies decorbeautybeers coolers...spreadsteatofu meat alternativestortillas flat breadtrail mix snack mixtrash bags linersvitamins supplementswater seltzer sparkling waterwhite winesyogurt
user_id
10000000000...1000000001
20300002000...31100002042
30000000000...4100000200
40000000000...0001000100
50200000000...0000000003
60000000000...0000000000
70000002000...0000000005
80100001000...0000000000
90000602000...00000002019
100100000000...0000000002

10 rows × 134 columns

#进行主成分分析
pca=PCA(n_components=0.9)
data=pca.fit_transform(cross)
data
array([[-2.42156587e+01,  2.42942720e+00, -2.46636975e+00, ...,
         6.86800336e-01,  1.69439402e+00, -2.34323022e+00],
       [ 6.46320806e+00,  3.67511165e+01,  8.38255336e+00, ...,
         4.12121252e+00,  2.44689740e+00, -4.28348478e+00],
       [-7.99030162e+00,  2.40438257e+00, -1.10300641e+01, ...,
         1.77534453e+00, -4.44194030e-01,  7.86665571e-01],
       ...,
       [ 8.61143331e+00,  7.70129866e+00,  7.95240226e+00, ...,
        -2.74252456e+00,  1.07112531e+00, -6.31925661e-02],
       [ 8.40862199e+01,  2.04187340e+01,  8.05410372e+00, ...,
         7.27554259e-01,  3.51339470e+00, -1.79079914e+01],
       [-1.39534562e+01,  6.64621821e+00, -5.23030367e+00, ...,
         8.25329076e-01,  1.38230701e+00, -2.41942061e+00]])
data.shape
(206209, 27)
cross.shape
(206209, 134)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

weixin_47049321

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值