import pandas as pd
from sklearn.decomposition import PCA
prior=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/order_products__prior.csv")
prior.head()
| order_id | product_id | add_to_cart_order | reordered |
---|
0 | 2 | 33120 | 1 | 1 |
---|
1 | 2 | 28985 | 2 | 1 |
---|
2 | 2 | 9327 | 3 | 0 |
---|
3 | 2 | 45918 | 4 | 1 |
---|
4 | 2 | 30035 | 5 | 0 |
---|
product=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/products.csv")
product.head()
| product_id | product_name | aisle_id | department_id |
---|
0 | 1 | Chocolate Sandwich Cookies | 61 | 19 |
---|
1 | 2 | All-Seasons Salt | 104 | 13 |
---|
2 | 3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
---|
3 | 4 | Smart Ones Classic Favorites Mini Rigatoni Wit... | 38 | 1 |
---|
4 | 5 | Green Chile Anytime Sauce | 5 | 13 |
---|
orders=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/orders.csv")
orders.head()
| order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order |
---|
0 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN |
---|
1 | 2398795 | 1 | prior | 2 | 3 | 7 | 15.0 |
---|
2 | 473747 | 1 | prior | 3 | 3 | 12 | 21.0 |
---|
3 | 2254736 | 1 | prior | 4 | 4 | 7 | 29.0 |
---|
4 | 431534 | 1 | prior | 5 | 4 | 15 | 28.0 |
---|
aisles=pd.read_csv("C:/01-工作相关/0002-钉钉下载/数据集/instacart-market-basket-analysis/instacart-market-basket-analysis/aisles.csv")
aisles.head()
| aisle_id | aisle |
---|
0 | 1 | prepared soups salads |
---|
1 | 2 | specialty cheeses |
---|
2 | 3 | energy granola bars |
---|
3 | 4 | instant foods |
---|
4 | 5 | marinades meat preparation |
---|
_mg=pd.merge(prior,product,on=["product_id","product_id"])
_mg.head()
| order_id | product_id | add_to_cart_order | reordered | product_name | aisle_id | department_id |
---|
0 | 2 | 33120 | 1 | 1 | Organic Egg Whites | 86 | 16 |
---|
1 | 26 | 33120 | 5 | 0 | Organic Egg Whites | 86 | 16 |
---|
2 | 120 | 33120 | 13 | 0 | Organic Egg Whites | 86 | 16 |
---|
3 | 327 | 33120 | 5 | 1 | Organic Egg Whites | 86 | 16 |
---|
4 | 390 | 33120 | 28 | 1 | Organic Egg Whites | 86 | 16 |
---|
_mg2=pd.merge(_mg,orders,on=["order_id","order_id"])
_mg2.head()
| order_id | product_id | add_to_cart_order | reordered | product_name | aisle_id | department_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order |
---|
0 | 2 | 33120 | 1 | 1 | Organic Egg Whites | 86 | 16 | 202279 | prior | 3 | 5 | 9 | 8.0 |
---|
1 | 2 | 28985 | 2 | 1 | Michigan Organic Kale | 83 | 4 | 202279 | prior | 3 | 5 | 9 | 8.0 |
---|
2 | 2 | 9327 | 3 | 0 | Garlic Powder | 104 | 13 | 202279 | prior | 3 | 5 | 9 | 8.0 |
---|
3 | 2 | 45918 | 4 | 1 | Coconut Butter | 19 | 13 | 202279 | prior | 3 | 5 | 9 | 8.0 |
---|
4 | 2 | 30035 | 5 | 0 | Natural Sweetener | 17 | 13 | 202279 | prior | 3 | 5 | 9 | 8.0 |
---|
mt=pd.merge(_mg2,aisles,on=["aisle_id","aisle_id"])
mt.head(10)
| order_id | product_id | add_to_cart_order | reordered | product_name | aisle_id | department_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | aisle |
---|
0 | 2 | 33120 | 1 | 1 | Organic Egg Whites | 86 | 16 | 202279 | prior | 3 | 5 | 9 | 8.0 | eggs |
---|
1 | 26 | 33120 | 5 | 0 | Organic Egg Whites | 86 | 16 | 153404 | prior | 2 | 0 | 16 | 7.0 | eggs |
---|
2 | 120 | 33120 | 13 | 0 | Organic Egg Whites | 86 | 16 | 23750 | prior | 11 | 6 | 8 | 10.0 | eggs |
---|
3 | 327 | 33120 | 5 | 1 | Organic Egg Whites | 86 | 16 | 58707 | prior | 21 | 6 | 9 | 8.0 | eggs |
---|
4 | 390 | 33120 | 28 | 1 | Organic Egg Whites | 86 | 16 | 166654 | prior | 48 | 0 | 12 | 9.0 | eggs |
---|
5 | 537 | 33120 | 2 | 1 | Organic Egg Whites | 86 | 16 | 180135 | prior | 15 | 2 | 8 | 3.0 | eggs |
---|
6 | 582 | 33120 | 7 | 1 | Organic Egg Whites | 86 | 16 | 193223 | prior | 6 | 2 | 19 | 10.0 | eggs |
---|
7 | 608 | 33120 | 5 | 1 | Organic Egg Whites | 86 | 16 | 91030 | prior | 11 | 3 | 21 | 12.0 | eggs |
---|
8 | 623 | 33120 | 1 | 1 | Organic Egg Whites | 86 | 16 | 37804 | prior | 63 | 3 | 12 | 3.0 | eggs |
---|
9 | 689 | 33120 | 4 | 1 | Organic Egg Whites | 86 | 16 | 108932 | prior | 16 | 1 | 13 | 3.0 | eggs |
---|
cross=pd.crosstab(mt["user_id"],mt["aisle"])
cross.head(10)
aisle | air fresheners candles | asian foods | baby accessories | baby bath body care | baby food formula | bakery desserts | baking ingredients | baking supplies decor | beauty | beers coolers | ... | spreads | tea | tofu meat alternatives | tortillas flat bread | trail mix snack mix | trash bags liners | vitamins supplements | water seltzer sparkling water | white wines | yogurt |
---|
user_id | | | | | | | | | | | | | | | | | | | | | |
---|
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
2 | 0 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | ... | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 42 |
---|
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 |
---|
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
5 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
---|
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
---|
7 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
---|
8 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
---|
9 | 0 | 0 | 0 | 0 | 6 | 0 | 2 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 19 |
---|
10 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
---|
10 rows × 134 columns
pca=PCA(n_components=0.9)
data=pca.fit_transform(cross)
data
array([[-2.42156587e+01, 2.42942720e+00, -2.46636975e+00, ...,
6.86800336e-01, 1.69439402e+00, -2.34323022e+00],
[ 6.46320806e+00, 3.67511165e+01, 8.38255336e+00, ...,
4.12121252e+00, 2.44689740e+00, -4.28348478e+00],
[-7.99030162e+00, 2.40438257e+00, -1.10300641e+01, ...,
1.77534453e+00, -4.44194030e-01, 7.86665571e-01],
...,
[ 8.61143331e+00, 7.70129866e+00, 7.95240226e+00, ...,
-2.74252456e+00, 1.07112531e+00, -6.31925661e-02],
[ 8.40862199e+01, 2.04187340e+01, 8.05410372e+00, ...,
7.27554259e-01, 3.51339470e+00, -1.79079914e+01],
[-1.39534562e+01, 6.64621821e+00, -5.23030367e+00, ...,
8.25329076e-01, 1.38230701e+00, -2.41942061e+00]])
data.shape
(206209, 27)
cross.shape
(206209, 134)