基于历史各项数据预计未来三个月销售额情况
import os. path
import pandas as pd
import numpy as np
import xgboost as xgb
base_path = "./data/"
数据读取
navigation = pd. read_csv( os. path. join( base_path, 'navigation.csv' ) )
sales = pd. read_csv( os. path. join( base_path, 'sales.csv' ) )
train = pd. read_csv( os. path. join( base_path, 'train.csv' ) )
test = pd. read_csv( os. path. join( base_path, 'test.csv' ) )
vimages = pd. read_csv( os. path. join( base_path, 'vimages.csv' ) )
test. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line macro_material color fr_FR_price en_US_description 0 ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_1 1 ed4c7471eac7e8c6e6718364c2b6e75462eeb47c Accessories Women TEXTILES SCARVES WINTER SCARVES ECHARPE REYKJAVIK SANS LIGNE ESTHETIQUE TEXTILE NaN 650.0 In pure, soft cashmere, the Reykjavik scarf is... 1 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_1 1 26b2c4f6281482cccf1e748ef388f1649ecf1c8b Leather Goods Women CITY BAGS DAILY BAGS HANDBAGS VERY ZIPPED TOTE VERY LEATHER KAKI FANGO COQUILLE 2550.0 For Fall-Winter 2018, the Very Zipped Tote han... 2 00aad36ebc04aaa761df219c1b48d1e6162d28bb_1 1 00aad36ebc04aaa761df219c1b48d1e6162d28bb Leather Goods Women CITY BAGS DAILY BAGS CROSS SHOULDER BAGS HINA MAHINA LEATHER NOIR 2460.0 The Hina is two bags in one: gather in the sid... 3 8281636e7d162c92c1a3c1f9e40afb05d39a02f9_1 1 8281636e7d162c92c1a3c1f9e40afb05d39a02f9 Accessories Men TEXTILES STOLES WINTER STOLES ETOLE MONOGRAM NIGHTFALL SANS LIGNE ESTHETIQUE TEXTILE NaN 455.0 With its subtle tone-on-tone Monogram pattern ... 4 b76f8505700409a198fe69fb33e5d6cc01166c9b_1 1 b76f8505700409a198fe69fb33e5d6cc01166c9b Accessories Men TIES TIES OTHER TIES CRAVATE MONOGRAM ILLUSION SANS LIGNE ESTHETIQUE TEXTILE NaN 150.0 Suitable both for business and special occasio...
vimages. head( )
sku_hash dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 dim_8 dim_9 ... dim_2039 dim_2040 dim_2041 dim_2042 dim_2043 dim_2044 dim_2045 dim_2046 dim_2047 dim_2048 0 3f6eba29c2c71cc804edb9a4a88760a273dd4348 0.000000 0.165251 0.017061 0.0 0.097332 0.000000 0.008793 0.064529 0.067048 ... 0.075003 0.000000 0.319221 0.756835 0.286287 0.000000 0.000000 0.000000 1.051608 0.677220 1 d7681062d1826c06aeca716e0aff6d5c096ac6d8 0.126899 0.000000 0.041690 0.0 0.600556 0.556608 0.608440 0.896061 0.011975 ... 0.000000 0.010133 0.325223 0.006643 0.169298 0.317276 0.308293 0.086837 1.122189 0.074612 2 c594b658ce0f17ea594a50426dcf69468b83bdef 0.218891 0.444878 0.023655 0.0 0.933365 0.000000 0.413172 0.020560 0.010038 ... 0.075254 0.000000 0.037003 0.404053 0.301876 0.049935 0.031618 0.138646 0.697306 0.075182 3 5b7dbbfcc0eba1037e6ab3c9e5cf3ea819deba65 0.002935 0.000000 0.504765 0.0 0.391393 0.159187 2.894631 0.708697 0.485973 ... 0.703062 0.000000 0.002509 0.000000 0.441945 0.850916 0.043980 0.000000 0.112908 0.618085 4 3d3487afe62434c742af37cf935a726f5bb16251 0.000000 0.030289 0.084332 0.0 0.416195 0.000000 0.487941 0.154408 0.052589 ... 0.054207 0.000000 0.000000 0.000000 0.118480 0.000000 0.000000 0.073685 0.000000 0.114883
5 rows × 2049 columns
train. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line macro_material color fr_FR_price en_US_description target 0 3d8a4ae769b526187c36901f204691a663333fa4_1 1 3d8a4ae769b526187c36901f204691a663333fa4 Leather Goods Women LG ACCESSORIES SMALL LEATHER GOODS COMPACT WALLETS PF.VICTORINE DAMIER AZUR CANVAS AZUR 370.0 Fashioned from summery Damier Azur canvas, thi... 1366.0 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6 Accessories Women FANCY ACCESSORIES BAG CHARMS BAG CHARMS BAG CHARM PETITE MALLE SANS LIGNE ESTHETIQUE CANVAS NaN 295.0 This collectible piece features our iconic Pet... 526.0 2 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 1 7cb4d3626bd48a9b523d8693266219c34aeccde8 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA LEATHER GALET 2660.0 The perfect summer companion, this effortlessl... 503.0 3 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 1 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA LEATHER MAGNOLIA 2660.0 The perfect summer companion, this effortlessl... 276.0 4 6913a128945e0efeafc52101dcdeaa610eaa4430_1 1 6913a128945e0efeafc52101dcdeaa610eaa4430 Accessories Men FASHION JEWELRY NECKLACES NECKLACES COLL.CHARMS FOR GENTLEMEN SANS LIGNE ESTHETIQUE NON PRECIOUS METAL NaN 325.0 This distinctive charm necklace reinterprets d... 206.0
navigation. head( )
Date sku_hash day_visit month_visit website_version_zone_number website_version_country_number traffic_source page_views addtocart 0 Day 6 b6cb56a971c9ffeeaf1833202ab453b51ab680a9 Wednesday 9 5.0 32.0 Source 1 105 0 1 Day 4 9c5060f9fb648cdfafd66f170af517f4de0a9162 Monday 9 5.0 32.0 Source 1 314 0 2 Day 4 da1549af83c813e5b53a4e4c0d588fa0099e3d3d Monday 10 5.0 32.0 Source 1 57 0 3 Day 6 477ac04baf2a484f441ffe688abc89bb46a1c21d Wednesday 10 5.0 32.0 Source 1 547 0 4 Day 4 799bd8691fe0be0137173eadd5e79d033a389bd8 Monday 10 5.0 32.0 Source 2 43 0
sales. head( )
Date day_transaction_date Month_transaction type zone_number country_number name sku_hash sales_quantity currency_rate_USD ... NetSentiment_6_day_before PositiveSentiment_6_day_before NegativeSentiment_6_day_before Impressions_6_day_before TotalBuzzPost_7_day_before TotalBuzz_7_day_before NetSentiment_7_day_before PositiveSentiment_7_day_before NegativeSentiment_7_day_before Impressions_7_day_before 0 Day_7 Thursday 4 Type_2 4 43 Name_1 84d4e90b67c2b9902cff522be4eb121f637c17b5 5 1.225636 ... 77.525599 33634.0 4258.0 643226023.0 68960.0 107970.0 70.473656 32032.0 5548.0 631477319.0 1 Day_6 Wednesday 4 Type_1 4 20 Name_1 84d4e90b67c2b9902cff522be4eb121f637c17b5 5 1.228951 ... 70.473656 32032.0 5548.0 631477319.0 65312.0 102920.0 71.788703 34277.0 5629.0 730132649.0 2 Day_1 Monday 1 Type_2 2 65 Name_1 70f3dd559074ecfc9d69ff64480bc8acae977e3a 5 1.201496 ... 46.158532 27594.0 10165.0 359205211.0 45746.0 64881.0 73.083627 21349.0 3320.0 268925766.0 3 Day_7 Sunday 1 Type_1 3 49 Name_1 70f3dd559074ecfc9d69ff64480bc8acae977e3a 5 1.204091 ... 72.438611 16292.0 2604.0 307167067.0 42796.0 61653.0 68.744442 20872.0 3866.0 323820589.0 4 Day_5 Tuesday 2 Type_1 4 20 Name_1 1f8b7a2b42d4cdf5eef70f75fb86a34d514d2747 5 1.238391 ... 72.776593 28344.0 4466.0 440849942.0 60653.0 90848.0 75.338753 26527.0 3731.0 393463263.0
5 rows × 97 columns
获取不同颜色产品的平均指标
product_descriptor = [ 'product_type' , 'product_gender' , 'macro_function' ,
'function' , 'sub_function' , 'model' , 'aesthetic_sub_line' , 'macro_material' ,
'month' ]
product_target_sum = train. groupby( product_descriptor) [ 'target' ] . sum ( ) . reset_index( name = 'sum_target' )
product_target_count = train. groupby( product_descriptor) [ 'target' ] . count( ) . reset_index( name = 'count_target' )
product_target_stats = pd. merge( product_target_sum, product_target_count, on= product_descriptor)
train = train. merge( product_target_stats, on= product_descriptor, how= 'left' )
test = test. merge( product_target_stats, on= product_descriptor, how= 'left' )
train[ 'mean_target' ] = ( train[ 'sum_target' ] - train[ 'target' ] ) / ( train[ 'count_target' ] - 1 )
test[ 'mean_target' ] = test[ 'sum_target' ] / test[ 'count_target' ]
train. drop( [ 'count_target' , 'sum_target' ] , axis= 1 )
test. drop( [ 'count_target' , 'sum_target' ] , axis= 1 ) . head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line macro_material color fr_FR_price en_US_description mean_target 0 ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_1 1 ed4c7471eac7e8c6e6718364c2b6e75462eeb47c Accessories Women TEXTILES SCARVES WINTER SCARVES ECHARPE REYKJAVIK SANS LIGNE ESTHETIQUE TEXTILE NaN 650.0 In pure, soft cashmere, the Reykjavik scarf is... 782.0 1 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_1 1 26b2c4f6281482cccf1e748ef388f1649ecf1c8b Leather Goods Women CITY BAGS DAILY BAGS HANDBAGS VERY ZIPPED TOTE VERY LEATHER KAKI FANGO COQUILLE 2550.0 For Fall-Winter 2018, the Very Zipped Tote han... 179.0 2 00aad36ebc04aaa761df219c1b48d1e6162d28bb_1 1 00aad36ebc04aaa761df219c1b48d1e6162d28bb Leather Goods Women CITY BAGS DAILY BAGS CROSS SHOULDER BAGS HINA MAHINA LEATHER NOIR 2460.0 The Hina is two bags in one: gather in the sid... NaN 3 8281636e7d162c92c1a3c1f9e40afb05d39a02f9_1 1 8281636e7d162c92c1a3c1f9e40afb05d39a02f9 Accessories Men TEXTILES STOLES WINTER STOLES ETOLE MONOGRAM NIGHTFALL SANS LIGNE ESTHETIQUE TEXTILE NaN 455.0 With its subtle tone-on-tone Monogram pattern ... NaN 4 b76f8505700409a198fe69fb33e5d6cc01166c9b_1 1 b76f8505700409a198fe69fb33e5d6cc01166c9b Accessories Men TIES TIES OTHER TIES CRAVATE MONOGRAM ILLUSION SANS LIGNE ESTHETIQUE TEXTILE NaN 150.0 Suitable both for business and special occasio... NaN
train. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line macro_material color fr_FR_price en_US_description target sum_target count_target mean_target 0 3d8a4ae769b526187c36901f204691a663333fa4_1 1 3d8a4ae769b526187c36901f204691a663333fa4 Leather Goods Women LG ACCESSORIES SMALL LEATHER GOODS COMPACT WALLETS PF.VICTORINE DAMIER AZUR CANVAS AZUR 370.0 Fashioned from summery Damier Azur canvas, thi... 1366.0 1837.0 2 471.000000 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6 Accessories Women FANCY ACCESSORIES BAG CHARMS BAG CHARMS BAG CHARM PETITE MALLE SANS LIGNE ESTHETIQUE CANVAS NaN 295.0 This collectible piece features our iconic Pet... 526.0 526.0 1 NaN 2 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 1 7cb4d3626bd48a9b523d8693266219c34aeccde8 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA LEATHER GALET 2660.0 The perfect summer companion, this effortlessl... 503.0 1395.0 4 297.333333 3 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 1 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA LEATHER MAGNOLIA 2660.0 The perfect summer companion, this effortlessl... 276.0 1395.0 4 373.000000 4 6913a128945e0efeafc52101dcdeaa610eaa4430_1 1 6913a128945e0efeafc52101dcdeaa610eaa4430 Accessories Men FASHION JEWELRY NECKLACES NECKLACES COLL.CHARMS FOR GENTLEMEN SANS LIGNE ESTHETIQUE NON PRECIOUS METAL NaN 325.0 This distinctive charm necklace reinterprets d... 206.0 206.0 1 NaN
train. drop( [ 'count_target' , 'sum_target' ] , axis= 1 , inplace = True )
test. drop( [ 'count_target' , 'sum_target' ] , axis= 1 , inplace = True )
train. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line macro_material color fr_FR_price en_US_description target mean_target 0 3d8a4ae769b526187c36901f204691a663333fa4_1 1 3d8a4ae769b526187c36901f204691a663333fa4 Leather Goods Women LG ACCESSORIES SMALL LEATHER GOODS COMPACT WALLETS PF.VICTORINE DAMIER AZUR CANVAS AZUR 370.0 Fashioned from summery Damier Azur canvas, thi... 1366.0 471.000000 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6 Accessories Women FANCY ACCESSORIES BAG CHARMS BAG CHARMS BAG CHARM PETITE MALLE SANS LIGNE ESTHETIQUE CANVAS NaN 295.0 This collectible piece features our iconic Pet... 526.0 NaN 2 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 1 7cb4d3626bd48a9b523d8693266219c34aeccde8 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA LEATHER GALET 2660.0 The perfect summer companion, this effortlessl... 503.0 297.333333 3 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 1 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA LEATHER MAGNOLIA 2660.0 The perfect summer companion, this effortlessl... 276.0 373.000000 4 6913a128945e0efeafc52101dcdeaa610eaa4430_1 1 6913a128945e0efeafc52101dcdeaa610eaa4430 Accessories Men FASHION JEWELRY NECKLACES NECKLACES COLL.CHARMS FOR GENTLEMEN SANS LIGNE ESTHETIQUE NON PRECIOUS METAL NaN 325.0 This distinctive charm necklace reinterprets d... 206.0 NaN
统计同款的数量
count_vec_cols = [ 'macro_function' , 'function' , 'sub_function' , 'model' ,
'aesthetic_sub_line' , 'macro_material' , 'color' ]
for col in count_vec_cols:
tmp = pd. DataFrame( { 'sku_hash' : pd. concat( [ train[ 'sku_hash' ] , test[ 'sku_hash' ] ] ) , col: pd. concat( [ train[ col] , test[ col] ] ) } )
tmp = pd. DataFrame( tmp. groupby( col) [ 'sku_hash' ] . count( ) ) . reset_index( )
tmp. columns = [ col, col+ '_count' ]
train = train. merge( tmp, on= col, how= 'left' )
test = test. merge( tmp, on= col, how= 'left' )
train. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line ... en_US_description target mean_target macro_function_count function_count sub_function_count model_count aesthetic_sub_line_count macro_material_count color_count 0 3d8a4ae769b526187c36901f204691a663333fa4_1 1 3d8a4ae769b526187c36901f204691a663333fa4 Leather Goods Women LG ACCESSORIES SMALL LEATHER GOODS COMPACT WALLETS PF.VICTORINE DAMIER AZUR ... Fashioned from summery Damier Azur canvas, thi... 1366.0 471.000000 2292 2094 447 90 126 3213 120.0 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6 Accessories Women FANCY ACCESSORIES BAG CHARMS BAG CHARMS BAG CHARM PETITE MALLE SANS LIGNE ESTHETIQUE ... This collectible piece features our iconic Pet... 526.0 NaN 546 126 102 3 3477 3213 NaN 2 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 1 7cb4d3626bd48a9b523d8693266219c34aeccde8 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... The perfect summer companion, this effortlessl... 503.0 297.333333 3039 2784 699 15 90 4089 39.0 3 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 1 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... The perfect summer companion, this effortlessl... 276.0 373.000000 3039 2784 699 15 90 4089 60.0 4 6913a128945e0efeafc52101dcdeaa610eaa4430_1 1 6913a128945e0efeafc52101dcdeaa610eaa4430 Accessories Men FASHION JEWELRY NECKLACES NECKLACES COLL.CHARMS FOR GENTLEMEN SANS LIGNE ESTHETIQUE ... This distinctive charm necklace reinterprets d... 206.0 NaN 843 171 162 3 3477 1023 NaN
5 rows × 23 columns
统计不同访问源的数量
traffic_source_views = navigation. groupby( [ 'sku_hash' , 'traffic_source' ] ) [ 'page_views' ] . sum ( ) . reset_index( )
traffic_source_views[ : 5 ]
sku_hash traffic_source page_views 0 000cb631113e2f54ca5512139a6592e9584957aa Source 1 1748 1 000cb631113e2f54ca5512139a6592e9584957aa Source 2 40 2 000cb631113e2f54ca5512139a6592e9584957aa Source 5 30 3 000cb631113e2f54ca5512139a6592e9584957aa Source 6 48 4 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 Source 1 16300
traffic_source_views = traffic_source_views. pivot( index = 'sku_hash' , columns = 'traffic_source' , values = 'page_views' ) . reset_index( )
traffic_source_views. columns = [ 'sku_hash' ,
'page_views_nav1' , 'page_views_nav2' , 'page_views_nav3' ,
'page_views_nav4' , 'page_views_nav5' , 'page_views_nav6' ]
traffic_source_views. head( )
sku_hash page_views_nav1 page_views_nav2 page_views_nav3 page_views_nav4 page_views_nav5 page_views_nav6 0 000cb631113e2f54ca5512139a6592e9584957aa 1748.0 40.0 NaN NaN 30.0 48.0 1 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 16300.0 286.0 NaN 25.0 93.0 25.0 2 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 147.0 5.0 NaN NaN 5.0 NaN 3 00287bbb94c12066df6491dccd744ee87ff01a90 3473.0 15.0 NaN 25.0 NaN NaN 4 003f8a76cb823eb7c58b6d052c57a8933f9275fd 47468.0 251.0 NaN 55.0 NaN 10.0
统计不同类型的销售数量
type_sales = sales. groupby( [ 'sku_hash' , 'type' ] ) [ 'sales_quantity' ] . sum ( ) . reset_index( )
type_sales = type_sales. pivot( index = 'sku_hash' , columns = 'type' , values = 'sales_quantity' ) . reset_index( )
type_sales. columns = [ 'sku_hash' , 'sales_quantity_type1' , 'sales_quantity_type2' ]
type_sales. head( )
sku_hash sales_quantity_type1 sales_quantity_type2 0 000cb631113e2f54ca5512139a6592e9584957aa 104.0 10.0 1 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 325.0 40.0 2 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 85.0 25.0 3 00287bbb94c12066df6491dccd744ee87ff01a90 35.0 10.0 4 003f8a76cb823eb7c58b6d052c57a8933f9275fd 253.0 145.0
统计不同地区情况
zone_sales = sales. groupby( [ 'sku_hash' , 'zone_number' ] ) [ 'sales_quantity' ] . sum ( ) . reset_index( )
zone_sales = zone_sales. pivot( index = 'sku_hash' , columns = 'zone_number' , values = 'sales_quantity' ) . reset_index( )
zone_sales. columns = [ 'sku_hash' ,
'sales_quantity_zone1' , 'sales_quantity_zone2' , 'sales_quantity_zone3' ,
'sales_quantity_zone4' , 'sales_quantity_zone5' ]
zone_sales. head( )
sku_hash sales_quantity_zone1 sales_quantity_zone2 sales_quantity_zone3 sales_quantity_zone4 sales_quantity_zone5 0 000cb631113e2f54ca5512139a6592e9584957aa 20.0 5.0 15.0 50.0 24.0 1 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 NaN NaN 82.0 20.0 263.0 2 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 55.0 35.0 10.0 NaN 10.0 3 00287bbb94c12066df6491dccd744ee87ff01a90 10.0 NaN 5.0 30.0 NaN 4 003f8a76cb823eb7c58b6d052c57a8933f9275fd 45.0 148.0 50.0 140.0 15.0
统计各种网络情况
navigation_stats = navigation. groupby( [ 'sku_hash' ] ) [ 'page_views' ] . sum ( ) . reset_index( name= 'page_views' )
sales_stats = sales. groupby( [ 'sku_hash' ] ) [ 'sales_quantity' , 'TotalBuzzPost' , 'TotalBuzz' , 'NetSentiment' , 'PositiveSentiment' , 'NegativeSentiment' , 'Impressions' ] . sum ( ) . reset_index( )
navigation_stats. head( )
sku_hash page_views 0 000cb631113e2f54ca5512139a6592e9584957aa 1866 1 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 16729 2 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 157 3 00287bbb94c12066df6491dccd744ee87ff01a90 3513 4 003f8a76cb823eb7c58b6d052c57a8933f9275fd 47784
sales_stats. head( )
sku_hash sales_quantity TotalBuzzPost TotalBuzz NetSentiment PositiveSentiment NegativeSentiment Impressions 0 000cb631113e2f54ca5512139a6592e9584957aa 114 1308458.0 1791611.0 1471.884478 469288.0 67668.0 8.729030e+09 1 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 365 497215.0 803633.0 851.430118 228523.0 38871.0 4.159187e+09 2 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 110 1206926.0 1718843.0 1380.767295 524646.0 100366.0 1.121427e+10 3 00287bbb94c12066df6491dccd744ee87ff01a90 45 475098.0 680543.0 581.175056 211602.0 33527.0 3.253251e+09 4 003f8a76cb823eb7c58b6d052c57a8933f9275fd 398 3282466.0 4795323.0 3742.253384 1401916.0 307473.0 2.389471e+10
划分数据集
train[ 'idx' ] = pd. Categorical( train. sku_hash) . codes
train[ 'idx' ] = train[ 'idx' ] % 5
train. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line ... target mean_target macro_function_count function_count sub_function_count model_count aesthetic_sub_line_count macro_material_count color_count idx 0 3d8a4ae769b526187c36901f204691a663333fa4_1 1 3d8a4ae769b526187c36901f204691a663333fa4 Leather Goods Women LG ACCESSORIES SMALL LEATHER GOODS COMPACT WALLETS PF.VICTORINE DAMIER AZUR ... 1366.0 471.000000 2292 2094 447 90 126 3213 120.0 1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6 Accessories Women FANCY ACCESSORIES BAG CHARMS BAG CHARMS BAG CHARM PETITE MALLE SANS LIGNE ESTHETIQUE ... 526.0 NaN 546 126 102 3 3477 3213 NaN 1 2 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 1 7cb4d3626bd48a9b523d8693266219c34aeccde8 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... 503.0 297.333333 3039 2784 699 15 90 4089 39.0 4 3 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 1 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... 276.0 373.000000 3039 2784 699 15 90 4089 60.0 2 4 6913a128945e0efeafc52101dcdeaa610eaa4430_1 1 6913a128945e0efeafc52101dcdeaa610eaa4430 Accessories Men FASHION JEWELRY NECKLACES NECKLACES COLL.CHARMS FOR GENTLEMEN SANS LIGNE ESTHETIQUE ... 206.0 NaN 843 171 162 3 3477 1023 NaN 0
5 rows × 24 columns
整合数据集
X = train. copy( )
X = X. merge( navigation_stats, on = 'sku_hash' , how = 'left' )
X = X. merge( sales_stats, on = 'sku_hash' , how = 'left' )
X = X. merge( traffic_source_views, on = 'sku_hash' , how = 'left' )
X = X. merge( type_sales, on = 'sku_hash' , how = 'left' )
X = X. merge( zone_sales, on = 'sku_hash' , how = 'left' )
X. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line ... page_views_nav4 page_views_nav5 page_views_nav6 sales_quantity_type1 sales_quantity_type2 sales_quantity_zone1 sales_quantity_zone2 sales_quantity_zone3 sales_quantity_zone4 sales_quantity_zone5 0 3d8a4ae769b526187c36901f204691a663333fa4_1 1 3d8a4ae769b526187c36901f204691a663333fa4 Leather Goods Women LG ACCESSORIES SMALL LEATHER GOODS COMPACT WALLETS PF.VICTORINE DAMIER AZUR ... 20.0 25.0 29.0 946.0 554.0 506.0 238.0 75.0 544.0 137.0 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6 Accessories Women FANCY ACCESSORIES BAG CHARMS BAG CHARMS BAG CHARM PETITE MALLE SANS LIGNE ESTHETIQUE ... NaN 39.0 NaN 338.0 60.0 84.0 55.0 25.0 155.0 79.0 2 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 1 7cb4d3626bd48a9b523d8693266219c34aeccde8 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... 10.0 10.0 5.0 429.0 145.0 190.0 55.0 35.0 195.0 99.0 3 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 1 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 Leather Goods Women CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... 5.0 10.0 5.0 228.0 55.0 108.0 55.0 20.0 20.0 80.0 4 6913a128945e0efeafc52101dcdeaa610eaa4430_1 1 6913a128945e0efeafc52101dcdeaa610eaa4430 Accessories Men FASHION JEWELRY NECKLACES NECKLACES COLL.CHARMS FOR GENTLEMEN SANS LIGNE ESTHETIQUE ... NaN 5.0 NaN 110.0 25.0 35.0 5.0 20.0 30.0 45.0
5 rows × 45 columns
X = train. copy( )
X = X. merge( navigation_stats, on = 'sku_hash' , how = 'left' )
X = X. merge( sales_stats, on = 'sku_hash' , how = 'left' )
X = X. merge( traffic_source_views, on = 'sku_hash' , how = 'left' )
X = X. merge( type_sales, on = 'sku_hash' , how = 'left' )
X = X. merge( zone_sales, on = 'sku_hash' , how = 'left' )
X. loc[ X. product_type== 'Accessories' , 'product_type' ] = '0'
X. loc[ X. product_type== 'Leather Goods' , 'product_type' ] = '1'
X. product_type = X. product_type. astype( int )
X. loc[ X. product_gender== 'Women' , 'product_gender' ] = '-1'
X. loc[ X. product_gender== 'Unisex' , 'product_gender' ] = '0'
X. loc[ X. product_gender== 'Men' , 'product_gender' ] = '1'
X. product_gender = X. product_gender. astype( int )
X[ 'y' ] = np. log( X[ 'target' ] + 1 )
X. head( )
ID month sku_hash product_type product_gender macro_function function sub_function model aesthetic_sub_line ... page_views_nav5 page_views_nav6 sales_quantity_type1 sales_quantity_type2 sales_quantity_zone1 sales_quantity_zone2 sales_quantity_zone3 sales_quantity_zone4 sales_quantity_zone5 y 0 3d8a4ae769b526187c36901f204691a663333fa4_1 1 3d8a4ae769b526187c36901f204691a663333fa4 1 -1 LG ACCESSORIES SMALL LEATHER GOODS COMPACT WALLETS PF.VICTORINE DAMIER AZUR ... 25.0 29.0 946.0 554.0 506.0 238.0 75.0 544.0 137.0 7.220374 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 1 c05a54f7067be054ec4b27d0d6081353ef7d9df6 0 -1 FANCY ACCESSORIES BAG CHARMS BAG CHARMS BAG CHARM PETITE MALLE SANS LIGNE ESTHETIQUE ... 39.0 NaN 338.0 60.0 84.0 55.0 25.0 155.0 79.0 6.267201 2 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 1 7cb4d3626bd48a9b523d8693266219c34aeccde8 1 -1 CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... 10.0 5.0 429.0 145.0 190.0 55.0 35.0 195.0 99.0 6.222576 3 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 1 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 1 -1 CITY BAGS DAILY BAGS SHOULDER BAGS GIROLATA MAHINA ... 10.0 5.0 228.0 55.0 108.0 55.0 20.0 20.0 80.0 5.624018 4 6913a128945e0efeafc52101dcdeaa610eaa4430_1 1 6913a128945e0efeafc52101dcdeaa610eaa4430 0 1 FASHION JEWELRY NECKLACES NECKLACES COLL.CHARMS FOR GENTLEMEN SANS LIGNE ESTHETIQUE ... 5.0 NaN 110.0 25.0 35.0 5.0 20.0 30.0 45.0 5.332719
5 rows × 46 columns
整合测试集
Z = test. copy( )
Z = Z. merge( navigation_stats, on = 'sku_hash' , how = 'left' )
Z = Z. merge( sales_stats, on = 'sku_hash' , how = 'left' )
Z = Z. merge( traffic_source_views, on = 'sku_hash' , how = 'left' )
Z = Z. merge( type_sales, on = 'sku_hash' , how = 'left' )
Z = Z. merge( zone_sales, on = 'sku_hash' , how = 'left' )
Z. loc[ Z. product_type== 'Accessories' , 'product_type' ] = '0'
Z. loc[ Z. product_type== 'Leather Goods' , 'product_type' ] = '1'
Z. product_type = Z. product_type. astype( int )
Z. loc[ Z. product_gender== 'Women' , 'product_gender' ] = '-1'
Z. loc[ Z. product_gender== 'Unisex' , 'product_gender' ] = '0'
Z. loc[ Z. product_gender== 'Men' , 'product_gender' ] = '1'
Z. product_gender = Z. product_gender. astype( int )
features = [ 'product_type' , 'product_gender' ,
'page_views' , 'sales_quantity' ,
'TotalBuzzPost' , 'TotalBuzz' , 'NetSentiment' , 'PositiveSentiment' , 'NegativeSentiment' , 'Impressions' ,
'fr_FR_price' ,
'macro_function_count' , 'function_count' , 'sub_function_count' , 'model_count' , 'aesthetic_sub_line_count' , 'macro_material_count' , 'color_count' ,
'page_views_nav1' , 'page_views_nav2' , 'page_views_nav3' , 'page_views_nav4' , 'page_views_nav5' , 'page_views_nav6' ,
'sales_quantity_type1' , 'sales_quantity_type2' ,
'sales_quantity_zone1' , 'sales_quantity_zone2' , 'sales_quantity_zone3' , 'sales_quantity_zone4' , 'sales_quantity_zone5' ,
'mean_target' , ]
交叉验证
def train_test_split ( tr, te, mo, feats, num_folds) :
Xtrain = [ ]
ytrain = [ ]
dtrain = [ ]
Xval = [ ]
yval = [ ]
dval = [ ]
for i in range ( num_folds) :
Xtrain. append( tr. loc[ ( tr. month== mo) & ( tr. idx != i) , feats] . values)
ytrain. append( tr. loc[ ( tr. month== mo) & ( tr. idx != i) , 'y' ] . values)
dtrain. append( xgb. DMatrix( Xtrain[ i] , ytrain[ i] ) )
Xval. append( tr. loc[ ( tr. month== mo) & ( tr. idx == i) , feats] . values)
yval. append( tr. loc[ ( tr. month== mo) & ( tr. idx == i) , 'y' ] . values)
dval. append( xgb. DMatrix( Xval[ i] , yval[ i] ) )
Xtest = te. loc[ ( te. month == mo) , feats] . values
dtest = xgb. DMatrix( Xtest)
return dtrain, dval, dtest
Xgboost参数
param = { }
param[ 'objective' ] = 'reg:linear'
param[ 'eval_metric' ] = 'rmse'
param[ 'booster' ] = 'gbtree'
param[ 'eta' ] = 0.025
param[ 'subsample' ] = 0.7
param[ 'colsample_bytree' ] = 0.7
param[ 'num_parallel_tree' ] = 3
param[ 'min_child_weight' ] = 25
param[ 'gamma' ] = 5
param[ 'max_depth' ] = 3
param[ 'silent' ] = 1
第一个月的情况
dtrain, dval, dtest = train_test_split( tr = X, te = Z, mo = 1 , feats = features, num_folds = 5 )
model_m1 = [ ]
for i in range ( 5 ) :
model_m1. append(
xgb. train(
param,
dtrain[ i] ,
50000 ,
[ ( dtrain[ i] , 'train' ) , ( dval[ i] , 'eval' ) ] ,
early_stopping_rounds = 200 ,
verbose_eval = False )
)
oof_m1 = [ ]
oof_test_m1 = [ ]
for i in range ( 5 ) :
oof_m1. append( model_m1[ i] . predict( dval[ i] ) )
oof_test_m1. append( model_m1[ i] . predict( dtest) )
test_m1 = np. mean( oof_test_m1, axis= 0 )
m1 = { }
for i in range ( 5 ) :
m1 = { ** m1, ** dict ( zip ( X. loc[ ( X. month== 1 ) & ( X. idx== i) , 'sku_hash' ] , oof_m1[ i] ) ) }
m1 = { ** m1, ** dict ( zip ( Z. loc[ ( Z. month== 1 ) , 'sku_hash' ] , test_m1) ) }
oof_m1 = pd. DataFrame. from_dict( m1, orient= 'index' ) . reset_index( )
oof_m1. columns = [ 'sku_hash' , 'oof_m1' ]
X2 = pd. merge( X. copy( ) , oof_m1, on = 'sku_hash' )
Z2 = pd. merge( Z. copy( ) , oof_m1, on = 'sku_hash' )
features2 = features + [ 'oof_m1' ]
第二个月情况
dtrain2, dval2, dtest2 = train_test_split( tr = X2, te = Z2, mo = 2 , feats = features2, num_folds = 5 )
model_m2 = [ ]
for i in range ( 5 ) :
model_m2. append(
xgb. train(
param,
dtrain2[ i] ,
50000 ,
[ ( dtrain2[ i] , 'train' ) , ( dval2[ i] , 'eval' ) ] ,
early_stopping_rounds = 200 ,
verbose_eval = False )
)
oof_m2 = [ ]
oof_test_m2 = [ ]
for i in range ( 5 ) :
oof_m2. append( model_m2[ i] . predict( dval2[ i] ) )
oof_test_m2. append( model_m2[ i] . predict( dtest2) )
test_m2 = np. mean( oof_test_m2, axis= 0 )
m2 = { }
for i in range ( 5 ) :
m2 = { ** m2, ** dict ( zip ( X. loc[ ( X. month== 2 ) & ( X. idx== i) , 'sku_hash' ] , oof_m2[ i] ) ) }
m2 = { ** m2, ** dict ( zip ( Z. loc[ ( Z. month== 2 ) , 'sku_hash' ] , test_m2) ) }
oof_m2 = pd. DataFrame. from_dict( m2, orient= 'index' ) . reset_index( )
oof_m2. columns = [ 'sku_hash' , 'oof_m2' ]
X3 = pd. merge( X2. copy( ) , oof_m2, on = 'sku_hash' )
Z3 = pd. merge( Z2. copy( ) , oof_m2, on = 'sku_hash' )
features3 = features2 + [ 'oof_m2' ]
第三个月情况
dtrain3, dval3, dtest3 = train_test_split( tr = X3, te = Z3, mo = 3 , feats = features3, num_folds = 5 )
model_m3 = [ ]
for i in range ( 5 ) :
model_m3. append(
xgb. train(
param,
dtrain3[ i] ,
50000 ,
[ ( dtrain3[ i] , 'train' ) , ( dval3[ i] , 'eval' ) ] ,
early_stopping_rounds = 200 ,
verbose_eval = False )
)
oof_m3 = [ ]
oof_test_m3 = [ ]
for i in range ( 5 ) :
oof_m3. append( model_m3[ i] . predict( dval3[ i] ) )
oof_test_m3. append( model_m3[ i] . predict( dtest3) )
test_m3 = np. mean( oof_test_m3, axis= 0 )
m3 = { }
for i in range ( 5 ) :
m3 = { ** m3, ** dict ( zip ( X. loc[ ( X. month== 3 ) & ( X. idx== i) , 'sku_hash' ] , oof_m3[ i] ) ) }
m3 = { ** m3, ** dict ( zip ( Z. loc[ ( Z. month== 3 ) , 'sku_hash' ] , test_m3) ) }
oof_m3 = pd. DataFrame. from_dict( m3, orient= 'index' ) . reset_index( )
oof_m3. columns = [ 'sku_hash' , 'oof_m3' ]
X3 = pd. merge( X3. copy( ) , oof_m3, on = 'sku_hash' )
Z3 = pd. merge( Z3. copy( ) , oof_m3, on = 'sku_hash' )
Z3[ 'target' ] = 0
Z3. loc[ Z3. month == 1 , 'target' ] = Z3. loc[ Z3. month == 1 , 'oof_m1' ]
Z3. loc[ Z3. month == 2 , 'target' ] = Z3. loc[ Z3. month == 2 , 'oof_m2' ]
Z3. loc[ Z3. month == 3 , 'target' ] = Z3. loc[ Z3. month == 3 , 'oof_m3' ]
X3[ 'pred_target' ] = 0
X3. loc[ X3. month == 1 , 'pred_target' ] = X3. loc[ X3. month == 1 , 'oof_m1' ]
X3. loc[ X3. month == 2 , 'pred_target' ] = X3. loc[ X3. month == 2 , 'oof_m2' ]
X3. loc[ X3. month == 3 , 'pred_target' ] = X3. loc[ X3. month == 3 , 'oof_m3' ]
评估结果
print ( f"month1: {np.sqrt(np.mean((X3.loc[X3.month==1,'y'] - X3.loc[X3.month==1,'pred_target'])**2))}" )
print ( f"month2: {np.sqrt(np.mean((X3.loc[X3.month==2,'y'] - X3.loc[X3.month==2,'pred_target'])**2))}" )
print ( f"month3: {np.sqrt(np.mean((X3.loc[X3.month==3,'y'] - X3.loc[X3.month==3,'pred_target'])**2))}" )
print ( f"overall: {np.sqrt(np.mean((X3['y'] - X3['pred_target'])**2))}" )
month1: 0.4401109610656908
month2: 0.5816163893598695
month3: 0.7147597943809006
overall: 0.5895921884618234
生成结果
Z3[ 'target' ] = np. exp( Z3. target) - 1
final_sub = Z3[ [ 'ID' , 'target' ] ]
final_sub. to_csv( os. path. join( base_path, 'silly-raddar-sub4.csv' ) , index= None )
final = pd. read_csv( './data/silly-raddar-sub4.csv' )
final. head( )
ID target 0 ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_1 300.303182 1 ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_2 301.345169 2 ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_3 212.500844 3 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_1 85.048020 4 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_2 71.009720