天池新手平台赛ODPS入门代码
推荐比赛群:大数据比赛交流群 155167917
思路总结:整理了一下天池入门平台赛ODPS的SQL代码,赛题是2015年的阿里移动推荐算法大赛,本文的代码是用了SQL,并没有涉及算法平台。主要思路是根据用户(12月18日)对商品的操作(查看,收藏,加入购物车,购买)统计进行判断。在18号加入购物车操作的user-item对是最有可能在第二天实现购买,所以将此作为预测子集,同时在预测集中删去了已在18号进行购买的user-item对,以及删除查看次数过多仍然没有购买行为的user-item对等。
根据这些简单规则可以得到baseline成绩:
数据导入
--odps sql
********************************************************************--
--author:zhc
--create time:2016-05-09 17:21:27
--********************************************************************--
--Create table tianchi_fresh_comp_train_user_online as
--select * from tianchi_data.tianchi_fresh_comp_train_user_online;
--Create table tianchi_fresh_comp_train_item_online as
--select * from tianchi_data.tianchi_fresh_comp_train_item_online;
--select count (*) from tianchi_fresh_comp_train_user_online;
---train_item去除重复ui对
--create table train_item as
--select
--t.item_id,t.item_category from tianchi_fresh_comp_train_item_online t group by t.item_id,t.item_category ;
--select count (*) from train_item; --6052459行
数据预处理
--********************************************************************--
--author:zhc
--create time:2016-05-09 21:14:12
--********************************************************************--
--create table trainset18_cart_no_buy1 as
--select cart.*,length(buy.time) as buy_time
--from trainset_18_cart cart
--left outer join trainset_18_buy buy on
--cart.user_id=buy.user_id and cart.item_category=buy.item_category;
---trainset18_p信息:cart_no_buy
--create table trainset18_p as
--select t.* from trainset18_cart_no_buy t where buy_time IS NULL;
--ALTER TABLE trainset18_cart_no_buy1 ADD COLUMNS (label boolean );
--select t.* from trainset18_p t where user_id='100000027'
---trainset18_p_1信息:去除重复ui对,还未与商品子集交
--create table trainset18_p_1 as
--select p.user_id,p.item_id,p.item_category,max(time)as cart_time from trainset18_p p
--group by p.user_id,p.item_id,p.item_category;
--create table trainset18_p_2 as
--select p.*
--from trainset18_p_1 p
-- join train_item ti on
--p.item_id=ti.item_id ;
--**************************2016.05.10*************************--
--select count (*) from tianchi_fresh_comp_train_item_online;
--select count (*) from train_item;
----将18号加入购物车且没有购买该类别任何商品的ui对与商品子集取交集作为结果
--create table tianchi_mobile_recommendation_predict as --57006行
--select p.user_id,p.item_id
--from trainset18_p_1 p
--join train_item ti on
--p.item_id=ti.item_id ;
--select count (*) from trainset18_p_1; --662462行
--select count (*) from tianchi_mobile_recommendation_predict;--57006行
---trainset18_p_2与tianchi_mobile_recommendation_predict(只有ui对)
---基本相同,但保存了完整信息,用于后面的进一步处理
--create table trainset18_p_2 as --57006行
--select p.*
--from trainset18_p_1 p
--join train_item ti on
--p.item_id=ti.item_id ;
--create table trainset18_p_uc as --37331
--select p.user_id,count(p.item_id) as item_count,p.item_category
--from trainset18_p_2 p group by p.user_id,p.item_category;
--select count (*) from trainset18_p_uc;
---对同一类型商品加入购物车超过2次的uc对
--create table trainset18_p_uc_3 as --4083行
--select * from trainset18_p_uc where item_count>2
--create table trainset18_p_uc_4 as --2100行
--select * from trainset18_p_uc where item_count>3
---筛选出预测集p2中对同类商品加购物车不超过3次的ui对
--create table trainset18_p_3 as --18768行 大于3的uc对
--select p2.user_id,p2.item_id,p2.item_category,p2.cart_time,uc3.item_count
--from trainset18_p_2 p2
--left outer join trainset18_p_uc_3 uc3 on
--p2.user_id=uc3.user_id and p2.item_category=uc3.item_category;
--select count(* )from trainset18_p_3 where item_count is NULL; --38808
--select count(* )from trainset18_p_3 where item_count is NOT NULL; --18198
---对同一类型商品加入购物车超过3次的uc对
--create table trainset18_p_4 as --18768行 大于3的uc对
--select p2.user_id,p2.item_id,p2.item_category,p2.cart_time,uc4.item_count
--from trainset18_p_2 p2
--left outer join trainset18_p_uc_4 uc4 on
--p2.user_id=uc4.user_id and p2.item_category=uc4.item_category;
--select count(* )from trainset18_p_4 where item_count is NULL; --44757
--select count(* )from trainset18_p_4 where item_count is NOT NULL; --12249
--18号对商品子集的实际购买量 --35878
--create table trainset_18_buy_son as
--select p.*
--from trainset_18_buy p
--join train_item ti on
--p.item_id=ti.item_id ;
--create table trainset_18_buy_son1 as --32958
--select b.user_id,b.item_id from trainset_18_buy_son b group by b.user_id,b.item_id;
--18号对商品的点击ui对
--create table trainset_18_click as --==32534036
--select
--b.* from trainset_18 b where b.behavior_type=1 ;
--drop table if exists trainset_18_click_count;
--create table trainset_18_click_count as --14792012
--select
--user_id,item_id,max(time) as click_time,count(behavior_type)as ui_click from trainset_18_click group by user_id,item_id;
--18号加入购物车的ui对,同时包含该ui对的点击次数和最后点击时间
--drop table if exists trainset18_p_cart_click;
--create table trainset18_p_cart_click as --57006
--select p4.* ,cc.ui_click,cc.click_time from trainset18_p_4 p4
--left outer join trainset_18_click_count cc on
--p4.user_id=cc.user_id and p4.item_id=cc.item_id;
进一步剔除
-------------------------------------2016.05.14-------------------------------------
--create table trainset18_P_4_null as --44757 假如购物车且同类商品部超过3件的ui集
--select p4.* from trainset18_p_4 p4 where p4.item_count is NULL ;
--create table trainset18_p_cart_click_11143 as
--select * from trainset18_p_cart_click where
--cart_time<'2014-12-18 14' and cart_time>'2014-12-18 11' and ui_click<3 and item_count is NULL ;--1600
--drop table if exists trainset18_p_cart_click_11202;
--create table trainset18_p_cart_click_11202 as
--select * from trainset18_p_cart_click where
--cart_time<'2014-12-18 20' and cart_time>'2014-12-18 11' and ui_click<2 and item_count is NULL;--2355
--create table trainset18_p_cart_click_00125 as
--select * from trainset18_p_cart_click where cart_time<'2014-12-18 12' and ui_click<5 and item_count is NULL;--8004
--drop table if exists trainset18_p_cart_no_buy_day;
--create table trainset18_p_cart_no_buy_day as --白天加入购物车后再次查看但未购买(点击总数不超过8次)
--select * from trainset18_p_cart_click where cart_time<click_time and cart_time<'2014-12-18 18'
--and item_count is NULL and ui_click<8 ;--3928 / 3402
--create table trainset18_p_delete1 as --44757
--select pn.* ,cc.user_id as flag1
--from trainset18_P_4_null pn
--left outer join trainset18_p_cart_click_11143 cc on --1600
--pn.user_id=cc.user_id and pn.item_id=cc.item_id;
--create table trainset18_p_delete1_ok as --43157
--select * from trainset18_p_delete1 where flag1 is null
--create table trainset18_p_delete2 as --43157
--select pn.* ,cc.user_id as flag2
--from trainset18_p_delete1_ok pn
--left outer join trainset18_p_cart_click_13202 cc on --1842
--pn.user_id=cc.user_id and pn.item_id=cc.item_id;
--create table trainset18_p_delete2_ok as --41315
--select * from trainset18_p_delete2 where flag2 is null
--select * from trainset18_p_delete2_ok where cart_time>'2014-12-18 11' --30935
--create table trainset18_p_delete3 as --41315
--select pn.* ,cc.user_id as flag3
--from trainset18_p_delete2_ok pn --41315
--left outer join trainset18_p_cart_click_00125 cc on --8004
--pn.user_id=cc.user_id and pn.item_id=cc.item_id;
--create table trainset18_p_delete3_ok as --33311
--select * from trainset18_p_delete3 where flag3 is null
-------------------------------------2016.05.15-------------------------------------
--create table trainset18_p_delete_11202 as --44757 删去12-19点加入购物车但点击次数只有一次的ui对
--select pn.* ,cc.user_id as flag1
--from trainset18_P_4_null pn
--left outer join trainset18_p_cart_click_11202 cc on --2355
--pn.user_id=cc.user_id and pn.item_id=cc.item_id;
--create table trainset18_p_delete_11202_ok as --42402
--select * from trainset18_p_delete_11202 where flag1 is null
--create table trainset18_p_delete_00125 as --42402 删去00-12点加入购物车但点击次数小于5次的ui对
--select pn.* ,cc.user_id as flag2
--from trainset18_p_delete_11202_ok pn
--left outer join trainset18_p_cart_click_00125 cc on --8004
--pn.user_id=cc.user_id and pn.item_id=cc.item_id;
--create table trainset18_p_delete_00125_ok as --34398
--select * from trainset18_p_delete_00125 where flag2 is null
--drop table if exists trainset18_p_delete_cartnobuy;
--create table trainset18_p_delete_cartnobuy as --34398 --删去白天加入购物车后再次查看但未购买
--select pn.* ,cc.user_id as flag3
--from trainset18_p_delete_00125_ok pn
--left outer join trainset18_p_cart_no_buy_day cc on --3402
--pn.user_id=cc.user_id and pn.item_id=cc.item_id;
--drop table if exists trainset18_p_delete_cartnobuy_ok;
--create table trainset18_p_delete_cartnobuy_ok as --31987 (30996)
--select * from trainset18_p_delete_cartnobuy where flag3 is null
预测
--create table trainset18_p_test as
--select * from tianchi_mobile_recommendation_predict;
--DELETE from trainset18_p_test where --执行报错
--cart_time<'2014-12-18 14' and cart_time>'2014-12-18 11' and ui_click<3 ;
--drop table tianchi_mobile_recommendation_predict
---预测结果表 38808行
--create table tianchi_mobile_recommendation_predict as
--select p3.user_id,p3.item_id from trainset18_p_3 p3 where p3.item_count is NULL;
--结果表名:tianchi_mobile_recommendation_predict--
--drop table if exists tianchi_mobile_recommendation_predict;
--create table tianchi_mobile_recommendation_predict as
--create table tianchi_mobile_recommendation_predict as --29684 -- >11
--create table tianchi_mobile_recommendation_predict as --32838 -- >09
--select p3.user_id,p3.item_id from trainset18_p_3 p3 where p3.item_count is NULL and cart_time>'2014-12-18 08';
--select p4.user_id,p4.item_id from trainset18_p_4 p4 where p4.item_count is NULL --and cart_time>'2014-12-18 11' ;
--9-37936,10-36221,11-34377 2016.05.13
--select user_id,item_id from trainset18_p_delete2_ok where cart_time>'2014-12-18 11' --30935 2016.05.14 hit:1982
--select user_id,item_id from trainset18_p_delete_11202_ok where cart_time>'2014-12-18 11' --32022 2016.05.15
--select user_id,item_id from trainset18_p_delete_cartnobuy_ok --31987 2016.05.15 hit:1950
--select user_id,item_id from trainset18_p_delete_00125_ok --34398 2016.05.16 hit:2142
总结
平台赛的目的就是让新手熟悉阿里大数据平台的使用,方便后期参赛。所以还是很值得像我这样的新手参与练习的。不过本文只使用了ODPS平台进行数据处理和结果导出,算法平台部分都是图形界面操作,非常方便,选手可以把重心放在赛题思路分析和特征工程上,算法实现部分则直接用平台上的算法包即可。此外,平台还支持MR,下次比赛打算学习一下。
附上竞赛群分享的平台MR教程。