零基础入门推荐系统 - 新闻推荐-Task2 (DataWhale学习小组)
数据探索性分析
加载需要的module
% matplotlib inline
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
plt. rcParams[ 'font.family' ] = [ 'PingFang HK' ]
import os, gc, re, warnings, sys
warnings. filterwarnings( "ignore" )
导入数据并查看
articles = pd. read_csv( 'articles.csv' )
articles = articles. rename( columns= {
'article_id' : 'click_article_id' } )
print ( articles. shape)
articles. head( )
(364047, 4)
click_article_id
category_id
created_at_ts
words_count
0
0
0
1513144419000
168
1
1
1
1405341936000
189
2
2
1
1408667706000
250
3
3
1
1408468313000
230
4
4
1
1407071171000
162
text_emb = pd. read_csv( 'articles_emb.csv' )
print ( text_emb. shape)
text_emb. head( )
(364047, 251)
article_id
emb_0
emb_1
emb_2
emb_3
emb_4
emb_5
emb_6
emb_7
emb_8
...
emb_240
emb_241
emb_242
emb_243
emb_244
emb_245
emb_246
emb_247
emb_248
emb_249
0
0
-0.161183
-0.957233
-0.137944
0.050855
0.830055
0.901365
-0.335148
-0.559561
-0.500603
...
0.321248
0.313999
0.636412
0.169179
0.540524
-0.813182
0.286870
-0.231686
0.597416
0.409623
1
1
-0.523216
-0.974058
0.738608
0.155234
0.626294
0.485297
-0.715657
-0.897996
-0.359747
...
-0.487843
0.823124
0.412688
-0.338654
0.320786
0.588643
-0.594137
0.182828
0.397090
-0.834364
2
2
-0.619619
-0.972960
-0.207360
-0.128861
0.044748
-0.387535
-0.730477
-0.066126
-0.754899
...
0.454756
0.473184
0.377866
-0.863887
-0.383365
0.137721
-0.810877
-0.447580
0.805932
-0.285284
3
3
-0.740843
-0.975749
0.391698
0.641738
-0.268645
0.191745
-0.825593
-0.710591
-0.040099
...
0.271535
0.036040
0.480029
-0.763173
0.022627
0.565165
-0.910286
-0.537838
0.243541
-0.885329
4
4
-0.279052
-0.972315
0.685374
0.113056
0.238315
0.271913
-0.568816
0.341194
-0.600554
...
0.238286
0.809268
0.427521
-0.615932
-0.503697
0.614450
-0.917760
-0.424061
0.185484
-0.580292
5 rows × 251 columns
train_click = pd. read_csv( 'train_click_log.csv' )
print ( train_click. shape)
train_click. head( )
(1112623, 9)
user_id
click_article_id
click_timestamp
click_environment
click_deviceGroup
click_os
click_country
click_region
click_referrer_type
0
199999
160417
1507029570190
4
1
17
1
13
1
1
199999
5408
1507029571478
4
1
17
1
13
1
2
199999
50823
1507029601478
4
1
17
1
13
1
3
199998
157770
1507029532200
4
1
17
1
25
5
4
199998
96613
1507029671831
4
1
17
1
25
5
test_click = pd. read_csv( 'testA_click_log.csv' )
print ( test_click. shape)
test_click. head( )
(518010, 9)
user_id
click_article_id
click_timestamp
click_environment
click_deviceGroup
click_os
click_country
click_region
click_referrer_type
0
249999
160974
1506959142820
4
1
17
1
13
2
1
249999
160417
1506959172820
4
1
17
1
13
2
2
249998
160974
1506959056066
4
1
12
1
13
2
3
249998
202557
1506959086066
4
1
12
1
13
2
4
249997
183665
1506959088613
4
1
17
1
15
5
click_log.csv文件数据中每个字段的含义
user_id: 用户的唯一标识
click_article_id: 用户点击的文章唯一标识
click_timestamp: 用户点击文章时的时间戳
click_environment: 用户点击文章的环境
click_deviceGroup: 用户点击文章的设备组
click_os: 用户点击文章时的操作系统
click_country: 用户点击文章时的所在的国家
click_region: 用户点击文章时所在的区域
click_referrer_type: 用户点击文章时,文章的来源
数据预处理:增加新变量
train_click[ 'rank' ] = train_click. groupby( [ 'user_id' ] ) [ 'click_timestamp' ] . rank( ascending= False ) . astype( int )
test_click[ 'rank' ] = test_click. groupby( [ 'user_id' ] ) [ 'click_timestamp' ] . rank( ascending= False ) . astype( int )
train_click[ 'click_cnts' ] = train_click. groupby( [ 'user_id' ] ) [ 'click_timestamp' ] . transform( 'count' )
test_click[ 'click_cnts' ] = test_click. groupby( [ 'user_id' ] ) [ 'click_timestamp' ] . transform( 'count' )
train_click = train_click. merge( articles, how= 'left' , on= [ 'click_article_id' ] )
train_click. head( )
user_id
click_article_id
click_timestamp
click_environment
click_deviceGroup
click_os
click_country
click_region
click_referrer_type
rank
click_cnts
category_id
created_at_ts
words_count
0
199999
160417
1507029570190
4
1
17
1
13
1
11
11
281
1506942089000
173
1
199999
5408
1507029571478
4
1
17
1
13
1
10
11
4
1506994257000
118
2
199999
50823
1507029601478
4
1
17
1
13
1
9
11
99
1507013614000
213
3
199998
157770
1507029532200
4
1
17
1
25
5
40
40
281
1506983935000
201
4
199998
96613
1507029671831
4
1
17
1
25
5
39
40
209
1506938444000
185
test_click = test_click. merge