AI比赛-推荐系统（一）-新闻推荐02：训练item词向量【每篇新闻文章对应一个词向量】【word2vec：将每个用户点击的新闻文章序列作为输入（类似文本序列），训练出每篇文章的向量表示】【天池】

本文链接：https://blog.csdn.net/u013250861/article/details/128783839

安装gensim，使用gensim中的word2vec模型

# 安装gensim
!pip install gensim

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import logging

一、读取原始数据

1、加载原始数据

path = './data_raw/'
save_path = './temp_results/'

2、读取训练数据集

# 训练数据集
trn_click = pd.read_csv(path + 'train_click_log.csv')
trn_click = trn_click.sort_values('user_id')
trn_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type
1112620	0	157507	1508211702520	4	1	17	1	25	2
1112619	0	30760	1508211672520	4	1	17	1	25	2
1112602	1	63746	1508211346889	4	1	17	1	25	6
1112601	1	289197	1508211316889	4	1	17	1	25	6
1112600	2	168401	1508211468695	4	3	20	1	25	2
...	...	...	...	...	...	...	...	...	...
1081895	199999	218355	1508176867088	4	1	17	1	13	1
660731	199999	161191	1507665351186	4	1	17	1	13	1
660732	199999	42223	1507665381186	4	1	17	1	13	1
211041	199999	123909	1507226987864	4	1	17	1	13	1
0	199999	160417	1507029570190	4	1	17	1	13	1

1112623 rows × 9 columns

3、读取测试数据集

# 读取测试数据集
tst_click = pd.read_csv(path + 'testA_click_log.csv')
tst_click = tst_click.sort_values('user_id')
tst_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type
138222	200000	195839	1507030363999	4	1	17	1	17	1
138223	200000	191971	1507030393999	4	1	17	1	17	1
378656	200000	194300	1507651461280	4	1	17	1	17	1
138221	200001	175040	1507029536442	4	3	2	1	18	7
138219	200002	297906	1507029946064	4	1	17	1	8	1
...	...	...	...	...	...	...	...	...	...
155995	249999	313431	1507052560685	4	1	17	1	13	2
178071	249999	214800	1507117287497	4	1	17	1	13	2
178070	249999	233717	1507117257497	4	1	17	1	13	2
188451	249999	233717	1507133510213	4	1	17	1	13	2
0	249999	160974	1506959142820	4	1	17	1	13	2

518010 rows × 9 columns

4、读取文章信息

# 读取文章信息
item_df = pd.read_csv(path + 'articles.csv')
item_df = item_df.sort_values('article_id')
item_df

	article_id	category_id	created_at_ts	words_count
0	0	0	1513144419000	168
1	1	1	1405341936000	189
2	2	1	1408667706000	250
3	3	1	1408468313000	230
4	4	1	1407071171000	162
...	...	...	...	...
364042	364042	460	1434034118000	144
364043	364043	460	1434148472000	463
364044	364044	460	1457974279000	177
364045	364045	460	1515964737000	126
364046	364046	460	1505811330000	479

364047 rows × 4 columns

二、数据预处理

1、重命名item_df，与trn_click、tst_click中保持一致方便后续match

# 重命名item_df，与trn_click、tst_click中保持一致方便后续match  
item_df = item_df.rename(columns={'article_id': 'click_article_id'})  
item_df

	click_article_id	category_id	created_at_ts	words_count
0	0	0	1513144419000	168
1	1	1	1405341936000	189
2	2	1	1408667706000	250
3	3	1	1408468313000	230
4	4	1	1407071171000	162
...	...	...	...	...
364042	364042	460	1434034118000	144
364043	364043	460	1434148472000	463
364044	364044	460	1457974279000	177
364045	364045	460	1515964737000	126
364046	364046	460	1505811330000	479

364047 rows × 4 columns

2、计算用户点击文章的次数，并添加新的一列count

# 计算用户点击文章的次数，并添加新的一列count
trn_click['click_cnts'] = trn_click.groupby(['user_id'])['click_timestamp'].transform('count')
trn_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type	click_cnts
1112620	0	157507	1508211702520	4	1	17	1	25	2	2
1112619	0	30760	1508211672520	4	1	17	1	25	2	2
1112602	1	63746	1508211346889	4	1	17	1	25	6	2
1112601	1	289197	1508211316889	4	1	17	1	25	6	2
1112600	2	168401	1508211468695	4	3	20	1	25	2	2
...	...	...	...	...	...	...	...	...	...	...
1081895	199999	218355	1508176867088	4	1	17	1	13	1	11
660731	199999	161191	1507665351186	4	1	17	1	13	1	11
660732	199999	42223	1507665381186	4	1	17	1	13	1	11
211041	199999	123909	1507226987864	4	1	17	1	13	1	11
0	199999	160417	1507029570190	4	1	17	1	13	1	11

1112623 rows × 10 columns

tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')
tst_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type	click_cnts
138222	200000	195839	1507030363999	4	1	17	1	17	1	3
138223	200000	191971	1507030393999	4	1	17	1	17	1	3
378656	200000	194300	1507651461280	4	1	17	1	17	1	3
138221	200001	175040	1507029536442	4	3	2	1	18	7	1
138219	200002	297906	1507029946064	4	1	17	1	8	1	7
...	...	...	...	...	...	...	...	...	...	...
155995	249999	313431	1507052560685	4	1	17	1	13	2	19
178071	249999	214800	1507117287497	4	1	17	1	13	2	19
178070	249999	233717	1507117257497	4	1	17	1	13	2	19
188451	249999	233717	1507133510213	4	1	17	1	13	2	19
0	249999	160974	1506959142820	4	1	17	1	13	2	19

518010 rows × 10 columns

3、合并trn_click与item_df、tst_click与item_df

trn_click = trn_click.merge(item_df, how='left', on=['click_article_id'])   # 合并trn_click与item_df【外键为click_article_id】
trn_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type	click_cnts	category_id	created_at_ts	words_count
0	0	157507	1508211702520	4	1	17	1	25	2	2	281	1508236945000	370
1	0	30760	1508211672520	4	1	17	1	25	2	2	26	1508185091000	162
2	1	63746	1508211346889	4	1	17	1	25	6	2	133	1508142585000	162
3	1	289197	1508211316889	4	1	17	1	25	6	2	418	1508179909000	176
4	2	168401	1508211468695	4	3	20	1	25	2	2	297	1507663321000	215
...	...	...	...	...	...	...	...	...	...	...	...	...	...
1112618	199999	218355	1508176867088	4	1	17	1	13	1	11	352	1508155745000	202
1112619	199999	161191	1507665351186	4	1	17	1	13	1	11	281	1507646579000	285
1112620	199999	42223	1507665381186	4	1	17	1	13	1	11	67	1507648195000	186
1112621	199999	123909	1507226987864	4	1	17	1	13	1	11	250	1507198955000	240
1112622	199999	160417	1507029570190	4	1	17	1	13	1	11	281	1506942089000	173

1112623 rows × 13 columns

tst_click = tst_click.merge(item_df, how='left', on=['click_article_id'])   # 合并tst_click与item_df【外键为click_article_id】
tst_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type	click_cnts	category_id	created_at_ts	words_count
0	200000	195839	1507030363999	4	1	17	1	17	1	3	317	1507011388000	177
1	200000	191971	1507030393999	4	1	17	1	17	1	3	309	1507013094000	222
2	200000	194300	1507651461280	4	1	17	1	17	1	3	317	1507636150000	202
3	200001	175040	1507029536442	4	3	2	1	18	7	1	299	1506974928000	176
4	200002	297906	1507029946064	4	1	17	1	8	1	7	428	1506958329000	218
...	...	...	...	...	...	...	...	...	...	...	...	...	...
518005	249999	313431	1507052560685	4	1	17	1	13	2	19	431	1507037015000	222
518006	249999	214800	1507117287497	4	1	17	1	13	2	19	348	1507099489000	227
518007	249999	233717	1507117257497	4	1	17	1	13	2	19	375	1507089657000	184
518008	249999	233717	1507133510213	4	1	17	1	13	2	19	375	1507089657000	184
518009	249999	160974	1506959142820	4	1	17	1	13	2	19	281	1506912747000	259

518010 rows × 13 columns

4、合并训练集与测试集

# 合并训练集与测试集
all_click = trn_click.append(tst_click)
all_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type	click_cnts	category_id	created_at_ts	words_count
0	0	157507	1508211702520	4	1	17	1	25	2	2	281	1508236945000	370
1	0	30760	1508211672520	4	1	17	1	25	2	2	26	1508185091000	162
2	1	63746	1508211346889	4	1	17	1	25	6	2	133	1508142585000	162
3	1	289197	1508211316889	4	1	17	1	25	6	2	418	1508179909000	176
4	2	168401	1508211468695	4	3	20	1	25	2	2	297	1507663321000	215
...	...	...	...	...	...	...	...	...	...	...	...	...	...
518005	249999	313431	1507052560685	4	1	17	1	13	2	19	431	1507037015000	222
518006	249999	214800	1507117287497	4	1	17	1	13	2	19	348	1507099489000	227
518007	249999	233717	1507117257497	4	1	17	1	13	2	19	375	1507089657000	184
518008	249999	233717	1507133510213	4	1	17	1	13	2	19	375	1507089657000	184
518009	249999	160974	1506959142820	4	1	17	1	13	2	19	281	1506912747000	259

1630633 rows × 13 columns

5、把时间进行归一化操作

# 为了更好的可视化，这里把时间进行归一化操作
mm = MinMaxScaler()
all_click['click_timestamp'] = mm.fit_transform(all_click[['click_timestamp']])
all_click['created_at_ts'] = mm.fit_transform(all_click[['created_at_ts']])
all_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type	click_cnts	category_id	created_at_ts	words_count
0	0	157507	0.343719	4	1	17	1	25	2	2	281	0.992941	370
1	0	30760	0.343711	4	1	17	1	25	2	2	26	0.992790	162
2	1	63746	0.343622	4	1	17	1	25	6	2	133	0.992666	162
3	1	289197	0.343613	4	1	17	1	25	6	2	418	0.992775	176
4	2	168401	0.343655	4	3	20	1	25	2	2	297	0.991274	215
...	...	...	...	...	...	...	...	...	...	...	...	...	...
518005	249999	313431	0.025659	4	1	17	1	13	2	19	431	0.989453	222
518006	249999	214800	0.043419	4	1	17	1	13	2	19	348	0.989635	227
518007	249999	233717	0.043411	4	1	17	1	13	2	19	375	0.989606	184
518008	249999	233717	0.047871	4	1	17	1	13	2	19	375	0.989606	184
518009	249999	160974	0.000025	4	1	17	1	13	2	19	281	0.989092	259

1630633 rows × 13 columns

6、把训练数据集按click_timestamp排序

all_click = all_click.sort_values('click_timestamp')
all_click

	user_id	click_article_id	click_timestamp	click_environment	click_deviceGroup	click_os	click_country	click_region	click_referrer_type	click_cnts	category_id	created_at_ts	words_count
517912	249990	162300	0.000000	4	3	20	1	25	2	5	281	0.989186	193
517988	249998	160974	0.000002	4	1	12	1	13	2	5	281	0.989092	259
517867	249985	160974	0.000003	4	1	17	1	8	2	8	281	0.989092	259
517797	249979	162300	0.000004	4	1	17	1	25	2	2	281	0.989186	193
517875	249988	160974	0.000004	4	1	17	1	21	2	17	281	0.989092	259
...	...	...	...	...	...	...	...	...	...	...	...	...	...
496662	121158	224148	0.779105	4	1	17	1	13	2	54	354	0.997195	212
234746	70254	207672	0.860177	4	1	17	1	20	2	14	331	0.998288	242
234748	70254	96333	0.860185	4	1	17	1	20	2	14	209	0.998272	299
5953	2465	203538	0.999992	4	1	17	1	2	2	8	327	0.999741	275
5954	2465	145309	1.000000	4	1	17	1	2	2	8	269	1.000000	216

1630633 rows × 13 columns

三、训练各个文章的Embedding

1、将click_article_id字段的数据类型转为字符串类型

word2vec模型的输入要求为字符串类型

all_click.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1630633 entries, 0 to 518009
Data columns (total 13 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   user_id              1630633 non-null  int64  
 1   click_article_id     1630633 non-null  int64  
 2   click_timestamp      1630633 non-null  float64
 3   click_environment    1630633 non-null  int64  
 4   click_deviceGroup    1630633 non-null  int64  
 5   click_os             1630633 non-null  int64  
 6   click_country        1630633 non-null  int64  
 7   click_region         1630633 non-null  int64  
 8   click_referrer_type  1630633 non-null  int64  
 9   click_cnts           1630633 non-null  int64  
 10  category_id          1630633 non-null  int64  
 11  created_at_ts        1630633 non-null  float64
 12  words_count          1630633 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 174.2 MB

# 只有转换成字符串才可以进行训练
all_click['click_article_id'] = all_click['click_article_id'].astype(str)

all_click.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1630633 entries, 0 to 518009
Data columns (total 13 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   user_id              1630633 non-null  int64  
 1   click_article_id     1630633 non-null  object 
 2   click_timestamp      1630633 non-null  float64
 3   click_environment    1630633 non-null  int64  
 4   click_deviceGroup    1630633 non-null  int64  
 5   click_os             1630633 non-null  int64  
 6   click_country        1630633 non-null  int64  
 7   click_region         1630633 non-null  int64  
 8   click_referrer_type  1630633 non-null  int64  
 9   click_cnts           1630633 non-null  int64  
 10  category_id          1630633 non-null  int64  
 11  created_at_ts        1630633 non-null  float64
 12  words_count          1630633 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 174.2+ MB

2、将每个用户的新闻点击序列转换成句子的形式

# 转换成句子的形式
docs = all_click.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()
docs

	user_id	click_article_id
0	0	[30760, 157507]
1	1	[289197, 63746]
2	2	[36162, 168401]
3	3	[50644, 36162]
4	4	[42567, 39894]
...	...	...
249995	249995	[300470, 16129, 160974, 182394, 198659, 272143...
249996	249996	[160974]
249997	249997	[183665, 181686, 123909, 74719, 124667, 124337...
249998	249998	[160974, 202557, 237524, 236207, 235105]
249999	249999	[160974, 160417, 162338, 313431, 233717, 21480...

250000 rows × 2 columns

docs = docs['click_article_id'].values.tolist()
docs

Output exceeds the size limit. Open the full output data in a text editor
[['30760', '157507'],
 ['289197', '63746'],
 ['36162', '168401'],
 ['50644', '36162'],
 ['42567', '39894'],
 ['211442', '234481'],
 ['62464', '10023'],
 ['50644', '211442'],
 ['70986', '50644'],
 ['70986', '211442', '211455'],
 ['50644', '159195'],
 ['50644', '234481'],
 ['211442', '211455'],
 ['36162', '277107'],
 ['70986', '36162'],
 ['277107', '342473', '206415'],
 ['50644', '211442'],
 ['156279', '158331', '363916'],
 ['70986', '224730'],
 ['70986', '205824'],
 ['285433', '285300'],
 ['205958', '70758'],
 ['107014', '107190'],
 ['309535', '309311'],
 ['211442', '156279'],
...
 ['234481', '30760'],
 ['209122', '234308'],
 ['209122', '70986', '284470', '277712'],
 ['70986', '122152'],
 ...]

3、训练Word2Vec模型

# 为了方便查看训练的进度，这里设定一个log信息
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
# 这里的参数对训练得到的向量影响也很大,默认负采样为5【为节约时间这里模型只迭代了一次，为方便查看设置vector_size=16，vector_size可设置为100】
w2v = Word2Vec(sentences=docs, sg=1, window=5, seed=2020, vector_size=16, workers=1, min_count=1, epochs=1) # 需要注意：
# 保存成字典的形式
item_w2v_emb_dict = {k: w2v.wv[k] for k in all_click['click_article_id']}

Output exceeds the size limit. Open the full output data in a text editor
2023-01-29 01:59:03,467:INFO:collecting all words and their counts
2023-01-29 01:59:03,468:INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-01-29 01:59:03,475:INFO:PROGRESS: at sentence #10000, processed 25727 words, keeping 3473 word types
2023-01-29 01:59:03,483:INFO:PROGRESS: at sentence #20000, processed 53883 words, keeping 5811 word types
2023-01-29 01:59:03,492:INFO:PROGRESS: at sentence #30000, processed 84881 words, keeping 7676 word types
2023-01-29 01:59:03,501:INFO:PROGRESS: at sentence #40000, processed 118390 words, keeping 9297 word types
2023-01-29 01:59:03,511:INFO:PROGRESS: at sentence #50000, processed 154179 words, keeping 10844 word types
2023-01-29 01:59:03,523:INFO:PROGRESS: at sentence #60000, processed 192350 words, keeping 12357 word types
2023-01-29 01:59:03,536:INFO:PROGRESS: at sentence #70000, processed 233685 words, keeping 13473 word types
2023-01-29 01:59:03,551:INFO:PROGRESS: at sentence #80000, processed 281335 words, keeping 14939 word types
2023-01-29 01:59:03,565:INFO:PROGRESS: at sentence #90000, processed 329973 words, keeping 16420 word types
2023-01-29 01:59:03,578:INFO:PROGRESS: at sentence #100000, processed 379428 words, keeping 17904 word types
2023-01-29 01:59:03,593:INFO:PROGRESS: at sentence #110000, processed 431464 words, keeping 18928 word types
2023-01-29 01:59:03,611:INFO:PROGRESS: at sentence #120000, processed 489655 words, keeping 20157 word types
2023-01-29 01:59:03,629:INFO:PROGRESS: at sentence #130000, processed 550375 words, keeping 21588 word types
2023-01-29 01:59:03,649:INFO:PROGRESS: at sentence #140000, processed 613031 words, keeping 22923 word types
2023-01-29 01:59:03,669:INFO:PROGRESS: at sentence #150000, processed 678645 words, keeping 24209 word types
2023-01-29 01:59:03,691:INFO:PROGRESS: at sentence #160000, processed 749559 words, keeping 25743 word types
2023-01-29 01:59:03,714:INFO:PROGRESS: at sentence #170000, processed 831064 words, keeping 27232 word types
2023-01-29 01:59:03,738:INFO:PROGRESS: at sentence #180000, processed 914233 words, keeping 28612 word types
2023-01-29 01:59:03,766:INFO:PROGRESS: at sentence #190000, processed 1004976 words, keeping 29699 word types
2023-01-29 01:59:03,800:INFO:PROGRESS: at sentence #200000, processed 1112623 words, keeping 31116 word types
2023-01-29 01:59:03,824:INFO:PROGRESS: at sentence #210000, processed 1200577 words, keeping 31798 word types
2023-01-29 01:59:03,850:INFO:PROGRESS: at sentence #220000, processed 1285942 words, keeping 32381 word types
2023-01-29 01:59:03,878:INFO:PROGRESS: at sentence #230000, processed 1380836 words, keeping 33131 word types
...
2023-01-29 01:59:08,532:INFO:EPOCH 0 - PROGRESS: at 94.73% examples, 317616 words/s, in_qsize 1, out_qsize 0
2023-01-29 01:59:09,130:INFO:EPOCH 0: training on 1630633 raw words (1453015 effective words) took 4.7s, 309765 effective words/s
2023-01-29 01:59:09,131:INFO:Word2Vec lifecycle event {'msg': 'training on 1630633 raw words (1453015 effective words) took 4.7s, 309335 effective words/s', 'datetime': '2023-01-29T01:59:09.131480', 'gensim': '4.3.0', 'python': '3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 15:55:03) \n[GCC 10.4.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'train'}
2023-01-29 01:59:09,132:INFO:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=35380, vector_size=16, alpha=0.025>', 'datetime': '2023-01-29T01:59:09.132038', 'gensim': '4.3.0', 'python': '3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 15:55:03) \n[GCC 10.4.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'created'}

item_w2v_emb_dict

Output exceeds the size limit. Open the full output data in a text editor
{'162300': array([-0.37774816,  1.3504976 , -0.23309161,  0.26072636, -0.23989849,
         0.5004345 , -0.56577134, -0.5481367 ,  0.2039964 ,  0.8342705 ,
        -0.48475933, -0.52761525,  0.20438878,  1.1823852 , -0.4367902 ,
         0.5961195 ], dtype=float32),
 '160974': array([-0.6204943 ,  1.9116834 , -0.46083373,  0.40015092, -0.1149492 ,
         0.16426347, -0.79475456, -0.5675412 , -0.11137734,  0.8231001 ,
        -0.40696675, -0.41709152,  0.46059853,  1.4314909 , -0.43337965,
         0.6168017 ], dtype=float32),
 '158082': array([-0.31103644,  1.366509  , -0.15305053,  0.19239815, -0.22246826,
         0.5307462 , -0.6013954 , -0.5462901 ,  0.15414776,  0.7907959 ,
        -0.47677982, -0.5615761 ,  0.13641724,  1.0848166 , -0.49600208,
         0.577961  ], dtype=float32),
 '158536': array([-0.8572138 ,  2.3342323 , -0.820582  ,  0.59009516, -0.11867882,
        -0.00443581, -0.90663487, -0.48073447, -0.21962236,  0.72721934,
        -0.5553819 , -0.38393563,  0.44512793,  1.4385353 , -0.60131   ,
         0.8454735 ], dtype=float32),
 '300470': array([-3.0404758e-01,  1.6421673e+00, -3.3394170e-01,  1.7405626e-01,
        -3.1160885e-01,  4.3005905e-01, -6.1486483e-01, -5.4003429e-01,
         7.3214585e-04,  8.6553878e-01, -3.9799103e-01, -4.2683634e-01,
         4.4759423e-01,  1.5222788e+00, -4.0337685e-01,  5.7117921e-01],
       dtype=float32),
 '59758': array([-0.525708  ,  1.5093073 , -0.56673056,  0.20997894, -0.18883261,
         0.15363155, -0.53620964, -0.47269717, -0.02236754,  0.80978376,
        -0.3462906 , -0.26018006,  0.27204517,  1.4290353 , -0.47859445,
         0.36089382], dtype=float32),
...
 '63596': array([ 0.01696501,  0.02612317,  0.00530556,  0.03226124,  0.01429469,
        -0.04120407, -0.01667616, -0.03267057, -0.00297171, -0.05766348,
        -0.00219459, -0.02344081,  0.00532304,  0.0203535 , -0.03306687,
         0.01739437], dtype=float32),
 ...}

4、将推荐列表字典的形式转换成df

# 将推荐列表字典的形式转换成df
articles_embedding_list = []
for article_id, embedding in item_w2v_emb_dict.items():
    embedding = embedding.tolist()
    articles_embedding_list.append([article_id] + embedding)
articles_embedding_df = pd.DataFrame(articles_embedding_list, columns=['article_id'] + ['emb_' + str(i) for i in range(16)])
articles_embedding_df

	article_id	emb_0	emb_1	emb_2	emb_3	emb_4	emb_5	emb_6	emb_7	emb_8	emb_9	emb_10	emb_11	emb_12	emb_13	emb_14	emb_15
0	162300	-0.377748	1.350498	-0.233092	0.260726	-0.239898	0.500435	-0.565771	-0.548137	0.203996	0.834270	-0.484759	-0.527615	0.204389	1.182385	-0.436790	0.596120
1	160974	-0.620494	1.911683	-0.460834	0.400151	-0.114949	0.164263	-0.794755	-0.567541	-0.111377	0.823100	-0.406967	-0.417092	0.460599	1.431491	-0.433380	0.616802
2	158082	-0.311036	1.366509	-0.153051	0.192398	-0.222468	0.530746	-0.601395	-0.546290	0.154148	0.790796	-0.476780	-0.561576	0.136417	1.084817	-0.496002	0.577961
3	158536	-0.857214	2.334232	-0.820582	0.590095	-0.118679	-0.004436	-0.906635	-0.480734	-0.219622	0.727219	-0.555382	-0.383936	0.445128	1.438535	-0.601310	0.845474
4	300470	-0.304048	1.642167	-0.333942	0.174056	-0.311609	0.430059	-0.614865	-0.540034	0.000732	0.865539	-0.397991	-0.426836	0.447594	1.522279	-0.403377	0.571179
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
35375	224148	-0.106398	0.048206	-0.058324	-0.039949	0.049242	0.056162	-0.010760	0.023729	-0.016845	0.038499	0.012183	0.025061	0.074791	-0.014461	-0.007782	0.029464
35376	207672	0.009338	0.037096	-0.042352	0.021171	0.027002	-0.027749	-0.028915	0.028978	0.000694	0.050991	0.010830	-0.005212	-0.018071	0.024465	-0.003564	0.020691
35377	96333	-0.096300	0.045616	0.019998	0.053210	0.046997	0.038450	-0.027336	0.010665	-0.006431	0.008583	0.032228	0.008812	-0.002347	0.007475	0.037719	-0.024010
35378	203538	-0.046094	0.016830	0.036601	-0.013213	0.057705	-0.031858	-0.010430	0.005571	-0.005230	-0.011425	-0.007605	-0.049261	0.001949	-0.032922	0.013038	0.004562
35379	145309	0.003439	0.054341	0.054563	-0.022907	-0.028525	-0.045272	-0.046397	0.026955	0.015746	0.022889	-0.009937	-0.042705	0.000379	0.031564	0.058540	-0.043099

35380 rows × 17 columns

5、保存利用word2vec训练好的每篇文章的Embedding

articles_embedding_df.to_csv(save_path + '/articles_emb.csv', index=False, header=True)

四、使用word2vec训练得到的词向量进行可视化

# 使用word2vec训练得到的词向量进行可视化
def get_item_sim_list(df):
    sim_list = []
    item_list = df['click_article_id'].values
    for i in range(0, len(item_list)-1):
        emb1 = item_w2v_emb_dict[str(item_list[i])] # 需要注意的是word2vec训练时候使用的是str类型的数据
        emb2 = item_w2v_emb_dict[str(item_list[i+1])]
        sim_list.append(np.dot(emb1,emb2)/(np.linalg.norm(emb1)*(np.linalg.norm(emb2))))
    sim_list.append(0)

    return sim_list

# 随机选择5个用户，查看这些用户前后查看文章的相似性
sub_user_ids = np.random.choice(all_click.user_id.unique(), size=5, replace=False)
sub_user_info = all_click[all_click['user_id'].isin(sub_user_ids)]
for _, user_df in sub_user_info.groupby('user_id'):
    item_sim_list = get_item_sim_list(user_df)
    print("item_sim_list = ", item_sim_list)
    plt.plot(item_sim_list)

item_sim_list =  [0.92883706, 0]
item_sim_list =  [0.9124602, 0]
item_sim_list =  [0.9199342, 0.96343744, 0]
item_sim_list =  [0.9781094, 0.96341544, 0.97788256, 0.9595503, 0.8217939, 0.72416514, 0.9395525, 0.7885189, 0.9457342, 0.88077354, 0.97478765, 0.9660947, 0.98741436, 0.741313, 0.87396616, 0.929134, 0]
item_sim_list =  [0.96066815, 0.9530401, 0.6379823, 0.9544767, 0.97723705, 0.9679809, 0.9853506, 0.7811122, 0.75035423, 0]