安装gensim,使用gensim中的word2vec模型
# 安装gensim
!pip install gensim
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import logging
一、读取原始数据
1、加载原始数据
path = './data_raw/'
save_path = './temp_results/'
2、读取训练数据集
# 训练数据集
trn_click = pd.read_csv(path + 'train_click_log.csv')
trn_click = trn_click.sort_values('user_id')
trn_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | |
---|---|---|---|---|---|---|---|---|---|
1112620 | 0 | 157507 | 1508211702520 | 4 | 1 | 17 | 1 | 25 | 2 |
1112619 | 0 | 30760 | 1508211672520 | 4 | 1 | 17 | 1 | 25 | 2 |
1112602 | 1 | 63746 | 1508211346889 | 4 | 1 | 17 | 1 | 25 | 6 |
1112601 | 1 | 289197 | 1508211316889 | 4 | 1 | 17 | 1 | 25 | 6 |
1112600 | 2 | 168401 | 1508211468695 | 4 | 3 | 20 | 1 | 25 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1081895 | 199999 | 218355 | 1508176867088 | 4 | 1 | 17 | 1 | 13 | 1 |
660731 | 199999 | 161191 | 1507665351186 | 4 | 1 | 17 | 1 | 13 | 1 |
660732 | 199999 | 42223 | 1507665381186 | 4 | 1 | 17 | 1 | 13 | 1 |
211041 | 199999 | 123909 | 1507226987864 | 4 | 1 | 17 | 1 | 13 | 1 |
0 | 199999 | 160417 | 1507029570190 | 4 | 1 | 17 | 1 | 13 | 1 |
1112623 rows × 9 columns
3、读取测试数据集
# 读取测试数据集
tst_click = pd.read_csv(path + 'testA_click_log.csv')
tst_click = tst_click.sort_values('user_id')
tst_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | |
---|---|---|---|---|---|---|---|---|---|
138222 | 200000 | 195839 | 1507030363999 | 4 | 1 | 17 | 1 | 17 | 1 |
138223 | 200000 | 191971 | 1507030393999 | 4 | 1 | 17 | 1 | 17 | 1 |
378656 | 200000 | 194300 | 1507651461280 | 4 | 1 | 17 | 1 | 17 | 1 |
138221 | 200001 | 175040 | 1507029536442 | 4 | 3 | 2 | 1 | 18 | 7 |
138219 | 200002 | 297906 | 1507029946064 | 4 | 1 | 17 | 1 | 8 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
155995 | 249999 | 313431 | 1507052560685 | 4 | 1 | 17 | 1 | 13 | 2 |
178071 | 249999 | 214800 | 1507117287497 | 4 | 1 | 17 | 1 | 13 | 2 |
178070 | 249999 | 233717 | 1507117257497 | 4 | 1 | 17 | 1 | 13 | 2 |
188451 | 249999 | 233717 | 1507133510213 | 4 | 1 | 17 | 1 | 13 | 2 |
0 | 249999 | 160974 | 1506959142820 | 4 | 1 | 17 | 1 | 13 | 2 |
518010 rows × 9 columns
4、读取文章信息
# 读取文章信息
item_df = pd.read_csv(path + 'articles.csv')
item_df = item_df.sort_values('article_id')
item_df
article_id | category_id | created_at_ts | words_count | |
---|---|---|---|---|
0 | 0 | 0 | 1513144419000 | 168 |
1 | 1 | 1 | 1405341936000 | 189 |
2 | 2 | 1 | 1408667706000 | 250 |
3 | 3 | 1 | 1408468313000 | 230 |
4 | 4 | 1 | 1407071171000 | 162 |
... | ... | ... | ... | ... |
364042 | 364042 | 460 | 1434034118000 | 144 |
364043 | 364043 | 460 | 1434148472000 | 463 |
364044 | 364044 | 460 | 1457974279000 | 177 |
364045 | 364045 | 460 | 1515964737000 | 126 |
364046 | 364046 | 460 | 1505811330000 | 479 |
364047 rows × 4 columns
二、数据预处理
1、重命名item_df,与trn_click、tst_click中保持一致方便后续match
# 重命名item_df,与trn_click、tst_click中保持一致方便后续match
item_df = item_df.rename(columns={'article_id': 'click_article_id'})
item_df
click_article_id | category_id | created_at_ts | words_count | |
---|---|---|---|---|
0 | 0 | 0 | 1513144419000 | 168 |
1 | 1 | 1 | 1405341936000 | 189 |
2 | 2 | 1 | 1408667706000 | 250 |
3 | 3 | 1 | 1408468313000 | 230 |
4 | 4 | 1 | 1407071171000 | 162 |
... | ... | ... | ... | ... |
364042 | 364042 | 460 | 1434034118000 | 144 |
364043 | 364043 | 460 | 1434148472000 | 463 |
364044 | 364044 | 460 | 1457974279000 | 177 |
364045 | 364045 | 460 | 1515964737000 | 126 |
364046 | 364046 | 460 | 1505811330000 | 479 |
364047 rows × 4 columns
2、计算用户点击文章的次数,并添加新的一列count
# 计算用户点击文章的次数,并添加新的一列count
trn_click['click_cnts'] = trn_click.groupby(['user_id'])['click_timestamp'].transform('count')
trn_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | click_cnts | |
---|---|---|---|---|---|---|---|---|---|---|
1112620 | 0 | 157507 | 1508211702520 | 4 | 1 | 17 | 1 | 25 | 2 | 2 |
1112619 | 0 | 30760 | 1508211672520 | 4 | 1 | 17 | 1 | 25 | 2 | 2 |
1112602 | 1 | 63746 | 1508211346889 | 4 | 1 | 17 | 1 | 25 | 6 | 2 |
1112601 | 1 | 289197 | 1508211316889 | 4 | 1 | 17 | 1 | 25 | 6 | 2 |
1112600 | 2 | 168401 | 1508211468695 | 4 | 3 | 20 | 1 | 25 | 2 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1081895 | 199999 | 218355 | 1508176867088 | 4 | 1 | 17 | 1 | 13 | 1 | 11 |
660731 | 199999 | 161191 | 1507665351186 | 4 | 1 | 17 | 1 | 13 | 1 | 11 |
660732 | 199999 | 42223 | 1507665381186 | 4 | 1 | 17 | 1 | 13 | 1 | 11 |
211041 | 199999 | 123909 | 1507226987864 | 4 | 1 | 17 | 1 | 13 | 1 | 11 |
0 | 199999 | 160417 | 1507029570190 | 4 | 1 | 17 | 1 | 13 | 1 | 11 |
1112623 rows × 10 columns
tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')
tst_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | click_cnts | |
---|---|---|---|---|---|---|---|---|---|---|
138222 | 200000 | 195839 | 1507030363999 | 4 | 1 | 17 | 1 | 17 | 1 | 3 |
138223 | 200000 | 191971 | 1507030393999 | 4 | 1 | 17 | 1 | 17 | 1 | 3 |
378656 | 200000 | 194300 | 1507651461280 | 4 | 1 | 17 | 1 | 17 | 1 | 3 |
138221 | 200001 | 175040 | 1507029536442 | 4 | 3 | 2 | 1 | 18 | 7 | 1 |
138219 | 200002 | 297906 | 1507029946064 | 4 | 1 | 17 | 1 | 8 | 1 | 7 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
155995 | 249999 | 313431 | 1507052560685 | 4 | 1 | 17 | 1 | 13 | 2 | 19 |
178071 | 249999 | 214800 | 1507117287497 | 4 | 1 | 17 | 1 | 13 | 2 | 19 |
178070 | 249999 | 233717 | 1507117257497 | 4 | 1 | 17 | 1 | 13 | 2 | 19 |
188451 | 249999 | 233717 | 1507133510213 | 4 | 1 | 17 | 1 | 13 | 2 | 19 |
0 | 249999 | 160974 | 1506959142820 | 4 | 1 | 17 | 1 | 13 | 2 | 19 |
518010 rows × 10 columns
3、合并trn_click与item_df、tst_click与item_df
trn_click = trn_click.merge(item_df, how='left', on=['click_article_id']) # 合并trn_click与item_df【外键为click_article_id】
trn_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | click_cnts | category_id | created_at_ts | words_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 157507 | 1508211702520 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 281 | 1508236945000 | 370 |
1 | 0 | 30760 | 1508211672520 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 26 | 1508185091000 | 162 |
2 | 1 | 63746 | 1508211346889 | 4 | 1 | 17 | 1 | 25 | 6 | 2 | 133 | 1508142585000 | 162 |
3 | 1 | 289197 | 1508211316889 | 4 | 1 | 17 | 1 | 25 | 6 | 2 | 418 | 1508179909000 | 176 |
4 | 2 | 168401 | 1508211468695 | 4 | 3 | 20 | 1 | 25 | 2 | 2 | 297 | 1507663321000 | 215 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1112618 | 199999 | 218355 | 1508176867088 | 4 | 1 | 17 | 1 | 13 | 1 | 11 | 352 | 1508155745000 | 202 |
1112619 | 199999 | 161191 | 1507665351186 | 4 | 1 | 17 | 1 | 13 | 1 | 11 | 281 | 1507646579000 | 285 |
1112620 | 199999 | 42223 | 1507665381186 | 4 | 1 | 17 | 1 | 13 | 1 | 11 | 67 | 1507648195000 | 186 |
1112621 | 199999 | 123909 | 1507226987864 | 4 | 1 | 17 | 1 | 13 | 1 | 11 | 250 | 1507198955000 | 240 |
1112622 | 199999 | 160417 | 1507029570190 | 4 | 1 | 17 | 1 | 13 | 1 | 11 | 281 | 1506942089000 | 173 |
1112623 rows × 13 columns
tst_click = tst_click.merge(item_df, how='left', on=['click_article_id']) # 合并tst_click与item_df【外键为click_article_id】
tst_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | click_cnts | category_id | created_at_ts | words_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 200000 | 195839 | 1507030363999 | 4 | 1 | 17 | 1 | 17 | 1 | 3 | 317 | 1507011388000 | 177 |
1 | 200000 | 191971 | 1507030393999 | 4 | 1 | 17 | 1 | 17 | 1 | 3 | 309 | 1507013094000 | 222 |
2 | 200000 | 194300 | 1507651461280 | 4 | 1 | 17 | 1 | 17 | 1 | 3 | 317 | 1507636150000 | 202 |
3 | 200001 | 175040 | 1507029536442 | 4 | 3 | 2 | 1 | 18 | 7 | 1 | 299 | 1506974928000 | 176 |
4 | 200002 | 297906 | 1507029946064 | 4 | 1 | 17 | 1 | 8 | 1 | 7 | 428 | 1506958329000 | 218 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
518005 | 249999 | 313431 | 1507052560685 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 431 | 1507037015000 | 222 |
518006 | 249999 | 214800 | 1507117287497 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 348 | 1507099489000 | 227 |
518007 | 249999 | 233717 | 1507117257497 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 375 | 1507089657000 | 184 |
518008 | 249999 | 233717 | 1507133510213 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 375 | 1507089657000 | 184 |
518009 | 249999 | 160974 | 1506959142820 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 281 | 1506912747000 | 259 |
518010 rows × 13 columns
4、合并训练集与测试集
# 合并训练集与测试集
all_click = trn_click.append(tst_click)
all_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | click_cnts | category_id | created_at_ts | words_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 157507 | 1508211702520 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 281 | 1508236945000 | 370 |
1 | 0 | 30760 | 1508211672520 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 26 | 1508185091000 | 162 |
2 | 1 | 63746 | 1508211346889 | 4 | 1 | 17 | 1 | 25 | 6 | 2 | 133 | 1508142585000 | 162 |
3 | 1 | 289197 | 1508211316889 | 4 | 1 | 17 | 1 | 25 | 6 | 2 | 418 | 1508179909000 | 176 |
4 | 2 | 168401 | 1508211468695 | 4 | 3 | 20 | 1 | 25 | 2 | 2 | 297 | 1507663321000 | 215 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
518005 | 249999 | 313431 | 1507052560685 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 431 | 1507037015000 | 222 |
518006 | 249999 | 214800 | 1507117287497 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 348 | 1507099489000 | 227 |
518007 | 249999 | 233717 | 1507117257497 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 375 | 1507089657000 | 184 |
518008 | 249999 | 233717 | 1507133510213 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 375 | 1507089657000 | 184 |
518009 | 249999 | 160974 | 1506959142820 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 281 | 1506912747000 | 259 |
1630633 rows × 13 columns
5、把时间进行归一化操作
# 为了更好的可视化,这里把时间进行归一化操作
mm = MinMaxScaler()
all_click['click_timestamp'] = mm.fit_transform(all_click[['click_timestamp']])
all_click['created_at_ts'] = mm.fit_transform(all_click[['created_at_ts']])
all_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | click_cnts | category_id | created_at_ts | words_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 157507 | 0.343719 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 281 | 0.992941 | 370 |
1 | 0 | 30760 | 0.343711 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 26 | 0.992790 | 162 |
2 | 1 | 63746 | 0.343622 | 4 | 1 | 17 | 1 | 25 | 6 | 2 | 133 | 0.992666 | 162 |
3 | 1 | 289197 | 0.343613 | 4 | 1 | 17 | 1 | 25 | 6 | 2 | 418 | 0.992775 | 176 |
4 | 2 | 168401 | 0.343655 | 4 | 3 | 20 | 1 | 25 | 2 | 2 | 297 | 0.991274 | 215 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
518005 | 249999 | 313431 | 0.025659 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 431 | 0.989453 | 222 |
518006 | 249999 | 214800 | 0.043419 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 348 | 0.989635 | 227 |
518007 | 249999 | 233717 | 0.043411 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 375 | 0.989606 | 184 |
518008 | 249999 | 233717 | 0.047871 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 375 | 0.989606 | 184 |
518009 | 249999 | 160974 | 0.000025 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 281 | 0.989092 | 259 |
1630633 rows × 13 columns
6、把训练数据集按click_timestamp排序
all_click = all_click.sort_values('click_timestamp')
all_click
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | click_cnts | category_id | created_at_ts | words_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
517912 | 249990 | 162300 | 0.000000 | 4 | 3 | 20 | 1 | 25 | 2 | 5 | 281 | 0.989186 | 193 |
517988 | 249998 | 160974 | 0.000002 | 4 | 1 | 12 | 1 | 13 | 2 | 5 | 281 | 0.989092 | 259 |
517867 | 249985 | 160974 | 0.000003 | 4 | 1 | 17 | 1 | 8 | 2 | 8 | 281 | 0.989092 | 259 |
517797 | 249979 | 162300 | 0.000004 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 281 | 0.989186 | 193 |
517875 | 249988 | 160974 | 0.000004 | 4 | 1 | 17 | 1 | 21 | 2 | 17 | 281 | 0.989092 | 259 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
496662 | 121158 | 224148 | 0.779105 | 4 | 1 | 17 | 1 | 13 | 2 | 54 | 354 | 0.997195 | 212 |
234746 | 70254 | 207672 | 0.860177 | 4 | 1 | 17 | 1 | 20 | 2 | 14 | 331 | 0.998288 | 242 |
234748 | 70254 | 96333 | 0.860185 | 4 | 1 | 17 | 1 | 20 | 2 | 14 | 209 | 0.998272 | 299 |
5953 | 2465 | 203538 | 0.999992 | 4 | 1 | 17 | 1 | 2 | 2 | 8 | 327 | 0.999741 | 275 |
5954 | 2465 | 145309 | 1.000000 | 4 | 1 | 17 | 1 | 2 | 2 | 8 | 269 | 1.000000 | 216 |
1630633 rows × 13 columns
三、训练各个文章的Embedding
1、将click_article_id字段的数据类型转为字符串类型
word2vec模型的输入要求为字符串类型
all_click.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1630633 entries, 0 to 518009
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 1630633 non-null int64
1 click_article_id 1630633 non-null int64
2 click_timestamp 1630633 non-null float64
3 click_environment 1630633 non-null int64
4 click_deviceGroup 1630633 non-null int64
5 click_os 1630633 non-null int64
6 click_country 1630633 non-null int64
7 click_region 1630633 non-null int64
8 click_referrer_type 1630633 non-null int64
9 click_cnts 1630633 non-null int64
10 category_id 1630633 non-null int64
11 created_at_ts 1630633 non-null float64
12 words_count 1630633 non-null int64
dtypes: float64(2), int64(11)
memory usage: 174.2 MB
# 只有转换成字符串才可以进行训练
all_click['click_article_id'] = all_click['click_article_id'].astype(str)
all_click.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1630633 entries, 0 to 518009
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 1630633 non-null int64
1 click_article_id 1630633 non-null object
2 click_timestamp 1630633 non-null float64
3 click_environment 1630633 non-null int64
4 click_deviceGroup 1630633 non-null int64
5 click_os 1630633 non-null int64
6 click_country 1630633 non-null int64
7 click_region 1630633 non-null int64
8 click_referrer_type 1630633 non-null int64
9 click_cnts 1630633 non-null int64
10 category_id 1630633 non-null int64
11 created_at_ts 1630633 non-null float64
12 words_count 1630633 non-null int64
dtypes: float64(2), int64(10), object(1)
memory usage: 174.2+ MB
2、将每个用户的新闻点击序列转换成句子的形式
# 转换成句子的形式
docs = all_click.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()
docs
user_id | click_article_id | |
---|---|---|
0 | 0 | [30760, 157507] |
1 | 1 | [289197, 63746] |
2 | 2 | [36162, 168401] |
3 | 3 | [50644, 36162] |
4 | 4 | [42567, 39894] |
... | ... | ... |
249995 | 249995 | [300470, 16129, 160974, 182394, 198659, 272143... |
249996 | 249996 | [160974] |
249997 | 249997 | [183665, 181686, 123909, 74719, 124667, 124337... |
249998 | 249998 | [160974, 202557, 237524, 236207, 235105] |
249999 | 249999 | [160974, 160417, 162338, 313431, 233717, 21480... |
250000 rows × 2 columns
docs = docs['click_article_id'].values.tolist()
docs
Output exceeds the size limit. Open the full output data in a text editor
[['30760', '157507'],
['289197', '63746'],
['36162', '168401'],
['50644', '36162'],
['42567', '39894'],
['211442', '234481'],
['62464', '10023'],
['50644', '211442'],
['70986', '50644'],
['70986', '211442', '211455'],
['50644', '159195'],
['50644', '234481'],
['211442', '211455'],
['36162', '277107'],
['70986', '36162'],
['277107', '342473', '206415'],
['50644', '211442'],
['156279', '158331', '363916'],
['70986', '224730'],
['70986', '205824'],
['285433', '285300'],
['205958', '70758'],
['107014', '107190'],
['309535', '309311'],
['211442', '156279'],
...
['234481', '30760'],
['209122', '234308'],
['209122', '70986', '284470', '277712'],
['70986', '122152'],
...]
3、训练Word2Vec模型
# 为了方便查看训练的进度,这里设定一个log信息
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
# 这里的参数对训练得到的向量影响也很大,默认负采样为5【为节约时间这里模型只迭代了一次,为方便查看设置vector_size=16,vector_size可设置为100】
w2v = Word2Vec(sentences=docs, sg=1, window=5, seed=2020, vector_size=16, workers=1, min_count=1, epochs=1) # 需要注意:
# 保存成字典的形式
item_w2v_emb_dict = {k: w2v.wv[k] for k in all_click['click_article_id']}
Output exceeds the size limit. Open the full output data in a text editor
2023-01-29 01:59:03,467:INFO:collecting all words and their counts
2023-01-29 01:59:03,468:INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-01-29 01:59:03,475:INFO:PROGRESS: at sentence #10000, processed 25727 words, keeping 3473 word types
2023-01-29 01:59:03,483:INFO:PROGRESS: at sentence #20000, processed 53883 words, keeping 5811 word types
2023-01-29 01:59:03,492:INFO:PROGRESS: at sentence #30000, processed 84881 words, keeping 7676 word types
2023-01-29 01:59:03,501:INFO:PROGRESS: at sentence #40000, processed 118390 words, keeping 9297 word types
2023-01-29 01:59:03,511:INFO:PROGRESS: at sentence #50000, processed 154179 words, keeping 10844 word types
2023-01-29 01:59:03,523:INFO:PROGRESS: at sentence #60000, processed 192350 words, keeping 12357 word types
2023-01-29 01:59:03,536:INFO:PROGRESS: at sentence #70000, processed 233685 words, keeping 13473 word types
2023-01-29 01:59:03,551:INFO:PROGRESS: at sentence #80000, processed 281335 words, keeping 14939 word types
2023-01-29 01:59:03,565:INFO:PROGRESS: at sentence #90000, processed 329973 words, keeping 16420 word types
2023-01-29 01:59:03,578:INFO:PROGRESS: at sentence #100000, processed 379428 words, keeping 17904 word types
2023-01-29 01:59:03,593:INFO:PROGRESS: at sentence #110000, processed 431464 words, keeping 18928 word types
2023-01-29 01:59:03,611:INFO:PROGRESS: at sentence #120000, processed 489655 words, keeping 20157 word types
2023-01-29 01:59:03,629:INFO:PROGRESS: at sentence #130000, processed 550375 words, keeping 21588 word types
2023-01-29 01:59:03,649:INFO:PROGRESS: at sentence #140000, processed 613031 words, keeping 22923 word types
2023-01-29 01:59:03,669:INFO:PROGRESS: at sentence #150000, processed 678645 words, keeping 24209 word types
2023-01-29 01:59:03,691:INFO:PROGRESS: at sentence #160000, processed 749559 words, keeping 25743 word types
2023-01-29 01:59:03,714:INFO:PROGRESS: at sentence #170000, processed 831064 words, keeping 27232 word types
2023-01-29 01:59:03,738:INFO:PROGRESS: at sentence #180000, processed 914233 words, keeping 28612 word types
2023-01-29 01:59:03,766:INFO:PROGRESS: at sentence #190000, processed 1004976 words, keeping 29699 word types
2023-01-29 01:59:03,800:INFO:PROGRESS: at sentence #200000, processed 1112623 words, keeping 31116 word types
2023-01-29 01:59:03,824:INFO:PROGRESS: at sentence #210000, processed 1200577 words, keeping 31798 word types
2023-01-29 01:59:03,850:INFO:PROGRESS: at sentence #220000, processed 1285942 words, keeping 32381 word types
2023-01-29 01:59:03,878:INFO:PROGRESS: at sentence #230000, processed 1380836 words, keeping 33131 word types
...
2023-01-29 01:59:08,532:INFO:EPOCH 0 - PROGRESS: at 94.73% examples, 317616 words/s, in_qsize 1, out_qsize 0
2023-01-29 01:59:09,130:INFO:EPOCH 0: training on 1630633 raw words (1453015 effective words) took 4.7s, 309765 effective words/s
2023-01-29 01:59:09,131:INFO:Word2Vec lifecycle event {'msg': 'training on 1630633 raw words (1453015 effective words) took 4.7s, 309335 effective words/s', 'datetime': '2023-01-29T01:59:09.131480', 'gensim': '4.3.0', 'python': '3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 15:55:03) \n[GCC 10.4.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'train'}
2023-01-29 01:59:09,132:INFO:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=35380, vector_size=16, alpha=0.025>', 'datetime': '2023-01-29T01:59:09.132038', 'gensim': '4.3.0', 'python': '3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 15:55:03) \n[GCC 10.4.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'created'}
item_w2v_emb_dict
Output exceeds the size limit. Open the full output data in a text editor
{'162300': array([-0.37774816, 1.3504976 , -0.23309161, 0.26072636, -0.23989849,
0.5004345 , -0.56577134, -0.5481367 , 0.2039964 , 0.8342705 ,
-0.48475933, -0.52761525, 0.20438878, 1.1823852 , -0.4367902 ,
0.5961195 ], dtype=float32),
'160974': array([-0.6204943 , 1.9116834 , -0.46083373, 0.40015092, -0.1149492 ,
0.16426347, -0.79475456, -0.5675412 , -0.11137734, 0.8231001 ,
-0.40696675, -0.41709152, 0.46059853, 1.4314909 , -0.43337965,
0.6168017 ], dtype=float32),
'158082': array([-0.31103644, 1.366509 , -0.15305053, 0.19239815, -0.22246826,
0.5307462 , -0.6013954 , -0.5462901 , 0.15414776, 0.7907959 ,
-0.47677982, -0.5615761 , 0.13641724, 1.0848166 , -0.49600208,
0.577961 ], dtype=float32),
'158536': array([-0.8572138 , 2.3342323 , -0.820582 , 0.59009516, -0.11867882,
-0.00443581, -0.90663487, -0.48073447, -0.21962236, 0.72721934,
-0.5553819 , -0.38393563, 0.44512793, 1.4385353 , -0.60131 ,
0.8454735 ], dtype=float32),
'300470': array([-3.0404758e-01, 1.6421673e+00, -3.3394170e-01, 1.7405626e-01,
-3.1160885e-01, 4.3005905e-01, -6.1486483e-01, -5.4003429e-01,
7.3214585e-04, 8.6553878e-01, -3.9799103e-01, -4.2683634e-01,
4.4759423e-01, 1.5222788e+00, -4.0337685e-01, 5.7117921e-01],
dtype=float32),
'59758': array([-0.525708 , 1.5093073 , -0.56673056, 0.20997894, -0.18883261,
0.15363155, -0.53620964, -0.47269717, -0.02236754, 0.80978376,
-0.3462906 , -0.26018006, 0.27204517, 1.4290353 , -0.47859445,
0.36089382], dtype=float32),
...
'63596': array([ 0.01696501, 0.02612317, 0.00530556, 0.03226124, 0.01429469,
-0.04120407, -0.01667616, -0.03267057, -0.00297171, -0.05766348,
-0.00219459, -0.02344081, 0.00532304, 0.0203535 , -0.03306687,
0.01739437], dtype=float32),
...}
4、将推荐列表字典的形式转换成df
# 将推荐列表字典的形式转换成df
articles_embedding_list = []
for article_id, embedding in item_w2v_emb_dict.items():
embedding = embedding.tolist()
articles_embedding_list.append([article_id] + embedding)
articles_embedding_df = pd.DataFrame(articles_embedding_list, columns=['article_id'] + ['emb_' + str(i) for i in range(16)])
articles_embedding_df
article_id | emb_0 | emb_1 | emb_2 | emb_3 | emb_4 | emb_5 | emb_6 | emb_7 | emb_8 | emb_9 | emb_10 | emb_11 | emb_12 | emb_13 | emb_14 | emb_15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 162300 | -0.377748 | 1.350498 | -0.233092 | 0.260726 | -0.239898 | 0.500435 | -0.565771 | -0.548137 | 0.203996 | 0.834270 | -0.484759 | -0.527615 | 0.204389 | 1.182385 | -0.436790 | 0.596120 |
1 | 160974 | -0.620494 | 1.911683 | -0.460834 | 0.400151 | -0.114949 | 0.164263 | -0.794755 | -0.567541 | -0.111377 | 0.823100 | -0.406967 | -0.417092 | 0.460599 | 1.431491 | -0.433380 | 0.616802 |
2 | 158082 | -0.311036 | 1.366509 | -0.153051 | 0.192398 | -0.222468 | 0.530746 | -0.601395 | -0.546290 | 0.154148 | 0.790796 | -0.476780 | -0.561576 | 0.136417 | 1.084817 | -0.496002 | 0.577961 |
3 | 158536 | -0.857214 | 2.334232 | -0.820582 | 0.590095 | -0.118679 | -0.004436 | -0.906635 | -0.480734 | -0.219622 | 0.727219 | -0.555382 | -0.383936 | 0.445128 | 1.438535 | -0.601310 | 0.845474 |
4 | 300470 | -0.304048 | 1.642167 | -0.333942 | 0.174056 | -0.311609 | 0.430059 | -0.614865 | -0.540034 | 0.000732 | 0.865539 | -0.397991 | -0.426836 | 0.447594 | 1.522279 | -0.403377 | 0.571179 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
35375 | 224148 | -0.106398 | 0.048206 | -0.058324 | -0.039949 | 0.049242 | 0.056162 | -0.010760 | 0.023729 | -0.016845 | 0.038499 | 0.012183 | 0.025061 | 0.074791 | -0.014461 | -0.007782 | 0.029464 |
35376 | 207672 | 0.009338 | 0.037096 | -0.042352 | 0.021171 | 0.027002 | -0.027749 | -0.028915 | 0.028978 | 0.000694 | 0.050991 | 0.010830 | -0.005212 | -0.018071 | 0.024465 | -0.003564 | 0.020691 |
35377 | 96333 | -0.096300 | 0.045616 | 0.019998 | 0.053210 | 0.046997 | 0.038450 | -0.027336 | 0.010665 | -0.006431 | 0.008583 | 0.032228 | 0.008812 | -0.002347 | 0.007475 | 0.037719 | -0.024010 |
35378 | 203538 | -0.046094 | 0.016830 | 0.036601 | -0.013213 | 0.057705 | -0.031858 | -0.010430 | 0.005571 | -0.005230 | -0.011425 | -0.007605 | -0.049261 | 0.001949 | -0.032922 | 0.013038 | 0.004562 |
35379 | 145309 | 0.003439 | 0.054341 | 0.054563 | -0.022907 | -0.028525 | -0.045272 | -0.046397 | 0.026955 | 0.015746 | 0.022889 | -0.009937 | -0.042705 | 0.000379 | 0.031564 | 0.058540 | -0.043099 |
35380 rows × 17 columns
5、保存利用word2vec训练好的每篇文章的Embedding
articles_embedding_df.to_csv(save_path + '/articles_emb.csv', index=False, header=True)
四、使用word2vec训练得到的词向量进行可视化
# 使用word2vec训练得到的词向量进行可视化
def get_item_sim_list(df):
sim_list = []
item_list = df['click_article_id'].values
for i in range(0, len(item_list)-1):
emb1 = item_w2v_emb_dict[str(item_list[i])] # 需要注意的是word2vec训练时候使用的是str类型的数据
emb2 = item_w2v_emb_dict[str(item_list[i+1])]
sim_list.append(np.dot(emb1,emb2)/(np.linalg.norm(emb1)*(np.linalg.norm(emb2))))
sim_list.append(0)
return sim_list
# 随机选择5个用户,查看这些用户前后查看文章的相似性
sub_user_ids = np.random.choice(all_click.user_id.unique(), size=5, replace=False)
sub_user_info = all_click[all_click['user_id'].isin(sub_user_ids)]
for _, user_df in sub_user_info.groupby('user_id'):
item_sim_list = get_item_sim_list(user_df)
print("item_sim_list = ", item_sim_list)
plt.plot(item_sim_list)
item_sim_list = [0.92883706, 0]
item_sim_list = [0.9124602, 0]
item_sim_list = [0.9199342, 0.96343744, 0]
item_sim_list = [0.9781094, 0.96341544, 0.97788256, 0.9595503, 0.8217939, 0.72416514, 0.9395525, 0.7885189, 0.9457342, 0.88077354, 0.97478765, 0.9660947, 0.98741436, 0.741313, 0.87396616, 0.929134, 0]
item_sim_list = [0.96066815, 0.9530401, 0.6379823, 0.9544767, 0.97723705, 0.9679809, 0.9853506, 0.7811122, 0.75035423, 0]