word2vec词向量最后转成统一长度的电影item唯一表示
1、 分词、训练
# coding:utf-8
import gensim
from gensim.models import word2vec
from gensim.models.doc2vec import Doc2Vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
path = r'/Users/lonng/Desktop/v+/呆萌的停用词表.txt'
import jieba
import jieba.analyse
jieba.analyse.set_stop_words(path)
import pandas as pd
import pymysql
df_all = pd.read_csv('/User。。。。。eaning_data10.csv',index_col=0).fillna('')
###################### 创建停用词列表#############33
def stopwordslist():
stopwords = [line.strip() for line in open(path,encoding='UTF-8').readlines()]
return stopwords
def combine(x):
sentence_depart = jieba.cut(x['title'] + x['language']+ x['area']\
+ x['director']+ x['crew_name']+x["describe"] \
+ x['type']+ x['video_type']\
+ x['