上篇用了countvectorize进行文本embling,忽视了文本词中上下文的语义。因此这里用到了word2vec。
word2vec训练词向量。
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
from gensim.models.word2vec import Word2vec
def load_dataset(name,nrows=None):
datasets={
"unlabeled_train":"unlabelTrainData.tsv",
"labeled_train":"labeledTrainData.tsv",
"test":"testData.tsv"
}
if name not in datasets:
raise ValueError(name)
data_file=os.path.join("..","data",datasets[name])
df=pd.read_csv(data_File,sep="\t",escapechar="\\",nrows=nrows)
return df
读入无标签数据
用于训练生成word2vec词向量
df = load_dataset('unlabeled_train')
eng_stopwords = {}.fromkeys([ line.rstrip() for line in open