数据:
链接: https://pan.baidu.com/s/1QvFWU1985u0sVDVJkWTr0w?pwd=myxm 提取码: myxm 复制这段内容后打开百度网盘手机App,操作更方便哦
# 1
import numpy as np
import re
import string
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim import utils
from nltk.corpus import stopwords
def textClean(text):
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = text.lower().split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return (text)
def cleanup(text):
text = textClean(text)
text = text.translate(str.maketrans("", "", string.punctuation))
return text
def constructTaggedDocuments(data):
sentences = []
for index, row in data.iteritems():
sentences.append(TaggedDocument(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
return sentences
def getEmbeddings(path,vector_dimension=300):
data = pd.read_csv(path)
missing_rows = []
for i in range(len(data)):
if data.loc[i, 'text'] != data.loc[i, 'text']:
missing_rows.append(i)
data = data.drop(missing_rows).reset_index().drop(['index','id'],axis=1)
for i in range(len(data)):
data.loc[i, 'text'] = cleanup(data.loc[i,'text'])
x = constructTaggedDocuments(data['text'])
y = data['label'].values
text_model = Doc2Vec(min_count=1, window=