from gensim.models.doc2vec import Doc2Vec,LabeledSentence
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence
from sklearn.cross_validation import train_test_split
import numpy as np
##读取并预处理数据
with open('F:\\PythonJob\\ModernSentimentAnalysis\\aclImdb\\train\pos\\0_9.txt','r') as infile:
pos_reviews = infile.readlines()
with open('F:\\PythonJob\\ModernSentimentAnalysis\\aclImdb\\train\\neg\\0_3.txt','r') as infile:
neg_reviews = infile.readlines()
with open('F:\\PythonJob\\ModernSentimentAnalysis\\aclImdb\\train\\unsup\\0_0.txt','r') as infile:
unsup_reviews = infile.readlines()
# 1 代表积极情绪,0 代表消极情绪
y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_reviews, neg_reviews)), y,test_size=0.2)
test_size:样本占比
# 零星的预处理
def cleanText(corpus): # 对英文做简单的数据清洗预处理,中文根据需要进行修改
punctuation = """.,?!:;(){}[]"""
corpus = [z.lower().replace('n','') for z in corpus] # lower():小写
corpus = [z.replace('<br />', ' ') for z in corpus]
for c in punctuation: # 把标点当作个别单词。
corpus = [z.replace(c, ' %s '%c) for z in corpus]
corpus = [z.split() for z in corpus]
return corpus
x_train = cleanText(x_train)
x_test = cleanText(x_test)
unsup_reviews = cleanText(unsup_reviews)
#Gensim的Doc2Vec应用于训练要求每一篇文章/句子有一个唯一标识的label.
#我们使用Gensim自带的LabeledSentence方法. 标识的格式为"TRAIN_i"和"TEST_i",其中i为序号
def labelizeReviews(reviews, label_type):
labelized = []
for i,v in enumerate(reviews):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')
unsup_reviews = labelizeReviews(unsup_reviews, 'UNSUP')
# 这么一来创建了 LabeledSentence 类型对象
import random
size = 400
# 实例化 DM 和 DBOW 模型
model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5,workers=3)
model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3,negative=5, dm=0, workers=3)
# 对所有评论创建词汇表
model_dm.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))
model_dbow.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))
错误提示:
AttributeError Traceback (most recent call last) <ipython-input-68-bf1d90ddcc42> in <module>() 1 # 对所有评论创建词汇表 ----> 2 model_dm.build_vocab(np.concatenate((x_train, x_test, unsup_reviews))) 3 model_dbow.build_vocab(np.concatenate((x_train, x_test, unsup_reviews))) C:\Users\joh\Anaconda2\lib\site-packages\gensim\models\word2vec.pyc in build_vocab(self, sentences, keep_raw_vocab, trim_rule, progress_per, update) 629 Each sentence must be a list of unicode strings. 630 """ --> 631 self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey 632 # trim by min_count & precalculate downsampling 633 self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) C:\Users\joh\Anaconda2\lib\site-packages\gensim\models\doc2vec.pyc in scan_vocab(self, documents, progress_per, trim_rule, update) 696 for document_no, document in enumerate(documents): 697 if not checked_string_types: --> 698 if isinstance(document.words, string_types): 699 logger.warning( 700 "Each 'words' should be a list of words (usually unicode strings). " AttributeError: 'numpy.ndarray' object has no attribute 'words'