现代情感分析 新手 代码运行错误 求指点

import sys
from gensim.models.doc2vec import Doc2Vec,LabeledSentence
import gensim
 
LabeledSentence = gensim.models.doc2vec.LabeledSentence
from sklearn.cross_validation import train_test_split
import numpy as np
 ##读取并预处理数据
with open('F:\\PythonJob\\ModernSentimentAnalysis\\aclImdb\\train\pos\\0_9.txt','r') as infile:
    pos_reviews = infile.readlines()
with open('F:\\PythonJob\\ModernSentimentAnalysis\\aclImdb\\train\\neg\\0_3.txt','r') as infile:
    neg_reviews = infile.readlines()
with open('F:\\PythonJob\\ModernSentimentAnalysis\\aclImdb\\train\\unsup\\0_0.txt','r') as infile:
    unsup_reviews = infile.readlines()
# 1 代表积极情绪,0 代表消极情绪
y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_reviews, neg_reviews)), y,test_size=0.2)

 test_size:样本占比

# 零星的预处理
def cleanText(corpus): # 对英文做简单的数据清洗预处理,中文根据需要进行修改
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('n','') for z in corpus] # lower():小写
    corpus = [z.replace('<br />', ' ') for z in corpus]


    for c in punctuation:  # 把标点当作个别单词。
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus
 
x_train = cleanText(x_train)
x_test = cleanText(x_test)

unsup_reviews = cleanText(unsup_reviews)

#Gensim的Doc2Vec应用于训练要求每一篇文章/句子有一个唯一标识的label.
#我们使用Gensim自带的LabeledSentence方法. 标识的格式为"TRAIN_i"和"TEST_i",其中i为序号
def labelizeReviews(reviews, label_type):
    labelized = []
    for i,v in enumerate(reviews):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized
 
x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')
unsup_reviews = labelizeReviews(unsup_reviews, 'UNSUP')

# 这么一来创建了 LabeledSentence 类型对象

import random
 
size = 400
 
# 实例化 DM 和 DBOW 模型
model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5,workers=3)

model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3,negative=5, dm=0, workers=3)

# 对所有评论创建词汇表
model_dm.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))

model_dbow.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))

错误提示:

AttributeError                            Traceback (most recent call last)
<ipython-input-68-bf1d90ddcc42> in <module>()
      1 # 对所有评论创建词汇表
----> 2 model_dm.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))
      3 model_dbow.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))

C:\Users\joh\Anaconda2\lib\site-packages\gensim\models\word2vec.pyc in build_vocab(self, sentences, keep_raw_vocab, trim_rule, progress_per, update)
    629         Each sentence must be a list of unicode strings.
    630         """
--> 631         self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule)  # initial survey
    632         # trim by min_count & precalculate downsampling
    633         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)

C:\Users\joh\Anaconda2\lib\site-packages\gensim\models\doc2vec.pyc in scan_vocab(self, documents, progress_per, trim_rule, update)
    696         for document_no, document in enumerate(documents):
    697             if not checked_string_types:
--> 698                 if isinstance(document.words, string_types):
    699                     logger.warning(
    700                         "Each 'words' should be a list of words (usually unicode strings). "

AttributeError: 'numpy.ndarray' object has no attribute 'words'

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值