python情感预测(三)

对review进行情感正负面判断:


#coding=utf-8

"""
Use positive and negative review set as corpus to train a sentiment classifier.
This module use labeled positive and negative reviews as training set, then use nltk scikit-learn api to do classification task.
Aim to train a classifier automatically identifiy review's positive or negative sentiment, and use the probability as review helpfulness feature.

"""

from Preprocessing_module import textprocessing as tp
import pickle
import itertools
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import sklearn
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.metrics import accuracy_score


# 1. Load positive and negative review data
pos_review = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\pos_review.xlsx", 1, 1)
neg_review = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\neg_review.xlsx", 1, 1)

pos = pos_review
neg = neg_review


"""
# Cut positive review to make it the same number of nagtive review (optional)

shuffle(pos_review)
size = int(len(pos_review)/2 - 18)

pos = pos_review[:size]
neg = neg_review

"""


# 2. Feature extraction function
# 2.1 Use all words as features
def bag_of_words(words):
    return dict([(word, True) for word in words])


# 2.2 Use bigrams as features (use chi square chose top 200 bigrams)
def bigrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(bigrams)


# 2.3 Use words and bigrams as features (use chi square chose top 200 bigrams)
def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)


# 2.4 Use chi_sq to find most informative features of the review
# 2.4.1 First we should compute words or bigrams information score
def create_word_scores():
    posdata = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\pos_review.xlsx", 1, 1)

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\pos_review.xlsx", 1, 1)

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

# Combine words and bigrams and compute words and bigrams information scores
def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel(r"D:\tomcat\review_protection\Feature_extraction_module\Sentiment_features\Machine learning features\seniment review set\pos_review.xlsx", 1, 1)

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

# Choose word_scores extaction methods
word_scores = create_word_scores()
# word_scores = create_bigram_scores()
# word_scores = create_word_bigram_scores()


# 2.4.2 Second we should extact the most informative words or bigrams based on the information score
def find_best_words(word_scores, number):
    best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number]
    best_words = set([w for w, s in best_vals])
    return best_words

# 2.4.3 Third we could use the most informative words and bigrams as machine learning features
# Use chi_sq to find most informative words of the review
def best_word_features(words):
    return dict([(word, True) for word in words if word in best_words])

# Use chi_sq to find most informative bigrams of the review
def best_word_features_bi(words):
    return dict([(word, True) for word in nltk.bigrams(words) if word in best_words])

# Use chi_sq to find most informative words and bigrams of the review
def best_word_features_com(words):
    d1 = dict([(word, True) for word in words if word in best_words])
    d2 = dict([(word, True) for word in nltk.bigrams(words) if word in best_words])
    d3 = dict(d1, **d2)
    return d3



# 3. Transform review to features by setting labels to words in review
def pos_features(feature_extraction_method):
    posFeatures = []
    for i in pos:
        #for key in feature_extraction_method(i):
            #print key
        posWords = [feature_extraction_method(i),'pos']
        posFeatures.append(posWords)
    return posFeatures

def neg_features(feature_extraction_method):
    negFeatures = []
    for j in neg:
        #for key in feature_extraction_method(j):
           # print key
        negWords = [feature_extraction_method(j),'neg']
        negFeatures.append(negWords)
    return negFeatures


best_words = find_best_words(word_scores, 1500) # Set dimension and initiallize most informative words

# posFeatures = pos_features(bigrams)
# negFeatures = neg_features(bigrams)

# posFeatures = pos_features(bigram_words)
# negFeatures = neg_features(bigram_words)

posFeatures = pos_features(best_word_features)
print type(posFeatures)

negFeatures = neg_features(best_word_features)

# posFeatures = pos_features(best_word_features_com)
# negFeatures = neg_features(best_word_features_com)



# 4. Train classifier and examing classify accuracy
# Make the feature set ramdon
shuffle(posFeatures)
shuffle(negFeatures)

# 75% of features used as training set (in fact, it have a better way by using cross validation function)
size_pos = int(len(pos_review) * 0.75)
size_neg = int(len(neg_review) * 0.75)

train_set = posFeatures[:size_pos] + negFeatures[:size_neg]
test_set = posFeatures[size_pos:] + negFeatures[size_neg:]

test, tag_test = zip(*test_set)

def clf_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)

    predict = classifier.batch_classify(test)
    return accuracy_score(tag_test, predict)

print 'BernoulliNB`s accuracy is %f' %clf_score(BernoulliNB())
#print 'GaussianNB`s accuracy is %f' %clf_score(GaussianNB())
print 'MultinomiaNB`s accuracy is %f' %clf_score(MultinomialNB())
print 'LogisticRegression`s accuracy is %f' %clf_score(LogisticRegression())
print 'SVC`s accuracy is %f' %clf_score(SVC(gamma=0.001, C=100., kernel='linear'))
print 'LinearSVC`s accuracy is %f' %clf_score(LinearSVC())
print 'NuSVC`s accuracy is %f' %clf_score(NuSVC())



# 5. After finding the best classifier, then check different dimension classification accuracy
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    pred = classifier.batch_classify(test)
    return accuracy_score(tag_test, pred)

dimention = ['500','1000','1500','2000','2500','3000']

for d in dimention:
    word_scores = create_word_bigram_scores()
    best_words = find_best_words(word_scores, int(d))

    posFeatures = pos_features(best_word_features_com)
    negFeatures = neg_features(best_word_features_com)

    # Make the feature set ramdon
    shuffle(posFeatures)
    shuffle(negFeatures)

    # 75% of features used as training set (in fact, it have a better way by using cross validation function)
    size_pos = int(len(pos_review) * 0.75)
    size_neg = int(len(neg_review) * 0.75)

    trainset = posFeatures[:size_pos] + negFeatures[:size_neg]
    testset = posFeatures[size_pos:] + negFeatures[size_neg:]

    test, tag_test = zip(*testset)

    print 'BernoulliNB`s accuracy is %f' %score(BernoulliNB())
    print 'MultinomiaNB`s accuracy is %f' %score(MultinomialNB())
    print 'LogisticRegression`s accuracy is %f' %score(LogisticRegression())
    print 'SVC`s accuracy is %f' %score(SVC())
    print 'LinearSVC`s accuracy is %f' %score(LinearSVC())
    print 'NuSVC`s accuracy is %f' %score(NuSVC())
    print 



# 6. Store the best classifier under best dimension
def store_classifier(clf, trainset, filepath):
    classifier = SklearnClassifier(clf)
    classifier.train(trainset)
    # use pickle to store classifier
    pickle.dump(classifier, open(filepath,'w'))

实验结果为:

BernoulliNB`s accuracy is 0.726368
MultinomiaNB`s accuracy is 0.746269
LogisticRegression`s accuracy is 0.756219
SVC`s accuracy is 0.679104
LinearSVC`s accuracy is 0.721393
NuSVC`s accuracy is 0.751244
### 回答1: Python情感典是一种用于分析文本情感倾向的工具,可以帮助人们更好地理解和预测某段文字中的情感态度。下载Python情感典的方法如下: 首先,我们可以在互联网上搜索Python情感典的官方网站或者相关的资源网站。这些网站通常会提供免费的下载链接,可以直接从官方网站下载最新版本的情感典。 另外,我们还可以通过在Python的包管理器中搜索情感分析相关的库,如nltk(Natural Language Toolkit)或者TextBlob,这些库通常具有情感典的功能,可以直接通过pip命令进行安装。 此外,还有一些在GitHub上开源的情感典项目,如SentiWordNet、AFINN等。我们可以通过在GitHub上搜索相关关键,找到并下载这些项目的源代码和情感典文件。 下载完情感典后,我们需要将其导入到Python的开发环境中。如果情感典是一个文本文件,我们可以使用文件操作函数打开并读取其中的内容;如果是一个Python模块,我们可以通过导入相应的包或者脚本来使用其中的功能。 在使用情感典进行文本情感分析时,我们可以将文本数据作为输入,利用情感典中的汇和对应的情感值,计算出整个文本的情感倾向。这些情感值通常是根据某种算法或者人工标注得出的。 总之,Python情感典是一种能够帮助人们分析文本情感倾向的工具,可以通过官方网站、包管理器、GitHub等方式进行下载,并通过导入相关包或者读取文本文件的方式使用。 ### 回答2: Python情感典是一份用于情感分析的数据集,可以帮助我们对文本进行情感分类情感倾向性分析。Python情感典可以通过网络进行下载。 想要下载Python情感典,我们可以执行以下步骤: 1. 在任何一个搜索引擎中输入“Python情感典下载”,然后点击搜索按钮。 2. 在搜索结果中,找到一个可信赖的来源,比如官方网站、知名机构或者一些有良好口碑的网站。 3. 进入合适的网页后,我们可以找到Python情感典的下载链接。 4. 点击下载链接,等待下载完成。通常情况下,这个过程只需要几秒到几分钟的时间,具体取决于我们的网络速度。 5. 完成下载后,我们可以在计算机的指定位置找到Python情感典的文件。这个文件通常是一个文本文件,内容包含了用于情感分析的汇和对应的情感倾向。 值得注意的是,Python情感典并非只有一个版本,存在着多个不同的版本和来源。因此,在选择下载源时,我们应该选择权威可靠的版本,以避免使用过时或者不准确的情感典。 总结而言,下载Python情感典是一项简单的任务,只需要找到可信赖的来源并点击下载链接即可。这将为我们提供一个有用的工具,使我们能够更好地分析和理解文本中的情感倾向。 ### 回答3: Python情感典是用于情感分析的工具,能够帮助人们识别文本中所表达的情感倾向。下载Python情感典的过程非常简单。首先,我们需要在互联网上搜索Python情感典。通常情况下,我们可以在数据科学相关的网站或Github上找到该典的下载链接。其次,我们点击下载链接,将典的压缩包保存到本地计算机中。接下来,我们解压缩该压缩包,获得Python情感典的文本文件。然后,我们可以将该文本文件添加到我们的Python项目中,以便在代码中使用。对于情感分析的任务,我们可以读取典文件,将其中的情感汇加载到程序的内存中。这样,我们就可以通过比对文本中的汇和情感汇,从而对文本进行情感分类。这种典的下载和使用可以帮助我们更好地了解文本中所表达的情感倾向,从而对文本进行更深入的分析和理解。无论是进行情感分析的学术研究还是商业应用,Python情感典都是一种非常有用的工具。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值