文本情感分析——Bags of word Meets bags of Popcorn

import所需库

import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

import nltk
#nltk.download()

from nltk.corpus import stopwords

用pandas读入训练数据

datafile = os.path.join('..', 'data', 'labeledTrainData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))

df.head()

对影评数据做预处理,大概有以下环节:

  1. 去掉html标签
  2. 移除标点
  3. 切分成词/token
  4. 去掉停用词
  5. 重组为新的句子
def display(text, title):
    print(title)
    print("\n----------我是分割线-------------\n")

    print(text) 

raw_example = df['review'][1]

display(raw_example, '原始数据')

example = BeautifulSoup(raw_example, 'html.parser').get_text()
display(example, '去掉HTML标签的数据')

example_letters = re.sub(r'[^a-zA-Z]', ' ', example)

display(example_letters, '去掉标点的数据')

words = example_letters.lower().split()

display(words, '纯词列表数据')

#下载停用词和其他语料会用到

#nltk.download()

#words_nostop = [w for w in words if w not in stopwords.words('english')]
stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])
words_nostop = [w for w in words if w not in stopwords]

display(words_nostop, '去掉停用词数据')

#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = set(stopwords)


def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]

    return ' '.join(words)

clean_text(raw_example)

清洗数据添加到dataframe里

df['clean_review'] = df.review.apply(clean_text)

df.head()

抽取bag of words特征(用sklearn的CountVectorizer)

vectorizer = CountVectorizer(max_features = 5000) 
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()

train_data_features.shape

训练分类器

forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(train_data_features, df.sentiment)

在训练集上做个predict看看效果如何

confusion_matrix(df.sentiment, forest.predict(train_data_features))

删除不用的占内容变量

del df

del train_data_features

读取测试数据进行预测

datafile = os.path.join('..', 'data', 'testData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df['clean_review'] = df.review.apply(clean_text)

df.head()

test_data_features = vectorizer.transform(df.clean_review).toarray()

test_data_features.shape

result = forest.predict(test_data_features)

output = pd.DataFrame({'id':df.id, 'sentiment':result})

output.head()

output.to_csv(os.path.join('..', 'data', 'Bag_of_Words_model.csv'), index=False)

del df
del test_data_features




  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值