文本情感分析——Bags of word Meets bags of Popcorn

最新推荐文章于 2020-03-21 18:01:01 发布

chenjiwei064196

最新推荐文章于 2020-03-21 18:01:01 发布

阅读量443

点赞数 1

分类专栏：项目经历

本文链接：https://blog.csdn.net/chenjiwei064196/article/details/79630096

版权

项目经历专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import所需库

import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

import nltk
#nltk.download()

from nltk.corpus import stopwords

用pandas读入训练数据

datafile = os.path.join('..', 'data', 'labeledTrainData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))

df.head()

对影评数据做预处理，大概有以下环节：

去掉html标签
移除标点
切分成词/token
去掉停用词
重组为新的句子

def display(text, title):
print(title)
print("\n----------我是分割线-------------\n")

print(text)

raw_example = df['review'][1]

display(raw_example, '原始数据')

example = BeautifulSoup(raw_example, 'html.parser').get_text()
display(example, '去掉HTML标签的数据')

example_letters = re.sub(r'[^a-zA-Z]', ' ', example)

display(example_letters, '去掉标点的数据')

words = example_letters.lower().split()

display(words, '纯词列表数据')

#下载停用词和其他语料会用到

#nltk.download()

#words_nostop = [w for w in words if w not in stopwords.words('english')]
stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])
words_nostop = [w for w in words if w not in stopwords]

display(words_nostop, '去掉停用词数据')

#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = set(stopwords)

def clean_text(text):
text = BeautifulSoup(text, 'html.parser').get_text()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.lower().split()
words = [w for w in words if w not in eng_stopwords]

return ' '.join(words)

clean_text(raw_example)

清洗数据添加到dataframe里

df['clean_review'] = df.review.apply(clean_text)

df.head()

抽取bag of words特征(用sklearn的CountVectorizer)

vectorizer = CountVectorizer(max_features = 5000)
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()

train_data_features.shape

训练分类器

forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(train_data_features, df.sentiment)

在训练集上做个predict看看效果如何

confusion_matrix(df.sentiment, forest.predict(train_data_features))

删除不用的占内容变量

del df

del train_data_features

读取测试数据进行预测

datafile = os.path.join('..', 'data', 'testData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df['clean_review'] = df.review.apply(clean_text)

df.head()

test_data_features = vectorizer.transform(df.clean_review).toarray()

test_data_features.shape

result = forest.predict(test_data_features)

output = pd.DataFrame({'id':df.id, 'sentiment':result})

output.head()

output.to_csv(os.path.join('..', 'data', 'Bag_of_Words_model.csv'), index=False)

del df
del test_data_features

chenjiwei064196

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
文本情感分析——Bags of word Meets bags of Popcorn

import所需库import osimport reimport numpy as npimport pandas as pdfrom bs4 import BeautifulSoupfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.ensemble import RandomForestClassif...
复制链接

扫一扫