# -*- coding: UTF-8 -*-import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
defreview_to_wordlist(review):'''
把IMDB的评论转成词序列
参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613
'''# 去掉HTML标签,拿到内容
review_text = BeautifulSoup(review,"html.parser").get_text()# 用正则表达式取出符合规范的部分
review_text = re.sub("[^a-zA-Z]"," ", review_text)# 小写化所有的词,并转成词list
words = review_text.lower().split()# 返回wordsreturn words
载入数据集
# 载入数据集
train = pd.read_csv('data/new_train.csv', header=0)
test = pd.read_csv('data/new_test.csv', header=0)print(train.head())print(test.head())
ID sentiment review
0 1 1 Jo bhi ap se tou behtar hoon
1 2 0 ya Allah meri sister Affia ki madad farma
2 3 1 Yeh khud chahta a is umar main shadi krna ha...
3 4 1 Tc Apky mun xe exe alfax achy nae lgty
4 5 0 Good
id review
0 1 Jis ke aiteraf mien inhe behtareen muaawin ac...
1 2 Thank you same to you
2 3 ALLAH ki marzi hai Beshak wohi ata karne wala ...
3 4 Asal masla yehi hei k wo iss umar mein bhi sha...
4 5 Chaudhry Rehmat Ali ne January ko Ab...
预处理数据
# 预处理数据
label = train['sentiment']
train_data =[]for i inrange(len(train['review'])):
train_data.append(' '.join(review_to_wordlist(train['review'][i])))
test_data =[]for i inrange(len(test['review'])):
test_data.append(' '.join(review_to_wordlist(test['review'][i])))
TF-IDF处理结束.
D:\anaconda\lib\site-packages\sklearn\feature_extraction\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
朴素贝叶斯训练
from sklearn.naive_bayes import MultinomialNB as MNB
model_NB = MNB()
model_NB.fit(train_x, label)
MNB(alpha=1.0, class_prior=None, fit_prior=True)from sklearn.cross_validation import cross_val_score
import numpy as np
print("多项式贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc')))