1.数据读取
import pandas as pd
data = pd.read_excel('产品评价.xlsx')
display(data.head(),data.shape)
(1080, 3)
2.中文分词
words = []
for i, row in data.iterrows():
word = jieba.cut(row['评论'])
result = ' '.join(word)
words.append(result)
words[:3]
3.文本向量化
vect = CountVectorizer()
X = vect.fit_transform(words)
X = X.toarray()
print(X)
X.shape
[[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
...
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]]
(1080, 4075)
words_bag = vect.vocabulary_
print(words_bag)
{'iphone8': 194, 'xr': 264, '正品': 2660, '按键': 2221,..., '吃一堑长一智': 1279, '国产机': 1405}
len(words_bag)
4075
y = data['评价']
y.head()
0 1
1 1
2 1
3 1
4 1
Name: 评价, dtype: int64
4.神经网络分类模型
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64,64,32))
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
print(y_pred)
[1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 0
1 1 0 1 0 0 1 1 1 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1
1 1 1 0 1 0 1 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 0 0 1 1]
Wall time: 4.38 s
result = pd.DataFrame()
result['预测值'] = list(y_pred)
result['实际值'] = list(y_test)
result
预测值 实际值
0 1 1
1 0 0
... ... ...
107 1 1
108 rows × 2 columns
5.模型准确率
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print('------------',score)
mlp.score(X_test, y_test)
------------ 0.9907407407407407
0.9907407407407407
(y_pred == y_test).mean()
0.9722222222222222
6.模型预测
comment = input('请输入您对本商品的评价:')
comment = [' '.join(jieba.cut(comment))]
print(comment)
X_try = vect.transform(comment)
y_pred = mlp.predict(X_try.toarray())
print(y_pred)
请输入您对本商品的评价:商品真棒,我非常喜欢,这是一次满意的购物,特别开心,晚上下单,第二天上午到,立体声游侠效果不错
['商品 真棒 , 我 非常 喜欢 , 这是 一次 满意 的 购物 , 特别 开心 , 晚上 下单 , 第二天 上午 到 , 立体声 游侠 效果 不错']
[1]
7.朴素贝叶斯算法
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train,y_train)
y_pred = nb_clf.predict(X_test)
print(y_pred)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)
[1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 0 1 0
1 1 0 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1
1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1]
0.8888888888888888