实战！文本情感分析系统-以携程酒店评论情感分析为例（下）

single_ffish

已于 2024-09-27 09:09:57 修改

阅读量447

点赞数 5

文章标签： python 机器学习自然语言处理

于 2024-09-27 09:09:44 首次发布

本文链接：https://blog.csdn.net/single_ffish/article/details/142577946

版权

ok,上文我们对这个项目进行了背景介绍，介绍了相关的技术栈和运行结果。接下来我们来对代码进行介绍，帮助读者进一步的了解这个项目。代码和数据集放在文章末尾了。

数据集是这样子的，评论分为两类，label,review.其中label值为1代表积极评论，0代表消极评论。

数据加载部分代码

# 加载评论数据
data = pd.read_csv('ChnSentiCorp_htl_all.csv')
# 加载评论数据

# 重命名列名以匹配代码
data = data.rename(columns={'review': 'comment_text', 'label': 'sentiment'})

# 将情感标签转换为文本
data['sentiment'] = data['sentiment'].map({1: 'positive', 0: 'negative'})

数据预处理部分：

def preprocess_text(text):
    if isinstance(text, str):  # 检查是否为字符串类型
        # 去除特殊字符和标点符号
        text = re.sub(r'[^\w\s]', '', text)
        # 分词
        words = jieba.cut(text)
        # 去除停用词
        stop_words = set()  # 或者使用停用词表
        # stop_words = set(stopwords.words('chinese'))
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)
    else:
        # 处理非字符串类型，例如返回空字符串或进行其他处理
        return ''

训练并且对模型进行评估：

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    data['comment_text'], data['sentiment'], test_size=0.2, random_state=42
)

# 特征提取
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 模型训练
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# 模型评估
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy}")
print(classification_report(y_test, y_pred))

对训练数据可视化：


# 结果可视化
sns.countplot(x='sentiment', data=data)
plt.title('评论情感分布',fontproperties='SimHei')
plt.show()

#  生成关键词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt

positive_comments = data[data['sentiment'] == 'positive']['comment_text']
negative_comments = data[data['sentiment'] == 'negative']['comment_text']

font_path = 'C:/Windows/Fonts/simhei.ttf'  # 替换为您系统中的中文字体路径
font_prop = font_manager.FontProperties(fname=font_path)
# 生成正面评论关键词云
wordcloud = WordCloud(
    width=800, height=400, background_color='white', collocations=False,font_path=font_path
).generate(' '.join(positive_comments))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear' )
plt.axis('off')
plt.title('正面评论关键词云', fontproperties='SimHei')
plt.show()

# 生成负面评论关键词云
wordcloud = WordCloud(
    width=800, height=400, background_color='white', collocations=False,font_path=font_path
).generate(' '.join(negative_comments))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('负面评论关键词云',fontproperties='SimHei')
plt.show()

全部代码：

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import  re
import jieba
from matplotlib import font_manager
# 加载评论数据
data = pd.read_csv('ChnSentiCorp_htl_all.csv')
# 加载评论数据

# 重命名列名以匹配代码
data = data.rename(columns={'review': 'comment_text', 'label': 'sentiment'})

# 将情感标签转换为文本（如果需要）
data['sentiment'] = data['sentiment'].map({1: 'positive', 0: 'negative'})
# 数据预处理
# def preprocess_text(text):
#     # 去除特殊字符和标点符号
#     text = re.sub(r'[^\w\s]', '', text)
#     # 分词
#     words = jieba.cut(text)
#     # 去除停用词
#     stop_words=set()
#     # stop_words = set(stopwords.words('chinese'))
#     words = [word for word in words if word not in stop_words]
#     return ' '.join(words)
#

def preprocess_text(text):
    if isinstance(text, str):  # 检查是否为字符串类型
        # 去除特殊字符和标点符号
        text = re.sub(r'[^\w\s]', '', text)
        # 分词
        words = jieba.cut(text)
        # 去除停用词
        stop_words = set()  # 或者使用停用词表
        # stop_words = set(stopwords.words('chinese'))
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)
    else:
        # 处理非字符串类型，例如返回空字符串或进行其他处理
        return ''
data['comment_text'] = data['comment_text'].apply(preprocess_text)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    data['comment_text'], data['sentiment'], test_size=0.2, random_state=42
)

# 特征提取
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 模型训练
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# 模型评估
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy}")
print(classification_report(y_test, y_pred))

# 结果可视化
sns.countplot(x='sentiment', data=data)
plt.title('评论情感分布',fontproperties='SimHei')
plt.show()

#  生成关键词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt

positive_comments = data[data['sentiment'] == 'positive']['comment_text']
negative_comments = data[data['sentiment'] == 'negative']['comment_text']

font_path = 'C:/Windows/Fonts/simhei.ttf'  # 替换为您系统中的中文字体路径
font_prop = font_manager.FontProperties(fname=font_path)
# 生成正面评论关键词云
wordcloud = WordCloud(
    width=800, height=400, background_color='white', collocations=False,font_path=font_path
).generate(' '.join(positive_comments))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear' )
plt.axis('off')
plt.title('正面评论关键词云', fontproperties='SimHei')
plt.show()

# 生成负面评论关键词云
wordcloud = WordCloud(
    width=800, height=400, background_color='white', collocations=False,font_path=font_path
).generate(' '.join(negative_comments))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('负面评论关键词云',fontproperties='SimHei')
plt.show()