情感分析(Sentiment Analysis)是指通过自然语言处理技术自动判断文本的情感倾向,如正面、负面或中性。对于中文文本,情感分析通常需要考虑多种因素,包括停用词、程度级别词语、否定词等。本文将详细介绍如何构建一个简单的中文情感分析系统。
1、首先,我们需要准备一些必要的工具和资源:
jieba库:一个流行的中文分词工具。
停用词库:用于去除无关紧要的词汇。
情感词库:包括正面情绪词和负面情绪词。
程度级别词语:用于调整情感得分。
否定词:用于处理否定句。
2、加载停用词库
停用词库通常包含一些常见的词汇,如“的”、“是”等,这些词汇对情感分析没有帮助,需要去除。我们可以通过以下函数加载停用词库:
def load_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
stopwords = set(line.strip() for line in f)
return stopwords
3、加载情感词库
情感词库通常包含正面情绪词和负面情绪词。我们可以将它们合并成一个字典,其中键为词汇,值为情感得分。以下函数用于加载情感词库:
def load_sentiment_lexicon(positive_file, negative_file):
positive_lexicon = {}
with open(positive_file, 'r', encoding='utf-8') as f:
for line in f:
word, score = line.strip().split('\t')
positive_lexicon[word] = int(score)
negative_lexicon = {}
with open(negative_file, 'r', encoding='utf-8') as f:
for line in f:
word, score = line.strip().split('\t')
negative_lexicon[word] = int(score)
lexicon = {**positive_lexicon, **negative_lexicon}
return lexicon
4、加载程度级别词语
程度级别词语用于调整情感得分,例如“非常”会增加情感得分,“稍微”会减少情感得分。以下函数用于加载程度级别词语:
def load_degree_words(file_path):
degree_words = {}
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
word, multiplier = line.strip().split('\t')
degree_words[word] = float(multiplier)
return degree_words
5、加载否定词
否定词用于处理否定句,例如“不”、“没”等。以下函数用于加载否定词:
def load_negation_words(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
negation_words = set(line.strip() for line in f)
return negation_words
6、文本预处理
文本预处理包括分词和去除停用词。我们可以使用jieba进行分词,并去除停用词:
import jieba
import re
def preprocess_text(text, stopwords):
text = re.sub(r'[^\u4e00-\u9fa5]', '', text) # 保留中文字符
words = jieba.lcut(text)
filtered_words = [word for word in words if word not in stopwords]
return filtered_words
7、计算情感得分
最后,我们需要计算文本的情感得分。这涉及到考虑程度级别词语和否定词的影响:
def sentiment_score(text, lexicon, degree_words, negation_words):
words = preprocess_text(text, stopwords)
score = 0
modifier = 1
negation_flag = False
for word in words:
if word in degree_words:
modifier *= degree_words[word]
elif word in negation_words:
negation_flag = True
elif word in lexicon:
if negation_flag:
score += lexicon[word] * modifier * -1
negation_flag = False
else:
score += lexicon[word] * modifier
modifier = 1 # 重置修饰符
return score
8、示例代码
import jieba
import re
# 加载停用词库
def load_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
stopwords = set(line.strip() for line in f)
return stopwords
# 加载情感词库
def load_sentiment_lexicon(positive_file, negative_file):
positive_lexicon = {}
with open(positive_file, 'r', encoding='utf-8') as f:
for line in f:
word, score = line.strip().split('\t')
positive_lexicon[word] = int(score)
negative_lexicon = {}
with open(negative_file, 'r', encoding='utf-8') as f:
for line in f:
word, score = line.strip().split('\t')
negative_lexicon[word] = int(score)
lexicon = {**positive_lexicon, **negative_lexicon}
return lexicon
# 加载程度级别词语
def load_degree_words(file_path):
degree_words = {}
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
word, multiplier = line.strip().split('\t')
degree_words[word] = float(multiplier)
return degree_words
# 加载否定词
def load_negation_words(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
negation_words = set(line.strip() for line in f)
return negation_words
# 文本预处理
def preprocess_text(text, stopwords):
text = re.sub(r'[^\u4e00-\u9fa5]', '', text) # 保留中文字符
words = jieba.lcut(text)
filtered_words = [word for word in words if word not in stopwords]
return filtered_words
# 计算情感得分
def sentiment_score(text, lexicon, degree_words, negation_words):
words = preprocess_text(text, stopwords)
score = 0
modifier = 1
negation_flag = False
for word in words:
if word in degree_words:
modifier *= degree_words[word]
elif word in negation_words:
negation_flag = True
elif word in lexicon:
if negation_flag:
score += lexicon[word] * modifier * -1
negation_flag = False
else:
score += lexicon[word] * modifier
modifier = 1 # 重置修饰符
return score
# 示例文本
text = "这部电影真是太棒了,但是结局有点糟糕。"
# 文件路径
stopwords_file = 'chineseStopWords.txt'
positive_file = 'positiveWords.txt'
negative_file = 'negativeWords.txt'
degree_file = 'degreeWords.txt'
negation_file = 'negationWords.txt'
# 加载停用词库
stopwords = load_stopwords(stopwords_file)
# 加载情感词库
positive_lexicon = load_sentiment_lexicon(positive_file, negative_file)
negative_lexicon = load_sentiment_lexicon(negative_file, positive_file)
lexicon = {**positive_lexicon, **negative_lexicon}
# 加载程度级别词语
degree_words = load_degree_words(degree_file)
# 加载否定词
negation_words = load_negation_words(negation_file)
# 计算情感得分
score = sentiment_score(text, lexicon, degree_words, negation_words)
print(f"The sentiment score of the text is: {score}")