自然语言处理编程代码

m0_63076514

已于 2023-10-23 10:02:40 修改

阅读量216

点赞数 1

文章标签： c# 开发语言

于 2023-10-23 09:49:46 首次发布

本文链接：https://blog.csdn.net/m0_63076514/article/details/133982666

版权

# 基于字典的情感分析（上）

# 基于字典的情感分析（下）# 分类

# 基于文本类的情感分析（上）

# 基于文本类的情感分析（下）'哈哈哈哈，我很喜欢', '今天很开心']

# LDA（上）

# LDA下from gensim import corpora, models, similarities

# 基于字典的情感分析（上）

import re

import jieba

import codecs

from collections import defaultdict # 导入collections用于创建空白词典

def seg_word(sentence):

seg_list = jieba.cut(sentence)

seg_result = []

for word in seg_list:

seg_result.append(word)

stopwords = set()

stopword = codecs.open('../data/stopwords.txt', 'r',

encoding='utf-8') # 加载停用词

for word in stopword:

stopwords.add(word.strip())

stopword.close()

return list(filter(lambda x: x not in stopwords, seg_result))

def sort_word(word_dict):

sen_file = open('../data/BosonNLP_sentiment_score.txt', 'r+',

encoding='utf-8') # 加载Boson情感词典

sen_list = sen_file.readlines()

sen_dict = defaultdict() # 创建词典

for s in sen_list:

s = re.sub('\n', '', s) # 去除每行最后的换行符

if s:

# 构建以key为情感词，value为对应分值的词典

sen_dict[s.split(' ')[0]] = s.split(' ')[1]

not_file = open('../data/否定词.txt', 'r+',

encoding='utf-8') # 加载否定词词典

not_list = not_file.readlines()

for i in range(len(not_list)):

not_list[i] = re.sub('\n', '', not_list[i])

degree_file = open('../data/程度副词（中文）.txt', 'r+',

encoding='utf-8') # 加载程度副词词典

degree_list = degree_file.readlines()

degree_dic = defaultdict()

for d in degree_list:

d = re.sub('\n', '', d)

if d:

degree_dic[d.split(' ')[0]] = d.split(' ')[1]

sen_file.close()

degree_file.close()

not_file.close()

sen_word = dict()

not_word = dict()

degree_word = dict()

# 基于字典的情感分析（下）# 分类

for word in word_dict.keys():
if word in sen_dict.keys() and word not in not_list and word not in degree_dic.keys():
sen_word[word_dict[word]] = sen_dict[word] # 情感词典中的包含分词结果的词
elif word in not_list and word not in degree_dic.keys():
not_word[word_dict[word]] = -1 # 程度副词词典中的包含分词结果的词
elif word in degree_dic.keys():
# 否定词典中的包含分词结果的词
degree_word[word_dict[word]] = degree_dic[word]
return sen_word, not_word, degree_word # 返回分类结果

def list_to_dict(word_list):
data = {}
for x in range(0, len(word_list)):
data[word_list[x]] = x
return data

def socre_sentiment(sen_word, not_word, degree_word, seg_result):
W = 1 # 初始化权重
score = 0
sentiment_index = -1 # 情感词下标初始化
for i in range(0, len(seg_result)):
if i in sen_word.keys():
score += W * float(sen_word[i])
sentiment_index += 1 # 下一个情感词
for j in range(len(seg_result)):
if j in not_word.keys():
score *= -1 # 否定词反转情感
elif j in degree_word.keys():
score *= float(degree_word[j]) # 乘以程度副词
return score

def setiment(sentence):
# 对文本进行分词和去停用词，去除跟情感词无关的词语
seg_list = seg_word(sentence)
# 对分词结果进行分类，找出其中的情感词、程度副词和否定词
sen_word, not_word, degree_word = sort_word(list_to_dict(seg_list))
# 计算并汇总情感词的得分
score = socre_sentiment(sen_word, not_word, degree_word, seg_list)
return seg_list, sen_word, not_word, degree_word, score

if __name__ == '__main__':
print(setiment('我今天特别开心'))
print(setiment('我今天很开心、非常兴奋'))
print(setiment('我昨天开心，今天不开心'))

# 基于文本类的情感分析（上）

import nltk.classify as cf
import nltk.classify.util as cu
import jieba
def setiment(sentences):
# 文本转换为特征及特征选取
pos_data = []
with open('../data/pos.txt', 'r+', encoding='utf-8') as pos: # 读取积极评论
while True:
words = pos.readline()
if words:
positive = {} # 创建积极评论的词典
words = jieba.cut(words) # 对评论数据结巴分词
for word in words:
positive[word] = True
pos_data.append((positive, 'POSITIVE')) # 对积极词赋予POSITIVE标签
else:
break
neg_data = []
with open('../data/neg.txt', 'r+', encoding='utf-8') as neg: # 读取消极评论
while True:
words = neg.readline()
if words:
negative = {} # 创建消极评论的词典
words = jieba.cut(words) # 对评论数据结巴分词
for word in words:
negative[word] = True
neg_data.append((negative, 'NEGATIVE')) # 对消极词赋予NEGATIVE标签
else:
break
# 划分训练集（80%）与测试集（20%）
pos_num, neg_num = int(len(pos_data) * 0.8), int(len(neg_data) * 0.8)
train_data = pos_data[: pos_num] + neg_data[: neg_num] # 抽取80%数据
test_data = pos_data[pos_num: ] + neg_data[neg_num: ] # 剩余20%数据
# 构建分类器（朴素贝叶斯）
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model, test_data)
print('准确率为：' + str(ac))
tops = model.most_informative_features() # 信息量较大的特征
print('\n信息量较大的前10个特征为:')
for top in tops[: 10]:
print(top[0])
for sentence in sentences:
feature = {}
words = jieba.cut(sentence)
for word in words:
feature[word] = True
pcls = model.prob_classify(feature)
sent = pcls.max() # 情绪面标签（POSITIVE或NEGATIVE）
prob = pcls.prob(sent) # 情绪程度
print('\n','‘',sentence,'’', '的情绪面标签为', sent, '概率为','%.2f%%' % round(prob * 100, 2))
if __name__ == '__main__':
# 测试
sentences = ['破烂平板', '手感不错，推荐购买', '刚开始吧还不错，但是后面越来越卡，差评',

# 基于文本类的情感分析（下）'哈哈哈哈，我很喜欢', '今天很开心']

setiment(sentences)

# 代码9-3
from snownlp import SnowNLP # 调用情感分析函数
# 创建snownlp对象，设置要测试的语句
s1 = SnowNLP('这东西真的挺不错的')
s2 = SnowNLP('垃圾东西')
print('调用sentiments方法获取s1的积极情感概率为:',s1.sentiments)
print('调用sentiments方法获取s2的积极情感概率为:',s2.sentiments)

# LDA（上）

import pandas as pd
from snownlp import SnowNLP
import jieba

data = pd.read_csv('../data/comment.csv', sep=',', encoding='utf-8', header=0)
comment_data = data.loc[: , ['评论']] # 只提取评论数据
# 去除重复值
comment_data = comment_data.drop_duplicates()
# 短句删除
comments_data = comment_data.iloc[: , 0]
comments = comments_data[comments_data.apply(len) >= 4] # 剔除字数少于4的数据
# 语料压缩，句子中常出现重复语句，需要进行压缩
def yasuo(string):
for i in [1, 2]:
j = 0
while j < len(string) - 2 * i:
if string[j: j + i] == string[j + i: j + 2 * i] and (
string[j + i: j + 2 * i] == string[j + i: j + 3 * i]):
k = j + 2 * i
while k + i < len(string) and string[j: j + i] == string[j: j + 2 * i]:
k += i
string = string[: j + i] + string[k + i:]
j += 1
for i in [3, 4, 5]:
j = 0
while j < len(string) - 2 * i:
if string[j: j + i] == string[j + i: j + 2 * i]:
k = j + 2 * i
while k + i < len(string) and string[j: j + i] == string[j: j + 2 * i]:
k += i
string = string[: j + i] + string[k + i:]
j += 1
if string[: int(len(string) / 2)] == string[int(len(string) / 2):]:
string = string[: int(len(string) / 2)]
return string
comments = comments.astype('str').apply(lambda x: yasuo(x))

# LDA下from gensim import corpora, models, similarities

# 情感分析
coms = []
coms = comments.apply(lambda x: SnowNLP(x).sentiments)
# 情感分析，coms在0~1之间，以0.5分界，大于0.5，则为正面情感
pos_data = comments[coms >= 0.6] # 正面情感数据集，取0.6是为了增强情感
neg_data = comments[coms < 0.4] # 负面情感数据集
# 分词
mycut = lambda x: ' '.join(jieba.cut(x)) # 自定义简单分词函数
pos_data = pos_data.apply(mycut)
neg_data = neg_data.apply(mycut)
pos_data.head(5)
neg_data.tail(5)
print(len(pos_data))
print(len(neg_data))
# 去停用词
stop = pd.read_csv('../data/stopwords.txt', sep='bucunzai', encoding='utf-8', header=None)
stop = ['', ''] + list(stop[0]) # 添加空格符号，pandas过滤了空格符
pos = pd.DataFrame(pos_data)
neg = pd.DataFrame(neg_data)
pos[1] = pos['评论'].apply(lambda s: s.split(' ')) # 空格分词
pos[2] = pos[1].apply(lambda x: [i for i in x if i not in stop]) # 去除停用词
neg[1] = neg['评论'].apply(lambda s: s.split(' '))
neg[2] = neg[1].apply(lambda x: [i for i in x if i not in stop])
# 正面主题分析
pos_dict = corpora.Dictionary(pos[2]) # 建立词典
pos_corpus = [pos_dict.doc2bow(i) for i in pos[2]] # 建立语料库
pos_lda = models.LdaModel(pos_corpus, num_topics=3, id2word=pos_dict) # LDA模型训练
for i in range(3):
print('pos_topic' + str(i))
print(pos_lda.print_topic(i)) # 输出每个主题
# 负面主题分析
neg_dict = corpora.Dictionary(neg[2]) # 建立词典
neg_corpus = [neg_dict.doc2bow(i) for i in neg[2]] # 建立语料库，bag of word
neg_lda = models.LdaModel(neg_corpus, num_topics=3, id2word=neg_dict) # LDA模型训练
for i in range(3):
print('neg_topic' + str(i))
print(neg_lda.print_topic(i)) # 输出每个主题

m0_63076514

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
自然语言处理编程代码

print('\n','‘',sentence,'’', '的情绪面标签为', sent, '概率为','%.2f%%' % round(prob * 100, 2))train_data = pos_data[: pos_num] + neg_data[: neg_num] # 抽取80%数据。test_data = pos_data[pos_num: ] + neg_data[neg_num: ] # 剩余20%数据。encoding='utf-8') # 加载停用词。
复制链接

扫一扫