目录
前言
本程序进行文本分析,其中包括词云的统计与显示、新闻的自动摘要、词频统计以及词频的柱状图可视化,代码比较基础 (代码中的文件不在文章中放了,需要的可以私聊我,也可以在网上下载)。
技术要点
- matplotlib绘制条形图
- wordcloud绘制词云
- jieba分词统计
- collections.Counter用来统计相关元素出现的次数
代码
1.tf.tdf文本比较相似度
import jieba
import jieba.analyse
file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')
txt = file.read()
file.close()
keywords = jieba.analyse.extract_tags(txt, topK=20, withWeight=True, allowPOS=('n', 'nr', 'ns', 'v', 'vn'))
for item in keywords:
print(item[0], item[1])
2.新闻摘要
import re # 文档内容分句
import os # 获取文件路径
import jieba # 分词
import numpy
from sklearn.metrics import pairwise_distances # 计算文本相似度
from sklearn.feature_extraction.text import CountVectorizer # 转化为文本向量
def summary(path, num_summary=2):
'''
函数功能:实现文本摘要
参数说明:
path:文档路径
num_summary:摘要长短
返回:
result:摘要
'''
# 导入文本
# cwd=os.getcwd()
contents = ''
with open(path, 'r', encoding='utf-8') as file:
contents = file.read().strip()
# 分句
subCorpus = [contents] + re.split('[。?!\n]', contents)
# 导入停用词
stop_words_path = r'C:\Users\lenovo\Desktop\4文本分析\stop.txt'
stop_words = set()
with open(stop_words_path, 'r', encoding='utf-8') as sw:
[stop_words.add(line.strip()) for line in sw.readlines()]
# 分词
segments = []
clean_subCorpus = []
for content in subCorpus:
segs = jieba.cut(content) # 断词,list格式
segment = ' '.join(segs) # 转化为一个元素
if len(segment.strip()) >= 5: # 剔除长度小于5的句子
segments.append(segment.strip())
clean_subCorpus.append(content.strip())
# 文本向量
countVectorizer = CountVectorizer(stop_words=stop_words) # 设置关键参数stop_words
textVector = countVectorizer.fit_transform(segments) # shape=(10, 89)
# 文本相似度
distance_matrix = pairwise_distances(textVector, metric='cosine') # 数值越小越相似
# 生成摘要
sort_index = numpy.argsort(distance_matrix[0]) # 降序排列
num_summary = min(len(clean_subCorpus), num_summary + 1)
summarys = [] # 存放摘要
sorts = [] # 存放索引
for i in range(1, num_summary):
sorts.append(sort_index[i])
sorts_ix = numpy.argsort(sorts)
for ix in sorts_ix:
summarys.append(clean_subCorpus[sorts[ix]])
result = '。'.join(summarys)
return result
path = r'C:\Users\lenovo\Desktop\4文本分析\报告.txt'
summary(path, num_summary=3)
3.自动摘要
#textrank 自动摘要
import re
import jieba
import numpy as np
import jieba.analyse
from numpy import *
from collections import Counter
def load_stop_words():
global stopwords
with open(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt', "r", encoding="utf-8") as f:
stopwords = f.readlines()
for i in range(len(stopwords)):
stopwords[i] = stopwords[i].replace("\n", "")
def cosine_similarity(sentence1,sentence2):
sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)
sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)
vocab_list = list(set(sen1_vocab_list + sen2_vocab_list))
sen1_vec = np.zeros(len(vocab_list))
sen2_vec = np.zeros(len(vocab_list))
for i in range(len(vocab_list)):
sen1_vec[i] += Counter(sen1_vocab_list)[vocab_list[i]]
sen2_vec[i] += Counter(sen2_vocab_list)[vocab_list[i]]
cos_sim = float(np.sum(sen1_vec * sen2_vec))/(np.linalg.norm(sen1_vec) * np.linalg.norm(sen2_vec))
return cos_sim
def log_similarity(sentence1,sentence2):
sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)
sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)
if len(sen1_vocab_list) == 1 and len(sen2_vocab_list) == 1:
return 0.0
count = 0
for word in sen1_vocab_list:
if word in sen2_vocab_list:
count += 1
log_sim = count / (log(len(sen1_vocab_list)) + log(len(sen2_vocab_list)))
return log_sim
class GenerateAbstract():
@classmethod
def get_corpus_sentence_list(cls, corpus_list):
punch = r',|/|;|\'|`|<|>|\?|:|\{|\}|\~|!|@|#|\$|%|\^|&|=|\_|\+|,|。|;|【|】|!| |…'
sentence_list = []
for i in range(len(corpus_list)):
sentence_list.append([])
sentence_list[i] = re.split(punch, corpus_list[i])
if "" in sentence_list[i]:
sentence_list[i].remove("")
return sentence_list
@classmethod
def get_abstract(cls,corpus_sentence_list,**const):
cossim_range = const["sim_range"]
iters = const["iters"]
method = const["sim_method"]
page = 1
for sentence_list in corpus_sentence_list:
abstract_num = const["abstract_num"]
l = len(sentence_list)
if l < abstract_num:
abstract_num = l
sen_mat = np.zeros(l* l).reshape(l,l)
for i in range(len(sentence_list)):
for j in range(len(sentence_list)):
if i != j:
if method == "log":
cos_sim = log_similarity(sentence_list[i],sentence_list[j])
elif method == "cos":
cos_sim = cosine_similarity(sentence_list[i], sentence_list[j])
if cos_sim > cossim_range: #句子的余弦相似度在设定值之上,就这两个句子连线
sen_mat[i][j] += cos_sim
PR_mat = np.array(ones(l)).reshape(l,1)
for i in range(iters):
res_mat = 0.15 + 0.85 *sen_mat.dot(PR_mat)
res_dic = {}
for i in range(len(res_mat)):
res_dic.update({sentence_list[i]:float(res_mat[i][0])})
res_dic = sorted(res_dic.items(), key=lambda x: x[1], reverse=True) #PR值越大关键程度越高
abstract_list = []
abstract_str = ""
news_str = ""
for i in range(abstract_num):
abstract_list.append(res_dic[i][0])
for sentence in sentence_list:
if sentence in abstract_list:
abstract_list.remove(sentence)
abstract_str += sentence + "。"
for i in range(l):
if i < l - 1:
news_str += sentence_list[i] + ","
else:
news_str += sentence_list[i] + "。"
print("新闻{num}(本身新闻长度{len_sen},摘要长度{abs_num}):\n原文:\n{news}\n摘要:\n{abstract}\n".
format(num = page,abstract = abstract_str,abs_num = abstract_num,len_sen = len(sentence_list),news = news_str))
page += 1
if __name__ == "__main__":
with open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", "r", encoding="utf-8") as f:
news_list = f.readlines()
for i in range(len(news_list)):
news_list[i] = news_list[i].replace("\n", "")
corpus_sentence_list = GenerateAbstract.get_corpus_sentence_list(news_list)
GenerateAbstract.get_abstract(corpus_sentence_list,sim_range = 0.2,iters = 700,abstract_num = 8,sim_method="cos")
4.情感分析
# -*- coding: utf-8 -*-
"""
Created on Wed May 3 16:25:05 2017
http://www.jianshu.com/p/4cfcf1610a73?nomobile=yes 参考链接
#情感分析
@author: chuc
"""
from collections import defaultdict
import jieba
"""
1. 文本切割
"""
def sent2word(sentence):
"""
Segment a sentence to words
Delete stopwords
"""
jieba.load_userdict(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt")
segList = jieba.cut(sentence)
segResult = []
for w in segList:
segResult.append(w)
'''f = open('motion/stopword.txt')
stopwords = f.readlines()
f.close()
newSent = []
for word in segResult:
if word in stopwords:
# print "stopword: %s" % word
continue
else:
newSent.append(word)
'''
return segResult
"""
2. 情感定位
"""
def classifyWords(wordDict):
# (1) 情感词
f = open(r'motion/BosonNLP_sentiment_score.txt', encoding='utf-8')
senList = f.readline()
senDict = defaultdict()
while senList:
# senDict.append(senList.split())
senDict[senList.split(' ')[0]] = senList.split(' ')[1]
senList = f.readline()
f.close()
# (2) 否定词
g = open('motion/notDict.txt', encoding='utf-8')
notList = g.readline()
notDic = []
while notList:
notDic.append(notList)
notList = g.readline()
g.close()
# (3) 程度副词
f = open('motion/degree.txt')
degreeList = f.readline()
degreeDict = defaultdict()
while degreeList:
degreeDict[degreeList.split()[0]] = degreeList.split()[1]
degreeList = f.readline()
f.close()
senWord = defaultdict()
notWord = defaultdict()
degreeWord = defaultdict()
t = 0
for word in wordDict:
print(word)
if word in senDict.keys() and word not in notDic and word not in degreeDict.keys():
senWord[t] = senDict[word]
elif word in notDic[0] and word not in degreeDict.keys():
notWord[t] = -1
elif word in degreeDict.keys():
degreeWord[t] = degreeDict[word]
t = t + 1
# print( senWord, notWord, degreeWord)
return senWord, notWord, degreeWord
'''
计算句子分数
'''
def score(sen, no, degree, word):
score = 0
for i in range(len(word)):
if i in no.keys() and i + 1 in sen.keys():
sen[i + 1] = float(no[i]) * float(sen[i + 1])
elif i in degree.keys() and i + 1 in no.keys() and i + 2 in sen.keys():
sen[i + 2] = float(no[i]) * float(sen[i + 2] * float(degree[i]))
elif i in degree.keys() and i + 1 in sen.keys():
sen[i + 1] = float(degree[i]) * float(sen[i + 1])
elif i in degree.keys() and i + 1 in degree.keys():
sen[i] = float(degree[i]) * float(degree[i + 1])
# 考虑不同的短语组合算分
for j in sen.keys():
score = score + float(sen[j])
return score
def culate(sentences):
sp = sent2word(sentences)
d, dd, ddd = classifyWords(sp)
score1 = score(d, dd, ddd, sp)
return score1
5.词频词云
在进行这一部分代码操作时,首先要下载wordcloud库,词云库是无法在Pycharm中直接下载的。可以在CSDN上自行查询下载方式。
import jieba
import matplotlib.pyplot as plt
import wordcloud
import numpy
from PIL import Image
import matplotlib
import jieba.posseg as psg
# import matplotlib.colors as colors # 处理图片相关内容
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
# 读取文本
def read_txt():
file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')
txt = file.read()
file.close()
return txt
# 词性统计(写入文档)
def sda():
import jieba.posseg as psg
text = open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", encoding='utf-8', errors='ignore').read()
seg = psg.cut(text)
file = open(r"C:\Users\lenovo\Desktop\4文本分析\词性.txt", 'a+')
for ele in seg:
file.writelines(ele)
# 停词文档
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 分词生成词频统计(写入文档)
def write_txt():
words = jieba.lcut(read_txt()) # 使用精确模式对文本进行分词counts = {} # 通过键值对的形式存储词语及其出现的次数
counts = {}
stopwords = stopwordslist(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt')
for word in words:
if len(word) == 1: # 单个词语不计算在内
continue
elif word not in stopwords:
counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序
f = open("词频统计.txt", "w") # 写入文件
for i in range(len(items)):
word, count = items[i]
f.writelines("{0:<5}{1:>5}\n".format(word, count))
f.close()
# 生成词云
def creat_wordcloud():
f_0 = open("词频统计.txt", 'r')
# bg_pic=plt.imread(r'C:\Users\lenovo\Desktop\4文本分析\中国地图.png')
# 打开背景图片
color_mask = numpy.array(Image.open(r'C:\Users\lenovo\Desktop\4文本分析\中国地图.png'))
# 自定义文字颜色
# colormaps = colors.ListedColormap(['#FF0000','#FF7F50','#FFE4C4'])
text = f_0.read()
f_0.close()
wcloud = wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
background_color="white", # 指定背景颜色,默认黑色
max_words=500,
mask=color_mask, # 背景形状
# colormap=colormaps, # 指定颜色
width=1000, # 指定宽度
height=860, # 指定高度
margin=2,
).generate(text)
# 显示词云
wcloud.to_file("词云.jpg") # 生成词云图片
plt.imshow(wcloud)
plt.axis('off')
plt.show()
sda()
# 生成词云(默认样式)
# mywc1 = WordCloud().generate(tokenstr)
def main():
write_txt()
creat_wordcloud()
if __name__ == '__main__':
main()
首先导入一系列的库文件,Python的好处就在于存在非常多的第三方库,使得程序的编写得以简化。这些库在做其他词云可视化分析时也是要使用到的
import jieba
import matplotlib.pyplot as plt
import wordcloud
import numpy
from PIL import Image
import matplotlib
import jieba.posseg as psg
# import matplotlib.colors as colors # 处理图片相关内容
其中,jieba库用来分词,matplotlib绘制柱状图(柱状图必要的库),wordcloud库是核心用来绘制词云,PIL(Python Image Library)是python平台图像处理标准库
在词云绘制中,首先需要导入我们的txt文件
# 读取文本
def read_txt():
file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')
txt = file.read()
file.close()
return txt
这里文本的路径是文件存在的绝对路径,这里程序可能会出现报错,在CSDN上也都存在这类报错的解决办法
接下来是词性统计和分词生成的词频统计,词性统计后将统计结果写入停词文档(后附部分结果图)
# 词性统计(写入文档)
def sda():
import jieba.posseg as psg
text = open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", encoding='utf-8', errors='ignore').read()
seg = psg.cut(text)
file = open(r"C:\Users\lenovo\Desktop\4文本分析\词性.txt", 'a+')
for ele in seg:
file.writelines(ele)
# 停词文档
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 分词生成词频统计(写入文档)
def write_txt():
words = jieba.lcut(read_txt()) # 使用精确模式对文本进行分词counts = {} # 通过键值对的形式存储词语及其出现的次数
counts = {}
stopwords = stopwordslist(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt')
for word in words:
if len(word) == 1: # 单个词语不计算在内
continue
elif word not in stopwords:
counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序
f = open("词频统计.txt", "w") # 写入文件
for i in range(len(items)):
word, count = items[i]
f.writelines("{0:<5}{1:>5}\n".format(word, count))
f.close()
一系列工作完成后就可以生成词云了,在生成词云时可以选择词云的形状,需要从外部导入图片
一般来说,我们不想要这么方的词云,肯定喜欢一些有形状的,就需要导入其他包,这里导入的包为numpy,numpy系统是python的一种开源的数值计算扩展,这种工具可以用来存储和处理大型矩阵。这里在处理的时候将给出形状的图片表示为一个大型矩阵,再有颜色的地方来进行填词(导包 :import numpy as np)。导包之后需添加一个遮罩层,遮罩层就是用来限制生成图片的形状 。
# 生成词云
def creat_wordcloud():
f_0 = open("词频统计.txt", 'r')
# bg_pic=plt.imread(r'C:\Users\lenovo\Desktop\4文本分析\地图.png')
# 打开背景图片
color_mask = numpy.array(Image.open(r'C:\Users\lenovo\Desktop\4文本分析\地图.png'))
# 自定义文字颜色
# colormaps = colors.ListedColormap(['#FF0000','#FF7F50','#FFE4C4'])
text = f_0.read()
f_0.close()
wcloud = wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
background_color="white", # 指定背景颜色,默认黑色
max_words=500,
mask=color_mask, # 背景形状
# colormap=colormaps, # 指定颜色
width=1000, # 指定宽度
height=860, # 指定高度
margin=2,
).generate(text)
# 显示词云
wcloud.to_file("词云.jpg") # 生成词云图片
plt.imshow(wcloud)
plt.axis('off')
plt.show()
sda()
# 生成词云(默认样式)
# mywc1 = WordCloud().generate(tokenstr)
最后就是基本的主函数调用,不多做描述了
def main():
write_txt()
creat_wordcloud()
if __name__ == '__main__':
main()
6.词频柱状图
这一部分与上一部分词云也有一定关联,词云中显示文字的大小就表示了词语在报告中出现的频率,这一部分就是使词频更加的直观化、数据化
from matplotlib.font_manager import FontProperties
from collections import Counter
from pylab import *
import jieba.posseg as psg
mpl.rcParams['font.sans-serif'] = ['SimHei'] # X 轴可以显示中文
mpl.rcParams['axes.unicode_minus'] = False # X 轴可以显示中文
font = FontProperties(size=14)
f3 = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r',encoding='utf-8').read()
nowords = ['x', 'uj', 'a', 'ul', 'p', 'd', 'v', 'zg', 'm', 'ug', 'i', 'f', 'ad', 'nz', 'r', 'r', 'ns', 'q', 't', 'c']
wods = [x.word for x in psg.cut(f3) if len(x.word) >= 2 and (x.flag) not in nowords]
word_count = Counter(wods)
# print(word_count)
x = [x[0] for x in word_count.most_common(20)] # 统计top20个关键字
y = [x[1] for x in word_count.most_common(20)] # 统计top20个关键字出现的次数
fig = plt.figure()
plt.grid(False)
# c = np.random.randint(0,1,len(y))
plt.bar(x, y, color='lightskyblue')
plt.xlabel('关键词', fontproperties=font)
plt.ylabel('词频', fontproperties=font)
plt.title('词频分析柱状图', fontproperties=font)
plt.show()
总结
此项目比较基础,我本身也是一个初学者,自认为这个项目还算比较基础比较简单的,大家可以参考这个代码进行实践,本项目仅用于技术交流和学习,欢迎提出改进意见,以期共同进步。
第一篇博客,哪里写的不对,欢迎各位大神批评指正。