基于Python的文本分析和信息可视化

目录

前言

技术要点

代码

1.tf.tdf文本比较相似度

 2.新闻摘要

3.自动摘要

4.情感分析

 5.词频词云

6.词频柱状图

总结


前言

本程序进行文本分析,其中包括词云的统计与显示、新闻的自动摘要、词频统计以及词频的柱状图可视化,代码比较基础 (代码中的文件不在文章中放了,需要的可以私聊我,也可以在网上下载)。

技术要点

  1. matplotlib绘制条形图
  2. wordcloud绘制词云
  3. jieba分词统计
  4. collections.Counter用来统计相关元素出现的次数

代码

1.tf.tdf文本比较相似度

import jieba
import jieba.analyse

file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')
txt = file.read()
file.close()

keywords = jieba.analyse.extract_tags(txt, topK=20, withWeight=True, allowPOS=('n', 'nr', 'ns', 'v', 'vn'))

for item in keywords:
    print(item[0], item[1])

 2.新闻摘要

import re  # 文档内容分句
import os  # 获取文件路径
import jieba  # 分词
import numpy
from sklearn.metrics import pairwise_distances  # 计算文本相似度
from sklearn.feature_extraction.text import CountVectorizer  # 转化为文本向量


def summary(path, num_summary=2):
    '''
    函数功能:实现文本摘要
    参数说明:
        path:文档路径
        num_summary:摘要长短
    返回:
        result:摘要
    '''

    # 导入文本
    # cwd=os.getcwd()
    contents = ''
    with open(path, 'r', encoding='utf-8') as file:
        contents = file.read().strip()
    # 分句
    subCorpus = [contents] + re.split('[。?!\n]', contents)
    # 导入停用词
    stop_words_path = r'C:\Users\lenovo\Desktop\4文本分析\stop.txt'
    stop_words = set()
    with open(stop_words_path, 'r', encoding='utf-8') as sw:
        [stop_words.add(line.strip()) for line in sw.readlines()]
        # 分词
    segments = []
    clean_subCorpus = []
    for content in subCorpus:
        segs = jieba.cut(content)  # 断词,list格式
        segment = ' '.join(segs)  # 转化为一个元素
        if len(segment.strip()) >= 5:  # 剔除长度小于5的句子
            segments.append(segment.strip())
            clean_subCorpus.append(content.strip())
    # 文本向量
    countVectorizer = CountVectorizer(stop_words=stop_words)  # 设置关键参数stop_words
    textVector = countVectorizer.fit_transform(segments)  # shape=(10, 89)
    # 文本相似度
    distance_matrix = pairwise_distances(textVector, metric='cosine')  # 数值越小越相似
    # 生成摘要
    sort_index = numpy.argsort(distance_matrix[0])  # 降序排列
    num_summary = min(len(clean_subCorpus), num_summary + 1)

    summarys = []  # 存放摘要
    sorts = []  # 存放索引
    for i in range(1, num_summary):
        sorts.append(sort_index[i])
    sorts_ix = numpy.argsort(sorts)
    for ix in sorts_ix:
        summarys.append(clean_subCorpus[sorts[ix]])
    result = '。'.join(summarys)
    return result


path = r'C:\Users\lenovo\Desktop\4文本分析\报告.txt'
summary(path, num_summary=3)

3.自动摘要

#textrank 自动摘要
import re
import jieba
import numpy as np
import jieba.analyse
from numpy import *
from collections import Counter

def load_stop_words():
    global stopwords
    with open(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt', "r", encoding="utf-8") as f:
        stopwords = f.readlines()
    for i in range(len(stopwords)):
        stopwords[i] = stopwords[i].replace("\n", "")

def cosine_similarity(sentence1,sentence2):
    sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)
    sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)
    vocab_list = list(set(sen1_vocab_list + sen2_vocab_list))

    sen1_vec = np.zeros(len(vocab_list))
    sen2_vec = np.zeros(len(vocab_list))
    for i in range(len(vocab_list)):
        sen1_vec[i] += Counter(sen1_vocab_list)[vocab_list[i]]
        sen2_vec[i] += Counter(sen2_vocab_list)[vocab_list[i]]

    cos_sim = float(np.sum(sen1_vec * sen2_vec))/(np.linalg.norm(sen1_vec) * np.linalg.norm(sen2_vec))
    return cos_sim

def log_similarity(sentence1,sentence2):
    sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)
    sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)
    if len(sen1_vocab_list) == 1 and len(sen2_vocab_list) == 1:
        return 0.0
    count = 0
    for word in sen1_vocab_list:
        if word in sen2_vocab_list:
            count += 1
    log_sim = count / (log(len(sen1_vocab_list)) + log(len(sen2_vocab_list)))
    return log_sim


class GenerateAbstract():

    @classmethod
    def get_corpus_sentence_list(cls, corpus_list):
        punch = r',|/|;|\'|`|<|>|\?|:|\{|\}|\~|!|@|#|\$|%|\^|&|=|\_|\+|,|。|;|【|】|!| |…'
        sentence_list = []
        for i in range(len(corpus_list)):
            sentence_list.append([])
            sentence_list[i] = re.split(punch, corpus_list[i])
            if "" in sentence_list[i]:
                sentence_list[i].remove("")
        return sentence_list

    @classmethod
    def get_abstract(cls,corpus_sentence_list,**const):
        cossim_range = const["sim_range"]
        iters = const["iters"]
        method = const["sim_method"]
        page = 1
        for sentence_list in corpus_sentence_list:
            abstract_num = const["abstract_num"]
            l = len(sentence_list)
            if l < abstract_num:
                abstract_num = l
            sen_mat = np.zeros(l* l).reshape(l,l)
            for i in range(len(sentence_list)):
                for j in range(len(sentence_list)):
                    if i != j:
                        if method == "log":
                            cos_sim = log_similarity(sentence_list[i],sentence_list[j])
                        elif method == "cos":
                            cos_sim = cosine_similarity(sentence_list[i], sentence_list[j])
                        if cos_sim > cossim_range:                                                  #句子的余弦相似度在设定值之上,就这两个句子连线
                            sen_mat[i][j] += cos_sim
            PR_mat = np.array(ones(l)).reshape(l,1)
            for i in range(iters):
                res_mat = 0.15 + 0.85 *sen_mat.dot(PR_mat)
            res_dic = {}
            for i in range(len(res_mat)):
                res_dic.update({sentence_list[i]:float(res_mat[i][0])})
            res_dic = sorted(res_dic.items(), key=lambda x: x[1], reverse=True)         #PR值越大关键程度越高
            abstract_list = []
            abstract_str = ""
            news_str = ""
            for i in range(abstract_num):
                abstract_list.append(res_dic[i][0])
            for sentence in sentence_list:
                if sentence in abstract_list:
                    abstract_list.remove(sentence)
                    abstract_str += sentence + "。"
            for i in range(l):
                if i < l - 1:
                    news_str += sentence_list[i] + ","
                else:
                    news_str += sentence_list[i] + "。"
            print("新闻{num}(本身新闻长度{len_sen},摘要长度{abs_num}):\n原文:\n{news}\n摘要:\n{abstract}\n".
                  format(num = page,abstract = abstract_str,abs_num = abstract_num,len_sen = len(sentence_list),news = news_str))
            page += 1



if __name__ == "__main__":
    with open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", "r", encoding="utf-8") as f:
        news_list = f.readlines()
    for i in range(len(news_list)):
        news_list[i] = news_list[i].replace("\n", "")
    corpus_sentence_list = GenerateAbstract.get_corpus_sentence_list(news_list)
    GenerateAbstract.get_abstract(corpus_sentence_list,sim_range = 0.2,iters = 700,abstract_num = 8,sim_method="cos")

4.情感分析

# -*- coding: utf-8 -*-
"""
Created on Wed May  3 16:25:05 2017
http://www.jianshu.com/p/4cfcf1610a73?nomobile=yes 参考链接
#情感分析
@author: chuc
"""

from collections import defaultdict

import jieba

"""
1. 文本切割
"""


def sent2word(sentence):
    """
    Segment a sentence to words
    Delete stopwords

    """
    jieba.load_userdict(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt")
    segList = jieba.cut(sentence)
    segResult = []
    for w in segList:
        segResult.append(w)
    '''f = open('motion/stopword.txt')
    stopwords = f.readlines()
    f.close()
    newSent = []
    for word in segResult:
        if word in stopwords:
            # print "stopword: %s" % word
            continue
        else:
            newSent.append(word)
'''
    return segResult


"""
2. 情感定位
"""


def classifyWords(wordDict):
    # (1) 情感词
    f = open(r'motion/BosonNLP_sentiment_score.txt', encoding='utf-8')
    senList = f.readline()

    senDict = defaultdict()
    while senList:
        # senDict.append(senList.split())

        senDict[senList.split(' ')[0]] = senList.split(' ')[1]
        senList = f.readline()
    f.close()
    # (2) 否定词
    g = open('motion/notDict.txt', encoding='utf-8')
    notList = g.readline()
    notDic = []
    while notList:
        notDic.append(notList)
        notList = g.readline()
    g.close()

    # (3) 程度副词
    f = open('motion/degree.txt')
    degreeList = f.readline()
    degreeDict = defaultdict()
    while degreeList:
        degreeDict[degreeList.split()[0]] = degreeList.split()[1]
        degreeList = f.readline()
    f.close()
    senWord = defaultdict()
    notWord = defaultdict()
    degreeWord = defaultdict()
    t = 0
    for word in wordDict:
        print(word)
        if word in senDict.keys() and word not in notDic and word not in degreeDict.keys():
            senWord[t] = senDict[word]
        elif word in notDic[0] and word not in degreeDict.keys():
            notWord[t] = -1
        elif word in degreeDict.keys():
            degreeWord[t] = degreeDict[word]
        t = t + 1
    # print( senWord, notWord, degreeWord)
    return senWord, notWord, degreeWord


'''
计算句子分数  
'''


def score(sen, no, degree, word):
    score = 0
    for i in range(len(word)):
        if i in no.keys() and i + 1 in sen.keys():
            sen[i + 1] = float(no[i]) * float(sen[i + 1])
        elif i in degree.keys() and i + 1 in no.keys() and i + 2 in sen.keys():
            sen[i + 2] = float(no[i]) * float(sen[i + 2] * float(degree[i]))
        elif i in degree.keys() and i + 1 in sen.keys():
            sen[i + 1] = float(degree[i]) * float(sen[i + 1])
        elif i in degree.keys() and i + 1 in degree.keys():
            sen[i] = float(degree[i]) * float(degree[i + 1])
    # 考虑不同的短语组合算分
    for j in sen.keys():
        score = score + float(sen[j])
    return score


def culate(sentences):
    sp = sent2word(sentences)
    d, dd, ddd = classifyWords(sp)
    score1 = score(d, dd, ddd, sp)
    return score1

 5.词频词云

在进行这一部分代码操作时,首先要下载wordcloud库,词云库是无法在Pycharm中直接下载的。可以在CSDN上自行查询下载方式。

import jieba
import matplotlib.pyplot as plt
import wordcloud
import numpy
from PIL import Image
import matplotlib
import jieba.posseg as psg

# import matplotlib.colors as colors  # 处理图片相关内容

matplotlib.rcParams['font.sans-serif'] = ['SimHei']

# 读取文本
def read_txt():
    file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')
    txt = file.read()
    file.close()
    return txt


# 词性统计(写入文档)
def sda():
    import jieba.posseg as psg
    text = open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", encoding='utf-8', errors='ignore').read()
    seg = psg.cut(text)
    file = open(r"C:\Users\lenovo\Desktop\4文本分析\词性.txt", 'a+')
    for ele in seg:
        file.writelines(ele)


# 停词文档
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords


# 分词生成词频统计(写入文档)
def write_txt():
    words = jieba.lcut(read_txt())  # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数
    counts = {}
    stopwords = stopwordslist(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt')
    for word in words:
        if len(word) == 1:  # 单个词语不计算在内
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1  # 遍历所有词语,每出现一次其对应的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序

    f = open("词频统计.txt", "w")  # 写入文件
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()


# 生成词云
def creat_wordcloud():
    f_0 = open("词频统计.txt", 'r')
    # bg_pic=plt.imread(r'C:\Users\lenovo\Desktop\4文本分析\中国地图.png')
    # 打开背景图片
    color_mask = numpy.array(Image.open(r'C:\Users\lenovo\Desktop\4文本分析\中国地图.png'))
    # 自定义文字颜色
    # colormaps = colors.ListedColormap(['#FF0000','#FF7F50','#FFE4C4'])
    text = f_0.read()
    f_0.close()
    wcloud = wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                                 background_color="white",  # 指定背景颜色,默认黑色
                                 max_words=500,
                                 mask=color_mask,  # 背景形状
                                 # colormap=colormaps, # 指定颜色
                                 width=1000,  # 指定宽度
                                 height=860,  # 指定高度
                                 margin=2,
                                 ).generate(text)
    # 显示词云
    wcloud.to_file("词云.jpg")  # 生成词云图片

    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()
    sda()

    # 生成词云(默认样式)
    # mywc1 = WordCloud().generate(tokenstr)


def main():
    write_txt()
    creat_wordcloud()


if __name__ == '__main__':
    main()

首先导入一系列的库文件,Python的好处就在于存在非常多的第三方库,使得程序的编写得以简化。这些库在做其他词云可视化分析时也是要使用到的

import jieba
import matplotlib.pyplot as plt
import wordcloud
import numpy
from PIL import Image
import matplotlib
import jieba.posseg as psg

# import matplotlib.colors as colors  # 处理图片相关内容

其中,jieba库用来分词,matplotlib绘制柱状图(柱状图必要的库),wordcloud库是核心用来绘制词云,PIL(Python Image Library)是python平台图像处理标准库

在词云绘制中,首先需要导入我们的txt文件

# 读取文本
def read_txt():
    file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')
    txt = file.read()
    file.close()
    return txt

这里文本的路径是文件存在的绝对路径,这里程序可能会出现报错,在CSDN上也都存在这类报错的解决办法

接下来是词性统计和分词生成的词频统计,词性统计后将统计结果写入停词文档(后附部分结果图)

# 词性统计(写入文档)
def sda():
    import jieba.posseg as psg
    text = open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", encoding='utf-8', errors='ignore').read()
    seg = psg.cut(text)
    file = open(r"C:\Users\lenovo\Desktop\4文本分析\词性.txt", 'a+')
    for ele in seg:
        file.writelines(ele)


# 停词文档
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords


# 分词生成词频统计(写入文档)
def write_txt():
    words = jieba.lcut(read_txt())  # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数
    counts = {}
    stopwords = stopwordslist(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt')
    for word in words:
        if len(word) == 1:  # 单个词语不计算在内
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1  # 遍历所有词语,每出现一次其对应的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序

    f = open("词频统计.txt", "w")  # 写入文件
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()

 一系列工作完成后就可以生成词云了,在生成词云时可以选择词云的形状,需要从外部导入图片

一般来说,我们不想要这么方的词云,肯定喜欢一些有形状的,就需要导入其他包,这里导入的包为numpy,numpy系统是python的一种开源的数值计算扩展,这种工具可以用来存储和处理大型矩阵。这里在处理的时候将给出形状的图片表示为一个大型矩阵,再有颜色的地方来进行填词(导包 :import numpy as np)。导包之后需添加一个遮罩层,遮罩层就是用来限制生成图片的形状 。

# 生成词云
def creat_wordcloud():
    f_0 = open("词频统计.txt", 'r')
    # bg_pic=plt.imread(r'C:\Users\lenovo\Desktop\4文本分析\地图.png')
    # 打开背景图片
    color_mask = numpy.array(Image.open(r'C:\Users\lenovo\Desktop\4文本分析\地图.png'))
    # 自定义文字颜色
    # colormaps = colors.ListedColormap(['#FF0000','#FF7F50','#FFE4C4'])
    text = f_0.read()
    f_0.close()
    wcloud = wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                                 background_color="white",  # 指定背景颜色,默认黑色
                                 max_words=500,
                                 mask=color_mask,  # 背景形状
                                 # colormap=colormaps, # 指定颜色
                                 width=1000,  # 指定宽度
                                 height=860,  # 指定高度
                                 margin=2,
                                 ).generate(text)
    # 显示词云
    wcloud.to_file("词云.jpg")  # 生成词云图片

    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()
    sda()

    # 生成词云(默认样式)
    # mywc1 = WordCloud().generate(tokenstr)

最后就是基本的主函数调用,不多做描述了

def main():
    write_txt()
    creat_wordcloud()


if __name__ == '__main__':
    main()

6.词频柱状图

这一部分与上一部分词云也有一定关联,词云中显示文字的大小就表示了词语在报告中出现的频率,这一部分就是使词频更加的直观化、数据化

from matplotlib.font_manager import FontProperties
from collections import Counter
from pylab import *
import jieba.posseg as psg

mpl.rcParams['font.sans-serif'] = ['SimHei']  # X 轴可以显示中文
mpl.rcParams['axes.unicode_minus'] = False  # X 轴可以显示中文

font = FontProperties(size=14)
f3 = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r',encoding='utf-8').read()
nowords = ['x', 'uj', 'a', 'ul', 'p', 'd', 'v', 'zg', 'm', 'ug', 'i', 'f', 'ad', 'nz', 'r', 'r', 'ns', 'q', 't', 'c']

wods = [x.word for x in psg.cut(f3) if len(x.word) >= 2 and (x.flag) not in nowords]
word_count = Counter(wods)
# print(word_count)

x = [x[0] for x in word_count.most_common(20)]  # 统计top20个关键字
y = [x[1] for x in word_count.most_common(20)]  # 统计top20个关键字出现的次数
fig = plt.figure()
plt.grid(False)
# c = np.random.randint(0,1,len(y))
plt.bar(x, y, color='lightskyblue')
plt.xlabel('关键词', fontproperties=font)
plt.ylabel('词频', fontproperties=font)
plt.title('词频分析柱状图', fontproperties=font)
plt.show()

总结

此项目比较基础,我本身也是一个初学者,自认为这个项目还算比较基础比较简单的,大家可以参考这个代码进行实践,本项目仅用于技术交流和学习,欢迎提出改进意见,以期共同进步。

第一篇博客,哪里写的不对,欢迎各位大神批评指正。


  • 2
    点赞
  • 32
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值