用python提取pdf和txt内容统计词频并画出词云图

本文介绍了使用Python和PyMuPDF库读取PDF文件中的文本内容,然后利用jieba进行中文分词并统计词频,去除停用词和特定频率范围内的词,最后生成词云图的完整过程。
摘要由CSDN通过智能技术生成

①用python读取pdf文件 里的内容

def read_pdf(path_list):
    text=""
    for file_path in path_list:
        doc = fitz.open(file_path)   
        for page in doc:  
            text += page.get_text()
        
        # 有得pdf读取不了 文字 所以说读取不到 文字时候 报错
        if(text==""):
            raise Exception("{file_path}的内容为空!")
    return text  

②读取完内容后,用jieba分词然后统计频次


def preprocess_text(text):  
    text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
    words = jieba.cut(text)
    
    # jieba分割完词后统计频率
    fenge = {}
    for i in words:
        if i not in fenge:
            fenge[i]=1
        else:
            fenge[i]+=1

    
    # 以下是 处理

    # 对频率进行排序
    words = sorted(fenge.items(),key=lambda x:x[1],reverse=True)
    
    out = {}
    for word,count in words:
        out[word]=count

    # 停用词
    list1 = ["的","和","等","在","到","了","为","是","一是","二是"]
    # 删除不要的词
    for i in list1:
        try:
            del out[i]
        except KeyError as e:
            None
    
    # 删除长度为1的词
    for i in list(out.keys()):
        if len(i)==1:
            del out[i]

    # 删除频率小于xiaoyu的词
    global xiaoyu

    for i in list(out.keys()):
        if out[i]<=xiaoyu:
            del out[i]

     # 删除频率大于dayu的词
    global dayu

    for i in list(out.keys()):
        if out[i]>=dayu:
            del out[i]

③最后用处理后的数据进行生成词云图

# 生成词云图  
def generate_wordcloud(words):  
    word_counts = Counter(words)  
    # ,max_words=10
    wordcloud = WordCloud(width=800, height=400, background_color='white',font_path = "msyh.ttc",max_font_size=150,min_font_size=10).generate_from_frequencies(word_counts)  
    plt.figure(figsize=(10, 5))  
    plt.imshow(wordcloud, interpolation='bilinear')  
    plt.axis("off")  
    plt.show()  

完整代码如下

import fitz  # PyMuPDF  
from wordcloud import WordCloud  
import matplotlib.pyplot as plt  
from collections import Counter  
import re  
import jieba
# 从PDF中读取文本  
def read_pdf(path_list):
    text=""
    for file_path in path_list:
        doc = fitz.open(file_path)   
        for page in doc:  
            text += page.get_text()
        
        # 有得pdf读取不了 文字 所以说读取不到 文字时候 报错
        if(text==""):
            raise Exception("{file_path}的内容为空!")

    
    return text     
def read_txt(path_list): 
    data=""
    for file_path in path_list:
        data += open(file_path,mode="r",encoding="utf-8").read()
    return data 


def preprocess_text(text):  
    text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
    words = jieba.cut(text)
    
    # jieba分割完词后统计频率
    fenge = {}
    for i in words:
        if i not in fenge:
            fenge[i]=1
        else:
            fenge[i]+=1

    
    # 对频率进行排序
    words = sorted(fenge.items(),key=lambda x:x[1],reverse=True)
    
    out = {}
    for word,count in words:
        out[word]=count

    # 停用词
    list1 = ["的","和","等","在","到","了","为","是","一是","二是"]
    # 删除不要的词
    for i in list1:
        try:
            del out[i]
        except KeyError as e:
            None
    
    # 删除长度为1的词
    for i in list(out.keys()):
        if len(i)==1:
            del out[i]

    # 删除频率小于xiaoyu的词
    global xiaoyu

    for i in list(out.keys()):
        if out[i]<=xiaoyu:
            del out[i]

     # 删除频率大于dayu的词
    global dayu

    for i in list(out.keys()):
        if out[i]>=dayu:
            del out[i]


    
    return out 

# 生成词云图  
def generate_wordcloud(words):  
    word_counts = Counter(words)  
    # ,max_words=10
    wordcloud = WordCloud(width=800, height=400, background_color='white',font_path = "msyh.ttc",max_font_size=150,min_font_size=10).generate_from_frequencies(word_counts)  
    plt.figure(figsize=(10, 5))  
    plt.imshow(wordcloud, interpolation='bilinear')  
    plt.axis("off")  
    plt.show()  
  


# 筛选出 想要频率的范围 这里是删除频率小于xiaoyu的词 和 删除频率大于dayu的词 
xiaoyu=10
dayu=10000
# 主程序  
if __name__ == "__main__": 
    
    # 文件换为 自己的文件
    pdf_list=[
        "乡村振兴背景下数字乡村建设对农村相对贫困的影响研究_邓玉梅.pdf",
        "乡村振兴视域下数字乡村建设研究_郭鑫.pdf",
        "乡村振兴战略视域下乡风文明建设研究_王胜礼.pdf",
        "新时代健康乡村建设研究_马景瑞.pdf"
    ]

    words = preprocess_text(read_pdf(pdf_list))

    # words = preprocess_text(text)  
    generate_wordcloud(words)
    print(words)
    # 生成柱状图

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

sinnp

谢谢宝子的打赏,mua!!

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值