①用python读取pdf文件 里的内容
def read_pdf(path_list):
text=""
for file_path in path_list:
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
# 有得pdf读取不了 文字 所以说读取不到 文字时候 报错
if(text==""):
raise Exception("{file_path}的内容为空!")
return text
②读取完内容后,用jieba分词然后统计频次
def preprocess_text(text):
text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
words = jieba.cut(text)
# jieba分割完词后统计频率
fenge = {}
for i in words:
if i not in fenge:
fenge[i]=1
else:
fenge[i]+=1
# 以下是 处理
# 对频率进行排序
words = sorted(fenge.items(),key=lambda x:x[1],reverse=True)
out = {}
for word,count in words:
out[word]=count
# 停用词
list1 = ["的","和","等","在","到","了","为","是","一是","二是"]
# 删除不要的词
for i in list1:
try:
del out[i]
except KeyError as e:
None
# 删除长度为1的词
for i in list(out.keys()):
if len(i)==1:
del out[i]
# 删除频率小于xiaoyu的词
global xiaoyu
for i in list(out.keys()):
if out[i]<=xiaoyu:
del out[i]
# 删除频率大于dayu的词
global dayu
for i in list(out.keys()):
if out[i]>=dayu:
del out[i]
③最后用处理后的数据进行生成词云图
# 生成词云图
def generate_wordcloud(words):
word_counts = Counter(words)
# ,max_words=10
wordcloud = WordCloud(width=800, height=400, background_color='white',font_path = "msyh.ttc",max_font_size=150,min_font_size=10).generate_from_frequencies(word_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
完整代码如下
import fitz # PyMuPDF
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re
import jieba
# 从PDF中读取文本
def read_pdf(path_list):
text=""
for file_path in path_list:
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
# 有得pdf读取不了 文字 所以说读取不到 文字时候 报错
if(text==""):
raise Exception("{file_path}的内容为空!")
return text
def read_txt(path_list):
data=""
for file_path in path_list:
data += open(file_path,mode="r",encoding="utf-8").read()
return data
def preprocess_text(text):
text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
words = jieba.cut(text)
# jieba分割完词后统计频率
fenge = {}
for i in words:
if i not in fenge:
fenge[i]=1
else:
fenge[i]+=1
# 对频率进行排序
words = sorted(fenge.items(),key=lambda x:x[1],reverse=True)
out = {}
for word,count in words:
out[word]=count
# 停用词
list1 = ["的","和","等","在","到","了","为","是","一是","二是"]
# 删除不要的词
for i in list1:
try:
del out[i]
except KeyError as e:
None
# 删除长度为1的词
for i in list(out.keys()):
if len(i)==1:
del out[i]
# 删除频率小于xiaoyu的词
global xiaoyu
for i in list(out.keys()):
if out[i]<=xiaoyu:
del out[i]
# 删除频率大于dayu的词
global dayu
for i in list(out.keys()):
if out[i]>=dayu:
del out[i]
return out
# 生成词云图
def generate_wordcloud(words):
word_counts = Counter(words)
# ,max_words=10
wordcloud = WordCloud(width=800, height=400, background_color='white',font_path = "msyh.ttc",max_font_size=150,min_font_size=10).generate_from_frequencies(word_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# 筛选出 想要频率的范围 这里是删除频率小于xiaoyu的词 和 删除频率大于dayu的词
xiaoyu=10
dayu=10000
# 主程序
if __name__ == "__main__":
# 文件换为 自己的文件
pdf_list=[
"乡村振兴背景下数字乡村建设对农村相对贫困的影响研究_邓玉梅.pdf",
"乡村振兴视域下数字乡村建设研究_郭鑫.pdf",
"乡村振兴战略视域下乡风文明建设研究_王胜礼.pdf",
"新时代健康乡村建设研究_马景瑞.pdf"
]
words = preprocess_text(read_pdf(pdf_list))
# words = preprocess_text(text)
generate_wordcloud(words)
print(words)
# 生成柱状图