Python开发练习-对目标文件夹内文本文档的题目、内容分别生成词云图-CSDN博客

本文链接：https://blog.csdn.net/tocreateone/article/details/115371865

搜索目标文件夹内的子文件夹信息；
搜索子文件夹内的txt文档信息；
各子文件夹内生成一张文档标题、一张文档内容两张词云图；
每隔5分钟执行一次以上操作。

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import jieba
import matplotlib
import os
import wordcloud
import cv2
import numpy as np
import pathlib
import copy
import time
def getusefile(orgxls='.'):
    #查目标目录下所有txt文件，返回文件名列表
    usefile=[]
    excelfile=sorted(pathlib.Path(orgxls).glob('**/*.txt*'))
    usefile=[str(tpfile) for tpfile in excelfile]
    ft=[]
    usef=[]
    for ff in usefile:
        file_out,tail=os.path.splitext(ff)
        file_name=os.path.split(file_out)[1]
        if file_name not in ft:
            ft.append(file_name)
            usef.append(ff)
    return copy.deepcopy(usef)


def ciyun(txtfile):
    #取各个文件的名称、内容
    #获取文件名
    file_out,tail=os.path.splitext(txtfile)
    file_name=os.path.split(file_out)[1]
    #读文本内容
    try:
        with open(txtfile,'r',encoding='utf-8')as fp:
            text=fp.readlines()
    except:
        with open(txtfile,'r',encoding='gbk')as fp:
            text=fp.readlines()
            
    text1=[line.strip() for line in text if line.strip()]
    string=''.join(text1)
    print(file_name,len(string))
    return file_name,string
def mkciyun(file_name,text,img1,font,path0):
    #分词，生成词云
    #中文分词
    txtcut=jieba.lcut(text)
    #中文词云用分词结果处理
    string=[ss for ss in txtcut if len(ss)>1]
    text=' '.join(string)
    
    stopword=['的']
    #背景图片处理
    back_color=cv2.imread(img1)
    img_array=np.array(back_color)
    #词云参数设置
    wc = wordcloud.WordCloud(
        background_color='white',
        width=1000,
        height=800,
        mask=img_array,
        font_path=font,
        stopwords=stopword
    )
    #词云图片生成
    wc.generate_from_text(text)#绘制图片
    matplotlib.pyplot.imshow(wc)
    matplotlib.pyplot.axis('off')
    wc.to_file(path0+'/'+file_name+'.jpg')

def sccy(path0,font,img1):
    #生成目标文件夹内各子文件夹的词云图
    fod=r'\baidunews'                   #当前文件夹内的目标文件夹（各子文件夹的父文件夹）
    path1=path0+fod
    #获取目标文件夹内的子文件夹名称列表
    pathlst=os.listdir(path1)
    pathlst=[os.path.join(path1,line) for line in pathlst if not os.path.isfile(line)]
    for path1 in pathlst:
        txtall=getusefile(path1)        #查目标目录下所有txt文件，返回文件名列表
        print(len(txtall))
        #print(txtall)

        subject=[]
        datas=[]
        for file1 in txtall:
            filename,filedata=ciyun(file1)      #取各个文件的名称、内容
            subject.append(filename)
            datas.append(filedata)
            #print(filename,len(filedata))
        #分词，生成词云
        mkciyun('标题词云',''.join(subject),img1,font,path1)
        mkciyun('内容词云',''.join(datas),img1,font,path1)

path0 = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()    #当前文件路径
font=r'C:\Windows\Fonts\simfang.ttf'        #中文字体
img1=path0+'/wangqiu.jpg'                   #背景图
while 1:
    tt1=time.time()
    #生成目标文件夹内各子文件夹的词云图
    sccy(path0,font,img1)
    tt2=time.time()
    print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()))
    time.sleep(300-int(tt2-tt1))