Python3.7对文本批量进行词频分析

MilkLeong

已于 2022-07-24 03:19:52 修改

阅读量5.3k

点赞数 14

分类专栏：自然语言处理文章标签： python os csv

于 2020-02-28 17:44:22 首次发布

本文链接：https://blog.csdn.net/MilkLeong/article/details/104560395

版权

自然语言处理专栏收录该内容

22 篇文章 1 订阅

订阅专栏

github上找的源码，自己改的，记在这里。

对图中的文档做分词及词频统计，然后将统计生成的excel表格和分词后的text文本存入result文件夹里。
待分词的文本：

最后生成的文档：
最后产生的文档
文件批量处理函数：
主要用到os模块为新生成的文件命名，实现批量处理

def word_frequency_analysis(path):
    files = os.listdir(path)  # files为列表，存储的是path里面的所有文件名
    result_dir = os.path.abspath(os.path.join(path, 'result'))  # 返回result文档的路径
    csv_all = os.path.abspath(os.path.join(result_dir, 'csv_all.csv'))
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)  #若不存在该文件路径，则创建一个
    for filename in files:
        if not fnmatch.fnmatch(filename, '*.txt'):
            continue
        txt_path = os.path.join(path, filename)
        txt_content = open(txt_path, 'r').read()
        field_name = filename[:-4] + '年'  # eg:返回2014年，2015年
        header_filed.append(field_name)
        filename_fulltext = filename[:-4] + '_all.txt'
        filename_counter = filename[:-4] + '_tj.csv'
       # filename_key = filename[:-4] + '_hy_tj.csv'

        txt_to_all = os.path.join(os.path.join(path, 'result'), filename_fulltext)
        txt_to_counter = os.path.join(os.path.join(path, 'result'), filename_counter)
      #  txt_to_key = os.path.join(os.path.join(path, 'result'), filename_key)

        text_cutted = jiebaCutText(txt_content)
        text_cleared = clearText(text_cutted)
        text_counted = countwords(text_cleared, txt_to_counter)

        newfile = open(txt_to_all, 'w')
        newfile.write(text_cleared)
        newfile.close()

分词函数：
主要用jieba第三方库进行分词

def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=False)
    liststr = '/'.join(seg_list)
    return liststr  # 返回的结果中会带标点符号

去除标点和单音节词：
将符合要求的词记入列表

def clearText(text):
    mywordlist = []
    for myword in text.split('/'):
        if len(myword.strip()) > 1 and contain_zh(myword.strip()):
            mywordlist.append(myword.strip())
    return '/'.join(mywordlist)

判断字符是否为汉字：
用到re模块，判断字符是否是汉字

def contain_zh(word):
    zh = re.compile(u'[\u4200-\u9fa5]+')
    match = zh.search(word)
    return match

词频统计函数：
用到字典和collections模块

def countwords(text, counter_file):
    count_dict = dict()
    for item in text.split('/'):
        if item in count_dict:
            count_dict[item] += 1
        else:
            count_dict[item] = 1

    d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
    with open(counter_file, 'w',encoding='utf-8-sig') as f:
        #f.write(codecs.BOM_UTF8)
        w = csv.writer(f)
        w.writerows(d_sorted_by_value.items())

完整的代码：

import csv
import fnmatch
import os
import re
from collections import OrderedDict
import jieba

#header_filed = []

def word_frequency_analysis(path):
    files = os.listdir(path)  # files为列表，存储的是path里面的所有文件名
    result_dir = os.path.abspath(os.path.join(path, 'result'))  # 返回result文档的路径
    csv_all = os.path.abspath(os.path.join(result_dir, 'csv_all.csv'))
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)  #若不存在该文件路径，则创建一个
    for filename in files:
        if not fnmatch.fnmatch(filename, '*.txt'):
            continue
        txt_path = os.path.join(path, filename)
        txt_content = open(txt_path, 'r').read()
        field_name = filename[:-4] + '年'  # eg:返回2014年，2015年
        header_filed.append(field_name)
        filename_fulltext = filename[:-4] + '_all.txt'
        filename_counter = filename[:-4] + '_tj.csv'
       # filename_key = filename[:-4] + '_hy_tj.csv'

        txt_to_all = os.path.join(os.path.join(path, 'result'), filename_fulltext)
        txt_to_counter = os.path.join(os.path.join(path, 'result'), filename_counter)
      #  txt_to_key = os.path.join(os.path.join(path, 'result'), filename_key)

        text_cutted = jiebaCutText(txt_content)
        text_cleared = clearText(text_cutted)
        text_counted = countwords(text_cleared, txt_to_counter)

        newfile = open(txt_to_all, 'w')
        newfile.write(text_cleared)
        newfile.close()


def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=False)
    liststr = '/'.join(seg_list)
    return liststr  # 返回的结果中会带标点符号


def clearText(text):
    mywordlist = []
    for myword in text.split('/'):
        if len(myword.strip()) > 1 and contain_zh(myword.strip()):
            mywordlist.append(myword.strip())
    return '/'.join(mywordlist)


def contain_zh(word):
    zh = re.compile(u'[\u4200-\u9fa5]+')
    match = zh.search(word)
    return match


def countwords(text, counter_file):
    count_dict = dict()
    for item in text.split('/'):
        if item in count_dict:
            count_dict[item] += 1
        else:
            count_dict[item] = 1

    d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
    with open(counter_file, 'w',encoding='utf-8-sig', newline = '') as f: #newline参数防止生成的文件有空行
        #f.write(codecs.BOM_UTF8)
        w = csv.writer(f)
        w.writerows(d_sorted_by_value.items())

if __name__=='__main__':
    path = 'E:/Programe/PySeg/jieba-wordcloud-demo-master/基础数据/韶关（分年度）'
    word_frequency_analysis(path)