Python3.7对文本批量进行词频分析

github上找的源码,自己改的,记在这里。

对图中的文档做分词及词频统计,然后将统计生成的excel表格和分词后的text文本存入result文件夹里。
待分词的文本:
待分词的文本
最后生成的文档:
最后产生的文档
文件批量处理函数:
主要用到os模块为新生成的文件命名,实现批量处理

def word_frequency_analysis(path):
    files = os.listdir(path)  # files为列表,存储的是path里面的所有文件名
    result_dir = os.path.abspath(os.path.join(path, 'result'))  # 返回result文档的路径
    csv_all = os.path.abspath(os.path.join(result_dir, 'csv_all.csv'))
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)  #若不存在该文件路径,则创建一个
    for filename in files:
        if not fnmatch.fnmatch(filename, '*.txt'):
            continue
        txt_path = os.path.join(path, filename)
        txt_content = open(txt_path, 'r').read()
        field_name = filename[:-4] + '年'  # eg:返回2014年,2015年
        header_filed.append(field_name)
        filename_fulltext = filename[:-4] + '_all.txt'
        filename_counter = filename[:-4] + '_tj.csv'
       # filename_key = filename[:-4] + '_hy_tj.csv'

        txt_to_all = os.path.join(os.path.join(path, 'result'), filename_fulltext)
        txt_to_counter = os.path.join(os.path.join(path, 'result'), filename_counter)
      #  txt_to_key = os.path.join(os.path.join(path, 'result'), filename_key)

        text_cutted = jiebaCutText(txt_content)
        text_cleared = clearText(text_cutted)
        text_counted = countwords(text_cleared, txt_to_counter)

        newfile = open(txt_to_all, 'w')
        newfile.write(text_cleared)
        newfile.close()

分词函数:
主要用jieba第三方库进行分词

def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=False)
    liststr = '/'.join(seg_list)
    return liststr  # 返回的结果中会带标点符号

去除标点和单音节词:
将符合要求的词记入列表

def clearText(text):
    mywordlist = []
    for myword in text.split('/'):
        if len(myword.strip()) > 1 and contain_zh(myword.strip()):
            mywordlist.append(myword.strip())
    return '/'.join(mywordlist)

判断字符是否为汉字:
用到re模块,判断字符是否是汉字

def contain_zh(word):
    zh = re.compile(u'[\u4200-\u9fa5]+')
    match = zh.search(word)
    return match

词频统计函数:
用到字典和collections模块

def countwords(text, counter_file):
    count_dict = dict()
    for item in text.split('/'):
        if item in count_dict:
            count_dict[item] += 1
        else:
            count_dict[item] = 1

    d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
    with open(counter_file, 'w',encoding='utf-8-sig') as f:
        #f.write(codecs.BOM_UTF8)
        w = csv.writer(f)
        w.writerows(d_sorted_by_value.items())

完整的代码:

import csv
import fnmatch
import os
import re
from collections import OrderedDict
import jieba

#header_filed = []

def word_frequency_analysis(path):
    files = os.listdir(path)  # files为列表,存储的是path里面的所有文件名
    result_dir = os.path.abspath(os.path.join(path, 'result'))  # 返回result文档的路径
    csv_all = os.path.abspath(os.path.join(result_dir, 'csv_all.csv'))
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)  #若不存在该文件路径,则创建一个
    for filename in files:
        if not fnmatch.fnmatch(filename, '*.txt'):
            continue
        txt_path = os.path.join(path, filename)
        txt_content = open(txt_path, 'r').read()
        field_name = filename[:-4] + '年'  # eg:返回2014年,2015年
        header_filed.append(field_name)
        filename_fulltext = filename[:-4] + '_all.txt'
        filename_counter = filename[:-4] + '_tj.csv'
       # filename_key = filename[:-4] + '_hy_tj.csv'

        txt_to_all = os.path.join(os.path.join(path, 'result'), filename_fulltext)
        txt_to_counter = os.path.join(os.path.join(path, 'result'), filename_counter)
      #  txt_to_key = os.path.join(os.path.join(path, 'result'), filename_key)

        text_cutted = jiebaCutText(txt_content)
        text_cleared = clearText(text_cutted)
        text_counted = countwords(text_cleared, txt_to_counter)

        newfile = open(txt_to_all, 'w')
        newfile.write(text_cleared)
        newfile.close()


def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=False)
    liststr = '/'.join(seg_list)
    return liststr  # 返回的结果中会带标点符号


def clearText(text):
    mywordlist = []
    for myword in text.split('/'):
        if len(myword.strip()) > 1 and contain_zh(myword.strip()):
            mywordlist.append(myword.strip())
    return '/'.join(mywordlist)


def contain_zh(word):
    zh = re.compile(u'[\u4200-\u9fa5]+')
    match = zh.search(word)
    return match


def countwords(text, counter_file):
    count_dict = dict()
    for item in text.split('/'):
        if item in count_dict:
            count_dict[item] += 1
        else:
            count_dict[item] = 1

    d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
    with open(counter_file, 'w',encoding='utf-8-sig', newline = '') as f: #newline参数防止生成的文件有空行
        #f.write(codecs.BOM_UTF8)
        w = csv.writer(f)
        w.writerows(d_sorted_by_value.items())

if __name__=='__main__':
    path = 'E:/Programe/PySeg/jieba-wordcloud-demo-master/基础数据/韶关(分年度)'
    word_frequency_analysis(path)
  • 14
    点赞
  • 63
    收藏
    觉得还不错? 一键收藏
  • 8
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值