python实现指定目录下批量文件的单词计数：串行版本

本文链接：https://blog.csdn.net/lovesqcc/article/details/39702541

直接上代码。

练习目标：

1. 使用 Python 面向对象的方法封装逻辑和表达；

2. 使用异常处理和日志API ；

3. 使用文件目录读写API ；

4. 使用 list, map, tuple 三种数据结构；

5. lambda 、正则使用及其它。

下一篇将实现并发版本。

#-------------------------------------------------------------------------------
# Name:        wordstat_serial.py
# Purpose:     statistic words in java files of given directory by serial
#
# Author:      qin.shuq
#
# Created:     08/10/2014
# Copyright:   (c) qin.shuq 2014
# Licence:     <your licence>
#-------------------------------------------------------------------------------

import re
import os
import time
import logging

LOG_LEVELS = {
    'DEBUG': logging.DEBUG, 'INFO': logging.INFO,
    'WARN': logging.WARNING, 'ERROR': logging.ERROR,
    'CRITICAL': logging.CRITICAL
}

def initlog(filename) :

    logger = logging.getLogger()
    hdlr = logging.FileHandler(filename)
    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(LOG_LEVELS['INFO'])

    return logger


errlog = initlog("error.log")
infolog = initlog("info.log")

class WordReading(object):

    def __init__(self, fileList):
        self.fileList = fileList

    def readFileInternal(self, filename):
        lines = []
        try:
            f = open(filename, 'r')
            lines = f.readlines()
            infolog.info('[successful read file %s]\n' % filename)
            f.close()
        except IOError, err:
            errorInfo = 'file %s Not found \n' % filename
            errlog.error(errorInfo)
        return lines

    def readFile(self):
        allLines = []
        for filename in self.fileList:
            allLines.extend(self.readFileInternal(filename))
        return allLines

class WordAnalyzing(object):
    '''
     return Map<Word, count>  the occurrence times of each word
    '''
    wordRegex = re.compile("[\w]+")
    def __init__(self, allLines):
        self.allLines = allLines

    def analyze(self):
        result = {}
        lineContent = ''.join(self.allLines)
        matches = WordAnalyzing.wordRegex.findall(lineContent)
        if matches:
            for word in matches:
                if result.get(word) is None:
                    result[word] = 0
                result[word] += 1
        return result

class FileObtainer(object):

    def __init__(self, dirpath, fileFilterFunc=None):
        self.dirpath = dirpath
        self.fileFilterFunc = fileFilterFunc

    def findAllFilesInDir(self):
        files = []
        for path, dirs, filenames in os.walk(self.dirpath):
            if len(filenames) > 0:
                for filename in filenames:
                    files.append(path+'/'+filename)

        if self.fileFilterFunc is None:
            return files
        else:
            return filter(self.fileFilterFunc, files)

class PostProcessing(object):

    def __init__(self, resultMap):
        self.resultMap = resultMap

    def sortByValue(self):
        return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True)

    def obtainTopN(self, topN):
        sortedResult = self.sortByValue()
        sortedNum = len(sortedResult)
        topN = sortedNum if topN > sortedNum else topN
        for i in range(topN):
            topi = sortedResult[i]
            print topi[0], ' counts: ', topi[1]

if __name__ == "__main__":

    dirpath = "c:\\Users\\qin.shuq\\Desktop\\region_master\\src"

    starttime = time.time()
    fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java'))
    fileList = fileObtainer.findAllFilesInDir()
    endtime = time.time()
    print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms'

    starttime = time.time()
    wr = WordReading(fileList)
    allLines = wr.readFile()
    endtime = time.time()
    print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms'

    starttime = time.time()
    wa = WordAnalyzing(allLines)
    resultMap = wa.analyze()
    endtime = time.time()
    print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms'

    starttime = time.time()
    postproc = PostProcessing(resultMap)
    postproc.obtainTopN(30)
    endtime = time.time()
    print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'