python使用codecs模块进行文件操作-读写中英文字符

最新推荐文章于 2024-06-17 09:38:30 发布

wind.liao

最新推荐文章于 2024-06-17 09:38:30 发布

阅读量1.1k

点赞数

分类专栏： android 文章标签： python codecs

本文链接：https://blog.csdn.net/liaowenfeng/article/details/75335110

版权

android 专栏收录该内容

38 篇文章 0 订阅

订阅专栏

摘自:

python使用codecs模块进行文件操作

以下是将源文件转成带BOM的UTF-8编码python脚本

import sys
import os
import codecs


"""
Usage: ConvertCp.py SrcDir DstDir


e.g.
if your source folder is "D:/Test/NonUnicode", destination folder is "D:/Test/utf8", just run command as follow :


python  ConvertCp.py "D:/Test/NonUnicode" "D:/Test/utf8"

"""


codePageList = (\
    "utf-8",
    "cp1251",   
    "cp855",
    "cp1252",
    "cp1250",
    "cp1251",
    "cp1254",
    "cp936",
    "cp950",
    "cp932",
    "cp949",
    "cp874",
    "cp1253"
)


fileExtFilter = (\
    ".cpp",
    ".c",
    ".cxx",
    ".h",
    ".hpp",
    ".hxx",
    ".cc",
    ".inl"
)


def FileIsBomUtf8Encoding(filePath):
    """
    Judge the file whether is Bom Utf_8.    
    """
    content = ""
    try:
        f = open(filePath, "rb")
        try:
            content = f.read()
        finally:
            f.close()
    except IOError:
        print ("open file %s failed." % (os.path.basename(filePath)))
        return False

    if content[0:3] == '\xef\xbb\xbf':
        return True

    return False


#----------------------------------------------------------------------
def ConvertFileEncoding(sourceFilePath, targetFilePath, targetEncoding = "utf_8"):
    """
    Convert the text files from ANSI encoding into 'targetEncoding'(utf_8). 

    @param sourceFilePath       source files path.
    @param targetFilePath       target files path.
    @param targetEncoding       target files encoding.
    """

    #
    # filter file ext.
    #
    (filePathname, filePathExt) = os.path.splitext(sourceFilePath)
    if filePathExt.lower() not in fileExtFilter:
        return False

    #
    # If the file is Bom utf_8 just skip.
    #
    if FileIsBomUtf8Encoding(sourceFilePath):
        # print ("File \"%s\" is utf_8 format, not need convert." % (sourceFilePath))
        return False

    #
    # Get the source content.
    #
    content = None
    sourceEncoding = None
    for cp in codePageList:
        try:
            sourceFile = codecs.open(sourceFilePath, mode = "r", encoding = cp)
            content = sourceFile.read()
            sourceEncoding = cp
            break
        except UnicodeDecodeError:
            sourceFile.close()
            content = None
            continue
        except IOError:
            print ("open file %s failed." % (os.path.basename(sourceFilePath)))
            return False


    if content == None:
        print ("File \"%s\" is not valid encoding." % (sourceFilePath))
        return False

    #
    # ensure the target directory exist.
    #
    targetPathDir = os.path.dirname(targetFilePath)
    if not os.path.exists(targetPathDir):
        os.makedirs(targetPathDir)

    #
    # convert the file content.
    #
    try:
        targetFile = codecs.open(targetFilePath, mode = "w", encoding = targetEncoding)
        try:
            if targetEncoding.lower().startswith("utf") and targetEncoding.lower()[len(targetEncoding)-1] == "8":
                targetFile.write(unicode( codecs.BOM_UTF8, "utf_8" ))

            if content[0:3] == u'\xef\xbb\xbf':
                content = content[3:]

            targetFile.write(content)
        except UnicodeDecodeError:
            #
            # skip the failure file.
            #
            print ("convert file: \"%s\" failure" % (sourceFilePath) )
            sourceFile.close()
            targetFile.close()
            os.remove(targetFilePath)
            return False

        finally:
            sourceFile.close()
            targetFile.close()

    except IOError:
        print ("open file %s failed." % (targetFilePath))
        return False

    # print ("convert file: \"%s\" from %s to %s successfully" % (os.path.basename(sourceFilePath), sourceEncoding, targetEncoding) )
    return True

if __name__=='__main__':
    """"""
    if len(sys.argv) <= 2:
        # print __doc__
        sSourceDir = r"D:\\trunk"
        sTargetDir = r"D:\\\trunk_new"                  
    else:
        sSourceDir = sys.argv[1]
        sTargetDir = sys.argv[2]

    for root, dirs, files in os.walk(sSourceDir):
        for fileName in files:
            sourcePath = os.path.join(root, fileName)
            targetPath = sourcePath.replace(sSourceDir, sTargetDir)
            ConvertFileEncoding(sourcePath, targetPath)

wind.liao

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python使用codecs模块进行文件操作-读写中英文字符

摘自:python使用codecs模块进行文件操作import sysimport osimport codecs"""Usage: ConvertCp.py SrcDir DstDire.g.if your source folder is "D:/Test/NonUnicode", destination folder is "D:/Test/utf8", just run comm
复制链接

扫一扫

专栏目录