python写的批量将c/c++代码文件转换为utf8编码脚本

最新推荐文章于 2021-06-18 14:02:21 发布

小橙哥

最新推荐文章于 2021-06-18 14:02:21 发布

阅读量2k

点赞数 1

分类专栏： python 文章标签： cc++代码文件转换为utf8编码代码编码转换 python脚本代码转换 python utf8

本文链接：https://blog.csdn.net/orangelion/article/details/13991529

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

在windows下做开发的中国程序员，很多平时都会遇到一些国外开发者写的C/C++代码，其中经常会包含一些非标准ANSI字符集的字符（不少优秀的类库，开源项目都会遇到），通常这些源代码文件都不是unicode编码的，导致在中文代码页的windows系统下VC编译器会出现大量警告，有些特殊字符出现在常量字符串定义中的，还会影响程序运行的结果。解决办法有两种，一种是修改自己操作系统的字符代码页为源代码文件相符合的代码页（一般修改为拉丁字符集的cp1252或者1250即可），这种办法的优点就是容易管理，比如代码作者更新了代码，你可以方便比较修改；另一种办法就是将源码文件转换为一种unicode字符编码，通常是utf8）。本文的内容就是第二种方法。

之前写了个python脚本用来批量转换这些源代码文件为utf8编码保存，支持代码路径的嵌套，转换后的文件依然保持原始文件的目录结构，为了支持vc编辑器，所以要添加bom头，您可以按照您的需要对脚本做些修改即可，比如需要支持更多的文件后缀等，记录在此。

#!/usr/bin/env python
#coding:utf-8
# Author:  Leo Cheng--<15483930@qq.com>
# Purpose: 
# Created: 2011-7-28


import sys
import os
import codecs


"""
Usage: ConvertCp.py SrcDir DstDir


e.g.
if your source folder is "D:/Test/NonUnicode", destination folder is "D:/Test/utf8", just run command as follow :


python  ConvertCp.py "D:/Test/NonUnicode" "D:/Test/utf8"
   
"""


codePageList = (\
    "cp1252", 
    "cp1250", 
    "cp1251", 
    "cp1254", 
    "cp936",  
    "cp950",  
    "cp932",  
    "cp949",  
    "cp874",  
    "cp1253"
)


fileExtFilter = (\
    ".cpp", 
    ".c",
    ".cxx", 
    ".h", 
    ".hpp",
    ".hxx"
)


def FileIsBomUtf8Encoding(filePath):
    """
    Judge the file whether is Bom Utf_8. 
    """
    content = ""
    try:
        f = open(filePath, "rb")
        try:
            content = f.read()
        finally:
            f.close()
    except IOError:
        print "open file %s failed." % (os.path.basename(filePath))
        return False
    
    if content[0:3] == '\xef\xbb\xbf':
        return True
    
    return False


#----------------------------------------------------------------------
def ConvertFileEncoding(sourceFilePath, targetFilePath, targetEncoding = "utf_8"):
    """
    Convert the text files from ANSI encoding into 'targetEncoding'(utf_8). 
    
    @param sourceFilePath        source files path.
    @param targetFilePath        target files path.
    @param targetEncoding        target files encoding.
    """
    
    #
    # filter file ext.
    #
    (filePathname, filePathExt) = os.path.splitext(sourceFilePath)
    if filePathExt.lower() not in fileExtFilter:
        return False
        
    #
    # If the file is Bom utf_8 just skip.
    #
    if FileIsBomUtf8Encoding(sourceFilePath):
        print "File \"%s\" is utf_8 format, not need convert." % (sourceFilePath)
        return False
    
    #
    # Get the source content.
    #
    content = None
    sourceEncoding = None
    for cp in codePageList:
        try:
            sourceFile = codecs.open(sourceFilePath, mode = "r", encoding = cp)
            content = sourceFile.read()
            sourceEncoding = cp
            break
        except UnicodeDecodeError:
            sourceFile.close()
            content = None
            continue
        except IOError:
            print "open file %s failed." % (os.path.basename(sourceFilePath))
            return False


    if content == None:
        print "File \"%s\" is not valid encoding." % (sourceFilePath)
        return False
        
    #
    # ensure the target directory exist.
    #
    targetPathDir = os.path.dirname(targetFilePath)
    if not os.path.exists(targetPathDir):
        os.makedirs(targetPathDir)
        
    #
    # convert the file content.
    #
    try:
        targetFile = codecs.open(targetFilePath, mode = "w", encoding = targetEncoding)
        try:
            if targetEncoding.lower().startswith("utf") and targetEncoding.lower()[len(targetEncoding)-1] == "8":
                targetFile.write(unicode( codecs.BOM_UTF8, "utf_8" ))
               
            if content[0:3] == u'\xef\xbb\xbf':
                content = content[3:]
               
            targetFile.write(content)           
        except UnicodeDecodeError:
           #
           # skip the failure file.
           #
            print "convert file: \"%s\" failure" % (sourceFilePath) 
            sourceFile.close()
            targetFile.close()
            os.remove(targetFilePath)
            return False
       
        finally:
            sourceFile.close()
            targetFile.close()
           
    except IOError:
        print "open file %s failed." % (targetFilePath)
        return False
    
    print "convert file: \"%s\" from %s to %s successfully" % (os.path.basename(sourceFilePath), sourceEncoding, targetEncoding) 
    return True
                
if __name__=='__main__':
    """"""
    if len(sys.argv) <= 2:
        print __doc__
        sSourceDir = r"E:\TestSrc"
        sTargetDir = r"E:\TestDst"                   
    else:
        sSourceDir = sys.argv[1]
        sTargetDir = sys.argv[2]
  
    for root, dirs, files in os.walk(sSourceDir):
        for fileName in files:
            sourcePath = os.path.join(root, fileName)
            targetPath = sourcePath.replace(sSourceDir, sTargetDir)
            ConvertFileEncoding(sourcePath, targetPath)