Python脚本转换文件编码格式

最新推荐文章于 2023-03-22 13:57:44 发布

七夕猛虎

最新推荐文章于 2023-03-22 13:57:44 发布

阅读量1.6k

点赞数 1

分类专栏： python工具文章标签： python 编码格式转换

本文链接：https://blog.csdn.net/qiximenghu/article/details/102999172

版权

python工具专栏收录该内容

7 篇文章 3 订阅

订阅专栏

背景：由于平时使用的阅读代码的软件在处理ASCII编码的中文的时候会出现乱码，例如vscode，source insight等。手动使用notepad++去转换文件的编码格式又比较费时间。代码文件少的时候可以采用手动转码的方式，但是如果代码文件非常多，目录结构比较复杂，那么转码的工作量就非常大了。所以针对这个问题，我用python写了一个自动转换文件编码格式的脚本，可以有效减少转码的工作量。

原理：以二进制读的方式打开文件，通过chardet模块检测文件的编码方式，然后以检测到的编码方式读取文件内容，最后再以需要转码的格式写入源文件即可。

环境：windows 平台：python3 用到的模块：os,sys,chardet,codecs,win32api

chardet和win32api安装方式：

pip install chardet
pip install pywin32

判断是否为需要转码的文件，采用白名单列表的方式：判断文件后缀名是否在白名单内来确定是否需要转码。防止修改非代码文件。

# 当工程的目录比较复杂时候，里面往往包含一些非代码文件，转换那些文件是没有必要的，
# 故设置一个转换白名单，以下列表中的文件类型会被转码，不在此列表中的文件类型不会转码
g_valid_file_type = ['txt', 'cpp', 'c', 'h']

def is_valid_file(f_path):
    global g_valid_file_type
    if os.path.splitext(f_path)[1][1:] in g_valid_file_type:
        return True
    return False

获取目录下的代码文件：采用os模块的listdir方法获取目录下的所有文件名和目录名，再接着使用os.path.isfile和os.path.isdir判断是否为文件或目录。将传入的路径名进行一个短路径名转换，防止文件路径太长，导致后面的打开文件失败。最后对收集到的目录做递归处理。

# 递归获取目录下所有有效文件
def get_file_list_in_folder(f_path):
    file_list = []
    folder_list = []
    name_list = os.listdir(f_path)

    for name in name_list:
        # 防止文件路径太长导致打开文件失败，转换为短路径
        short_path = win32api.GetShortPathName(f_path)
        t_path = os.path.join(short_path, name)
        if os.path.isfile(t_path):
            # 判断是否为需要转码的文件
            if is_valid_file(t_path):
                file_list.append(t_path)
            continue

        if os.path.isdir(t_path):
            folder_list.append(t_path)
            continue

        print("Unknown path type: " + t_path)
    # 递归遍历目录
    for f in folder_list:
        file_list += get_file_list_in_folder(f)
    return file_list

检测文件的编码格式，使用二进制读的方式读取文件内容，然后使用chardet模块的detect方法检测这些二进制字节数据，此方法会给出所检测字节的编码格式，可信度和语言三个元素的字典。当文件内容较少时，检测结果往往不太准确，可信度较低{'encoding': None, 'confidence': 0.0, 'language': None}：

# 使用chardet模块检测文件的编码格式
def get_file_enc_type(f_path):
    f_size = os.path.getsize(f_path)
    # 读取文件所有的内容，然后进行检测编码，检测结果准确度高。
    # 对于文件内容较大时，IO效率较低，可以设置为一个固定值，比如1000个字节。可以显著提升检测速度。
    # content = fp.read(1000)
    fp = codecs.open(f_path, 'rb')
    content = fp.read(f_size)
    enc_type_dict = chardet.detect(content)
    fp.close()
    return enc_type_dict

转码单个文件，先用检测到的编码格式读取文件内容，然后将错误的字节采用替换的方式替代。然后将文件内容再以目标转码格式写入到源文件中就可以了。当文件内容较少的时候，chardet检测结果的可信度往往比较低，此时若继续使用这个低可信度的编码格式，转换完成后，中文依然是乱码。我自己当时试的时候，发现读文件的编码格式为ansi时可以正常转码，故此处设置了一个默认编码格式的全局变量：

# 目的转码格式
g_target_enc_type = 'utf-8'
# 若文件内容过少，chardet模块会检测不准确，confidence值较低，
# 此时检测结果不可信，故设置此字段用于替换chardet的检测结果
g_default_enc_type = 'ansi'

# 转码单个文件
def change_file_enc_type(f_path):
    global g_default_enc_type
    global g_target_enc_type
    
    if not is_valid_file(f_path):
        print('Not a valid file type!')
        print('Valid file type' + g_valid_file_type)
        return

    print(f_path)
    # 若文件内容为空，则返回的字典为{'encoding': None, 'confidence': 0.0, 'language': None}
    # 此时不应该进行转码操作
    enc_type_dict = get_file_enc_type(f_path)
    enc_type = enc_type_dict['encoding']
    print("encoding type: " + enc_type + ' confidence: ' + str(enc_type_dict['confidence']))
    if enc_type_dict['confidence'] < 0.9:
        print('Not enough confidence, use default encoding type: ' + g_default_enc_type)
        enc_type = g_default_enc_type

    if not enc_type or enc_type == g_default_enc_type:
        print("No need to convert!")
        return

    fp = codecs.open(f_path, 'r', encoding=enc_type, errors='replace')
    content = fp.readlines()
    fp.close()
    # 擦除源文件的内容，写入转码后的内容
    fp = codecs.open(f_path, 'w', encoding=g_target_enc_type)
    fp.writelines(content)
    fp.close()
    print("Convert complete!")

以上就是整个脚本的主要函数的功能和内容了，下面是整体源码：

# -*- coding: utf-8 -*-
import chardet
import os
import sys
import codecs
import win32api

# 目的转码格式
g_target_enc_type = 'utf-8'
# 若文件内容过少，chardet模块会检测不准确，confidence值较低，
# 此时检测结果不可信，故设置此字段用于替换chardet的检测结果
g_default_enc_type = 'ansi'
# 当工程的目录比较复杂时候，里面往往包含一些非代码文件，转换那些文件是没有必要的，
# 故设置一个转换白名单，以下列表中的文件类型会被转码，不在此列表中的文件类型不会转码
g_valid_file_type = ['txt', 'cpp', 'c', 'h']


# 使用chardet模块检测文件的编码格式
def get_file_enc_type(f_path):
    f_size = os.path.getsize(f_path)
    # 读取文件所有的内容，然后进行检测编码，检测结果准确度高。
    # 对于文件内容较大时，IO效率较低，可以设置为一个固定值，比如1000个字节。可以显著提升检测速度。
    # content = fp.read(1000)
    fp = codecs.open(f_path, 'rb')
    content = fp.read(f_size)
    enc_type_dict = chardet.detect(content)
    fp.close()
    return enc_type_dict


def is_valid_file(f_path):
    global g_valid_file_type
    if os.path.splitext(f_path)[1][1:] in g_valid_file_type:
        return True
    return False


# 转码单个文件
def change_file_enc_type(f_path):
    global g_default_enc_type
    global g_target_enc_type
    
    if not is_valid_file(f_path):
        print('Not a valid file type!')
        print('Valid file type' + g_valid_file_type)
        return

    print(f_path)
    # 若文件内容为空，则返回的字典为{'encoding': None, 'confidence': 0.0, 'language': None}
    # 此时不应该进行转码操作
    enc_type_dict = get_file_enc_type(f_path)
    enc_type = enc_type_dict['encoding']
    print("encoding type: " + enc_type + ' confidence: ' + str(enc_type_dict['confidence']))
    if enc_type_dict['confidence'] < 0.9:
        print('Not enough confidence, use default encoding type: ' + g_default_enc_type)
        enc_type = g_default_enc_type

    if not enc_type or enc_type == g_default_enc_type:
        print("No need to convert!")
        return

    fp = codecs.open(f_path, 'r', encoding=enc_type, errors='replace')
    content = fp.readlines()
    fp.close()
    # 擦除源文件的内容，写入转码后的内容
    fp = codecs.open(f_path, 'w', encoding=g_target_enc_type)
    fp.writelines(content)
    fp.close()
    print("Convert complete!")


# 递归获取目录下所有有效文件
def get_file_list_in_folder(f_path):
    file_list = []
    folder_list = []
    name_list = os.listdir(f_path)

    for name in name_list:
        # 防止文件路径太长导致打开文件失败，转换为短路径
        short_path = win32api.GetShortPathName(f_path)
        t_path = os.path.join(short_path, name)
        if os.path.isfile(t_path):
            if is_valid_file(t_path):
                file_list.append(t_path)
            continue

        if os.path.isdir(t_path):
            folder_list.append(t_path)
            continue

        print("Unknown path type: " + t_path)

    for f in folder_list:
        file_list += get_file_list_in_folder(f)
    return file_list


def deal_folder(f_path):
    file_list = get_file_list_in_folder(f_path)
    for f in file_list:
        change_file_enc_type(f)


def main():
    if len(sys.argv) < 2:
        print("You didn't input a folder path!")
        return False

    for arg in sys.argv[1:]:
        if os.path.isfile(arg):
            change_file_enc_type(arg)
            continue
        
        if os.path.isdir(arg):
            deal_folder(arg)
            continue

        print("Error path: " + arg)
    # 转换完成后不关闭控制台，可供阅读转码过程打印的信息
    os.system("pause")


if __name__ == '__main__':
    main()

使用pyinstaller打包后生成的exe：https://download.csdn.net/download/qiximenghu/11969313

百度云下载地址：https://pan.baidu.com/s/1VyQq1GnEj9J3xTLg9Mc8gg 提取码：e9pg

七夕猛虎

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
Python脚本转换文件编码格式

背景：由于平时使用的阅读代码的软件在处理ASCII编码的中文的时候会出现乱码，例如vscode，source insight等。手动使用notepad++去转换文件的编码格式又比较费时间。代码文件少的时候可以采用手动转码的方式，但是如果代码文件非常多，目录结构比较复杂，那么转码的工作量就非常大了。所以针对这个问题，我用python写了一个自动转换文件编码格式的脚本，可以有效减少转码的工作量...
复制链接

扫一扫