中文csv文本编码转utf8那些事 - python实现

中文csv文本编码转utf8那些事 - python

jcLee95 的个人博客
邮箱 :291148484@163.com
CSDN 主页https://blog.csdn.net/qq_28550263?spm=1001.2101.3001.5343
本文地址https://blog.csdn.net/qq_28550263/article/details/126009118

请参考以下代码:

# -*- coding: utf-8 -*-  
##!/usr/bin/python3
# @Author : Jack Lee
# @Email : 291148484@163.com
import os
import time
import codecs
import chardet

class CodeError(ValueError):pass

def get_time() -> str:
    return str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

def which_codes(filepath):
    # c = codecs.open(filepath,"r")
    # print(c)
    # return c.encoding
    with open(filepath,'rb') as f:
        content = f.read()
    return chardet.detect(content).get('encoding').lower()

def open_gbk_as_str(filepath)-> str:
    with open(filepath,'r',encoding='gbk') as f:
        content = f.read()
    return content

def open_big5_as_str(filepath)-> str:
    with open(filepath,'r',encoding='big5') as f:
        content = f.read()
    return content

def open_big5hkscs_as_str(filepath)-> str:
    with open(filepath,'r',encoding='big5hkscs') as f:
        content = f.read()
    return content

def open_cp950_as_str(filepath)-> str:
    with open(filepath,'r',encoding='cp950') as f:
        content = f.read()
    return content

def open_gb2312_as_str(filepath)-> str:
    with open(filepath,'r',encoding='gb2312') as f:
        content = f.read()
    return content

def open_hz_as_str(filepath)-> str:
    with open(filepath,'r',encoding='hz') as f:
        content = f.read()
    return content

def open_ascii_as_str(filepath)-> str:
    with open(filepath,'r',encoding='ascii') as f:
        content = f.read()
    return content

def open_utf8_as_str(filepath)-> str:
    with open(filepath,'r',encoding='utf-8') as f:
        content = f.read()
    return content

def open_utf16_as_str(filepath)-> str:
    with open(filepath,'rb') as f:
        content = f.read()
    return str(content,encoding='utf-16')

def open_utf32_as_str(filepath)-> str:
    with open(filepath,'rb') as f:
        content = f.read()
    return str(content,encoding='utf-32')

def open_as_str(filepath):
    codes = which_codes(filepath)
    # gbk encode
    if codes == "936" or codes == "cp936" or codes == "ms936" or codes == "gbk":
        print("[INFO] "+get_time()+" GBK, codes =",codes)
        return open_gbk_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # big5 encode
    if codes == "big5" or codes == "big5-tw" or codes == "csbig5":
        print("[INFO] "+get_time()+" big5, codes =",codes)
        return open_big5_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # big5hkscs encode
    if codes == "big5hkscs" or codes == "big5-hkscs" or codes == "hkscs":
        print("[INFO] "+get_time()+" big5hkscs, codes =",codes)
        return open_big5hkscs_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # cp950 encode
    if codes == "cp950" or codes == "950" or codes == "ms950":
        print("[INFO] "+get_time()+" cp950, codes =",codes)
        return open_cp950_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # gb2312 encode
    if codes == "gb2312" or codes == "chinese" or codes == "csiso58gb231280" or codes == "euc-cn" or codes == "euccn" or codes == "eucgb2312-cn" or codes == "gb2312-1980" or codes == "gb2312-80" or codes == "iso-ir-58":
        print("[INFO] "+get_time()+" gb2312, codes =",codes)
        return open_gb2312_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # hz encode
    if codes == "hz" or codes == "hzgb" or codes == "hz-gb" or codes == "hz-gb-2312":
        print("[INFO] "+get_time()+" hz, codes =",codes)
        return open_cp950_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')
    
    # ascii encode
    elif codes == 'ascii':
        print("[INFO] "+get_time()+" encoding = ascii")
        return open_ascii_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # utf-16 encode
    elif codes == 'utf-16' or codes == 'U16' or codes == 'utf16' or codes == 'utf_16':
        print("[INFO] "+get_time()+" utf-16, codes =",codes)
        return open_utf16_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # utf-32 encode
    elif codes == 'utf-32' or codes == 'U32' or codes == 'utf32' or codes == 'utf_32':
        print("[INFO] "+get_time()+" utf-32, codes =",codes)
        return open_utf32_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8')

    # utf-8 encode
    elif codes == "utf-8" or codes == "U8" or codes == "cp65001" or codes == "utf8" or codes == "UTF":
        print("[INFO] "+get_time()+" utf-8, codes =",codes)
        return open_utf8_as_str(filepath)
    
    # other unrecognized codes:
    else:
        print('[CRITICAL] '+get_time()+' The current encoding used is:"'+codes+'", the program failed to process this encoding.')
        raise CodeError('Text file:"'+filepath+'" which encoding method cannot be read.')


# Used to replace the specified csv with the corresponding utf8 encoded csv.
def replace_by_utf8_csv(filepath):
    s = ""
    l = open_as_str(filepath).split('\n\r')
    for i in l:
        if i!="":
            s=s+i
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(s)
    print('[INFO] '+get_time()+' Translated file '+ filepath +' into utf-8.')

# print(open_as_str(r'C:\Users\a2911\Desktop\script\sources\aaa.csv'))

说明:

  1. 调用 open_as_str 函数用于打开一个文本文件,得到相应的 utf-8 字符串;
  2. 调用 replace_by_utf8_csv 用于,将一个非 utf-8 的 csv 替换为同名 utf-8 的 csv。
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

jcLee95

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值