import cchardet
def get_file_encoding(filepath):
with open(filepath, 'rb') as f:
encoding = cchardet.detect(f.read())['encoding']
if encoding in ["ISO-8859-1", "ASCII"]:
return "GB2312"
if encoding == "EUC-TW":
return "GBK"
return encoding
C语言编写的cchardet 效率比python编写的chardet 效率高60倍
编码关系:
![](https://i-blog.csdnimg.cn/blog_migrate/4be4ceafc2859ac244567914dfbe733c.png)
当使用GBK编码写入含有部分UTF-8编码字符串提示\xa0等字符无法识别时,可以使用unicodedata处理异常空白符转成可见空格
import unicodedata # python原生库
content = unicodedata.normalize('NFKC', content)
with open(target_file, "w", encoding="gbk") as o:
o.write(content)
附chardet的识别文件编码
from chardet.universaldetector import UniversalDetector, LanguageFilter
def get_file_encoding(filepath):
detector = UniversalDetector(LanguageFilter.CHINESE)
with open(filepath, 'rb') as f:
for line in f:
detector.feed(line)
if detector.done:
break
detector.close()
if detector.result['encoding'] in ["ascii", "ISO-8859-1"]:
return "GB2312"
return detector.result['encoding']
读取文件
encoding = get_file_encoding(filepath)
with open(filepath, 'rb') as f:
f = io.TextIOWrapper(f, encoding=encoding, errors='ignore')
f.seek(0)
content = f.read()
写文件(用二进制写入,避免\r\n与\n的写入差异)
with open(filepath, "wb") as o:
o.write(content.encode('utf-8'))