一,使用标准库实现
import codecs
def read_file_with_encoding(filename):
with open(filename, 'rb') as f:
raw_data = f.read()
file_encoding = 'utf-8'
try_encodings = ['utf-8', 'gbk', 'big5', 'utf-16', 'latin1']
for encoding in try_encodings:
try:
decoded_data = raw_data.decode(encoding)
file_encoding = encoding
break
except UnicodeDecodeError:
pass
content = raw_data.decode(file_encoding)
return content, file_encoding
def convert_encoding_to_utf8(source_file, target_file):
with open(source_file, 'rb') as f:
raw_data = f.read()
try_encodings = ['utf-8', 'gbk', 'big5', 'utf-16', 'latin1']
for encoding in try_encodings:
try:
decoded_data = raw_data.decode(encoding)
new_data = decoded_data.encode('utf-8')
with open(target_file, 'wb') as f:
f.write(new_data)
break
except UnicodeDecodeError:
pass
content, encoding = read_file_with_encoding('test.txt')
print(content, encoding)
convert_encoding_to_utf8('test.txt', 'test_utf8.txt')
二,使用第三方库
import chardet
def open_file(file_path):
with open(file_path, 'rb') as file:
content = file.read()
result = chardet.detect(content)
encoding = result['encoding']
return content, encoding
def convert_to_utf8(file_path, new_file_path='utf8_file.txt'):
with open(file_path, 'rb') as file:
content = file.read()
result = chardet.detect(content)
encoding = result['encoding']
if encoding != 'utf-8':
with open(new_file_path, 'wb') as new_file:
new_content = content.decode(encoding).encode('utf-8')
new_file.write(new_content)
return new_file_path
else:
return "File is already in utf-8 encoding"
file_content, file_encoding = open_file('sample_file.txt')
print(file_content, file_encoding)
new_file_path = convert_to_utf8('another_file.txt')
print(new_file_path)