Go转码非UTF-8格式文件

代码实例

使用Go自带工具转码

package main

import (
	"bytes"
	"fmt"
	"io"
	"unicode/utf8"

	"github.com/saintfish/chardet"
	"golang.org/x/text/encoding"
	"golang.org/x/text/encoding/charmap"
	"golang.org/x/text/encoding/japanese"
	"golang.org/x/text/encoding/simplifiedchinese"
	"golang.org/x/text/encoding/traditionalchinese"
	xunicode "golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
)

func main() {

	// 读取本地文件
	//data, err := os.ReadFile("GB18030.txt")
	//if err != nil {
	//	fmt.Println("读取文件出错:", err)
	//	return
	//}

	// 示例数据
	data := []byte{0xD2, 0xBB, 0xCA, 0xC7, 0xD6, 0xD0, 0xCE, 0xC4, 0xC6, 0xF7}

	// 将数据转换为UTF-8编码
	utf8Data, err := convertToUTF8(data)
	if err != nil {
		fmt.Println("转换出错:", err)
		return
	}

	fmt.Println("转换后的UTF-8编码数据:", string(utf8Data))
}

func convertToUTF8(data []byte) ([]byte, error) {
	// 判断data是否是有效的UTF-8编码
	if !utf8.Valid(data) {
		// 检测编码格式
		sourceEncoding, err := detectEncoding(data)
		if err != nil {
			return nil, err
		}

		// 获取编码格式对应的编码器
		encoder := getEncoder(sourceEncoding)
		if encoder == nil {
			return nil, fmt.Errorf("encoder not found for encoding: %s", sourceEncoding)
		}

		// 使用编码器将数据转换为 UTF-8 编码
		utf8Data, err := io.ReadAll(transform.NewReader(bytes.NewReader(data), encoder.NewDecoder()))
		if err != nil {
			return nil, err
		}

		return utf8Data, nil
	}
	// 如果data已经是有效的UTF-8编码,则直接返回
	return data, nil
}

func detectEncoding(data []byte) (string, error) {
	detector := chardet.NewTextDetector()
	result, err := detector.DetectBest(data)
	if err != nil {
		return "", err
	}
	return result.Charset, nil
}

func getEncoder(encodingName string) encoding.Encoding {
	switch encodingName {
	case "UTF-8":
		return encoding.Nop
	case "GBK":
		return simplifiedchinese.GBK
	case "GB-18030":
		return simplifiedchinese.GB18030
	case "HZ-GB-2312":
		return simplifiedchinese.HZGB2312
	case "Big5":
		return traditionalchinese.Big5
	case "UTF-16LE":
		return xunicode.UTF16(xunicode.LittleEndian, xunicode.IgnoreBOM)
	case "UTF-16BE":
		return xunicode.UTF16(xunicode.BigEndian, xunicode.IgnoreBOM)
	case "EUC-JP":
		return japanese.EUCJP
	case "Shift_JIS":
		return japanese.ShiftJIS
	case "ISO-2022-JP":
		return japanese.ISO2022JP
	case "ISO-8859-1":
		return charmap.ISO8859_1
	case "ISO-8859-2":
		return charmap.ISO8859_2
	case "ISO-8859-3":
		return charmap.ISO8859_3
	case "ISO-8859-4":
		return charmap.ISO8859_4
	case "ISO-8859-5":
		return charmap.ISO8859_5
	case "ISO-8859-6":
		return charmap.ISO8859_6
	case "ISO-8859-7":
		return charmap.ISO8859_7
	case "ISO-8859-8":
		return charmap.ISO8859_8
	case "ISO-8859-9":
		return charmap.ISO8859_9
	case "ISO-8859-10":
		return charmap.ISO8859_10
	case "ISO-8859-13":
		return charmap.ISO8859_13
	case "ISO-8859-14":
		return charmap.ISO8859_14
	case "ISO-8859-15":
		return charmap.ISO8859_15
	case "ISO-8859-16":
		return charmap.ISO8859_16
	case "Windows-1250":
		return charmap.Windows1250
	case "Windows-1251":
		return charmap.Windows1251
	case "Windows-1252":
		return charmap.Windows1252
	case "Windows-1253":
		return charmap.Windows1253
	case "Windows-1254":
		return charmap.Windows1254
	case "Windows-1255":
		return charmap.Windows1255
	case "Windows-1256":
		return charmap.Windows1256
	case "Windows-1257":
		return charmap.Windows1257
	case "Windows-1258":
		return charmap.Windows1258
	case "KOI8-R":
		return charmap.KOI8R
	case "KOI8-U":
		return charmap.KOI8U
	default:
		return nil
	}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值