搜狗词库scel导入落格输入法扩展词库txt 2025-03-28

最新推荐文章于 2025-05-14 15:07:12 发布

小二帆

最新推荐文章于 2025-05-14 15:07:12 发布

阅读量96

点赞数

文章标签： python 开发语言

原文链接：https://blog.csdn.net/WGH100817/article/details/101721567

版权

搜狗词库scel导入落格输入法txt

代码基于python3，多个词库输入，一个txt输出。2025-03-28
已校正可用，非原创。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import struct
import sys

# 搜狗的 scel 词库保存的是 Unicode 编码，每两个字节一个字符（中文汉字或者英文字母）
# 文件主要分为两部分：
# 1. 全局拼音表：格式为 (index, len, pinyin)
# 2. 汉语词组表：格式为 (same, py_table_len, py_table, {word_len, word, ext_len, ext})
#
# 定位偏移如下：
startPy = 0x1540        # 拼音表起始位置
startChinese = 0x2628   # 汉语词组表起始位置

# 全局拼音表（字典：索引 -> 拼音字符串）
GPy_Table = {}

# 解析结果：列表，每个元素为 (词频, 拼音, 中文词组)
GTable = []

def byte2str(data):
    """将原始字节码转换为字符串，每2个字节对应一个字符"""
    i = 0
    length = len(data)
    ret = ''
    while i < length:
        x = data[i:i+2]
        t = chr(struct.unpack('H', x)[0])
        if t == '\r':
            ret += '\n'
        elif t != ' ':
            ret += t
        i += 2
    return ret

def getPyTable(data):
    """解析全局拼音表"""
    # 注意：判断头部时要用 bytes 类型
    if data[0:4] != b"\x9D\x01\x00\x00":
        return None
    data = data[4:]
    pos = 0
    length = len(data)
    while pos < length:
        index = struct.unpack('H', data[pos:pos+2])[0]
        pos += 2
        l = struct.unpack('H', data[pos:pos+2])[0]
        pos += 2
        py = byte2str(data[pos:pos+l])
        GPy_Table[index] = py
        pos += l

def getWordPy(data):
    """获取一个词组的拼音（直接拼接对应拼音字符串）"""
    pos = 0
    length = len(data)
    ret = ''
    while pos < length:
        index = struct.unpack('H', data[pos:pos+2])[0]
        ret += GPy_Table[index]
        pos += 2
    return ret

def getWord(data):
    """获取一个词组（目前与 getWordPy 类似）"""
    pos = 0
    length = len(data)
    ret = ''
    while pos < length:
        index = struct.unpack('H', data[pos:pos+2])[0]
        ret += GPy_Table[index]
        pos += 2
    return ret

def getChinese(data):
    """解析中文词组部分"""
    pos = 0
    length = len(data)
    while pos < length:
        # 同音词数量
        same = struct.unpack('H', data[pos:pos+2])[0]
        pos += 2
        # 拼音索引表长度
        py_table_len = struct.unpack('H', data[pos:pos+2])[0]
        pos += 2
        # 根据拼音索引表获取拼音字符串
        py = getWordPy(data[pos: pos+py_table_len])
        pos += py_table_len
        # 每个同音词
        for i in range(same):
            # 中文词组字节数长度
            c_len = struct.unpack('H', data[pos:pos+2])[0]
            pos += 2
            word = byte2str(data[pos: pos+c_len])
            pos += c_len
            # 扩展数据长度（通常为 10 字节）
            ext_len = struct.unpack('H', data[pos:pos+2])[0]
            pos += 2
            # 词频
            count = struct.unpack('H', data[pos:pos+2])[0]
            GTable.append((count, py, word))
            pos += ext_len

def deal(file_name):
    """处理单个 scel 文件"""
    print('-' * 60)
    with open(file_name, 'rb') as f:
        data = f.read()
    # 检查文件头是否符合搜狗词库格式
    if data[0:12] != b"\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00":
        print("确认你选择的是搜狗(.scel)词库?")
        sys.exit(0)
    print("词库名：", byte2str(data[0x130:0x338]))
    print("词库类型：", byte2str(data[0x338:0x540]))
    print("描述信息：", byte2str(data[0x540:0xd40]))
    print("词库示例：", byte2str(data[0xd40:startPy]))
    getPyTable(data[startPy:startChinese])
    getChinese(data[startChinese:])

if __name__ == '__main__':
    # 将要转换的词库文件放在列表中
    files = [
        '电机电气.scel',
        '编程语言.scel',
        '自然语言.scel',
        '深度学习.scel',
        '人工智能专业术语【官方推荐】.scel',
        '人工智能词库（待补充）.scel',
        '人工智能常用词库.scel',
        '机器学习.scel',
        '关于电厂相关词汇.scel',
        '汽轮机.scel',
        '热电厂.scel',
        '电力系统及其自动化 (1).scel',
        '电子词汇大全【官方推荐】.scel',
        '电力词汇大全【官方推荐】.scel',
        '电力系统及其自动化.scel',
        '电力系统.scel',
        '电力行业常用词库.scel',
        '原神【官方推荐】.scel',
        '原神词汇.scel'
    ]

    # 遍历所有文件进行处理
    for f in files:
        deal(f)
    
    # 将所有中文词组（第三个元素）输出到 output.txt 文件中
    with open('output.txt', 'w', encoding='utf-8') as out:
        for count, py, word in GTable:
            out.write(word + '\n')