Python 给词语编码新世纪五笔

最新推荐文章于 2022-07-05 22:56:58 发布

das2m

最新推荐文章于 2022-07-05 22:56:58 发布

阅读量417

点赞数

文章标签：五笔码表新世纪五笔 98五笔词库过滤编码

本文链接：https://blog.csdn.net/zhangyingda/article/details/119832080

版权

输入法 RIME

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# d.py

from itertools import islice
import time

# wubixinshiji.dict.yaml https://github.com/GuoBinyong/wubixinshiji
NCTPATH = './wubixinshiji.dict.yaml'



# 读取新世纪五笔码表
def read_code_table():
    nc_ct = {}
    with open(NCTPATH, 'r', encoding='utf-8') as f:
        # 按行读取
        for line in islice(f.readlines(), 81, None):
            str = line.strip().split('\t')
            nc_ct[str[0]] = str[1]
    f.close()
    print("已读取新世纪码表, 共有 %d 项 " % len(nc_ct))
    return nc_ct

# 读取新世纪五笔码表为 list 
def list_code_table():
    list_nc_ct = []
    with open(NCTPATH, 'r', encoding='utf-8') as f:
        # 按行读取
        for line in islice(f.readlines(), 81, None):
            str = line.strip().split('\t')
            # 元组哦
            tmp=(str[0], str[1])
            list_nc_ct.append(tmp)
 
    f.close()
    print("已读取新世纪码表, 共有 %d 项 " % len(list_nc_ct))
    return list_nc_ct

# 生成新世纪五笔单字码表
def single_table():
    single_ct = {}
    rc=0
    dc=0
    cc=0
    with open(NCTPATH, 'r', encoding='utf-8') as f:
        # 按行读取
        for line in islice(f.readlines(), 81, None):
            # 分割行 str:['工','a','99454797','aa']
            str = line.strip().split('\t')
            # 检测第一段的长度，是否为单字
            if len(str[0]) == 1:
                # 检测编码是否为独码
                if len(str[1])>1:
                    if str[0] not in single_ct:
                        single_ct[str[0]] = str[1]
                    else:
                        rc+=1
                else:
                    dc+=1
            else:
                cc+=1
    f.close()

    print("忽略 82 行，删掉词语 %d 条，独码 %d 条，重复 %d 条，已生成单字码表共有 %d 项" %(cc,dc,rc,len(single_ct)))
    return single_ct


# 全局读一次，减少开销
NCT = read_code_table()
listNCT=list_code_table()
SCT = single_table()


# 比对 98 五笔词库和新世纪五笔码表，新世纪五笔码表中没有的保存到文件
def filter_code_table(table_path):
    dt = {}
    # 读取行数
    lc = 0
    # 新世纪五笔码表中已经存在项计数
    hc = 0
    # 单字数量
    sc = 0
    # 打开 98 五笔词库 从第25行开始
    with open(table_path, 'r', encoding='utf-8') as f:
        # 从第24行开始读取
        for line in islice(f.readlines(), 24, None):
            # 分割
            str = line.strip().split('\t')
            # 取出字串列表的第一个元素 汉字，是否在新世纪五笔码表中
            if str[0] not in NCT:
                # 检测 长度是词语吗？
                if len(str[0]) > 1:
                    dt[str[0]] = str[1]
                else:
                    sc += 1
            else:
                hc += 1
            lc += 1
    f.close()

    # 写入文件，追加方式
    with open(u'./out.txt', 'a+', encoding='utf-8') as o:
        # 遍历 dict 此处只有一对数据
        for key, value in dt.items():
            o.write('%s\t%s\n' % (key, value))
    o.close()

    print("处理词库文件 %s 完毕, 共处理 %d 行, %d 个项已经存在于新世纪五笔码表中, 保存了 %d 行，舍弃了 %d 个单独字符。" % (table_path, lc, hc, len(dt), sc))


# 查询单字编码
def query_code(s):
    if s in SCT:
        return SCT[s]


# 根据汉字查找 新世纪五笔 的编码
def get_code(str):
    len_str = len(str)
    if len_str == 1:
        return query_code(str)

    if len_str == 2:
        # 取索引 第一个字符 第二个字符
        f = str[0]
        s = str[1]
        fc = query_code(f)
        sc = query_code(s)
        return fc[:2] + sc[:2]

    if len_str == 3:
        s1 = str[0]
        s2 = str[1]
        s3 = str[2]

        s1c = query_code(s1)
        s2c = query_code(s2)
        s3c = query_code(s3)

        return s1c[0] + s2c[0] + s3c[:2]

    if len_str > 3:
        s1 = str[0]
        s2 = str[1]
        s3 = str[2]
        se = str[len_str - 1]

        s1c = query_code(s1)
        s2c = query_code(s2)
        s3c = query_code(s3)
        sec = query_code(se)
        return s1c[0] + s2c[0] + s3c[0] + sec[0]

def take_second(e):
    return e[1]

if __name__ == '__main__':
    start = time.time()
    
    filter_code_table('./wubi98_ci.dict.yaml')
    filter_code_table('./wubi98_S.dict.yaml')
    filter_code_table('./wubi98_U.dict.yaml')

    new_code_table = {}
    lc = 0
    cc = 0
    final_code_table=[]
    with open(u'./out.txt', 'r', encoding='utf-8') as f:
        for line in f:
            # 分割
            str = line.strip().split('\t')
            # 给汉字编码 此处会过滤掉重复 key
            new_code_table[str[0]] = get_code(str[0])
            lc += 1
    
    # 将筛选出来的词语添加到新世纪五笔词库中
    for n in listNCT:
        final_code_table.append(n)
    
    for key,value in new_code_table.items():
        m=(key,value)
        final_code_table.append(m)

    # 按编码排序
    final_code_table.sort(key=take_second)

    f.close()
    # 输出汉字和编码写入文件
    with open(NCTPATH, 'r', encoding='utf-8') as f,open(u'./sorted.txt', 'w', encoding='utf-8') as o:
        for line in islice(f.readlines(), 0, 81):
            # 写入文件头
            o.write(line)
        for c in final_code_table:
            o.write('%s\t%s\n' % (c[0], c[1]))
            cc+=1

    print("新世纪五笔原有 %d 条，新添加了 %d 条，整合后为 %d 条，从98五笔词库中提取了 %d 条词语（未查重），现写入 %d 条词语。" % (len(listNCT),len(new_code_table),len(final_code_table),lc, cc))


    end = time.time()
    print("程序用时：%fs" % (end - start))

程序输出

已读取新世纪码表, 共有 107396 项 
已读取新世纪码表, 共有 112061 项 
忽略 82 行，删掉词语 79459 条，独码 25 条，重复 4491 条，已生成单字码表共有 28086 项
处理词库文件 ./wubi98_ci.dict.yaml 完毕, 共处理 108547 行, 92410 个项已经存在于新世纪五笔码表中, 保存了 16047 行，舍弃了 90 个单独字符。
处理词库文件 ./wubi98_S.dict.yaml 完毕, 共处理 73011 行, 64295 个项已经存在于新世纪五笔码表中, 保存了 8715 行，舍弃了 1 个单独字符。
处理词库文件 ./wubi98_U.dict.yaml 完毕, 共处理 100477 行, 32351 个项已经存在于新世纪五笔码表中, 保存了 0 行，舍弃了 68126 个单独字符。
新世纪五笔原有 112061 条，新添加了 16072 条，整合后为 128133 条，从98五笔词库中提取了 24762 条词语（未查重），现写入 128133 条词语。
程序用时：0.481324s

squirrel.custom.yaml

patch:
  style/color_scheme: das2m
  style/horizontal: true
  style/text_orientation: horizontal  # horizontal | vertical
  style/inline_preedit: true
  style/font_face: '思源黑体 CN ExtraLight'
  style/font_point: 16
  style/label_font_point: 16
  style/comment_font_point: 12
  style/corner_radius: 5
  style/border_height: 4
  style/dborder_width: 4
  style/candidate_format: "%c %@ "

  preset_color_schemes/das2m:
    name: 少司命 / das2m
    author: Das2m <zhangyingda@gmail.com>
    back_color: '0x4C4957'
    text_color: '0xCAFDDB'
    hilited_text_color: '0xCAFDDB'
    hilited_back_color: '0x4C4957'
    hilited_candidate_text_color: '0xA28AFD'
    hilited_candidate_back_color: '0x4C4957'
    hilited_candidate_label_color: '0xA28AFD'
    hilited_comment_text_color: '0xA28AFD'
    candidate_text_color: '0xFDFCFC'
    label_color: '0xFDFCFC'
    comment_text_color: '0xFDFCFC'