细胞词库的解码C源程序

最新推荐文章于 2022-03-03 20:03:08 发布

jocks

最新推荐文章于 2022-03-03 20:03:08 发布

阅读量1.9k

点赞数

分类专栏： C 文章标签： c

本文链接：https://blog.csdn.net/jocks/article/details/40935431

版权

C 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

//细胞词库解码C源程序,输出为UNICODE的TXT格式

说明：
sogo输入法细胞词库解析源程序dis_sogo_cell.c，将.scel文件解码输出unicode格式的文本文件，可以提取出大量的中文词语,输出拼音及其中文词语，每一条记录一行，
每条记录的拼音和中文词语用分号分隔，同音词之间以逗号分隔，格式如下所示：
bei di ;北地,北堤,
bei gou ;北沟,杯勾,
cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区,
chang cheng shu dian ;长城书店,
chang ge zhuang cun ;常各庄村,
chang he da sha ;长和大厦,长河大厦,

使用方法（Linux下）：
1.编译：gcc dis_sogo_cell.c -o dis_sogo_cell
        或直接make
2.使用：./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt
3.用shell脚本批量处理,将当全目录下的.scel文件转换为TXT文件:
#!/bin/bash
for scel_file in `ls *.scel` 
do
./dis_sogo_cell ${scel_file} > ${scel_file}.txt
done
4.使用程序包内提供的cell2txt.sh,在shell下直接执行即可将当前目录下的所有.scel文件提取为文本文件(UNICODE格式的)。如：
# ./cell2txt.sh
5.如需将生成的unicode 的文本文件转为ANSI编码的TXT文件，可以利用Linux的iconv命令，如：
# iconv -futf-16 -tGB18030 sogo_scel_file.scel.txt   -o  sogo_scel_file.scel_ANSI.txt
转换编码后的文件sogo_scel_file.scel_ANSI.txt将比原来的sogo_scel_file.scel.txt文件的字节数减小约50%，可极大节省存储空间，同时便于使用不支持UNICODE的
文本编辑器查看输出的结果。

C代码如下：

/*
 * dis_sogo_cell.c
 *
 * snallieATtomDOTcom
 * Sun Nov  7 06:30:00 CST 2014
 *
 * decoding sogo .scel file, 
 * output Chinese PinYin string and Chinese word in unicode 
 *
 * Example of output data:
 * bei di ;北地,北堤,
 * bei gou ;北沟,杯勾,
 * cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区,
 * chang cheng shu dian ;长城书店,
 * chang ge zhuang cun ;常各庄村,
 * chang he da sha ;长和大厦,长河大厦,
 *
 * to make under Linux: # gcc dis_sogo_cell.c -o dis_sogo_cell
 * usage  : ./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt
 *
 * !! sogo_scel_file.scel.txt is a TXT file in unicode !!
 *
 * to invoke in shell script:
 * for scel_file in `ls *.scel` ; do ./dis_sogo_cell ${scel_file} > ${scel_file}.txt ; done
 * 
 */

/*
 * 搜狗的scel词库就是保存的文本的unicode编码，每两个字节一个字符（中文汉字或者英文字母）
 * 找出其每部分的偏移位置即可
 * 主要两部分
 * 1.全局拼音表，貌似是所有的拼音组合，字典序
 *        格式为(index,len,pinyin)的列表
 *        index: 两个字节的整数 代表这个拼音的索引
 *        len: 两个字节的整数 拼音的字节长度
 *        pinyin: 当前的拼音，每个字符两个字节，总长len
 *        
 * 2.汉语词组表
 *        格式为(same_pronounce_num,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表
 *        same_pronounce_num: 两个字节 整数 同音词数量
 *        py_table_len:  两个字节 整数
 *        py_table: 整数列表，每个整数两个字节,每个整数代表一个拼音的索引
 * 
 *        word_len:两个字节 整数 代表中文词组字节数长度
 *        word: 中文词组,每个中文汉字两个字节，总长度word_len
 *        ext_len: 两个字节 整数 代表扩展信息的长度，好像都是10
 *        ext: 扩展信息 前两个字节是一个整数(不知道是不是词频) 后八个字节全是0
 * 
 *       {word_len,word,ext_len,ext} 一共重复same_pronounce_num次 同音词 相同拼音表
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
extern int errno;

typedef enum bool_t { false, true } bool;

#include <inttypes.h>
/*
  int8_t         = 1,  uint8_t        = 1
  int16_t        = 2,  uint16_t       = 2
  int32_t        = 4,  uint32_t       = 4
  int64_t        = 8,  uint64_t       = 8
  int_least8_t   = 1,  uint_least8_t  = 1
  int_least16_t  = 2,  uint_least16_t = 2
  int_least32_t  = 4,  uint_least32_t = 4
  int_least64_t  = 8,  uint_least64_t = 8
  int_fast8_t    = 1,  uint_fast8_t   = 1
  int_fast16_t   = 4,  uint_fast16_t  = 4
  int_fast32_t   = 4,  uint_fast32_t  = 4
  int_fast64_t   = 8,  uint_fast64_t  = 8
*/

int start_PY = 0x1540;
int startChinese = 0x2628;	// 0x26c4
int count_py;

int py_cel_idx = 0;

typedef struct py_t {
    int16_t index;
    int16_t len;
    char pinyin[30];
} py_tab;

py_tab cel_py_tab[0x280];

FILE *in_file;
extern int errno;

int16_t index3;
int16_t len;
char pinyin[30];

void print_unicode_crlr()
{
    printf("%c%c", 0x0d, '\0');
    printf("%c%c", 0x0a, '\0');
}

void print_unicode_space()
{
    printf("%c%c", ' ', '\0');
}

void print_ascii_in_unicode(unsigned char a)
{
    printf("%c%c", a & 0x7f, '\0');
}

void print_ascii_str_in_unicode(unsigned char *ascii_str)
{
    int i = 0;
    while (ascii_str[i]) {
	print_ascii_in_unicode(ascii_str[i]);
	i++;
    }
}

void print_py(int idx)
{
    int i;
    for (i = 0; i < cel_py_tab[idx % count_py].len; i++) {
	printf("%c", cel_py_tab[idx % count_py].pinyin[i]);
    }
}

void print_char2(unsigned char *start_pos, int count, bool update_py_tab)
{
    int i;
    for (i = 0; i < count; i++) {
	printf("%c", start_pos[i]);
    }
}

void put_py_tab(unsigned char *start_pos, int count)
{
    int i;
    for (i = 0; i < count; i++) {
	cel_py_tab[py_cel_idx].pinyin[i % 30] = start_pos[i];
    }
}

int read_py_item(int pos)
{
    if (fseek(in_file, pos, SEEK_SET) == 0) {
	fread(&index3, 1, 2, in_file);
	fread(&len, 1, 2, in_file);
	if (len > 0) {
	    fread(pinyin, 1, len, in_file);
	    cel_py_tab[py_cel_idx].index = index3;
	    cel_py_tab[py_cel_idx].len = len;
	    put_py_tab(pinyin, len);
	    py_cel_idx++;
	    return 2 + 2 + len;
	} else {
	    return index3;
	}
    } else {
	printf("Seek error\n");
	exit(1);
    }
}

int main(int argc, char **argv)
{
    unsigned char header[12];
    unsigned char header_magic[13] =
	"\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00";
    char scel_type = 0;

    int next_pos = 4;

    int i;
    if (argc < 2) {
	printf("Usage: %s acel_l_file\n", argv[0]);
	return;
    } else {
	in_file = fopen(argv[1], "rb");
	if (!in_file) {
	    fprintf(stderr, "Can't open input file '%s', %s\n",
		    argv[1], strerror(errno));
	    exit(0);
	} else {
	    fread(header, 1, 12, in_file);	// read header magic word
	    scel_type = header[4];
	    header[4] = 0x44;
	    if (memcmp(header, header_magic, 12) != 0) {
		printf("Not a .scel file, quit!\n");
		exit(1);

	    } else {
		switch (scel_type) {
		case 0x44:	// 'D'
		    startChinese = 0x2628;
		    break;

		case 0x45:	// 'E'
		    startChinese = 0x26c4;
		    break;

		default:
		    {
			printf(".scel file corrupted, quit!\n");
			exit(1);
		    }
		    break;
		}
	    }

	    rewind(in_file);

	    printf("%c%c", 0xff, 0xfe);	// unicode-8 HEADER

//#define DEBUG
#define OUT_HEADER
#ifdef OUT_HEADER
	    // display file header info
	    {
		unsigned char title[0x338 - 0x130];
		unsigned char type[0x540 - 0x338];
		unsigned char desc[0xd40 - 0x540];
		unsigned char samples[0x1540 - 0xd40];

		print_ascii_str_in_unicode("Name: ");

		fseek(in_file, 0X130, SEEK_SET);
		if (fread(title, 1, 0x338 - 0x130, in_file) !=
		    (0x338 - 0x130)) {
		    perror("fread");
		    exit(1);
		}
		print_char2(title, 0x338 - 0x130, false);
		print_unicode_crlr();

		print_ascii_str_in_unicode("Type: ");

		fseek(in_file, 0x338, SEEK_SET);
		if (fread(type, 1, 0x540 - 0x338, in_file) !=
		    (0x540 - 0x338)) {
		    perror("fread");
		    exit(1);
		}
		print_char2(type, 0x540 - 0x338, false);
		print_unicode_crlr();

		print_ascii_str_in_unicode("Desc: ");

		fseek(in_file, 0x540, SEEK_SET);
		if (fread(desc, 1, 0xd40 - 0x540, in_file) !=
		    (0xd40 - 0x540)) {
		    perror("fread");
		    exit(1);
		}
		print_char2(desc, 0xd40 - 0x540, false);
		print_unicode_crlr();

		print_ascii_str_in_unicode("Smpl: ");

		fseek(in_file, 0xd40, SEEK_SET);
		if (fread(samples, 1, 0x1540 - 0xd40, in_file) !=
		    (0x1540 - 0xd40)) {
		    perror("fread");
		    exit(1);
		}
		print_char2(samples, 0x1540 - 0xd40, false);
		print_unicode_crlr();
		print_unicode_crlr();

	    }
	    rewind(in_file);
#endif

	    count_py = read_py_item(start_PY);
	    for (i = 0; i < count_py; i++) {
		next_pos += read_py_item(start_PY + next_pos);
	    }

#ifdef DEBUG
	    {
		int i;
		unsigned char str_tmp[200];
		for (i = 0; i < count_py; i++) {
		    sprintf(str_tmp, "%03d(0x%03x):", i, i);
		    print_ascii_str_in_unicode(str_tmp);
		    print_py(i);
		    print_unicode_crlr();
		}
	    }
#endif

	    {
		uint16_t same_pronounce_num;
		uint16_t py_table_len;
		uint16_t py_table[150];

		uint16_t word_len;
		uint8_t word[150];
		unsigned char str_tmp[200];
		int cur_fptr;
		int file_size;

		int i = 0;
		int j;
		int next_pos_py = 0;

		fseek(in_file, 0x0L, SEEK_END);
		file_size = ftell(in_file);

#ifdef DEBUG
		sprintf(str_tmp, "\r\file_size:%d\xd\xa", file_size);
		print_ascii_str_in_unicode(str_tmp);
#endif
		fseek(in_file, startChinese, SEEK_SET);

		while  (ftell(in_file) < file_size) {		
		    cur_fptr = ftell(in_file);

#ifdef DEBUG

		    sprintf(str_tmp, "\xd\xacur_fptr:%d(0x%08x)\xd\xa",
			    cur_fptr, cur_fptr);
		    print_ascii_str_in_unicode(str_tmp);
#endif

		    fread(&same_pronounce_num, 1,
			  sizeof same_pronounce_num, in_file);
		    fread(&py_table_len, 1, sizeof py_table_len, in_file);
		    if (py_table_len > sizeof(py_table)) {
			sprintf(str_tmp,
				"\xd\xaError, .scel file maybe corrupt: too big size of py_table:%d(0x%08x), at file:0x%x\xd\xa",
				py_table_len, py_table_len,
				ftell(in_file));
			print_ascii_str_in_unicode(str_tmp);
			break;
		    } else {
			fread(py_table, 1, py_table_len, in_file);
		    }

		    if (same_pronounce_num == 0 || same_pronounce_num > 20) {
//#define ERR_OUTPUT				
#ifdef ERR_OUTPUT
			sprintf(str_tmp,
				"\xd\xaError, improper SAME_PRONOUNCE_NUM item size:%d(0x%08x), at file:0x%x\xd\xa",
				same_pronounce_num, same_pronounce_num,
				ftell(in_file));
			print_ascii_str_in_unicode(str_tmp);
#endif
			break;
		    }
#ifdef DEBUG

		    sprintf(str_tmp, "same_pronounce_num:%d\xd\xa",
			    same_pronounce_num);
		    print_ascii_str_in_unicode(str_tmp);

		    sprintf(str_tmp, "py_table_len:%d\xd\xa",
			    py_table_len);
		    print_ascii_str_in_unicode(str_tmp);

		    sprintf(str_tmp, "py_table:%d\xd\xa", py_table[0]);
		    print_ascii_str_in_unicode(str_tmp);
#endif
		    // print PY string , e.g. "Zuo You Wei Nan"
		    for (i = 0; i < py_table_len / 2; i++) {
			print_char2(cel_py_tab[py_table[i] % count_py].
				    pinyin,
				    cel_py_tab[py_table[i] % count_py].len,
				    false);
			print_unicode_space();
		    }
		    print_ascii_in_unicode(';');

		    for (j = 0; j < same_pronounce_num; j++) {
			fread(&word_len, 1, sizeof word_len, in_file);
			fread(word, 1, word_len + 12, in_file);

			print_char2(word, word_len, false);

			if (!(j + 1 == same_pronounce_num)) {
			    print_ascii_in_unicode(',');
			}
		    }
		    print_unicode_crlr();

#ifdef DEBUG
		    {
			long cur_fptr2 = ftell(in_file);
			sprintf(str_tmp, "cur_fptr2:%d(0x%08x)\xd\xa",
				cur_fptr2, cur_fptr2);
			print_ascii_str_in_unicode(str_tmp);
		    }
#endif
		}
	    }
	}
    }
}