2008-05-10 11:41 基于trie树的分词算法（转载）

最新推荐文章于 2022-05-03 21:16:15 发布

cnki_ok

最新推荐文章于 2022-05-03 21:16:15 发布

阅读量705

点赞数

分类专栏：自然语言处理

自然语言处理专栏收录该内容

15 篇文章 0 订阅

订阅专栏

#ifndef _TRIE_H_
#define _TRIE_H_

struct trie_node
{
int value;
int eof;/* the tag of end*/
struct trie_node *child,*sibling;
};
extern void init_index();
extern int create_index(FILE *findex);
extern void search_index(FILE *fin,FILE *fout);
extern void free_index();
#endif

#include <stdio.h>
#include <stdlib.h>
#include "trie.h"

#define ascii_char(ch) (ch<0x7f?1:0)
#define HASH_SIZE 65536
#define BUFFER_SIZE 256/*assume the max CJK word is 128*/

/*here I don't read the in file into memory then analyze it*/
/*because it is hard to assure the size of in file*/
/*and the operate system like WIN32 have done the job of reading the file block*/
/*into memory and we just have few times to using fseek to back in the file*/

static struct trie_node hash[HASH_SIZE];

/*init the hash table*/
extern void init_index()
{
int i;
   struct trie_node *tmp;
for(i=0;i<HASH_SIZE;i++)
{
   tmp=hash+i;
   tmp->eof=0;
   tmp->value=0;
   tmp->child=NULL;
   tmp->sibling=NULL;
}
}

/*malloc a new node for stroe a CJK*/
/*be sure the eof=0,stand for non stop here*/
static struct trie_node * new_trie_node(int value)
{
     struct trie_node *tmp;
     tmp=(struct trie_node*)malloc(sizeof(struct trie_node));
     tmp->value=value;
     tmp->eof=0;
     tmp->child=NULL;
     tmp->sibling=NULL;

return tmp;
}

/*create the index*/
/*read the file*/
extern int create_index(FILE *findex)
{
int value;
unsigned char ch;
struct trie_node *head,*tmp;

while((ch=fgetc(findex))!=0xff)/*at the eof*/
{
   if(ascii_char(ch))
    return 0;
   value=ch;
   ch=fgetc(findex);
   value=(value<<8)+ch;
   head=hash+value;/*locate the hash position first*/
   /*head ,right*/

   while((ch=fgetc(findex))!='\n')/*if char='\n',stanf for a new word*/
    /*if that occurs,sorry,we have to create once again*/
   {
    if(ascii_char(ch))
     return 0;
    value=ch;
    ch=fgetc(findex);
    value=(value<<8)+ch;

    /*here exits three condition*/
    /*head->child=NULL,so we have to create a new node*/
    /*head->child->value=value*/
    /*head->child->value!=value,so we have to find in the sibling*/
    if(head->child==NULL)
    {
     head->child=new_trie_node(value);
     head=head->child;
    }
    else if(head->child->value==value)/*if exist*/
     head=head->child;/*go as the child goes*/
    else
    {
     head=head->child;
     tmp=head->sibling;
     while(tmp)
     {
      if(tmp->value==value)
       break;
      head=tmp;
      tmp=tmp->sibling;
     }
     if(tmp)/*if exist*/
      head=tmp;/*save the head*/
     else
     {
      head->sibling=new_trie_node(value);
      head=head->sibling;
     }
    }
   }
   head->eof=1;/*when a word insert into it*/
   /*we have to update the tag of eof*/
}
return 1;/*insert succefully*/
}

static void free_trie(struct trie_node *head)
{
struct trie_node *p1,*p2;
if(head==NULL)
   return ;
else
{
   p1=head->child;
   if(!p1)
    return ;
   else
   {
    while(p1)
    {
     p2=p1->sibling;
     free_trie(p1);
     p1=p2;
    }
   }
   free(head);
}
return ;
}

extern void free_index()
{
int i;
struct trie_node *head,*p1,*p2;
for(i=0;i<HASH_SIZE;i++)
{
   head=hash+i;
   p1=head->child;
   if(!p1)
    continue;
   else
   {
    while(p1)
    {
     p2=p1->sibling;
     free_trie(p1);
     p1=p2;
    }
   }
}
return ;
}

static void output_buffer(FILE *fin,FILE *fout,unsigned char *buffer,\
     int last_match_offset,int cur_file_offset)
{
/*here we have to putput the buffer*/
/*and reseek the infile*/
int i;
for(i=0;i<last_match_offset;i++)
   fputc(*(buffer+i),fout);
fputc('\n',fout);
fseek(fin,last_match_offset-cur_file_offset,SEEK_CUR);
/*the file pos has to back a distance*/

return ;
}
extern void search_index(FILE *fin,FILE *fout)
{
/*last_match_offset:the last offset come across the tag of eof*/
/*cur_buffer_offset:record the pos of buffer*/
/*cur_file_offset:record the pos of file,because we have to back in the file*/
int last_match_offset,\
   cur_buffer_offset,\
   cur_file_offset;
   /*sometimes*/
unsigned char ch[2];
int value;
unsigned char buffer[BUFFER_SIZE]={0};
struct trie_node *head;

while((ch[0]=fgetc(fin))!=0xff)
{
   last_match_offset=0;
   cur_buffer_offset=0;
   cur_file_offset=0;
   head=NULL;

   if(ascii_char(ch[0]))
   {
    output_buffer(fin,fout,buffer,\
      last_match_offset,cur_file_offset);
    fputc(ch[0],fout);
    fputc('\n',fout);
    continue;
   }

   buffer[cur_buffer_offset++]=ch[0];
   value=ch[0];

   ch[1]=fgetc(fin);
   buffer[cur_buffer_offset++]=ch[1];
   value=(value<<8)+ch[1];

   last_match_offset+=2;/*fetch two char*/
   cur_file_offset+=2;/*fetch two char*/

   head=hash+value;
   while((ch[0]=fgetc(fin))!=0xff)
   {
    if(ascii_char(ch[0]))
    {
     output_buffer(fin,fout,buffer,\
       last_match_offset,cur_file_offset);
     fputc(ch[0],fout);
     fputc('\n',fout);
     break;
    }

    ch[1]=fgetc(fin);
    value=(ch[0]<<8)+ch[1];

    cur_file_offset+=2;
    if(head->child==NULL)/*of course this is not match*/
    {
     output_buffer(fin,fout,buffer,\
      last_match_offset,cur_file_offset);
     break;/*match noce again*/
    }
    head=head->child;
    while(head)
    {
     if(head->value==value)/*match*/
     {
      buffer[cur_buffer_offset++]=ch[0];
      buffer[cur_buffer_offset++]=ch[1];
      if(head->eof)/*max length,has to update the*/
       /*last_match_offset*/
       last_match_offset=cur_buffer_offset;
      break;
     }
     head=head->sibling;
    }
    if(!head)/*if not match,match once again as one word*/
    {
     output_buffer(fin,fout,buffer,\
      last_match_offset,cur_file_offset);
     break;
    }
   }
}
return ;
}

#include <stdio.h>
#include <stdlib.h>
#include "trie.h"

int main()
{
FILE *findex,*fin,*fout;

findex=fopen("vocabulary.txt","r");

/*in
it the index first*/
/*the file format should be like this*/
/* ********* ('\n') */
/* ********* ('\n') */
/* ................ */
/*   ********* ('\n') */
/*be sure at the end of file,there should be a crlf*/

init_index();
if((findex==NULL)||(create_index(findex)==0))
{
   printf("insert index error\n");
   getchar();
   exit(1);
}
fclose(findex);
printf("OK\n");

fin=fopen("in.txt","r");
fout=fopen("out.txt","w+");

基于trie树的分词算法
if(fin==NULL)
{
   printf("search index error\n");
   getchar();
   exit(1);
}
search_index(fin,fout);
printf("OK\n");
free_index();

fclose(fin);
fclose(fout);

getchar();
return 0;
}