26、数据结构笔记之二十六串应用之建立词索引表

26、数据结构笔记之二十六串应用之建立词索引表

           “生命是一条美丽而曲折的幽径,路旁有妍花的丽蝶,累累的美果,但我们很少去停留观赏,或咀嚼它,只一心一意地渴望赶到我们幻想中更加美丽的豁然开朗的大道。然而在前进的程途中,却逐渐树影凄凉,花蝶匿迹,果实无存,最后终于发觉到达一个荒

漠。-- 萨拉”

 

1.  信息检索

信息检索是计算机应用的重要领域之一。为了提高图书馆数目检索的效率,建立书名关键词索引,可以实现读者快速检索书目的自动化,即读者根据关键词索引表,读者可以方便查询到自己感兴趣的书目。

           准备一个文件BookInfo.txt文件如下。注意最后一行不要后空行,不然程序会出错。

2.  BookInfo.txt

005Computer Data Structure

010Introduction to Data Structure

023Fundamental of Data Structure

034The Design and Analysis of Computer Algorithms

050Introduction to Numerical Analysis

067Numerical Analysis

3.  BookIdx.txt

最后代码运行完毕会生成如下的文本:

关键字              书号        

algorithms          034,

analysis            034,050,067,

computer            005,034,

data                005,010,023,

design              034,

fundamental         023,

introduction        010,050,

numerical           050,067,

structure           005,010,023,

 

 

4.  代码具体实现

4.1      定义

定义如下

#defineMaxBookNum1000                           //假设只对1000本书建索引表

#defineMaxKeyNum2500                            //索引表的最大容量

#defineMaxLineLen500                            //书目串的最大长度

#defineMaxWordNum10                             //词表的最大容量

typedefstruct{                                

           char*item[MaxKeyNum];                         //字符串的数组

           intlast;                                                //词表的长度

}WordListType;                                    //词表类型(顺序表) 

typedefstructLNode{                             //存放书号的链表

           intdata[3];

           structLNode*next;

}LNode, *LinkList;

 

typedefstruct{                                  //存放关键词的串

           char*ch;

           intlength;                                

}HString;

 

typedefstruct{

           HString*key;

           LinkListbnolist;

}IdxTermType;                                   //索引项类型   

 

typedefstruct{

           IdxTermTypeitem[MaxKeyNum + 1];

           intlast;

}IdxListType;                                  //索引表类型(有序表)

 

char *buf;                          //书目串缓冲区

WordListTypewdlist;                           //词表

int i;                                         //无重复的关键字个数

int b;                                         //保留i的前一次变换的值

int v;

char *com[10] = {"and","a","an","the","a","an","to","of"};

 

 

 

4.2      InitIdxList

初始化操作,置索引表idxlist为空表,且在idxlist.item[0]设一空串。

IdxListType *InitIdxList(IdxListType *idxlist)       

{

           idxlist =(IdxListType*)malloc(sizeof(IdxListType));

           if(idxlist== NULL)

           {

                     printf("ERROR——1!");

                     exit(-1);

           }

           idxlist->item[0].key= (HString *)malloc(sizeof(HString));

           idxlist->item[0].bnolist= (LinkList)malloc(sizeof(LNode));

           idxlist->item[0].key->ch= NULL;

           idxlist->item[0].key->length= 0;

           idxlist->item[0].bnolist->next= NULL;

           idxlist->last= 0;

           returnidxlist;

}

 

 

 

4.3      GetLine

从文件f读入一个书目信息到书目缓冲区buf

其中v,buf是一个全局变量。

当前设置的v最大是100,就是最多读取100行书的信息。

void GetLine(FILE *f)

{

           int j= 0;

           staticchara[100][100];

           charc;

           c =fgetc(f);

           while(((a[v][j++]= tolower(c) ) != '\n') && (c != EOF))

                     c= fgetc(f);

           //printf("%c",a[v][j- 1]);

           a[v][j-1]= '\0';

           buf= a[v++];

           printf("%s",buf);

}

 

4.4      ExtractKeyWord

从buf中提取书目关键词到词表wdlist,书号存入bno。

Buf中是一行从文本中提出来的字符串。

int ExtractKeyWord(intbno[])

{

           intk,s,t = 0,j = 0;

           b =i;

           char*p = buf,*q,*c[10];

           while(!isalnum(*p))

                     p++;

           while(isdigit(*p) || !isalpha(*p))

           {

                     bno[j]= *p - '0';

                     p++;j++;

           }

           j--;

//处理完书号,开始处理中间空格,j可以用来表示书号的位数。

           while(!isalpha(*p))

                     p++;

//处理完空格,处理书名

           q =p;

           while(*q)

           {

                     while((*q!= ' ') && *q && (*q != EOF))

                                q++;

                     q++;

                     *(q- 1) = '\0';

                     c[t++]= p;

                     p= q;

           }

//将每个书名的词,存放到数组c中。

//对比数组c和数组com中存放的是8个字符串,com数组中存放的是常用的字节,不能用来当做关键词。

           for(k= 0; k < t; ++k)

           {

                     for(s= 0; s < 8; ++s)

                     {

                                if(strcmp(c[k],com[s])== 0)

                                          break;

                     }

                     if(s== 8)

                                wdlist.item[i++]= c[k];

           }

//设定当前关键词的数量

           wdlist.last= i;

           printf("词表的长度%d ",wdlist.last);

           printf("书号:");

           for(k= 0; k < j; ++k)

                     printf("%d",bno[k]);

           printf("\n");

           for(k = b; k < i; ++k )

                     printf("关键字%s\n",wdlist.item[k]);

           return0;

}

4.5      InsIdxList

将书号bno的书名关键词按词典顺序插入索引表idxlist

int InsIdxList(IdxListType *idxlist, intbno[])

{

//b是保留i的前一次变换的值,i是关键字个数

           int k, t ,s,j = 0,c = b,d;

           LinkListp,q;

           if(i>= MaxKeyNum + 1)

           {

                     printf("超过索引表最大存储,请调整!\n");

                     exit(-1);

           }

           for(t= idxlist->last ; j < i - b; j++,c++)

           {

                     if(0 == t )                 //第一个关键字插入

                     {

                                idxlist->item[0].key->ch= wdlist.item[c];

                                for(d= 0; d < 3; ++d )

                                          idxlist->item[0].bnolist->data[d]= bno[d];                //书号存储

                                idxlist->item[0].bnolist->next= NULL;                        //链表尾部指向空

                                idxlist->last++;

                                t++;

                     }

                     //t不为0的时候,进行如下分支

                     else

                     {

//与已经存在的该关键词进行对比,是否已经存在一样的关键词了

                                for(k=0; k < t; k++)

                                {

                     //如果已经存在了一样的关键词,则再索引表对应的关键字增加书号到书号链表。

                                          if((s= strcmp(wdlist.item[c], idxlist->item[k].key->ch))== 0) //插入的关键字已存在

                                          {

                                                     p= (LinkList)malloc(sizeof(LNode));                        //申请个新结点,保存书号

                                                     printf("\n插入的书号:");

                                                     for(d= 0; d < 3; ++d)

                                                     {

                                                               p->data[d]= bno[d];

                                                               printf("%d",p->data[d]);

                                                     }

                                                     //书号赋值给新申请的节点

                                                     q= idxlist->item[k].bnolist;

                                                     for(d= 0; d < 3; ++d)

                                                               fprintf(stdout, "%d",q->data[d]);

                                                     while(q->next!= NULL)

                                                               q= q->next;

                                                     q->next= p;

                                                     p->next= NULL;

                                                     //输出该关键字下的所有书号

                                                     fprintf(stdout, "\n关键字%-20s\t\n",idxlist->item[k].key->ch);

                                                     q= idxlist->item[k].bnolist;

                                                     while(q)

                                                     {

                                                               for(d= 0; d < 3; ++d)

                                                                          fprintf(stdout, "%d",q->data[d]);

                                                               if(q->next!= NULL)

                                                                          fprintf(stdout, ",");

                                                               q= q->next;

                                                     }

                                                     printf("\n");

                                                     break;

                                          }

//插入的关键字大于最后一个已存在的关键字,则插在最后

// 并不是发现大于当前关键词后,立马处理的,而是在最后一个的时候才处理。因为后面可能还会发现和他相等的关键词的,如果最后一个还是大于当前关键词则进行插入到最后。

                                          elseif(s> 0)

                                          {

                                                     if(k== t - 1)                                                

                                                     {                                                            

                                                               idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                                               idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                                               idxlist->item[t].key->ch= wdlist.item[c];

                                                               for(d= 0; d < 3; ++d )

                                                                          idxlist->item[t].bnolist->data[d]= bno[d];

                                                               idxlist->item[t].bnolist->next=NULL;

                                                               idxlist->last++;

                                                               t++;

                                                               break;

                                                     }

                                          }

//插入的关键字小于最后一个已存在的关键字,则插在当前关键词的前面

// 如果是小于当前关键词,则直接插入,因为本来关键词就是安大小排序的不存在未比较的关键词大的情况。

                                          else

                                          {

                                                     idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                                     idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                                     for(s= t-1; s >= k; --s )

                                                     {//插入的关键字小于当前已有的关键字,则把当前的到最后的关键字向后移

                                                               idxlist->item[s+1].key->ch= idxlist->item[s].key->ch ;

                                                               idxlist->item[s+1].bnolist->next= idxlist->item[s].bnolist->next;

                                                               for(d= 0; d < 3; d++)

                                                                          idxlist->item[s+1].bnolist->data[d]= idxlist->item[s].bnolist->data[d];

                                                               //idxlist->item[s+1].bnolist->next= NULL;

                                                     }

                                                     idxlist->item[k].key->ch=wdlist.item[c];                   //在当前关键字上插入新的关键字

                                                     for(d= 0; d < 3; ++d )

                                                               idxlist->item[k].bnolist->data[d]= bno[d];

                                                     idxlist->item[k].bnolist->next= NULL;

                                                     idxlist->last++;

                                                     t++;

                                                     break;

                                          }

                                }

                     }

           }

           return0;

}

 

4.6      PutText

将生成的索引表idxlist输入到输出文件g

void PutText(FILE *g, IdxListType *idxlist)

           //将生成的索引表idxlist输入到输出文件g

{

           intk,t;

           fprintf(g,"%-20s%-60s\n","关键字","书号");

           fprintf(stdout,"%-20s%-60s\n","关键字","书号");

           for(k= 0; k < idxlist->last; k++)

           {

                     fprintf(g, "%-20s\t",idxlist->item[k].key->ch);

                     fprintf(stdout, "%-20s\t",idxlist->item[k].key->ch);

                     while(idxlist->item[k].bnolist)

                     {

                                for(t= 0; t < 3; ++t)

                                {

                                          fprintf(g, "%d",idxlist->item[k].bnolist->data[t]);

                                          fprintf(stdout, "%d",idxlist->item[k].bnolist->data[t]);

                                }

                                if(idxlist->item[k].bnolist->next!= NULL)

                                {

                                          fprintf(stdout, ",");

                                          fprintf(g, ",");

                                }

                                idxlist->item[k].bnolist= idxlist->item[k].bnolist->next;

                     }

                     fprintf(g, "\n");

                     fprintf(stdout, "\n");

           }

}

 

 

 

4.7      Idxlist_free

释放索引表空间

 

void Idxlist_free(IdxListType *idxlist)       //释放空间

{

           intk;

           LinkListp, q;

           for(k = 0; k < idxlist->last; k++)

           {

                     free(idxlist->item[k].key);

                     for(p = idxlist->item[k].bnolist; p; p = q)

                     {

                                q= p->next;

                                free(p);

                     }

           }

}

4.8      Main

看下主函数,定义一个索引表类型结构体。定义文件句柄变量 f,g,以及int 数组。其中f为BookInfo.txt句柄,g为BookIdx.txt句柄。

然后调用函数InitIdxList来实现初始化,获取f句柄函数中的字符串,提取其中的关键词,然后在索引表中间关键词和书号对应起来。

循环往复,将每行的关键词都读出来,然后将关键词和书号对应起来,如果关键词已经出现在了之前的关键词中,则将书号关联到已出现的关键词后。

最后将关键词和书号输出到一个文件中。

最后释放索引表,关闭文件句柄,退出。

int main()

{

           IdxListType*idxlist = NULL ;

           FILE*f, *g;

           intBookNo[5];

           if((f= fopen("BookInfo.txt", "r"))== NULL)

           {

                     printf("ERROR!Can not open BookInfo.txt");

                     return0;

           }

           else

           {

 

                     if((g= fopen("BookIdx.txt","w"))== NULL)

                     {

                                printf("ERROR!Can not open BookIdx.txt");

                                return0;

                     }

                     else

                     {

                                idxlist= InitIdxList(idxlist);                           //初始化索引表idxlist为空表

                                while(!feof(f))

                                {

                                          GetLine(f);                                          //从文件f读入一个书目信息到buf

                                          ExtractKeyWord(BookNo);                              //buf中提取关键词到词表,书号存入BookNo

                                          InsIdxList(idxlist,BookNo);                         //将书号为BookNo的关键词插入索引表

                                          printf("\ni=%d,b=%d\n",i,b);

                                }

                                PutText(g,idxlist);                                      //将生成的索引表idxlist输出到文件g

                     }

           }

           Idxlist_free(idxlist);

           fclose(f);

           fclose(g);

           return0;

}

 

 

 

5.  源码

#include<stdio.h>

#include<stdlib.h>

#include<string.h>

#include<ctype.h>

#defineMaxBookNum1000                           //假设只对1000本书建索引表

#defineMaxKeyNum2500                            //索引表的最大容量

#defineMaxLineLen500                            //书目串的最大长度

#defineMaxWordNum10                             //词表的最大容量

typedefstruct{                                

           char*item[MaxKeyNum];                         //字符串的数组

           intlast;                                               //词表的长度

}WordListType;                                    //词表类型(顺序表) 

 

typedefstructLNode{                             //存放书号的链表

           intdata[3];

           structLNode*next;

}LNode, *LinkList;

 

typedefstruct{                                  //存放关键词的串

           char*ch;

           intlength;                                

}HString;

 

typedefstruct{

           HString*key;                    

           LinkListbnolist;

}IdxTermType;                                   //索引项类型   

 

typedefstruct{

           IdxTermTypeitem[MaxKeyNum + 1];

           intlast;

}IdxListType;                                  //索引表类型(有序表)

 

char *buf;                          //书目串缓冲区

WordListTypewdlist;                           //词表

int i;                                         //无重复的关键字个数

int b;                                         //保留i的前一次变换的值

int v;

char *com[10] = {"and","a","an","the","a","an","to","of"};

 

IdxListType *InitIdxList(IdxListType *idxlist)       

           //初始化操作,置索引表idxlist为空表,且在idxlist.item[0]设一空串。

{

           idxlist =(IdxListType*)malloc(sizeof(IdxListType));

           if(idxlist== NULL)

           {

                     printf("ERROR——1!");

                     exit(-1);

           }

           idxlist->item[0].key= (HString *)malloc(sizeof(HString));

           idxlist->item[0].bnolist= (LinkList)malloc(sizeof(LNode));

           idxlist->item[0].key->ch= NULL;

           idxlist->item[0].key->length= 0;

           //idxlist->item[0].bnolist->data[3];

           idxlist->item[0].bnolist->next= NULL;

           idxlist->last= 0;

           returnidxlist;

}

void GetLine(FILE *f)

           //从文件f读入一个书目信息到书目缓冲区buf

{

           int j= 0;

           staticchara[100][100];

           charc;

           c =fgetc(f);

           while(((a[v][j++]= tolower(c) ) != '\n') && (c != EOF))

                     c= fgetc(f);

           //printf("%c",a[v][j- 1]);

           a[v][j-1]= '\0';

           buf= a[v++];

           printf("%s",buf);

}

int ExtractKeyWord(intbno[])

           //buf中提取书目关键词到词表wdlist,书号存入bno

{

           intk,s,t = 0,j = 0;

           b =i;

           char*p = buf,*q,*c[10];

           while(!isalnum(*p))

                     p++;

           while(isdigit(*p) || !isalpha(*p))

           {

                     bno[j]= *p - '0';

                     p++;j++;

           }

           j--;

           while(!isalpha(*p))

                     p++;

           q =p;

           while(*q)

           {

                     while((*q!= ' ') && *q && (*q != EOF))

                                q++;

                     q++;

                     *(q- 1) = '\0';

                     c[t++]= p;

                     p= q;

           }

           for(k= 0; k < t; ++k)

           {

                     for(s= 0; s < 8; ++s)

                     {

                                if(strcmp(c[k],com[s])== 0)

                                          break;

                     }

                     if(s== 8)

                                wdlist.item[i++]= c[k];

           }

           wdlist.last= i;

           printf("词表的长度%d ",wdlist.last);

           printf("书号:");

           for(k= 0; k < j; ++k)

                     printf("%d",bno[k]);

           printf("\n");

           for(k = b; k < i; ++k )

                     printf("关键字%s\n",wdlist.item[k]);

           return0;

}

int InsIdxList(IdxListType *idxlist, intbno[])

           //将书号bno的书名关键词按词典顺序插入索引表idxlist

{

           int k, t ,s,j = 0,c = b,d;

           LinkListp,q;

           if(i>= MaxKeyNum + 1)

           {

                     printf("超过索引表最大存储,请调整!\n");

                     exit(-1);

           }

           for(t= idxlist->last ; j < i - b; j++,c++)

           {

                     if(0 == t )                                                      //第一个关键字插入

                     {

                                idxlist->item[0].key->ch= wdlist.item[c];

                                for(d= 0; d < 3; ++d )

                                          idxlist->item[0].bnolist->data[d]= bno[d];                //书号存储

                                idxlist->item[0].bnolist->next= NULL;                        //链表尾部指向空

                                idxlist->last++;

                                t++;

                     }

                     else

                     {

                                for(k=0; k < t; k++)

                                {

                                          if((s= strcmp(wdlist.item[c], idxlist->item[k].key->ch))== 0) //插入的关键字已存在

                                          {

                                                     //ListInsert(idxlist->item[k].bnolist,Length(idxlist->item[k].bnolist)+1, bno);

                                                     p= (LinkList)malloc(sizeof(LNode));                        //申请个新结点,保存书号

                                                     printf("\n插入的书号:");

                                                     for(d= 0; d < 3; ++d)

                                                     {

                                                               p->data[d]= bno[d];

                                                               printf("%d",p->data[d]);

                                                     }

                                                     q= idxlist->item[k].bnolist;

                                                     for(d= 0; d < 3; ++d)

                                                               fprintf(stdout, "%d",q->data[d]);

                                                     while(q->next!= NULL)

                                                               q= q->next;

                                                     q->next= p;

                                                     p->next= NULL;

                                                     fprintf(stdout, "\n关键字%-20s\t\n",idxlist->item[k].key->ch);

                                                     q= idxlist->item[k].bnolist;

                                                     while(q)

                                                     {

                                                               for(d= 0; d < 3; ++d)

                                                                          fprintf(stdout, "%d",q->data[d]);

                                                               if(q->next!= NULL)

                                                                          fprintf(stdout, ",");

                                                               q= q->next;

                                                     }

                                                     printf("\n");

                                                     break;

                                          }

                                          elseif(s> 0)

                                          {

                                                     if(k== t - 1)                                                //插入的关键字大于最后一个已存在的关键字,则插在最后

                                                     {                                                            

                                                               idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                                               idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                                               idxlist->item[t].key->ch= wdlist.item[c];

                                                               for(d= 0; d < 3; ++d )

                                                                          idxlist->item[t].bnolist->data[d]= bno[d];

                                                               idxlist->item[t].bnolist->next=NULL;

                                                               idxlist->last++;

                                                               t++;

                                                               break;

                                                     }

                                          }

                                          else

                                          {

                                                     idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                                     idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                                     for(s= t-1; s >= k; --s )

                                                     {//插入的关键字小于当前已有的关键字,则把当前的到最后的关键字向后移

                                                               idxlist->item[s+1].key->ch= idxlist->item[s].key->ch ;

                                                               idxlist->item[s+1].bnolist->next= idxlist->item[s].bnolist->next;

                                                               for(d= 0; d < 3; d++)

                                                                          idxlist->item[s+1].bnolist->data[d]= idxlist->item[s].bnolist->data[d];

                                                               //idxlist->item[s+1].bnolist->next= NULL;

                                                     }

                                                     idxlist->item[k].key->ch=wdlist.item[c];                   //在当前关键字上插入新的关键字

                                                     for(d= 0; d < 3; ++d )

                                                               idxlist->item[k].bnolist->data[d]= bno[d];

                                                     idxlist->item[k].bnolist->next= NULL;

                                                     idxlist->last++;

                                                     t++;

                                                     break;

                                          }

                                }

                     }

           }

           return0;

}

void PutText(FILE *g, IdxListType *idxlist)

           //将生成的索引表idxlist输入到输出文件g

{

           intk,t;

          

           fprintf(g,"%-20s%-60s\n","关键字","书号");

           fprintf(stdout,"%-20s%-60s\n","关键字","书号");

           for(k= 0; k < idxlist->last; k++)

           {

                     fprintf(g, "%-20s\t",idxlist->item[k].key->ch);

                     fprintf(stdout, "%-20s\t",idxlist->item[k].key->ch);

                     while(idxlist->item[k].bnolist)

                     {

                                for(t= 0; t < 3; ++t)

                                {

                                          fprintf(g, "%d",idxlist->item[k].bnolist->data[t]);

                                          fprintf(stdout, "%d",idxlist->item[k].bnolist->data[t]);

                                }

                                if(idxlist->item[k].bnolist->next!= NULL)

                                {

                                          fprintf(stdout, ",");

                                          fprintf(g, ",");

                                }

                                idxlist->item[k].bnolist= idxlist->item[k].bnolist->next;

                     }

                     fprintf(g, "\n");

                     fprintf(stdout, "\n");

           }

}

void Idxlist_free(IdxListType *idxlist)       //释放空间

{

           intk;

           LinkListp, q;

           for(k = 0; k < idxlist->last; k++)

           {

                     free(idxlist->item[k].key);

                     for(p = idxlist->item[k].bnolist; p; p = q)

                     {

                                q= p->next;

                                free(p);

                     }

           }

}

int main()

{

           IdxListType*idxlist = NULL ;

           FILE*f, *g;

           intBookNo[5];

           if((f= fopen("BookInfo.txt", "r"))== NULL)

           {

                     printf("ERROR!Can not open BookInfo.txt");

                     return0;

           }

           else

           {

 

                     if((g= fopen("BookIdx.txt","w"))== NULL)

                     {

                                printf("ERROR!Can not open BookIdx.txt");

                                return0;

                     }

                     else

                     {

                                idxlist= InitIdxList(idxlist);                           //初始化索引表idxlist为空表

                                while(!feof(f))

                                {

                                          GetLine(f);                                          //从文件f读入一个书目信息到buf

                                          ExtractKeyWord(BookNo);                              //buf中提取关键词到词表,书号存入BookNo

                                          InsIdxList(idxlist,BookNo);                         //将书号为BookNo的关键词插入索引表

                                          printf("\ni=%d,b=%d\n",i,b);

                                }

                                PutText(g,idxlist);                                      //将生成的索引表idxlist输出到文件g

                     }

           }

           Idxlist_free(idxlist);

           fclose(f);

           fclose(g);

           return0;

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值