/* * msrch.c * Purpose: search text for multiple keywords simultaneously * Switches: DRIVER - will cause a test driver to be compiled * MAXCHAR - maximum number of symbols recognized * Usage: The sample driver illustrates all the key points. * There are three routines: * (1) MsrchInit( struct kword* ) is passed list of * words to search for * (2) MsrchGo( int(*MsrchData)(), * void (*MsrchSignal)(char*) ); * does the work. It uses two pointers to functions: * the first retrieves a character, the second is * called when a match is found. * (3) MsrchEnd( void ) cleans up the work areas */ #include <stdio.h> #include <stdlib.h> #include <string.h> struct kword /*linked list of keywords */ { unsigned char *word; struct kword *next; }; #define MAXCHAR 256 /* max number of different chars we search for */ static int MaxState; /* max number of states we have room for */ /* * First level of matching. * Possible values: * (1) EMPTY_SLOT -2 * (2) a character * (3) MULTI_WAY -1 */ static int *MatchArray; #define MULTI_WAY -1 /* flags for match_array */ #define EMPTY_SLOT -2 /* * values in MatchArray take us here: * go here if MatchArray is a character * or to this MULTI_WAY branching table */ union GoToTable{ int GotoState; int *BranchTable; }static *GotoArray; #define FAIL_STATE -1 /* in GotoState or BranchTable, this means failure */ /* * OutArray[] is the Output function * list of keywords 'found' by states */ static struct kword **OutArray; /* * FailArray[] is the Fail function * failure transition array */ static int *FailArray; /* variable to track next free state */ static int HighState; /* functions we use */ static void AddStateTrans(int, int, int); static void ComputeFail( void ); static void Enter( unsigned char * ); static void FindFail( int state, int s, int a ); static void QueueAdd( int *queue, int qbeg, int new ); /* set up tables needed by MsrchGo() */ void MsrchInit( struct kword *klist ) { int i; struct kword *ktemp; /* compute maximum number of possible states */ MaxState = 1; for ( ktemp=klist; ktemp!=NULL; ktemp=ktemp->next ) MaxState += strlen( ktemp->word ); /* allocate space for arrays */ MatchArray = (int *)malloc( sizeof(int)*MaxState ); GotoArray = (union GoToTable *)malloc( sizeof(union GoToTable)*MaxState ); OutArray = (struct kword **)malloc( sizeof(struct kword*)*MaxState ); FailArray = (int *)malloc( sizeof(int)*MaxState ); /* initialize state array */ for (i=0; i<MaxState; i++) { MatchArray[i] = EMPTY_SLOT; OutArray[i] = NULL; } /* initialize state_array[0] */ HighState = 0; AddStateTrans( 0, 'a', FAIL_STATE ); /* force a multiway table */ AddStateTrans( 0, 'b', FAIL_STATE ); /* step through keywords */ for (; klist!=NULL; klist=klist->next) Enter(klist->word); /* setup return to zero transitions for state[0] */ for (i=0; i<MAXCHAR; i++) if (GotoArray[0].BranchTable[i]==FAIL_STATE) GotoArray[0].BranchTable[i] = 0; /* and compute failure array */ ComputeFail(); } /* add transition from OldState -> NewState for MatchChar */ static void AddStateTrans( int OldState, int MatchChar, int NewState ) { int i, *temp; /* is this slot empty? */ if ( MatchArray[OldState]==EMPTY_SLOT ) /* this is easy */ { MatchArray[OldState] = MatchChar; GotoArray[OldState].GotoState = NewState; } /* is there already a multi-way table? */ else if (MatchArray[OldState]==MULTI_WAY) /* easy, too */ GotoArray[OldState].BranchTable[MatchChar] = NewState; else{ /* is there already a multi-way table */ temp = (int *)malloc(sizeof(int)*MAXCHAR); for (i=0; i<MAXCHAR; i++) temp[i] = FAIL_STATE; /* copy data from single way branch */ temp[MatchArray[OldState]] = GotoArray[OldState].GotoState; /* and new data */ temp[MatchChar] = NewState; /* and load it all into state_array */ MatchArray[OldState] = MULTI_WAY; GotoArray[OldState].BranchTable = temp; } } /* add kword to list of words our machine recognizes */ static void Enter( unsigned char *kword ) { int state, k; char *save; struct kword *ktemp; state = 0; save = kword; /* keep a copy */ /* first, see whether we can place this word * on top of an existing one * check the pre-substring is in the GotoTable */ for (; *kword!='/0'; kword++) { /* is this a single char slot? */ if ( MatchArray[state] == *kword ) state = GotoArray[state].GotoState; else if (MatchArray[state] == MULTI_WAY) /* multi-way? */ { if ((k=GotoArray[state].BranchTable[*kword])==FAIL_STATE) break; else /* we have a transition for this char */ state = k; } else break; } /* now add new states as needed */ for (; *kword!='/0'; kword++) { HighState += 1; if (HighState>=MaxState) /* uh-oh... */ { fputs("INTERNAL ERROR: too many states/n", stderr); exit( EXIT_FAILURE ); } AddStateTrans( state, *kword, HighState ); state = HighState; } /* now add this keyword to output list for final state */ ktemp = (struct kword*)malloc(sizeof(struct kword)); ktemp->word = save; ktemp->next = OutArray[state]; /* cycle-linked? */ OutArray[state] = ktemp; } /* build FailArray and update GotoArray */ static void ComputeFail() { int *queue, qbeg, r, s; int i; /* allocate a queue */ queue = (int*)malloc(sizeof(int)*MaxState); qbeg = 0; queue[0] = 0; /* scan first level and setup initial values for FailArray */ for ( i=0; i<MAXCHAR; i++){ if ((s=GotoArray[0].BranchTable[i])!=0) { FailArray[s] = 0; QueueAdd(queue, qbeg, s); } } /* now scan lower level */ while (queue[qbeg]!=0) { /* pull off state from front of queue and advance qbeg */ r = queue[qbeg]; qbeg = r; /* now investigate this state */ if (MatchArray[r] == EMPTY_SLOT) continue; /* no more to do */ else if (MatchArray[r]==MULTI_WAY) { /* scan its subsidiary states */ for (i=0; i<MAXCHAR; i++) { if ((s=GotoArray[r].BranchTable[i])!=FAIL_STATE) { /* add new state to queue */ QueueAdd(queue, qbeg, s); FindFail(FailArray[r], s, i); } } } else /* single char */ { QueueAdd( queue, qbeg, GotoArray[r].GotoState ); FindFail( FailArray[r], GotoArray[r].GotoState, MatchArray[r] ); } } /* tidy up */ free(queue); } /* * Actually compute failture transition. We know that 'a' * would normally cause us to go from state s1 to s2. * To compute the failure values, we backtrack in search * of other places 'a' might go. */ static void FindFail(int s1, int s2, int a) { int on_fail; struct kword *ktemp, kdummy, *out_copy, *kscan; for ( ;; s1=FailArray[s1] ) { if (MatchArray[s1]==a) { if ((on_fail=GotoArray[s1].GotoState) != FAIL_STATE) break; } else if (MatchArray[s1]==MULTI_WAY) { if ((on_fail=GotoArray[s1].BranchTable[a]) != FAIL_STATE) break; } } FailArray[s2] = on_fail; /* merge output lists */ /* first, make a copy of OutArray[on_fail] 是否终结状态*/ if (OutArray[on_fail]==NULL) out_copy = NULL; else { kscan = OutArray[on_fail]; out_copy = malloc(sizeof(struct kword)); out_copy->word = kscan->word; out_copy->next = NULL; for (kscan=kscan->next; kscan!=NULL; kscan=kscan->next) // 拷贝整条链 { ktemp = malloc(sizeof(struct kword)); ktemp->word = kscan->word; ktemp->next = out_copy->next; out_copy->next = ktemp; } } /* now merge them 尾插入 */ if ((kdummy.next=OutArray[s2])!=NULL) { ktemp = &kdummy; for (; ktemp->next->next != NULL; ktemp=ktemp->next) ; ktemp->next->next = out_copy; }else OutArray[s2] = out_copy; } /* add new to end of queue */ static void QueueAdd( int *queue, int qbeg, int new ) { int q; q=queue[qbeg]; if (q==0) /* is list empty? */ { queue[qbeg] = new; /* yes */ }else /* no: scan the next-to-last link */ { for (; queue[q]!=0; q=queue[q]) ; queue[q] = new; /* put this state at end of queue */ } /* and terminate list */ queue[new] = 0; } /* do the actual search */ void MsrchGo( int (*MsrchData)(), void (*MsrchSignal)(char *)) { int state, c, g, m; struct kword *kscan; state = 0; while ((c=MsrchData())!=EOF) { /* what is goto (state, c)? */ for (;;) { /* * we cheat slightly in the interest of * speed/simplicity. The machine will spend most * of its time in state==0, and this state is * always a MULTY_WAY table. Since this is a * simple test, we make it first and try to save * the calculation of array index */ if ( state==0 || (m=MatchArray[state])==MULTI_WAY ) g = GotoArray[state].BranchTable[c]; else if (m==c) g = GotoArray[state].GotoState; else g = FAIL_STATE; if (g!=FAIL_STATE)break; state = FailArray[state]; } state = g; /* anything to output? */ if((kscan=OutArray[state])!=NULL) for (;kscan!=NULL; kscan=kscan->next) MsrchSignal(kscan->word); } } /* free all the array we created */ void MsrchEnd( void ) { int i; struct kword *kscan; for (i=0; i<MaxState; i++) if(MatchArray[i]==MULTI_WAY) free(GotoArray[i].BranchTable); free(MatchArray); free(GotoArray); free(FailArray); for (i=0; i<MaxState; i++) { while (OutArray[i]!=NULL) { kscan = OutArray[i]; OutArray[i] = OutArray[i]->next; free(kscan); } } free(OutArray); } /* * This test driver expects a command line of the form * msrch file word-1 word-2 word-3 .... word-n * * It will then search file for all words on the command line. * The results are written to stdout. This illustrates all the * feature of using the multisearch routines. * * This is an admittedly simple design--the search routine would * certainly be faster if the character fetch routine was put * directly into the MsrchGo() module. However, to avoid using * application-specific code in the demonstration version of * these routines, it is coded as a separate subroutine. */ #define BUFSIZE 200 FILE *infile; char inbuf[BUFSIZE]; char *inbufptr; int linecount; /* declare the routines that MsrchGo will use */ int RetrieveChar( void ); void FoundWord(char *word); int main( int argc, char **argv ) { char infile_name[20]; struct kword *khead, *ktemp; int i; if (argc<3) { fprintf( stderr, "Usage: msrch infile word-1 word-2 ... word-n/n" ); exit( EXIT_FAILURE ); } strcpy( infile_name, argv[1] ); if ( (infile=fopen(infile_name, "r"))==NULL ) { fprintf( stderr, "Cannot open %s/n", infile_name ); exit( EXIT_FAILURE ); } linecount = 0; inbufptr = NULL; /* turn command-line parameters into a list of words */ khead = NULL; for (i=3; i<=argc; i++) { ktemp = (struct kword*)malloc(sizeof(struct kword)); ktemp->word = argv[i-1]; ktemp->next = khead; khead = ktemp; } MsrchInit(khead); /* setup system; pass list of words */ /* Now search. Note call to function by use of pointers */ MsrchGo(RetrieveChar, FoundWord); MsrchEnd(); /* clean up */ return EXIT_SUCCESS; } /* * get next character from input stream. Routine returns either * (a) a character ( as an int without its sign extended), or * (b) EOF */ int RetrieveChar(void) { int c; if (inbufptr==NULL || *(++inbufptr)=='/0' ) { /* read a new line of data */ if (fgets(inbuf, BUFSIZE, infile)==NULL) { fclose(infile); return EOF; } inbufptr = inbuf; linecount += 1; } c=*inbufptr; c&=0x00FF; /* make sure it is not sign extended */ return c; } /* FoundWord: called by MsrchGo() when it finds a match */ void FoundWord(char *word) { int i; fprintf( stdout, "Line %d/n%s", linecount, inbuf); i = (inbufptr-inbuf)-strlen(word)+1; for (;i>0; i--) fputc(' ', stdout); fprintf( stdout, "%s/n/n", word); } /* * 阅读后注释: * 1.为了节省空间GotoTable区分两种,一种情况是只有单一跳转的,只需要记录下一个状态即可; * 另一种情况是存在分支,即多个串在同一位置上具有相同的字符,存在不同的跳转可能,这里 * 需要另一张表记录。由于多数都是单一跳转,因此,大多数情况只需要一个状态,而不是生成新的 * 一张255的表,是否存在更好的可能记录呢?也许可以考虑使用链表。可以吗? * 2.MatchArray记录单一跳转时的,状态的输入,即当输入是什么字符时进入此状态,若是不存在 * 此状态,则为EMPTY_SOLT,若是多跳转模式则是MULTI_WAY。事实上MatchArray与GotoTable可以通 * 过struct合并. * 3.HighState记录当前已有的状态数 * 4.代码初始化表时加入两个状态[0,'a'],[0,'b']用于作为所有查找的初始状态。对于查找多字符 * 都需要引入这一个状态。 * 5.FindFail合并等价状态 * 例子 * (!t&&!a)----0----(t)1----(a)2----(l)3----(e)4 * | | * | | * | (o)5----(o)6----(l)7 * | * ----(a)8----(l)9----(e)10 * 这里可以看到 2==8, 3==9, 4==10,故Fail数组有: * 1 2 3 4 5 6 7 8 9 10 * 0 8 9 10 0 0 0 0 0 0 * 终结状态由OutArray保存 * 4---> tale, ale * 7---> tool * 10---> ale * Fail数组实质上就是一个并查集, 例如查找2那么找到1的等价状态0,查找是否有[0,a]跳转,若有 * 发现等价状态 * 6.QueueAdd描述了一个整数且保存整数的队列,这个队列保存的值都是唯一的范围刚好就是数组可以 * 索引的值,这里只需要把前一个队列值作为索引来保存当前值,并标记当前值索引的所在元素为下 * 一个存放的位置。实质就是以保存的值的索引关系来维护队列的关系,确保是尾插入而开始元素 * 0代表队列头。而下一个元素即next=queue[0], next_next=queue[next].... * 7.第427行代码与书上不同,书中是有误的 * 8.第276行 else if (MatchArray[s1]!=EMPTY_SLOT)应该改成else if (MatchArray[s1]==MULTI_WAY) * 9.第395行 * for (kscan=OutArray[i]; kscan!=NULL; kscan=kscan->next) * free( kscan ); * 有误应改成 * while (OutArray[i]!=NULL) * { * kscan = OutArray[i]; * OutArray[i] = OutArray[i]->next; * free(kscan); * } * */