#include <stdio.h> #include <ctype.h> #include <string.h> #include <stdlib.h> #ifndef max #define max(a,b) ((a)>(b) ? (a) : (b)) #endif #ifdef _DEBUG #define D(x) x #else #define D(x) #endif #define BOL '^' #define EOL '$' #define ANY '.' #define CCL '[' #define CCLEND ']' #define NCCL '^' #define CLOSURE '*' #define PCLOSE '+' #define OPT '?' #define MT_BOL (0x80|'^') #define MT_EOL (0x80|'$') #define MT_ANY (0x80|'.') #define MT_CCL (0x80|'[') #define MT_OPT (0x80|'?') #define MT_CLOSE (0x80|'*') #define MT_PCLOSE (0x80|'+') typedef unsigned char pattern; #define MAXPAT 128 #define MAPSIZE 16 #define ADVANCE(pat) (pat+=(*pat==MT_CCL)?(MAPSIZE+1):1) // 每字节8位 所以右移3位 b&0x7f等价于b/8, 通过1左移【b&07等价于b%8】位创建掩码 #define SETBIT(b,map) (/ (map)[((b)&0x7f)>>3] |= (1<<((b)&0x07)) ) #define TSTBIT(b,map) (/ (map)[((b)&0x7f)>>3] & (1<<((b)&0x07)) ) #define ISHEXDIGIT(x) isxdigit(x) #define ISOCTDIGIT(x) ('0'<=(x)&&(x)<='7') #define E_NONE 0 #define E_ILLEGAL 1 #define E_NOMEM 2 #define E_PAT 3 static int Error=E_NONE; static pattern * DoCCL( pattern* , unsigned char * ); static int DoEscapeSeq( char ** ); static int HexToBinary( int ); static int MatchOne( char**, pattern *, char *); static pattern* MakePattern( char * ); static char * MatchString( char *, pattern *, int ); static int OctToBinary( int ); static char * PatternCmp( char *, pattern *, char * ); extern int PatternErr( void ); int PatternErr(void) { return (Error); } // a.*[0-9]x+& 转成 'a' MT_CLOSE MT_ANY MT_CCL <bit_map> MT_CLOSE 'x' MT_EOL 重复运算符要提前 static pattern *MakePattern( char *exp ) { pattern *pat; pattern *cur; pattern *prev; pat = NULL; Error = E_ILLEGAL; if( !*exp || *exp=='/0' ) return pat; if( *exp==CLOSURE || *exp==PCLOSE || *exp==OPT ) // 不以这些开头 '*', '+', '?' return pat; Error = E_NOMEM; if( (pat=(pattern *)malloc(MAXPAT))==NULL ) return pat; D( memset( pat, 0, MAXPAT ) ); prev = cur = pat; Error = E_PAT; while( *exp && *exp!='/n' ) { if( cur>= &pat[MAXPAT-1] ) // 超过所支持的最长模式长度 { free(pat); return NULL; } switch( *exp ) { case ANY: *cur = MT_ANY; prev = cur++; ++exp; break; case BOL: *cur = (cur==pat)?MT_BOL:*exp; // 当"^"是第一个字符时解释开始符,否则为一般字符 prev = cur++; ++exp; break; case EOL: *cur = ( !exp[1] || exp[1]=='/n')? MT_EOL : *exp; // 当"$"为最后一个字符时解释为结束符,否则为一般字符 prev = cur++; ++exp; break; case CCL: if ( ((cur-pat)+MAPSIZE) >= MAXPAT ) // 超过所支持的最长模式长度 { free( pat ); return NULL; } prev = cur; // 这里与 *cur=MT_CCL; prev=cur++;等价 *cur++ = MT_CCL; exp = DoCCL(cur, exp); cur += MAPSIZE; break; case OPT: case CLOSURE: case PCLOSE: switch( *prev ) // 开始记号,结束记号,?, *, +不能后接重复运算符?,*,+ { case MT_BOL: case MT_EOL: case MT_OPT: case MT_PCLOSE: case MT_CLOSE: free(pat); return NULL; } memmove( prev+1, prev, cur-prev ); // 交换 cur与prev,使?, *, +记号置前 *prev = (*exp==OPT) ? MT_OPT : (*exp==PCLOSE) ? MT_PCLOSE: MT_CLOSE; // 根据字符设置相应的记号,三分支的选择表达式写法 ++cur; ++exp; break; default: prev = cur; *cur++ = DoEscapeSeq( &exp ); break; } } *cur = '/0'; Error = E_NONE; return pat; } // 生成bit_map static pattern *DoCCL( pattern *map, pattern *src ) { int first, last, negative; pattern *start = src; ++src; // 跳过 [ if( negative=(*src==NCCL) ) //检查是否有^字符 ++src; start = src; memset( map, 0, MAPSIZE ); while( *src && *src != CCLEND ) { if( *src != '-' ) { first = DoEscapeSeq( &src ); SETBIT(first, map); }else if( src== start ) // 若'-'为开头字符则认为是字符而不是代表区间的记号 { SETBIT('-', map); ++src; }else // 区间记号 { ++src; // 跳过区间记号 if( *src<src[-2] ) // 判断区间两边那个大,那个小 { first = *src; last = src[-2]; } else { first = src[-2]; last = *src; } while( ++first <= last ) // 设置区间位图 SETBIT(first, map); src++; } } if(*src==CCLEND) ++src; /* 跳过] */ if( negative ) // 位取反 for( first=MAPSIZE; --first>=0; ) *map++ ^= ~0; // 与全1异或 return src; } // 如果ret_endp为真则返回最后一个满足模式的字符指针 // 如果ret_endp为假则返回第一个匹配的开始的字符指针 char *MatchString( char *str, pattern *pat, int ret_endp ) { char *start; char *end = NULL; if (!pat) return NULL; if (!*str) { if((*pat == MT_EOL)||(*pat==MT_BOL && (!pat[1]||pat[1]==MT_EOL))) // 空模式 ^$ 不需要匹配 end = str; }else { //蛮力查找 start = str; while(*str) { if ( !(end=PatternCmp( str, pat, start ))) str++; else //找到匹配 { if (!ret_endp) end=str; break; } } } return end; } static char *PatternCmp( char *str, pattern *pat, char *start) { char *bocl, // 闭包串的开始 *end; // 返回值 串尾指针 if(!pat) return NULL; while(*pat) { if ( *pat==MT_OPT )// ?, 0或1个 若成功则字符串指针前进1,无论如何模式指针向前移1 { MatchOne( &str, ++pat, start ); ADVANCE(pat); }else if( !(*pat==MT_CLOSE || *pat==MT_PCLOSE )) { if ( !MatchOne(&str, pat, start) ) //一般匹配 return NULL; ADVANCE(pat); } else //处理闭包 { if (*pat++ == MT_PCLOSE) //至少一个 if(!MatchOne(&str, pat, start)) return NULL; // 零个或多个 bocl=str; while(*str && MatchOne(&str, pat, start)) ; // 匹配包余下的字符串 对于[a-z]*t这种后面串包含在闭包内的,使用贪心回退,尽可能找到多的匹配字串 // 例如[a-z]*ed这种当分析educated时,第一次str指向最右边的d,而pat则停在e,所以需要回退str两次再 // 继续匹配从而识别出educated,接着再回退识别出最开始的ed if(*ADVANCE(pat)) { for(;bocl<=str;--str) if(end=PatternCmp(str, pat, start)) break; return end; } break; } } --str; // MatchOne让str提前了,要回退1 且要在宏外调用,以防宏的展开有副作用 return max(start, str); } static int MatchOne( char ** strp, pattern *pat, char *start ) { int advance = -1; // -1代表无法匹配 switch ( *pat ) { case MT_BOL: if ( *strp==start ) // 只有扫瞄指针strp与字符串最开头start相一致时,才算匹配开始符号 advance = 0; break; case MT_ANY: if( **strp!='/n' ) // . 匹配任意字符 advance = 1; break; case MT_EOL: if( **strp == '/n' || **strp == '/0' ) advance = 0; break; case MT_CCL: if( TSTBIT( **strp, pat+1 )) advance = 1; break; default: // 一般字符 if ( **strp==*pat ) advance = 1; break; } if(advance>0) *strp+=advance; return advance+1; } static int HexToBinary(int c) { return isdigit(c) ? (c)-'0' : ((toupper(c)-'A')+10) &0xf; } static int OctToBinary(int c) { return ((c)-'0') & 0x7; } static int DoEscapeSeq(char **s) { int rval; if (**s!='//') rval = *((*s)++); // 非转义字符 else{ ++(*s); // 跳过 '/' switch (toupper(**s)) { case '/0': rval = '//'; break; case 'B': rval = '/b'; break; case 'F': rval = '/f'; break; case 'N': rval = '/n'; break; case 'R': rval = '/r'; break; case 'S': rval = ' '; break; case 'T': rval = '/t'; break; case 'E': rval = '/033'; break; // ASCII ESC character ('/033') case '^': // /^C C= any letter. Control code rval = *++(*s); rval = toupper(rval)-'@'; break; case 'X': // /xDDD number formed of 1-3 octal digits rval = 0; ++(*s); if (ISHEXDIGIT(**s)) { rval = HexToBinary(*(*s)++); } if (ISHEXDIGIT(**s)) { rval <<=4; rval |= HexToBinary(*(*s)++); } if (ISHEXDIGIT(**s)) { rval <<=4; rval |= HexToBinary(*(*s)++); } --(*s); break; default: // /DDD number formed of 1-3 octal digits if (!ISOCTDIGIT(**s)) rval = **s; else { ++(*s); rval=OctToBinary(*(*s)++); if (ISOCTDIGIT(**s)) { rval <<= 3; rval |= OctToBinary( *(*s)++ ); } if (ISOCTDIGIT(**s)) { rval <<= 3; rval |= OctToBinary( *(*s)++ ); } --(*s); } break; } ++(*s); } return rval; } #include <limits.h> int main(int argc, char **argv) { static pattern *pat; static FILE *inp; static char inp_buf[1024]; static char *end=NULL; if (CHAR_BIT != 8) // ansi-defined as bits in a char { fprintf(stderr, "Error: Requires 8-bit bytes/n" ); exit(EXIT_FAILURE); } if (argc<2||argv[1][0]=='-') { fprintf( stderr, "Usage: minigrep reg_exp filename/n" ); fprintf( stderr, "Usage: minigrep reg_exp < filename/n" ); exit(EXIT_FAILURE); } if (!(pat=MakePattern(argv[1]))) { fprintf(stderr, "Can't make expression template/n" ); exit(EXIT_FAILURE); } if (argc==2) inp = stdin; else if (!(inp = fopen(argv[2], "r"))) { perror( argv[2] ); exit(EXIT_FAILURE); } while (fgets(inp_buf, sizeof(inp_buf), inp)) if ((end=MatchString(inp_buf, pat, 0))!=NULL) { fputs(inp_buf, stdout); fputs("/n", stdout); fputs(end, stdout); } return EXIT_SUCCESS; }