说明:刚才发现在处理*元字符时弄错了,代码修改重新上传到CSDN了,文章中的示例代码也进行了修改。
前一版本有错误的代码中将*处理成了前一字符至少出现1次,修改后为出现0次或多次。
如果你是通过CSDN下载找到这个页面的,请务必留意,你下载的可能不是最终版的代码。最终版代码下载地址:
http://download.csdn.net/detail/sun2043430/5333836
看了《代码之美》第一章的《正则表达式》之后一直手痒,想自己写一个。趁着周末有空试验了一把。
首先是挑选正则表达式的元字符集,我选择了以下这些元字符:
// c 匹配任意的字母(大小写均可)
// . 任意字符
// * 重复前一字符0次或多次,但尽量多次(贪婪匹配)
// ^ 匹配串的开头
// $ 匹配串的结尾
// \ 转义字符(\c表示一个字母'c',\\表示一个字母'\')
在《代码之美》里面缺少转义字符,但是在一个完备的正则表达式中,转义字符是必不可少的(否则会有不能表示的字符),所以我在代码实现中加入了转义字符。
另外《代码之美》一书中的代码对*元字符采用的是简单的懒惰匹配,而在正规的正则表达式中,*、+、?都是采用的贪婪匹配的。从代码实现上来说贪婪匹配稍难于懒惰匹配。
另外,《代码之美》一书中也提到了如果涉及到转义字符、中括号表示的字符集时,最好用结构体的方式来表示正则表达式中的一个字符。
我的源代码中在进行正则表达式匹配之前,先对正则表达式的正确性做验证,看正则表达式是否正确。罗列的标准如下:
- 元字符*不能在开头,元字符^只能出现在表达式开头,元字符$只能出现在表达式末尾。
- 转义字符\后面只能跟元字符
- 元字符^后面不能是元字符*
- 元字符*后面不能是元字符*
一些正则表达式是否正确的测试样例:
"^^now err" is error regular expression!
"^*abcd" is error regular expression!
"ababcd\" is error regular expression!
"^^*bcd" is error regular expression!
"a^*bcd" is error regular expression!
"^ab$cd" is error regular expression!
"a**abcd" is error regular expression!
"*abcd" is error regular expression!
"\a*bcd" is error regular expression!
"now ok" is correct regular expression!
"\\a*bcd" is correct regular expression!
"\**bcd" is correct regular expression!
".*abcd" is correct regular expression!
"^a*bcd" is correct regular expression!
"^abcd$" is correct regular expression!
".abc d" is correct regular expression!
".*abcd" is correct regular expression!
"\c*abcd" is correct regular expression!
"\*abcd" is correct regular expression!
"c*abcd" is correct regular expression!
"\.c*abcd" is correct regular expression!
"abc\.c*abcd" is correct regular expression!
"abc d" is correct regular expression!
"\^*abcd" is correct regular expression!
检测正则表达式是否正确的代码:
bool CheckRegExpr(const char *pReg)
{
const char *pBegin = pReg;
if ('*' == *pBegin)
return false;
while (*pReg)
{
if ( ('^' == *pReg && pReg != pBegin) ||
('^' == *pReg && pReg[1] == '*') ||
('$' == *pReg && pReg[1] != '\0') )
return false;
if ('*' == *pReg && '*' == pReg[1])
return false;
if ('\\' == *pReg)
{
if (!IsRegMetacharacter(pReg[1]))
return false;
else
pReg++;
}
pReg++;
}
return true;
}
正则匹配的核心代码:
const char* RegExprFind(const char *pText, const char *pReg)
{
const char *pCur = pText;
if (!CheckRegExpr(pReg))
return (char*)-1;
do
{
g_pBeg = pCur;
MATCH_STATE eResult = Match(pCur, pReg);
if (MATCH_OK == eResult)
return pCur;
else if (MATCH_ERROR == eResult)
return (char*)-1;
else if (MATCH_FAIL == eResult)
return NULL;
}
while (*pCur++);
return NULL;
}
MATCH_STATE Match(const char *pCur, const char *pReg)
{
g_pEnd = pCur;
if ('\0' == *pReg)
return (g_pEnd != g_pBeg) ? MATCH_OK : MATCH_NEXT_POSITION;
if ('$' == *pReg)
return ('\0' == *pCur && g_pEnd != g_pBeg) ? MATCH_OK : MATCH_FAIL;
if ('^' == *pReg)
{
// return Match(pCur, pReg+1);
if (MATCH_OK == Match(pCur, pReg+1))
return MATCH_OK;
else
return MATCH_FAIL;
}
st_RE_Element elementCur;// 更复杂的情况要借助结构体
st_RE_Element elementNext;
int nElementLen1 = 0;
int nElementLen2 = 0;
nElementLen1 = GetReElement(&elementCur, pReg);
nElementLen2 = GetReElement(&elementNext, pReg+nElementLen1);
if (!elementNext.isEscape && elementNext.ch == '*')// 贪婪匹配比较麻烦
{
const char *pStart = pCur;
while(*pCur && MatchAt(elementCur, *pCur))
pCur++;
while (pCur >= pStart)
{
if (MATCH_OK == Match(pCur, pReg+nElementLen1+nElementLen2))
return MATCH_OK;
pCur--;
}
}
else
{
if (MatchAt(elementCur, *pCur))
return Match(pCur+1, pReg+nElementLen1);
}
return MATCH_NEXT_POSITION;
}
int GetReElement(pst_RE_Element pElement, const char *pReg)
{
if (*pReg == '\\')
{
pElement->isEscape = true;
pElement->ch = pReg[1];
return 2;
}
else
{
pElement->isEscape = false;
pElement->ch = *pReg;
return 1;
}
}
bool MatchAt(st_RE_Element regChar, char ch)
{
if (regChar.isEscape) // \c \\ etc.
{
return ch == regChar.ch;
}
else // a . c
{
if ('.' == regChar.ch || ('c' == regChar.ch && IsAlpha(ch)) )
{
return true;
}
return ch == regChar.ch;
}
}
正则表达式对应的结构体和枚举常量:
typedef struct _st_RE_Element
{
char ch;
bool isEscape;
}st_RE_Element, *pst_RE_Element;
enum MATCH_STATE
{
MATCH_OK,
MATCH_NEXT_POSITION,
MATCH_FAIL,
MATCH_ERROR, //regular expression syntax error, like "a**b"
};
正则表达式匹配效果演示:
0 1 2 3 4 5
012345678901234567890123456789012345678901234567890
Text is Let's ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 00 RegExpr = "Let" Let
Find at 00 RegExpr = "^Let.*$" Let's ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 00 RegExpr = "c*" Let
Find at 06 RegExpr = "g*o*" ggggo
Find at 06 RegExpr = "g*a*" gggg
Find at 30 RegExpr = "k*2*3*k*" 22333333
Find at 30 RegExpr = "k*2*k*3*" 22333333
Find at 22 RegExpr = "\$" $
Find at 20 RegExpr = "\." .
Find at 24 RegExpr = "\\*" \\\
Find at 20 RegExpr = "\..*23" .^$*\\\fg 2233333321222223
Find at 00 RegExpr = "c*" Let
Find at 14 RegExpr = "\c*" c
Find at 49 RegExpr = "2*$" 22222
Find at 32 RegExpr = "3*" 333333
Find at 30 RegExpr = "2*223" 223
Find at 30 RegExpr = "2*23" 223
Find at 06 RegExpr = "g*o.*3*21" ggggo abcdeccc.^$*\\\fg 2233333321
Find at 06 RegExpr = "g*o.*3*2" ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 06 RegExpr = "g*o.*3*" ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 06 RegExpr = "g*o.*3" ggggo abcdeccc.^$*\\\fg 2233333321222223333
Not Found! RegExpr = "Below are Not Found!------------------------------------------"
Not Found! RegExpr = "k*"
Not Found! RegExpr = "g*o.2*3"
Find at 54 RegExpr = "3*$"
Not Found! RegExpr = "^3*"
Not Found! RegExpr = "Below are Syntax Error!---------------------------------------"
Syntax error! RegExpr = "^*abc"
Syntax error! RegExpr = ".*a$bc"
完整代码可到以下地址下载:
http://download.csdn.net/detail/sun2043430/5333836
注:代码中使用函数式实现,为了便于打印输出、展示结果,使用了全局变量。