Regex预处理
假设只处理连接、或和*闭包这三种基本的Regex。因为Regex的连接格式为ab,为了方便计算机识别,得添加连接标识符。在计算机处理Regex时,一般都使用stack,为了方便计算机识别,通常将人类习惯的中缀表达式(1+1)转换为后缀表达式(11+)。
添加连接标识符
/// <summary>
/// 给正则表达式加入连接标识符“.”<br/>
/// e.g.<br/>
/// (ab|ba)*ab ——> (a.b|b.a)*.a.b
/// </summary>
public static string UseDotForConcatenation(string regex)
{
for (var i = 0; i < regex.Length - 1; ++i)
{
var cur = regex[i];
if (cur != '|' && cur != '(')
{ // Don't add '.' after '|' and '('
var next = regex[i + 1];
if (next != '|' && next != '*' && next != ')')
{ // Don't add '.' if '|', '*', ')' are the next characters
regex = new StringBuilder(regex).Insert(++i, ".").ToString();
}
}
}
return regex;
}
[TestMethod]
public void UseDotForConcatenationTest()
{
var regex = "((abcd|dcac)*|abba)*addcb";
var test = UseDotForConcatenation(regex);
Debug.WriteLine(test);
}
输出如下:
((a.b.c.d|d.c.a.c)*|a.b.b.a)*.a.d.d.c.b
转换为后缀表达式
/// <summary>
/// 将加入连结符后的正则表达式转换为后缀表示法<br/>
/// e.g.<br/>
/// (a.b|b.a)*.a.b ——> ab.ba.|*a.b.
/// </summary>
public static string InfixToPostfix(string infixRegex)
{
var stack = new Stack<char>();
var postfixRegex = new StringBuilder(infixRegex.Length << 1);
for (var i = 0; i < infixRegex.Length; ++i)
{
var ch = infixRegex[i];
switch (ch)
{
case '.':
case '|':
while (stack.Count > 0 && Precedence(ch) <= Precedence(stack.Peek()))
{
postfixRegex.Append(stack.Pop());
}
stack.Push(ch);
break;
case '(':
stack.Push(ch);
break;
case ')':
while (stack.Count > 0 && stack.Peek() != '(')
{
postfixRegex.Append(stack.Pop());
}
stack.Pop();
break;
default:
postfixRegex.Append(ch);
break;
}
}
while (stack.Count > 0)
{
postfixRegex.Append(stack.Pop());
}
return postfixRegex.ToString();
}
/// <summary>
/// 字符的优先级
/// </summary>