正则表达式重复和贪婪算法(Pascal)
用 FreePascal 实现正则表达式字符串查找,180 行代码,支持 . ? * + {n} {n,} {n,m} ?? *? +? {}?
语法。
program Project1;
type
// 正则表达式解析结果
TNode = class
FNext : TNode; // 下一个节点
FData : Char; // 节点内容
FMinRepeats : Integer; // 最小重复次数
FMaxRepeats : Integer; // 最大重复次数
FRepeats : Integer; // 当前重复次数
FGreedy : Boolean; // 当前节点是否使用贪婪模式
constructor Create(Data: Char);
destructor Destroy; override;
end;
constructor TNode.Create(Data: Char);
begin
FNext := nil;
FData := Data;
FMinRepeats := 1;
FMaxRepeats := 1;
FRepeats := 0;
FGreedy := True;
end;
destructor TNode.Destroy;
begin
if FNext <> nil then FNext.Free;
inherited Destroy;
end;
type
TRegExpr = class
private
FInput : String; // 输入的源串
FCur : PChar; // 当前位置
FEnd : PChar; // 结束位置
FProgram : TNode; // 初始节点
FGreedy : Boolean; // 是否使用贪婪模式
FGroup : array [0..1] of PChar; // 匹配结果的首尾位置
function Match(Node: TNode): PChar;
function Parse(PStr, PEnd: PChar): TNode;
public
constructor Create(Pattern: String);
destructor Destroy; override;
function Exec(Input: String; Greedy: Boolean): Boolean;
function ExecNext: Boolean;
end;
constructor TRegExpr.Create(Pattern: String);
begin
FProgram := Parse(PChar(Pattern), PChar(Pattern) + Length(Pattern));
end;
destructor TRegExpr.Destroy;
begin
FProgram.Free;
inherited Destroy;
end;
function StrToInt(var PStr: PChar): Integer;
begin
Result := 0;
while PStr^ in ['0'.. '9'] do begin
Result := Result * 10 + (Ord(PStr^) - Ord('0'));
PStr += 1;
end;
end;
// 解析正则表达式,返回首节点
// 支持 . ? * + {n} {n,} {n,m} ?? *? +? {}?
function TRegExpr.Parse(PStr, PEnd: PChar): TNode;
var
Node : TNode;
begin
if PStr >= PEnd then Exit(nil);
Result := TNode.Create(PStr^);
Node := Result;
PStr += 1;
while PStr < PEnd do begin
case PStr^ of
'?': begin
if (Node.FMinRepeats = 1) and (Node.FMaxRepeats = 1) then begin
Node.FMinRepeats := 0; Node.FMaxRepeats := 1
end else
Node.FGreedy := False;
end;
'*': begin Node.FMinRepeats := 0; Node.FMaxRepeats := 0 end;
'+': begin Node.FMinRepeats := 1; Node.FMaxRepeats := 0 end;
'{': begin
PStr += 1;
Node.FMinRepeats := StrToInt(PStr);
if PStr^ = ',' then begin
PStr += 1;
if PStr^ = '}' then
Node.FMaxRepeats := 0 // {n,}
else
Node.FMaxRepeats := StrToInt(PStr); // {n,m}
end else
Node.FMaxRepeats := Node.FMinRepeats; // {m}
if PStr^ <> '}' then Exit(nil);
end
else begin Node.FNext := TNode.Create(PStr^); Node := Node.FNext; end;
end;
PStr += 1;
end;
end;
// 检查从 FCur 开始的字符串是否与 Node 及其后续节点匹配,返回匹配的结束位置
function TRegExpr.Match(Node: TNode): PChar;
var
PLast, PMatched: PChar;
begin
if Node = nil then Exit(FCur); // 空节点表示正则表达式匹配完毕
Node.FRepeats := 0; // 复位上次 Exec 时改变的状态
// 先满足最小匹配次数
while (FCur < FEnd) and (Node.FRepeats < Node.FMinRepeats) do begin
if (Node.FData <> '.') and (Node.FData <> FCur^) then Exit(nil);
FCur += 1;
Node.FRepeats += 1;
end;
// 后续节点匹配成功才算匹配成功
Result := nil;
while FCur < FEnd do begin
PLast := FCur; // 记下此位置,贪婪匹配会从这里开始
// 匹配后续节点
PMatched := Match(Node.FNext);
if PMatched <> nil then Result := PMatched;
// 成功后是否贪婪匹配
if (Result <> nil) and (not FGreedy) or (not Node.FGreedy) then Exit;
// 是否到达最大重复次数
if (Node.FMaxRepeats > 0) and (Node.FRepeats = Node.FMaxRepeats) then Exit;
// 重复匹配
FCur := PLast; // 恢复到匹配 FNext 之前的位置
if (Node.FData <> '.') and (Node.FData <> FCur^) then Exit;
FCur += 1;
Node.FRepeats += 1;
end;
end;
// 查找第一个匹配结果
function TRegExpr.Exec(Input: String; Greedy: Boolean): Boolean;
begin
FInput := Input;
FCur := PChar(FInput);
FEnd := FCur + Length(FInput);
FGreedy := Greedy;
Result := ExecNext;
end;
// 查找下一个匹配结果
function TRegExpr.ExecNext: Boolean;
var
PStart, PMatched: PChar;
begin
PStart := FCur;
PMatched := nil;
while (FCur < FEnd) do begin
PMatched := Match(FProgram);
if PMatched <> nil then Break;
PStart += 1;
FCur := PStart;
end;
FGroup[0] := PStart;
FGroup[1] := PMatched;
Result := PMatched <> nil;
end;
// 测试
function SubStr(PStr, PEnd: PChar): String;
begin
SetLength(Result, PEnd - PStr);
Move(PStr[0], PChar(Result)[0], PEnd - PStr);
end;
procedure MatchAll(InputString, Pattern: String; Greedy: Boolean);
var
Reg: TRegExpr;
begin
WriteLn('------------------------------');
Reg := TRegExpr.Create(Pattern);
if Reg.Exec(InputString, Greedy) then
repeat
WriteLn(SubStr(Reg.FGroup[0], Reg.FGroup[1]));
until not Reg.ExecNext;
Reg.Free;
end;
begin
MatchAll('Hello, World, Hallo, World!', 'H.*o', True);
MatchAll('Hello, World, Hallo, World!', 'H.*o', False);
MatchAll('Hello, World, Hallo, World!', '.{2,5}l' , True);
MatchAll('Hello, World, Hallo, World!', '.{2,5}?l', True);
end.