{
Html解析器.
最近因为用到Html解析功能.在网上找了几款Delphi版本的,结果发现解析复杂的HTML都有一些问题.
没办法自己写了一款,经测试到现在没遇到任何解析不了的Html.
wr960204 武稀松 2013
http://www.raysoftware.cn/?p=370
感谢牛人杨延哲在HTML语法和CSS语法方面的帮助.
Thank Yang Yanzhe.
http://www.pockhero.com/
本版本只支持DelphiXE3之后的版本.如果用早期Delphi请使用HTMLParser.pas文件.
支持Windows,MacOSX,iOS,Android平台,完全去掉了对指针的使用.防止以后易博龙去掉
移动平台对指针的支持.
脱离了对旧版本的支持,甩掉包袱开发起来真的很爽!
}
unit HtmlParser_XE3UP;
interface
{$IF (defined(IOS) and defined(CPUARM)) or defined(ANDROID)}
{$DEFINE MOBILE_DEV}
{$ENDIF}
const
LowStrIndex = Low(string); // 移动平台=0,个人电脑平台=1
type
{$IFNDEF MSWINDOWS}
{ 接口使用WideString是为了可以给例如C++,VB等语言使用.
但是如果离开了Windows平台,其他平台是没有WideString这个COM的数据类型的.
}
WideString = string;
{$ENDIF}
IHtmlElement = interface;
IHtmlElementList = interface;
IHtmlElement = interface
['{8C75239C-8CFA-499F-B115-7CEBEDFB421B}']
// function GetOwner: IHtmlElement; stdcall;
function GetTagName: WideString; safecall;
function GetContent: WideString; safecall;
function GetOrignal: WideString; safecall;
function GetChildrenCount: Integer; stdcall;
function GetChildren(Index: Integer): IHtmlElement; stdcall;
function GetCloseTag: IHtmlElement; stdcall;
function GetInnerHtml(): WideString; safecall;
function GetOuterHtml(): WideString; safecall;
function GetInnerText(): WideString; safecall;
function GetAttributes(Key: WideString): WideString; safecall;
function GetSourceLineNum(): Integer; stdcall;
function GetSourceColNum(): Integer; stdcall;
// 属性是否存在
function HasAttribute(AttributeName: WideString): Boolean; stdcall;
{ 用CSS选择器语法查找Element,不支持"伪类"
CSS Selector Style search,not support Pseudo-classes.
http://www.w3.org/TR/CSS2/selector.html
}
function SimpleCSSSelector(const selector: WideString): IHtmlElementList; stdcall;
// 枚举属性
function EnumAttributeNames(Index: Integer): WideString; safecall;
property TagName: WideString read GetTagName;
property ChildrenCount: Integer read GetChildrenCount;
property Children[index: Integer]: IHtmlElement read GetChildren; default;
property CloseTag: IHtmlElement read GetCloseTag;
property Content: WideString read GetContent;
property Orignal: WideString read GetOrignal;
// property Owner: IHtmlElement read GetOwner;
// 获取元素在源代码中的位置
property SourceLineNum: Integer read GetSourceLineNum;
property SourceColNum: Integer read GetSourceColNum;
//
property InnerHtml: WideString read GetInnerHtml;
property OuterHtml: WideString read GetOuterHtml;
property InnerText: WideString read GetInnerText;
property Attributes[Key: WideString]: WideString read GetAttributes;
end;
IHtmlElementList = interface
['{8E1380C6-4263-4BF6-8D10-091A86D8E7D9}']
function GetCount: Integer; stdcall;
function GetItems(Index: Integer): IHtmlElement; stdcall;
property Count: Integer read GetCount;
property Items[Index: Integer]: IHtmlElement read GetItems; default;
end;
function ParserHTML(const Source: WideString): IHtmlElement; stdcall;
implementation
uses
SysUtils, generics.Collections;
type
TStringDictionary = TDictionary<string, string>;
TPropDictionary = TDictionary<string, WORD>;
TStringDynArray = TArray<string>;
const
WhiteSpace =[' ', #13, #10, #9];
// CSS Attribute Compare Operator
OperatorChar =['=', '!', '*', '~', '|', '^', '$'];
MaxListSize = Maxint div 16;
// TagProperty
tpBlock = $01;
tpInline = $02;
tpEmpty = $04;
tpFormatAsInline = $08;
tpPreserveWhitespace = $10;
tpInlineOrEmpty = tpInline or tpEmpty;
type
TAttrOperator = (aoExist, aoEqual, aoNotEqual, aoIncludeWord, aoBeginWord, aoBegin, aoEnd, aoContain);
PAttrSelectorItem = ^TAttrSelectorItem;
TAttrSelectorItem = record
Key: string;
AttrOperator: TAttrOperator;
Value: string;
end;
TSelectorItemRelation = (sirNONE, sirDescendant, sirChildren, sirYoungerBrother, sirAllYoungerBrother);
PCSSSelectorItem = ^TCSSSelectorItem;
TCSSSelectorItem = record
Relation: TSelectorItemRelation;
szTag: string;
Attributes: array of TAttrSelectorItem;
end;
TCSSSelectorItems = array of TCSSSelectorItem;
PCSSSelectorItems = ^TCSSSelectorItems;
TCSSSelectorItemGroup = array of TCSSSelectorItems;
//
TSourceContext = record
private
function GetCharOfCurrent(Index: Integer): Char; inline;
public
Code: string;
CodeIndex: Integer;
LineNum: Integer;
ColNum: Integer;
CurrentChar: Char;
{$IFDEF DEBUG}
currentCode: PChar;
{$ENDIF}
procedure IncSrc(); overload; inline;
procedure IncSrc(Step: Integer); overload; inline;
procedure setCode(const ACode: string); inline;
function ReadStr(UntilChars: TSysCharSet): string; inline;
function PeekStr(Index: Integer): string; overload; inline;
function PeekStr(): string; overload; inline;
function subStr(Index, Count: Integer): string; overload; inline;
function subStr(Count: Integer): string; overload; inline;
procedure SkipBlank(); inline;
property charOfCurrent[Index: Integer]: Char read GetCharOfCurrent;
end;
TAttributeItem = record
Key, Value: string;
end;
TAttributeDynArray = TArray<TAttributeItem>;
TIHtmlElementList = class;
THtmlElement = class;
THtmlElementList = TList<THtmlElement>;
THtmlElement = class(TInterfacedObject, IHtmlElement)
protected
// IHtmlElement
// function GetOwner: IHtmlElement; stdcall;
function GetTagName: WideString; safecall;
function GetContent: WideString; safecall;
function GetOrignal: WideString; safecall;
function GetChildrenCount: Integer; stdcall;
function GetChildren(Index: Integer): IHtmlElement; stdcall;
function GetCloseTag: IHtmlElement; stdcall;
function GetInnerHtml(): WideString; safecall;
function GetOuterHtml(): WideString; safecall;
function GetInnerText(): WideString; safecall;
function GetAttributes(Key: WideString): WideString; safecall;
function GetSourceLineNum(): Integer; stdcall;
function GetSourceColNum(): Integer; stdcall;
// 属性是否存在
function HasAttribute(AttributeName: WideString): Boolean; stdcall;
{ 用CSS选择器语法查找Element,不支持"伪类"
CSS Selector Style search,not support Pseudo-classes.
http://www.w3.org/TR/CSS2/selector.html
}
function SimpleCSSSelector(const selector: WideString): IHtmlElementList; stdcall;
// 枚举属性
function EnumAttributeNames(Index: Integer): WideString; safecall;
property TagName: WideString read GetTagName;
property ChildrenCount: Integer read GetChildrenCount;
property Children[index: Integer]: IHtmlElement read GetChildren; default;
property CloseTag: IHtmlElement read GetCloseTag;
property Content: WideString read GetContent;
property Orignal: WideString read GetOrignal;
// property Owner: IHtmlElement read GetOwner;
// 获取元素在源代码中的位置
property SourceLineNum: Integer read GetSourceLineNum;
property SourceColNum: Integer read GetSourceColNum;
//
property InnerHtml: WideString read GetInnerHtml;
property OuterHtml: WideString read GetOuterHtml;
property InnerText: WideString read GetInnerText;
property Attributes[Key: WideString]: WideString read GetAttributes;
private
FClosed: Boolean;
//
FOwner: THtmlElement;
FCloseTag: IHtmlElement;
FTagName: string;
FIsCloseTag: Boolean;
FContent: string;
FOrignal: string;
FSourceLine: Integer;
FSourceCol: Integer;
//
FAttributes: TStringDictionary;
FChildren: TIHtmlElementList;
procedure _GetHtml(IncludeSelf: Boolean; Sb: TStringBuilder);
procedure _GetText(IncludeSelf: Boolean; Sb: TStringBuilder);
procedure _SimpleCSSSelector(const ItemGroup: TCSSSelectorItemGroup; r: TIHtmlElementList);
procedure _Select(Item: PCSSSelectorItem; Count: Integer; r: TIHtmlElementList; OnlyTopLevel: Boolean = false);
public
constructor Create(AOwner: THtmlElement; AText: string; ALine, ACol: Integer);
destructor Destroy; override;
end;
TIHtmlElementList = class(TInterfacedObject, IHtmlElementList)
private
// IHtmlElementList
function GetItems(Index: Integer): IHtmlElement; stdcall;
function GetCount: Integer; stdcall;
protected
FList: TList<IHtmlElement>;
procedure SetItems(Index: Integer; const Value: IHtmlElement); inline;
function Add(Value: IHtmlElement): Integer; inline;
procedure Delete(Index: Integer); inline;
procedure Clear(); inline;
public
constructor Create;
destructor Destroy; override;
function IndexOf(Item: IHtmlElement): Integer;
// IHtmlElementList
property Items[index: Integer]: IHtmlElement read GetItems write SetItems; default;
property Count: Integer read GetCount;
end;
function SplitStr(ACharSet: TSysCharSet; AStr: string): TStringDynArray;
var
L, I: Integer;
S: string;
StrChar: Char;
begin
Result := nil;
if Length(AStr) <= 0 then
Exit;
I := Low(AStr);
L := Low(AStr);
StrChar := #0;
while I <= High(AStr) do
begin
if CharInSet(AStr[I], ['''', '"']) then
if StrChar = #0 then
StrChar := AStr[I]
else if StrChar = AStr[I] then
StrChar := #0;
// 不在字符串中,分隔符才生效
if StrChar = #0 then
if CharInSet(AStr[I], ACharSet) then
begin
if I > L then
begin
S := Copy(AStr, L{$IF (LowStrIndex = 0)} + 1{$ENDIF}, I - L);
SetLength(Result, Length(Result) + 1);
Result[Length(Result) - 1] := S;
end;
L := I + 1;
end;
Inc(I);
end;
if (I > L) then
begin
S := Copy(AStr, L{$IF (LowStrIndex = 0)} + 1{$ENDIF}, I - L);
SetLength(Result, Length(Result) + 1);
Result[Length(Result) - 1] := S;
end;
end;
function StrRight(const Value: string; Count: Integer): string;
var
start: Integer;
begin
start := Length(Value) - Count + 1;
if start <= 0 then
Result := Value
else
Result := Copy(Value, start, Count);
end;
function StrLeft(const Value: string; Count: Integer): string;
begin
Result := Copy(Value, LowStrIndex, Count);
end;
// ComapreAttr
function _aoExist(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
begin
Result := E.FAttributes.ContainsKey(Item.Key);
end;
function _aoEqual(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
begin
Result := E.FAttributes.ContainsKey(Item.Key) and (E.FAttributes[Item.Key] = Item.Value);
end;
function _aoNotEqual(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
begin
Result := E.FAttributes.ContainsKey(Item.Key) and (E.FAttributes[Item.Key] <> Item.Value);
end;
function _aoIncludeWord(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
var
S: TStringDynArray;
I: Integer;
begin
Result := false;
if not E.FAttributes.ContainsKey(Item.Key) then
Exit;
Result := True;
S := SplitStr(WhiteSpace, E.FAttributes[Item.Key]);
for I := Low(S) to High(S) do
if S[I] = Item.Value then
Exit;
Result := false;
end;
function _aoBeginWord(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
var
S: TStringDynArray;
I: Integer;
begin
Result := false;
if not E.FAttributes.ContainsKey(Item.Key) then
Exit;
S := SplitStr((WhiteSpace + ['_', '-']), E.FAttributes[Item.Key]);
Result := (Length(S) > 0) and (S[0] = Item.Value);
end;
function _aoBegin(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
var
attr, Value: string;
begin
Result := false;
if not E.FAttributes.ContainsKey(Item.Key) then
Exit;
attr := E.FAttributes[Item.Key];
Value := Item.Value;
Result := (Length(attr) > Length(Value)) and (StrLeft(attr, Length(Value)) = Value);
end;
function _aoEnd(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
var
attr, Value: string;
begin
Result := false;
if not E.FAttributes.ContainsKey(Item.Key) then
Exit;
attr := E.FAttributes[Item.Key];
Value := Item.Value;
Result := (Length(attr) > Length(Value)) and (StrRight(attr, Length(Value)) = Value);
end;
function _aoContain(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
begin
Result := false;
if not E.FAttributes.ContainsKey(Item.Key) then
Exit;
Result := Pos(Item.Value, E.FAttributes[Item.Key]) > 0;
end;
type
TFNCompareAttr = function(const Item: TAttrSelectorItem; E: THtmlElement): Boolean;
const
AttrCompareFuns: array[TAttrOperator] of TFNCompareAttr = (_aoExist, _aoEqual, _aoNotEqual, _aoIncludeWord, _aoBeginWord, _aoBegin, _aoEnd, _aoContain);
function ConvertEntities(S: string): string; forward;
function GetTagProperty(const TagName: string): WORD; forward;
procedure DoError(const Msg: string);
begin
raise Exception.Create(Msg);
end;
procedure _ParserAttrs(var sc: TSourceContext; var Attrs: TAttributeDynArray);
var
Item: TAttributeItem;
begin
SetLength(Attrs, 0);
while True do
begin
sc.SkipBlank();
if sc.CurrentChar = #0 then
Break;
Item.Key := sc.ReadStr((WhiteSpace + [#0, '=']));
Item.Value := '';
sc.SkipBlank;
if sc.CurrentChar = '=' then
begin
sc.IncSrc;
sc.SkipBlank;
Item.Value := sc.ReadStr((WhiteSpace + [#0]));
end;
SetLength(Attrs, Length(Attrs) + 1);
Attrs[Length(Attrs) - 1] := Item;
end;
end;
procedure _ParserNodeItem(S: string; var ATagName: string; var Attrs: TAttributeDynArray);
var
sc: TSourceContext;
begin
sc.setCode(S);
sc.SkipBlank;
ATagName := UpperCase(sc.ReadStr((WhiteSpace + [#0, '/', '>'])));
_ParserAttrs(sc, Attrs);
end;
function CreateTextElement(AOwner: THtmlElement; AText: string; ALine, ACol: Integer): THtmlElement;
begin
Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
with Result do
begin
FContent := ConvertEntities(AText);
FTagName := '#TEXT';
FClosed := True;
end;
end;
function CreateScriptElement(AOwner: THtmlElement; AText: string; ALine, ACol: Integer): THtmlElement;
begin
Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
with Result do
begin
FContent := ConvertEntities(AText);
FTagName := '#SCRIPT';
FClosed := True;
end;
end;
function CreateStyleElement(AOwner: THtmlElement; AText: string; ALine, ACol: Integer): THtmlElement;
begin
Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
with Result do
begin
FContent := ConvertEntities(AText);
FTagName := '#STYLE';
FClosed := True;
end;
end;
function CreateCommentElement(AOwner: THtmlElement; AText: string; ALine, ACol: Integer): THtmlElement;
begin
Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
with Result do
begin
FContent := ConvertEntities(AText);
FTagName := '#COMMENT';
FClosed := True;
end;
end;
function CreateTagElement(AOwner: THtmlElement; AText: string; ALine, ACol: Integer): THtmlElement;
var
Strs: TStringDynArray;
I: Integer;
Attrs: TAttributeDynArray;
begin
Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
with Result do
begin
// TODO 解析TagName和属性
if AText = '' then
Exit;
// 去掉两头的<
if AText[Low(AText)] = '<' then
AText := StrRight(AText, Length(AText) - 1);
if AText = '' then
Exit;
if AText[High(AText)] = '>' then
AText := StrLeft(AText, Length(AText) - 1);
// 检查是关闭节点,还是单个已经关闭的节点
if AText = '' then
Exit;
FClosed := AText[High(AText)] = '/';
FIsCloseTag := AText[LowStrIndex] = '/';
if FIsCloseTag then
AText := StrRight(AText, Length(AText) - 1);
if FClosed then
AText := StrLeft(AText, Length(AText) - 1);
//
_ParserNodeItem(AText, FTagName, Attrs);
for I := Low(Attrs) to High(Attrs) do
FAttributes.AddOrSetValue(LowerCase(Attrs[I].Key), ConvertEntities(Attrs[I].Value));
end;
end;
function CreateDocTypeElement(AOwner: THtmlElement; AText: string; ALine, ACol: Integer): THtmlElement;
begin
Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
with Result do
begin
FContent := ConvertEntities(AText);
FTagName := '#DOCTYPE';
FClosed := True;
if FContent = '' then
Exit;
if FContent[1] = '<' then
Delete(FContent, 1, 1);
if FContent = '' then
Exit;
if FContent[Length(FContent)] = '>' then
Delete(FContent, Length(FContent), 1);
FContent := Trim(Copy(Trim(FContent), 9, Length(FContent)));
end;
end;
procedure _ParserHTML(const Source: string; AElementList: THtmlElementList);
var
BeginLineNum, BeginColNum: Integer;
sc: TSourceContext;
function IsEndOfTag(TagName: string): Boolean;
begin
Result := false;
if sc.charOfCurrent[1] = '/' then
begin
Result := UpperCase(sc.subStr(sc.CodeIndex + 2, Length(TagName))) = UpperCase(TagName);
end;
end;
function PosCharInTag(AChar: Char): Boolean;
var
StrChar: Char;
begin
Result := false;
StrChar := #0;
while True do
begin
if sc.CurrentChar = #0 then
Break;
if sc.CurrentChar = '"' then
begin
if StrChar = #0 then
StrChar := sc.CurrentChar
else
StrChar := #0;
end;
if (sc.CurrentChar = AChar) and (StrChar = #0) then
begin
Result := True;
Break;
end;
sc.IncSrc;
end;
end;
function ParserStyleData(): string;
var
oldIndex: Integer;
begin
oldIndex := sc.CodeIndex;
if sc.subStr(4) = '<!--' then
begin
sc.IncSrc(5);
while True do
begin
if sc.CurrentChar = #0 then
DoError(Format('未完结的Style行:%d;列:%d;', [sc.LineNum, sc.ColNum]))
else if sc.CurrentChar = '>' then
begin
if (sc.charOfCurrent[-1] = '-') and (sc.charOfCurrent[-2] = '-') then
begin
sc.IncSrc;
sc.SkipBlank();
Break;
end;
end;
sc.IncSrc;
end;
end
else
while True do
begin
case sc.CurrentChar of
#0:
begin
Break;
end;
'<':
begin
if IsEndOfTag('style') then
begin
Break;
end;
end;
end;
sc.IncSrc;
end;
Result := sc.subStr(oldIndex, sc.CodeIndex - oldIndex);
end;
function ParserScriptData(): string;
var
oldIndex: Integer;
stringChar: Char;
PreIsblique: Boolean;
begin
oldIndex := sc.CodeIndex;
stringChar := #0;
sc.SkipBlank();
if sc.subStr(4) = '<!--' then
begin
sc.IncSrc(5);
while True do
begin
if sc.CurrentChar = #0 then
DoError(Format('未完结的Script行:%d;列:%d;', [sc.LineNum, sc.ColNum]))
else if sc.CurrentChar = '>' then
begin
if (sc.charOfCurrent[-1] = '-') and (sc.charOfCurrent[-2] = '-') then
begin
sc.IncSrc;
sc.SkipBlank();
Break;
end;
end;
sc.IncSrc;
end;
end
else
begin
while True do
begin
case sc.CurrentChar of
#0:
Break;
'"', '''': // 字符串
begin
stringChar := sc.CurrentChar;
PreIsblique := false;
sc.IncSrc();
while True do
begin
if sc.CurrentChar = #0 then
Break;
if (sc.CurrentChar = stringChar) and (not PreIsblique) then
Break;
if sc.CurrentChar = '\' then
PreIsblique := not PreIsblique
else
PreIsblique := false;
sc.IncSrc;
end;
end;
'/': // 注释
begin
sc.IncSrc();
case sc.CurrentChar of
'/': // 行注释
begin
while True do
begin
if CharInSet(sc.CurrentChar, [#0, #$0A]) then
begin
Break;
end;
sc.IncSrc();
end;
end;
'*': // 块注释
begin
sc.IncSrc();
sc.IncSrc();
while True do
begin
if sc.CurrentChar = #0 then
Break;
if (sc.CurrentChar = '/') and (sc.charOfCurrent[-1] = '*') then
begin
Break;
end;
sc.IncSrc();
end;
end;
end;
end;
'<':
begin
if IsEndOfTag('script') then
begin
Break;
end;
end;
end;
sc.IncSrc();
end;
end;
Result := sc.subStr(oldIndex, sc.CodeIndex - oldIndex)
end;
var
ElementType: (EtUnknow, EtTag, EtDocType, EtText, EtComment);
OldCodeIndex: Integer;
tmp: string;
Tag: THtmlElement;
begin
sc.setCode(Source);
while sc.CodeIndex <= high(sc.Code) do
begin
ElementType := EtUnknow;
OldCodeIndex := sc.CodeIndex;
BeginLineNum := sc.LineNum;
BeginColNum := sc.ColNum;
if sc.CurrentChar = #0 then
Break;
// "<"开头的就是Tag之类的
if sc.CurrentChar = '<' then
begin
sc.IncSrc;
if sc.CurrentChar = '!' then // 注释
begin
ElementType := EtComment;
sc.IncSrc;
case sc.CurrentChar of
'-': // <!-- -->
begin
sc.IncSrc; // -
while True do
begin
if not PosCharInTag('>') then
DoError('LineNum:' + IntToStr(BeginLineNum) + '无法找到Tag结束点:' + sc.subStr(100))
else if (sc.charOfCurrent[-1] = '-') and (sc.charOfCurrent[-2] = '-') then
begin
sc.IncSrc;
Break;
end;
sc.IncSrc;
end;
end;
'[': // <![CDATA[.....]]>
begin
sc.IncSrc; //
while True do
begin
if not PosCharInTag('>') then
DoError('LineNum:' + IntToStr(BeginLineNum) + '无法找到Tag结束点:' + sc.subStr(100))
else if (sc.charOfCurrent[-1] = ']') then
begin
sc.IncSrc;
Break;
end;
sc.IncSrc;
end;
end;
else // <!.....>
begin
if UpperCase(sc.PeekStr()) = 'DOCTYPE' then
begin
ElementType := EtDocType;
sc.IncSrc; //
if PosCharInTag('>') then
sc.IncSrc
else
DoError('LineNum:' + IntToStr(BeginLineNum) + '无法找到Tag结束点:' + sc.subStr(100));
end
else
begin
sc.IncSrc; //
if PosCharInTag('>') then
sc.IncSrc
else
DoError('LineNum:' + IntToStr(BeginLineNum) + '无法找到Tag结束点:' + sc.subStr(100));
end;
end;
end;
end
else if sc.CurrentChar = '?' then // <?...?> XML
begin
ElementType := EtComment;
sc.IncSrc; //
while True do
begin
if not PosCharInTag('>') then
DoError('LineNum:' + IntToStr(BeginLineNum) + '无法找到Tag结束点:' + sc.subStr(100))
else if (sc.charOfCurrent[-1] = '?') then
begin
sc.IncSrc;
Break;
end;
sc.IncSrc;
end;
end
else // 正常节点
begin
ElementType := EtTag;
sc.IncSrc;
if PosCharInTag('>') then
sc.IncSrc
else
DoError('LineNum:' + IntToStr(BeginLineNum) + '无法找到Tag结束点:' + sc.subStr(100));
end;
tmp := sc.subStr(OldCodeIndex, sc.CodeIndex - OldCodeIndex);
end
else // 不是"<"开头的 那就是纯文本节点
begin
ElementType := EtText;
while True do
begin
if CharInSet(sc.CurrentChar, [#0, '<']) then
Break;
sc.IncSrc;
end;
tmp := sc.subStr(OldCodeIndex, sc.CodeIndex - OldCodeIndex);
end;
//
// ShowMessage(sc.subStr(30));
case ElementType of
EtUnknow:
begin
DoError('LineNum:' + IntToStr(BeginLineNum) + '无法解析的内容:' + sc.subStr(100));
end;
EtDocType:
begin
Tag := CreateDocTypeElement(nil, tmp, BeginLineNum, BeginColNum);
AElementList.Add(Tag);
end;
EtTag:
begin
Tag := CreateTagElement(nil, tmp, BeginLineNum, BeginColNum);
AElementList.Add(Tag);
//
if (UpperCase(Tag.FTagName) = 'SCRIPT') and (not Tag.FIsCloseTag) and (not Tag.FClosed) then
begin
// 读取Script
BeginLineNum := sc.LineNum;
BeginColNum := sc.ColNum;
tmp := ParserScriptData();
Tag := CreateScriptElement(nil, tmp, BeginLineNum, BeginColNum);
AElementList.Add(Tag);
end
else if (UpperCase(Tag.FTagName) = 'STYLE') and (not Tag.FIsCloseTag) and (not Tag.FClosed) then
begin
// 读取Style
BeginLineNum := sc.LineNum;
BeginColNum := sc.ColNum;
tmp := ParserStyleData();
Tag := CreateStyleElement(nil, tmp, BeginLineNum, BeginColNum);
AElementList.Add(Tag);
end;
end;
EtText:
begin
Tag := CreateTextElement(nil, tmp, BeginLineNum, BeginColNum);
Tag.FSourceLine := BeginLineNum;
Tag.FSourceCol := BeginColNum;
AElementList.Add(Tag);
end;
EtComment:
begin
Tag := CreateCommentElement(nil, tmp, BeginLineNum, BeginColNum);
Tag.FSourceLine := BeginLineNum;
Tag.FSourceCol := BeginColNum;
AElementList.Add(Tag);
end;
end;
end;
//
//
end;
function BuildTree(ElementList: THtmlElementList): THtmlElement;
var
I, J: Integer;
E: THtmlElement;
T: THtmlElement;
FoundIndex: Integer;
TagProperty: WORD;
begin
Result := THtmlElement.Create(nil, '', 0, 0);
Result.FTagName := '#DOCUMENT';
Result.FClosed := false;
ElementList.Insert(0, Result);
I := 1;
while I < ElementList.Count do
begin
E := ElementList[I] as THtmlElement;
TagProperty := GetTagProperty(E.FTagName);
// 空节点,往下找,如果下一个带Tag的节点不是它的关闭节点,那么自动关闭
FoundIndex := -1;
if E.FIsCloseTag then
begin
for J := (I - 1) downto 0 do
begin
T := ElementList[J] as THtmlElement;
if (not T.FClosed) and (T.FTagName = E.FTagName) and (not T.FIsCloseTag) then
begin
FoundIndex := J;
Break;
end;
end;
// 如果往上找,找不到的话这个关闭Tag肯定是无意义的.
if FoundIndex > 0 then
begin
for J := (I - 1) downto FoundIndex do
begin
T := ElementList[J] as THtmlElement;
T.FClosed := True;
end;
(ElementList[FoundIndex] as THtmlElement).FCloseTag := E;
end
else
begin
E.Free;
end;
ElementList.Delete(I);
Continue;
end
else
begin
for J := (I - 1) downto 0 do
begin
T := ElementList[J] as THtmlElement;
if not T.FClosed then
begin
if ((GetTagProperty(T.FTagName) and tpEmpty) <> 0) then
T.FClosed := True
else
begin
T.FChildren.Add(E);
E.FOwner := T;
Break;
end;
end;
end;
end;
Inc(I);
end;
Result.FClosed := True;
end;
function ParserHTML(const Source: WideString): IHtmlElement; stdcall;
var
ElementList: THtmlElementList;
begin
ElementList := THtmlElementList.Create;
_ParserHTML(Source, ElementList);
Result := BuildTree(ElementList);
ElementList.Free;
end;
{$REGION '转换表之类的'}
var
gEntities: TStringDictionary;
type
TEntityItem = record
Key: string;
Value: WideChar;
end;
const
EntityTable: array[0..252 - 1] of TEntityItem = ((
Key: ' ';
Value: WideChar(160)
), (
Key: '¡';
Value: WideChar(161)
), (
Key: '¢';
Value: WideChar(162)
), (
Key: '£';
Value: WideChar(163)
), (
Key: '¤';
Value: WideChar(164)
), (
Key: '¥';
Value: WideChar(165)
), (
Key: '¦';
Value: WideChar(166)
), (
Key: '§';
Value: WideChar(167)
), (
Key: '¨';
Value: WideChar(168)
), (
Key: '©';
Value: WideChar(169)
), (
Key: 'ª';
Value: WideChar(170)
), (
Key: '«';
Value: WideChar(171)
), (
Key: '¬';
Value: WideChar(172)
), (
Key: '­';
Value: WideChar(173)
), (
Key: '®';
Value: WideChar(174)
), (
Key: '¯';
Value: WideChar(175)
), (
Key: '°';
Value: WideChar(176)
), (
Key: '±';
Value: WideChar(177)
), (
Key: '²';
Value: WideChar(178)
), (
Key: '³';
Value: WideChar(179)
), (
Key: '´';
Value: WideChar(180)
), (
Key: 'µ';
Value: WideChar(181)
), (
Key: '¶';
Value: WideChar(182)
), (
Key: '·';
Value: WideChar(183)
), (
Key: '¸';
Value: WideChar(184)
), (
Key: '¹';
Value: WideChar(185)
), (
Key: 'º';
Value: WideChar(186)
), (
Key: '»';
Value: WideChar(187)
), (
Key: '¼';
Value: WideChar(188)
), (
Key: '½';
Value: WideChar(189)
), (
Key: '¾';
Value: WideChar(190)
), (
Key: '¿';
Value: WideChar(191)
), (
Key: 'À';
Value: WideChar(192)
), (
Key: 'Á';
Value: WideChar(193)
), (
Key: 'Â';
Value: WideChar(194)
), (
Key: 'Ã';
Value: WideChar(195)
), (
Key: 'Ä';
Value: WideChar(196)
), (
Key: 'Å';
Value: WideChar(197)
), (
Key: 'Æ';
Value: WideChar(198)
), (
Key: 'Ç';
Value: WideChar(199)
), (
Key: 'È';
Value: WideChar(200)
), (
Key: 'É';
Value: WideChar(201)
), (
Key: 'Ê';
Value: WideChar(202)
), (
Key: 'Ë';
Value: WideChar(203)
), (
Key: 'Ì';
Value: WideChar(204)
), (
Key: 'Í';
Value: WideChar(205)
), (
Key: 'Î';
Value: WideChar(206)
), (
Key: 'Ï';
Value: WideChar(207)
), (
Key: 'Ð';
Value: WideChar(208)
), (
Key: 'Ñ';
Value: WideChar(209)
), (
Key: 'Ò';
Value: WideChar(210)
), (
Key: 'Ó';
Value: WideChar(211)
), (
Key: 'Ô';
Value: WideChar(212)
), (
Key: 'Õ';
Value: WideChar(213)
), (
Key: 'Ö';
Value: WideChar(214)
), (
Key: '×';
Value: WideChar(215)
), (
Key: 'Ø';
Value: WideChar(216)
), (
Key: 'Ù';
Value: WideChar(217)
), (
Key: 'Ú';
Value: WideChar(218)
), (
Key: 'Û';
Value: WideChar(219)
), (
Key: 'Ü';
Value: WideChar(220)
), (
Key: 'Ý';
Value: WideChar(221)
), (
Key: 'Þ';
Value: WideChar(222)
), (
Key: 'ß';
Value: WideChar(223)
), (
Key: 'à';
Value: WideChar(224)
), (
Key: 'á';
Value: WideChar(225)
), (
Key: 'â';
Value: WideChar(226)
), (
Key: 'ã';
Value: WideChar(227)
), (
Key: 'ä';
Value: WideChar(228)
), (
Key: 'å';
Value: WideChar(229)
), (
Key: 'æ';
Value: WideChar(230)
), (
Key: 'ç';
Value: WideChar(231)
), (
Key: 'è';
Value: WideChar(232)
), (
Key: 'é';
Value: WideChar(233)
), (
Key: 'ê';
Value: WideChar(234)
), (
Key: 'ë';
Value: WideChar(235)
), (
Key: 'ì';
Value: WideChar(236)
), (
Key: 'í';
Value: WideChar(237)
), (
Key: 'î';
Value: WideChar(238)
), (
Key: 'ï';
Value: WideChar(239)
), (
Key: 'ð';
Value: WideChar(240)
), (
Key: 'ñ';
Value: WideChar(241)
), (
Key: 'ò';
Value: WideChar(242)
), (
Key: 'ó';
Value: WideChar(243)
), (
Key: 'ô';
Value: WideChar(244)
), (
Key: 'õ';
Value: WideChar(245)
), (
Key: 'ö';
Value: WideChar(246)
), (
Key: '÷';
Value: WideChar(247)
), (
Key: 'ø';
Value: WideChar(248)
), (
Key: 'ù';
Value: WideChar(249)
), (
Key: 'ú';
Value: WideChar(250)
), (
Key: 'û';
Value: WideChar(251)
), (
Key: 'ü';
Value: WideChar(252)
), (
Key: 'ý';
Value: WideChar(253)
), (
Key: 'þ';
Value: WideChar(254)
), (
Key: 'ÿ';
Value: WideChar(255)
), (
Key: 'ƒ';
Value: WideChar(402)
), (
Key: 'Α';
Value: WideChar(913)
), (
Key: 'Β';
Value: WideChar(914)
), (
Key: 'Γ';
Value: WideChar(915)
), (
Key: 'Δ';
Value: WideChar(916)
), (
Key: 'Ε';
Value: WideChar(917)
), (
Key: 'Ζ';
Value: WideChar(918)
), (
Key: 'Η';
Value: WideChar(919)
), (
Key: 'Θ';
Value: WideChar(920)
), (
Key: 'Ι';
Value: WideChar(921)
), (
Key: 'Κ';
Value: WideChar(922)
), (
Key: 'Λ';
Value: WideChar(923)
), (
Key: 'Μ';
Value: WideChar(924)
), (
Key: 'Ν';
Value: WideChar(925)
), (
Key: 'Ξ';
Value: WideChar(926)
), (
Key: 'Ο';
Value: WideChar(927)
), (
Key: 'Π';
Value: WideChar(928)
), (
Key: 'Ρ';
Value: WideChar(929)
), (
Key: 'Σ';
Value: WideChar(931)
), (
Key: 'Τ';
Value: WideChar(932)
), (
Key: 'Υ';
Value: WideChar(933)
), (
Key: 'Φ';
Value: WideChar(934)
), (
Key: 'Χ';
Value: WideChar(935)
), (
Key: 'Ψ';
Value: WideChar(936)
), (
Key: 'Ω';
Value: WideChar(937)
), (
Key: 'α';
Value: WideChar(945)
), (
Key: 'β';
Value: WideChar(946)
), (
Key: 'γ';
Value: WideChar(947)
), (
Key: 'δ';
Value: WideChar(948)
), (
Key: 'ε';
Value: WideChar(949)
), (
Key: 'ζ';
Value: WideChar(950)
), (
Key: 'η';
Value: WideChar(951)
), (
Key: 'θ';
Value: WideChar(952)
), (
Key: 'ι';
Value: WideChar(953)
), (
Key: 'κ';
Value: WideChar(954)
), (
Key: 'λ';
Value: WideChar(955)
), (
Key: 'μ';
Value: WideChar(956)
), (
Key: 'ν';
Value: WideChar(957)
), (
Key: 'ξ';
Value: WideChar(958)
), (
Key: 'ο';
Value: WideChar(959)
), (
Key: 'π';
Value: WideChar(960)
), (
Key: 'ρ';
Value: WideChar(961)
), (
Key: 'ς';
Value: WideChar(962)
), (
Key: 'σ';
Value: WideChar(963)
), (
Key: 'τ';
Value: WideChar(964)
), (
Key: 'υ';
Value: WideChar(965)
), (
Key: 'φ';
Value: WideChar(966)
), (
Key: 'χ';
Value: WideChar(967)
), (
Key: 'ψ';
Value: WideChar(968)
), (
Key: 'ω';
Value: WideChar(969)
), (
Key: 'ϑ';
Value: WideChar(977)
), (
Key: 'ϒ';
Value: WideChar(978)
), (
Key: 'ϖ';
Value: WideChar(982)
), (
Key: '•';
Value: WideChar(8226)
), (
Key: '…';
Value: WideChar(8230)
), (
Key: '′';
Value: WideChar(8242)
), (
Key: '″';
Value: WideChar(8243)
), (
Key: '‾';
Value: WideChar(8254)
), (
Key: '⁄';
Value: WideChar(8260)
), (
Key: '℘';
Value: WideChar(8472)
), (
Key: 'ℑ';
Value: WideChar(8465)
), (
Key: 'ℜ';
Value: WideChar(8476)
), (
Key: '™';
Value: WideChar(8482)
), (
Key: 'ℵ';
Value: WideChar(8501)
), (
Key: '←';
Value: WideChar(8592)
), (
Key: '↑';
Value: WideChar(8593)
), (
Key: '→';
Value: WideChar(8594)
), (
Key: '↓';
Value: WideChar(8595)
), (
Key: '↔';
Value: WideChar(8596)
), (
Key: '↵';
Value: WideChar(8629)
), (
Key: '⇐';
Value: WideChar(8656)
), (
Key: '⇑';
Value: WideChar(8657)
), (
Key: '⇒';
Value: WideChar(8658)
), (
Key: '⇓';
Value: WideChar(8659)
), (
Key: '⇔';
Value: WideChar(8660)
), (
Key: '∀';
Value: WideChar(8704)
), (
Key: '∂';
Value: WideChar(8706)
), (
Key: '∃';
Value: WideChar(8707)
), (
Key: '∅';
Value: WideChar(8709)
), (
Key: '∇';
Value: WideChar(8711)
), (
Key: '∈';
Value: WideChar(8712)
), (
Key: '∉';
Value: WideChar(8713)
), (
Key: '∋';
Value: WideChar(8715)
), (
Key: '∏';
Value: WideChar(8719)
), (
Key: '∑';
Value: WideChar(8721)
), (
Key: '−';
Value: WideChar(8722)
), (
Key: '∗';
Value: WideChar(8727)
), (
Key: '√';
Value: WideChar(8730)
), (
Key: '∝';
Value: WideChar(8733)
), (
Key: '∞';
Value: WideChar(8734)
), (
Key: '∠';
Value: WideChar(8736)
), (
Key: '∧';
Value: WideChar(8743)
), (
Key: '∨';
Value: WideChar(8744)
), (
Key: '∩';
Value: WideChar(8745)
), (
Key: '∪';
Value: WideChar(8746)
), (
Key: '∫';
Value: WideChar(8747)
), (
Key: '∴';
Value: WideChar(8756)
), (
Key: '∼';
Value: WideChar(8764)
), (
Key: '≅';
Value: WideChar(8773)
), (
Key: '≈';
Value: WideChar(8776)
), (
Key: '≠';
Value: WideChar(8800)
), (
Key: '≡';
Value: WideChar(8801)
), (
Key: '≤';
Value: WideChar(8804)
), (
Key: '≥';
Value: WideChar(8805)
), (
Key: '⊂';
Value: WideChar(8834)
), (
Key: '⊃';
Value: WideChar(8835)
), (
Key: '⊄';
Value: WideChar(8836)
), (
Key: '⊆';
Value: WideChar(8838)
), (
Key: '⊇';
Value: WideChar(8839)
), (
Key: '⊕';
Value: WideChar(8853)
), (
Key: '⊗';
Value: WideChar(8855)
), (
Key: '⊥';
Value: WideChar(8869)
), (
Key: '⋅';
Value: WideChar(8901)
), (
Key: '⌈';
Value: WideChar(8968)
), (
Key: '⌉';
Value: WideChar(8969)
), (
Key: '⌊';
Value: WideChar(8970)
), (
Key: '⌋';
Value: WideChar(8971)
), (
Key: '⟨';
Value: WideChar(9001)
), (
Key: '⟩';
Value: WideChar(9002)
), (
Key: '◊';
Value: WideChar(9674)
), (
Key: '♠';
Value: WideChar(9824)
), (
Key: '♣';
Value: WideChar(9827)
), (
Key: '♥';
Value: WideChar(9829)
), (
Key: '♦';
Value: WideChar(9830)
), (
Key: '"';
Value: WideChar(34)
), (
Key: '&';
Value: WideChar(38)
), (
Key: '<';
Value: WideChar(60)
), (
Key: '>';
Value: WideChar(62)
), (
Key: 'Œ';
Value: WideChar(338)
), (
Key: 'œ';
Value: WideChar(339)
), (
Key: 'Š';
Value: WideChar(352)
), (
Key: 'š';
Value: WideChar(353)
), (
Key: 'Ÿ';
Value: WideChar(376)
), (
Key: 'ˆ';
Value: WideChar(710)
), (
Key: '˜';
Value: WideChar(732)
), (
Key: ' ';
Value: WideChar(8194)
), (
Key: ' ';
Value: WideChar(8195)
), (
Key: ' ';
Value: WideChar(8201)
), (
Key: '‌';
Value: WideChar(8204)
), (
Key: '‍';
Value: WideChar(8205)
), (
Key: '‎';
Value: WideChar(8206)
), (
Key: '‏';
Value: WideChar(8207)
), (
Key: '–';
Value: WideChar(8211)
), (
Key: '—';
Value: WideChar(8212)
), (
Key: '‘';
Value: WideChar(8216)
), (
Key: '’';
Value: WideChar(8217)
), (
Key: '‚';
Value: WideChar(8218)
), (
Key: '“';
Value: WideChar(8220)
), (
Key: '”';
Value: WideChar(8221)
), (
Key: '„';
Value: WideChar(8222)
), (
Key: '†';
Value: WideChar(8224)
), (
Key: '‡';
Value: WideChar(8225)
), (
Key: '‰';
Value: WideChar(8240)
), (
Key: '‹';
Value: WideChar(8249)
), (
Key: '›';
Value: WideChar(8250)
), (
Key: '€';
Value: WideChar(8364)
));
function HexToChar(Value: string): Char;
var
I: Integer;
W: WORD;
begin
W := 0;
for I := Low(Value) to High(Value) do
begin
case Value[I] of
'0'..'9':
W := (W shl 4) or (ord(Value[I]) - ord('0'));
'a'..'f':
W := (W shl 4) or (ord(Value[I]) - ord('a') + 10);
'A'..'F':
W := (W shl 4) or (ord(Value[I]) - ord('A') + 10);
else
W := 0;
end;
end;
Result := Char(W);
end;
function DecToChar(Value: string): Char;
var
I: Integer;
W: WORD;
begin
W := 0;
for I := Low(Value) to High(Value) do
begin
case Value[I] of
'0'..'9':
W := 10 * W + (ord(Value[I]) - ord('0'));
else
W := 0;
end;
end;
Result := Char(W);
end;
function ConvertEntities(S: string): string;
var
tmp: string;
I, p: Integer;
Sb: TStringBuilder;
begin
if Length(S) <= 3 then
Exit(S);
if Pos('&#', S) > 0 then
begin
S[low(S)] := S[low(S)];
end;
Sb := TStringBuilder.Create;
I := 0;
while I < Length(S) do
begin
if S.Chars[I] = '&' then
begin
p := S.IndexOf(';', I);
if p >= 0 then
begin
tmp := LowerCase(S.Substring(I, p - I + 1));
if (Length(tmp) > 2) and (tmp.Chars[1] = '#') then
begin
if (Length(tmp) > 3) and (tmp.Chars[2] = '$') then
Sb.Append(HexToChar(tmp.Substring(3, Length(tmp) - 4)))
else
Sb.Append(DecToChar(tmp.Substring(2, Length(tmp) - 3)));
end
else if gEntities.ContainsKey(tmp) then
Sb.Append(gEntities[tmp])
else
Sb.Append(tmp);
Inc(I, Length(tmp));
end
else
begin
Sb.Append(S.Chars[I]);
Inc(I);
end;
end
else
begin
Sb.Append(S.Chars[I]);
Inc(I);
end;
end;
Result := Sb.ToString;
FreeAndNil(Sb);
end;
function ConvertWhiteSpace(S: string): string;
var
Sb: TStringBuilder;
I: Integer;
PreIssWhite, ThisIsWhite: Boolean;
begin
Sb := TStringBuilder.Create;
PreIssWhite := false;
for I := Low(S) to High(S) do
begin
ThisIsWhite := CharInSet(S[I], WhiteSpace);
if ThisIsWhite then
begin
if not PreIssWhite then
Sb.Append(S[I]);
PreIssWhite := True;
end
else
begin
Sb.Append(S[I]);
PreIssWhite := false;
end;
end;
Result := Sb.ToString;
Sb.Free;
end;
const
BlockTags: array[0..59 - 1] of string = ('HTML', 'HEAD', 'BODY', 'FRAMESET', 'SCRIPT', 'NOSCRIPT', 'STYLE', 'META', 'LINK', 'TITLE', 'FRAME', 'NOFRAMES', 'SECTION', 'NAV', 'ASIDE', 'HGROUP', 'HEADER', 'FOOTER', 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'UL', 'OL', 'PRE', 'DIV', 'BLOCKQUOTE', 'HR', 'ADDRESS', 'FIGURE', 'FIGCAPTION', 'FORM', 'FIELDSET', 'INS', 'DEL', 'S', 'DL', 'DT', 'DD', 'LI', 'TABLE', 'CAPTION', 'THEAD', 'TFOOT', 'TBODY', 'COLGROUP', 'COL', 'TR', 'TH', 'TD', 'VIDEO', 'AUDIO', 'CANVAS', 'DETAILS', 'MENU', 'PLAINTEXT');
InlineTags: array[0..56 - 1] of string = ('OBJECT', 'BASE', 'FONT', 'TT', 'I', 'B', 'U', 'BIG', 'SMALL', 'EM', 'STRONG', 'DFN', 'CODE', 'SAMP', 'KBD', 'VAR', 'CITE', 'ABBR', 'TIME', 'ACRONYM', 'MARK', 'RUBY', 'RT', 'RP', 'A', 'IMG', 'BR', 'WBR', 'MAP', 'Q', 'SUB', 'SUP', 'BDO', 'IFRAME', 'EMBED', 'SPAN', 'INPUT', 'SELECT', 'TEXTAREA', 'LABEL', 'BUTTON', 'OPTGROUP', 'OPTION', 'LEGEND', 'DATALIST', 'KEYGEN', 'OUTPUT', 'PROGRESS', 'METER', 'AREA', 'PARAM', 'SOURCE', 'TRACK', 'SUMMARY', 'COMMAND', 'DEVICE');
EmptyTags: array[0..14 - 1] of string = ('META', 'LINK', 'BASE', 'FRAME', 'IMG', 'BR', 'WBR', 'EMBED', 'HR', 'INPUT', 'KEYGEN', 'COL', 'COMMAND', 'DEVICE');
FormatAsInlineTags: array[0..19 - 1] of string = ('TITLE', 'A', 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'PRE', 'ADDRESS', 'LI', 'TH', 'TD', 'SCRIPT', 'STYLE', 'INS', 'DEL', 'S');
PreserveWhitespaceTags: array[0..4 - 1] of string = ('PRE', 'PLAINTEXT', 'TITLE', 'TEXTAREA');
var
gTagProperty: TPropDictionary;
function GetTagProperty(const TagName: string): WORD;
var
Key, S: string;
begin
Result := 0;
Key := UpperCase(TagName);
if gTagProperty.ContainsKey(Key) then
Result := gTagProperty[UpperCase(TagName)]
else
Exit;
end;
function ParserCSSSelector(const Value: string): TCSSSelectorItemGroup;
var
sc: TSourceContext;
function AddAttr(var Item: TCSSSelectorItem): PAttrSelectorItem;
begin
SetLength(Item.Attributes, Length(Item.Attributes) + 1);
Result := @Item.Attributes[Length(Item.Attributes) - 1];
end;
function ParserAttr(): TAttrSelectorItem;
var
oldIndex: Integer;
tmp: string;
stringChar: Char;
begin
sc.IncSrc(); // [
Result.Key := '';
Result.AttrOperator := aoEqual;
Result.Value := '';
// Key
sc.SkipBlank();
oldIndex := sc.CodeIndex;
while not CharInSet(sc.CurrentChar, (WhiteSpace + OperatorChar + [']', #0])) do
sc.IncSrc();
Result.Key := sc.subStr(oldIndex, sc.CodeIndex - oldIndex);
Result.Key := LowerCase(Result.Key);
// Operator
sc.SkipBlank();
oldIndex := sc.CodeIndex;
case sc.CurrentChar of
'=', '!', '*', '~', '|', '^', '$':
begin
sc.IncSrc;
if sc.CurrentChar = '=' then
sc.IncSrc;
end;
']':
begin
Result.AttrOperator := aoExist;
sc.IncSrc;
Exit;
end;
else
begin
DoError(Format('无法解析CSS Attribute操作符[%d,%d]', [sc.LineNum, sc.ColNum]));
end;
end;
tmp := sc.subStr(oldIndex, sc.CodeIndex - oldIndex);
if Length(tmp) >= 1 then
begin
case tmp[LowStrIndex] of
'=':
Result.AttrOperator := aoEqual;
'!':
Result.AttrOperator := aoNotEqual;
'*':
Result.AttrOperator := aoContain;
'~':
Result.AttrOperator := aoIncludeWord;
'|':
Result.AttrOperator := aoBeginWord;
'^':
Result.AttrOperator := aoBegin;
'$':
Result.AttrOperator := aoEnd;
end;
end;
// Value
sc.SkipBlank();
oldIndex := sc.CodeIndex;
if CharInSet(sc.CurrentChar, ['"', '''']) then
stringChar := sc.CurrentChar
else
stringChar := #0;
sc.IncSrc();
while True do
begin
if stringChar = #0 then
begin
if CharInSet(sc.CurrentChar, (WhiteSpace + [#0, ']'])) then
Break;
end
else if (sc.CurrentChar = stringChar) then
begin
sc.IncSrc();
Break;
end;
sc.IncSrc();
end;
Result.Value := sc.subStr(oldIndex, sc.CodeIndex - oldIndex);
// SetString(Result.Value, oldP, P - oldP);
if (stringChar <> #0) and (Length(Result.Value) >= 2) then
Result.Value := Copy(Result.Value, 2, Length(Result.Value) - 2);
Result.Value := ConvertEntities(Result.Value);
//
sc.SkipBlank();
if sc.CurrentChar = ']' then
sc.IncSrc
else
DoError(Format('无法解析Attribute值[%d,%d]', [sc.LineNum, sc.ColNum]));
end;
procedure ParserItem(var Item: TCSSSelectorItem);
var
tmp: string;
pAttr: PAttrSelectorItem;
begin
sc.SkipBlank();
while True do
begin
case sc.CurrentChar of
#0, ',', ' ':
Break;
'.': // class
begin
sc.IncSrc();
pAttr := AddAttr(Item);
pAttr^.Key := 'class';
pAttr^.AttrOperator := aoIncludeWord;
pAttr^.Value := sc.ReadStr((WhiteSpace + OperatorChar + ['[', ']', '"', '''', ',', '.', '#', #0]));
end;
'#': // id
begin
sc.IncSrc();
pAttr := AddAttr(Item);
pAttr^.Key := 'id';
pAttr^.AttrOperator := aoEqual;
pAttr^.Value := sc.ReadStr((WhiteSpace + OperatorChar + ['[', ']', '"', '''', ',', '.', '#', #0]));
end;
'[': // attribute
begin
pAttr := AddAttr(Item);
pAttr^ := ParserAttr();
end;
'/':
begin
sc.IncSrc();
if sc.CurrentChar = '*' then // /**/
begin
sc.IncSrc();
sc.IncSrc();
while True do
begin
if (sc.CurrentChar = '/') and (sc.charOfCurrent[-1] = '*') then
begin
sc.IncSrc;
Break;
end;
sc.IncSrc;
end;
end;
end;
else
begin
Item.szTag := UpperCase(sc.ReadStr((WhiteSpace + ['[', ']', '"', '''', ',', '.', '#', #0])));
end;
end;
end;
end;
function AddItems(var Group: TCSSSelectorItemGroup): PCSSSelectorItems;
begin
SetLength(Group, Length(Group) + 1);
Result := @Group[Length(Group) - 1];
end;
function AddItem(var Items: TCSSSelectorItems): PCSSSelectorItem;
begin
SetLength(Items, Length(Items) + 1);
Result := @Items[Length(Items) - 1];
Result^.Relation := sirNONE;
end;
var
Tag: string;
pitems: PCSSSelectorItems;
pItem: PCSSSelectorItem;
begin
sc.setCode(Value);
//
pitems := AddItems(Result);
pItem := AddItem(pitems^);
while True do
begin
sc.SkipBlank;
ParserItem(pItem^);
sc.SkipBlank;
case sc.CurrentChar of
',':
begin
sc.IncSrc();
pitems := AddItems(Result);
pItem := AddItem(pitems^);
end;
'>':
begin
sc.IncSrc();
pItem := AddItem(pitems^);
pItem^.Relation := sirChildren;
end;
'+':
begin
sc.IncSrc();
pItem := AddItem(pitems^);
pItem^.Relation := sirYoungerBrother;
end;
'~':
begin
sc.IncSrc();
pItem := AddItem(pitems^);
pItem^.Relation := sirAllYoungerBrother;
end;
#0:
Break;
else
begin
pItem := AddItem(pitems^);
pItem^.Relation := sirDescendant;
end;
end;
end;
end;
procedure Init();
var
I: Integer;
Key: string;
S: WORD;
begin
gEntities := TStringDictionary.Create();
gTagProperty := TPropDictionary.Create;
for I := low(EntityTable) to high(EntityTable) do
begin
gEntities.Add(EntityTable[I].Key, EntityTable[I].Value);
end;
//
for I := low(BlockTags) to high(BlockTags) do
gTagProperty.AddOrSetValue(BlockTags[I], tpBlock);
for I := low(InlineTags) to high(InlineTags) do
gTagProperty.AddOrSetValue(InlineTags[I], tpInline);
for I := low(EmptyTags) to high(EmptyTags) do
begin
Key := EmptyTags[I];
if gTagProperty.ContainsKey(Key) then
S := gTagProperty[Key]
else
S := 0;
S := S or tpEmpty;
gTagProperty.AddOrSetValue(Key, S);
end;
for I := low(FormatAsInlineTags) to high(FormatAsInlineTags) do
begin
Key := FormatAsInlineTags[I];
if gTagProperty.ContainsKey(Key) then
S := gTagProperty[Key]
else
S := 0;
S := S or tpFormatAsInline;
gTagProperty.AddOrSetValue(Key[I], S);
end;
for I := low(PreserveWhitespaceTags) to high(PreserveWhitespaceTags) do
begin
Key := PreserveWhitespaceTags[I];
if gTagProperty.ContainsKey(Key) then
S := gTagProperty[Key]
else
S := 0;
S := S or tpPreserveWhitespace;
gTagProperty.AddOrSetValue(PreserveWhitespaceTags[I], S);
end;
end;
procedure UnInit();
begin
gTagProperty.Free;
gEntities.Free;
end;
{$ENDREGION '转换表之类的'}
{ TIHtmlElementList }
function TIHtmlElementList.Add(Value: IHtmlElement): Integer;
begin
Result := FList.Add(Value);
end;
procedure TIHtmlElementList.Clear;
begin
FList.Clear;
end;
constructor TIHtmlElementList.Create;
begin
inherited Create;
FList := TList<IHtmlElement>.Create;
end;
procedure TIHtmlElementList.Delete(Index: Integer);
begin
FList.Delete(Index);
end;
destructor TIHtmlElementList.Destroy;
begin
FList.Free;
inherited Destroy;
end;
function TIHtmlElementList.GetCount: Integer;
begin
Result := FList.Count;
end;
function TIHtmlElementList.GetItems(Index: Integer): IHtmlElement;
begin
Result := FList[Index];
end;
function TIHtmlElementList.IndexOf(Item: IHtmlElement): Integer;
begin
Result := FList.IndexOf(Item)
end;
procedure TIHtmlElementList.SetItems(Index: Integer; const Value: IHtmlElement);
begin
FList[Index] := Value;
end;
{ THtmlElement }
constructor THtmlElement.Create(AOwner: THtmlElement; AText: string; ALine, ACol: Integer);
begin
inherited Create;
FAttributes := TStringDictionary.Create();
FChildren := TIHtmlElementList.Create;
FOwner := AOwner;
FOrignal := AText;
FSourceLine := ALine;
FSourceCol := ACol;
end;
destructor THtmlElement.Destroy;
begin
FChildren.Free;
FAttributes.Free;
inherited Destroy;
end;
function THtmlElement.EnumAttributeNames(Index: Integer): WideString;
var
Attrs: TStringDynArray;
begin
Result := '';
Attrs := FAttributes.Keys.ToArray;
if Index < Length(Attrs) then
Result := Attrs[Index];
end;
function THtmlElement.GetAttributes(Key: WideString): WideString;
begin
Result := '';
Key := LowerCase(Key);
if FAttributes.ContainsKey(Key) then
Result := FAttributes[Key];
end;
function THtmlElement.GetChildren(Index: Integer): IHtmlElement;
begin
Result := FChildren[Index];
end;
function THtmlElement.GetChildrenCount: Integer;
begin
Result := FChildren.Count;
end;
function THtmlElement.GetCloseTag: IHtmlElement;
begin
Result := FCloseTag;
end;
function THtmlElement.GetContent: WideString;
begin
Result := FContent;
end;
procedure THtmlElement._GetHtml(IncludeSelf: Boolean; Sb: TStringBuilder);
var
I: Integer;
E: THtmlElement;
begin
if IncludeSelf then
Sb.Append(FOrignal);
for I := 0 to FChildren.Count - 1 do
begin
E := FChildren[I] as THtmlElement;
E._GetHtml(True, Sb);
end;
if IncludeSelf and (FCloseTag <> nil) then
(FCloseTag as THtmlElement)._GetHtml(True, Sb);
end;
procedure THtmlElement._GetText(IncludeSelf: Boolean; Sb: TStringBuilder);
var
I: Integer;
E: THtmlElement;
begin
if IncludeSelf and (FTagName = '#TEXT') then
begin
Sb.Append(FContent);
end;
for I := 0 to FChildren.Count - 1 do
begin
E := FChildren[I] as THtmlElement;
E._GetText(True, Sb);
end;
end;
procedure THtmlElement._Select(Item: PCSSSelectorItem; Count: Integer; r: TIHtmlElementList; OnlyTopLevel: Boolean);
function _Filtered(): Boolean;
var
I: Integer;
begin
Result := false;
if (Item^.szTag = '') or (Item^.szTag = '*') or (Item^.szTag = FTagName) then
begin
for I := Low(Item^.Attributes) to High(Item^.Attributes) do
if not AttrCompareFuns[Item^.Attributes[I].AttrOperator](Item^.Attributes[I], Self) then
Exit;
Result := True;
end;
end;
var
f: Boolean;
I, SelfIndex: Integer;
PE, E: THtmlElement;
Next: PCSSSelectorItem;
begin
// ShowMessage(item^.szTag);
// ShowMessage(item^.Attributes[0].Key + ' ' + item^.Attributes[0].Value);
f := _Filtered();
if f then
begin
if (Count = 1) then
begin
if (r.IndexOf(Self as IHtmlElement) < 0) then
r.Add(Self as IHtmlElement);
end
else if Count > 1 then
begin
Next := Item;
Inc(Next);
PE := Self.FOwner;
if PE = nil then
SelfIndex := -1
else
SelfIndex := PE.FChildren.IndexOf(Self as IHtmlElement);
case Next^.Relation of
sirDescendant, sirChildren:
begin
for I := 0 to FChildren.Count - 1 do
begin
E := FChildren[I] as THtmlElement;
E._Select(Next, Count - 1, r, Next^.Relation = sirChildren);
end;
end;
sirAllYoungerBrother, sirYoungerBrother:
begin
if (PE <> nil) and (SelfIndex >= 0) then
for I := (SelfIndex + 1) to PE.FChildren.Count - 1 do
begin
E := PE.FChildren[I] as THtmlElement;
if (Length(E.FTagName) = 0) or (E.FTagName[LowStrIndex] <> '#') then
begin
E._Select(Next, Count - 1, r, True);
if (Next^.Relation = sirYoungerBrother) then
Break;
end;
end;
end;
end;
end;
end;
if not OnlyTopLevel then
for I := 0 to FChildren.Count - 1 do
begin
E := FChildren[I] as THtmlElement;
E._Select(Item, Count, r);
end;
end;
procedure THtmlElement._SimpleCSSSelector(const ItemGroup: TCSSSelectorItemGroup; r: TIHtmlElementList);
var
I: Integer;
begin
for I := Low(ItemGroup) to High(ItemGroup) do
begin
_Select(@ItemGroup[I][0], Length(ItemGroup[I]), r);
end;
end;
function THtmlElement.GetInnerHtml: WideString;
var
Sb: TStringBuilder;
begin
Sb := TStringBuilder.Create;
_GetHtml(false, Sb);
Result := Sb.ToString;
Sb.Free;
end;
function THtmlElement.GetInnerText: WideString;
var
Sb: TStringBuilder;
begin
Sb := TStringBuilder.Create;
_GetText(True, Sb);
Result := Sb.ToString;
Sb.Free;
end;
function THtmlElement.GetOrignal: WideString;
begin
Result := FOrignal;
end;
function THtmlElement.GetOuterHtml: WideString;
var
Sb: TStringBuilder;
begin
Sb := TStringBuilder.Create;
_GetHtml(True, Sb);
Result := Sb.ToString;
Sb.Free;
end;
function THtmlElement.GetSourceColNum: Integer;
begin
Result := FSourceCol;
end;
function THtmlElement.GetSourceLineNum: Integer;
begin
Result := FSourceLine;
end;
function THtmlElement.GetTagName: WideString;
begin
Result := FTagName;
end;
function THtmlElement.HasAttribute(AttributeName: WideString): Boolean;
begin
Result := FAttributes.ContainsKey(LowerCase(AttributeName));
end;
function THtmlElement.SimpleCSSSelector(const selector: WideString): IHtmlElementList;
var
r: TIHtmlElementList;
begin
r := TIHtmlElementList.Create;
_SimpleCSSSelector(ParserCSSSelector(selector), r);
Result := r as IHtmlElementList;
end;
{ TSourceContext }
function TSourceContext.subStr(Index, Count: Integer): string;
begin
Result := System.Copy(Code, Index{$IF (LowStrIndex = 0)} + 1{$ENDIF}, Count);
end;
function TSourceContext.subStr(Count: Integer): string;
begin
Result := subStr(CodeIndex, Count);
end;
function TSourceContext.ReadStr(UntilChars: TSysCharSet): string;
var
oldIndex: Integer;
stringChar: Char;
begin
SkipBlank;
oldIndex := CodeIndex;
if CharInSet(CurrentChar, ['"', '''']) then
stringChar := CurrentChar
else
stringChar := #0;
IncSrc;
while True do
begin
if stringChar = #0 then
begin
if CharInSet(CurrentChar, UntilChars) then
Break;
end
else if (CurrentChar = stringChar) then
begin
IncSrc;
Break;
end;
IncSrc;
end;
Result := subStr(oldIndex, CodeIndex - oldIndex);
if (stringChar <> #0) and (Length(Result) >= 2) then
Result := System.Copy(Result, 2, Length(Result) - 2);
end;
function TSourceContext.GetCharOfCurrent(Index: Integer): Char;
begin
Result := Code[CodeIndex + Index];
end;
procedure TSourceContext.IncSrc;
begin
if CurrentChar = #10 then
begin
Inc(LineNum);
ColNum := 1;
end
else
Inc(ColNum);
Inc(CodeIndex);
CurrentChar := Code[CodeIndex];
{$IFDEF DEBUG}
currentCode := PChar(@Code[CodeIndex]);
{$ENDIF}
end;
procedure TSourceContext.IncSrc(Step: Integer);
var
I: Integer;
begin
for I := 0 to Step - 1 do
IncSrc();
end;
function TSourceContext.PeekStr: string;
begin
Result := PeekStr(CodeIndex);
end;
procedure TSourceContext.setCode(const ACode: string);
begin
CurrentChar := #0;
Code := ACode;
LineNum := 1;
ColNum := 1;
CodeIndex := Low(Code);
if Length(ACode) > 0 then
begin
CurrentChar := Code[CodeIndex];
{$IFDEF DEBUG}
currentCode := PChar(@Code[CodeIndex]);
{$ENDIF}
end;
end;
function TSourceContext.PeekStr(Index: Integer): string;
var
oldIndex: Integer;
begin
Result := '';
oldIndex := Index;
while not CharInSet(Code[Index], (WhiteSpace + ['/', '>'])) do
Inc(Index);
Result := subStr(oldIndex, Index - oldIndex);
end;
procedure TSourceContext.SkipBlank();
begin
while CharInSet(CurrentChar, WhiteSpace) do
IncSrc();
end;
initialization
Init();
finalization
UnInit();
end.
FListNode := parserHtml(sResponseHtml);
tmpIHtmlElementList := FListNode.SimpleCSSSelector('.item-main');
for k := 0 to tmpIHtmlElementList.Count - 1 do
begin
try
FDivItemsRoot := tmpIHtmlElementList.Items[k];
//Year
tmpIHtmlElementList2 := FDivItemsRoot.SimpleCSSSelector('.s-gold-supplier-year-icon');
if tmpIHtmlElementList2.Count > 0 then
begin
tmpShopInfo.years := tmpIHtmlElementList2.Items[0].InnerText;
end;
...
...
...
end;