清除无用的html标签,将HTML转成XHTML并清除一些无用的标签和属性_html/css_WEB-ITnose...

介绍

这是一个能帮你从HTML生成有效XHTML的经典库。它还提供对标签以及属性过滤的支持。你可以指定允许哪些标签和属性可在出现在输出中,而其他的标签过滤掉。你也可以使用这个库清理Microsoft Word文档转化成HTML时生成的臃肿的HTML。你也在将HTML发布到博客网站前清理一下,否则像WordPress、b2evolution等博客引擎会拒绝的。

它是如何工作的

里面有两个类:HtmlReader和HtmlWriter

HtmlReader拓展了著名的由Chris Clovett开发的SgmlReader。当它读取HTML时,它跳过所有有前缀的节点。其中,所有像

、、等上百的无用标签被滤除了。这样你读取的HTML就剩下核心的HTML标签了。

HtmlWriter拓展了常规的XmlWriter,XmlWriter生成XML。XHTML本质上是XML格式的HTML。所有你熟悉使用的标签??比如

div-tutorial-274541.html

和,都不是闭合的标签??在XHTML中必需是空元素形式,像

div-tutorial-274541.html

和。由于XHTML是常见的XML格式,你可以方便的使用XML解析器读取XHTML文档。这使得有了应用XPath搜索的机会。

HtmlReader

HtmlReader很简单,下面是完整的类:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

// This class skips all nodes which has some

/// kind of prefix. This trick does the job

/// to clean up MS Word/Outlook HTML markups.

///public class HtmlReader : Sgml.SgmlReader

{

public HtmlReader( TextReader reader ) : base( )

{

base.InputStream = reader;

base.DocType = "HTML" ;

}

public HtmlReader( string content ) : base( )

{

base.InputStream = new StringReader( content );

base.DocType = "HTML" ;

}

public override bool Read()

{

bool status = base.Read();

if ( status )

{

if ( base.NodeType == XmlNodeType.Element )

{

// Got a node with prefix. This must be one

// of those "" or something else.

// Skip this node entirely. We want prefix

// less nodes so that the resultant XML

// requires not namespace.

if ( base.Name.IndexOf( ':' ) > 0 )

base.Skip();

}

}

return status;

}

}

HtmlWriter

这个类是有点麻烦。下面是使用技巧:

重写WriteString方法并避免使用常规的XML编码。对HTML文件手动更改编码。

重写WriteStartElementis以避免不被允许的标签写到输出中。

重写WriteAttributesis以避免不需求的属性。

让我们分部分来看下整个类:

可配置性

你可以通过修改下面的部分配置HtmlWriter:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

public class HtmlWriter : XmlTextWriter

{

// If set to true, it will filter the output

/// by using tag and attribute filtering,

/// space reduce etc

///public bool FilterOutput = false;

// If true, it will reduce consecutive with one instance

///public bool ReduceConsecutiveSpace = true;

// Set the tag names in lower case which are allowed to go to output

///public string [] AllowedTags =

new string[] { "p" , "b" , "i" , "u" , "em" , "big" , "small" ,

"div" , "img" , "span" , "blockquote" , "code" , "pre" , "br" , "hr" ,

"ul" , "ol" , "li" , "del" , "ins" , "strong" , "a" , "font" , "dd" , "dt" };

// If any tag found which is not allowed, it is replaced by this tag.

/// Specify a tag which has least impact on output

///public string ReplacementTag = "dd";

// New lines \r\n are replaced with space

/// which saves space and makes the

/// output compact

///public bool RemoveNewlines = true;

// Specify which attributes are allowed.

/// Any other attribute will be discarded

///public string [] AllowedAttributes = new string[]

{

"class" , "href" , "target" , "border" , "src" ,

"align" , "width" , "height" , "color" , "size"

};

}

WriteString方法

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

// The reason why we are overriding

/// this method is, we do not want the output to be

/// encoded for texts inside attribute

/// and inside node elements. For example, all the

/// gets converted to &nbsp in output. But this does not

/// apply to HTML. In HTML, we need to have as it is.

//public override void WriteString(string text)

{

// Change all non-breaking space to normal space

text = text.Replace( " " , " " );

/// When you are reading RSS feed and writing Html,

/// this line helps remove those CDATA tags

text = text.Replace( "" , "" );

// Do some encoding of our own because

// we are going to use WriteRaw which won't

// do any of the necessary encoding

text = text.Replace( "

text = text.Replace( ">" , ">" );

text = text.Replace( "'" , "'" );

text = text.Replace( "\"" , "" e;" );

if ( this .FilterOutput )

{

text = text.Trim();

// We want to replace consecutive spaces

// to one space in order to save horizontal width

if ( this .ReduceConsecutiveSpace )

text = text.Replace( " " , " " );

if ( this .RemoveNewlines )

text = text.Replace(Environment.NewLine, " " );

base.WriteRaw( text );

}

else

{

base.WriteRaw( text );

}

}

WriteStartElement: 应用标签过滤

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

public override void WriteStartElement(string prefix,

string localName, string ns)

{

if ( this .FilterOutput )

{

bool canWrite = false ;

string tagLocalName = localName.ToLower();

foreach( string name in this .AllowedTags )

{

if ( name == tagLocalName )

{

canWrite = true ;

break ;

}

}

if ( !canWrite )

localName = "dd" ;

}

base.WriteStartElement(prefix, localName, ns);

}

WriteAttributes方法: 应用属性过滤

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

bool canWrite = false ;

string attributeLocalName = reader.LocalName.ToLower();

foreach( string name in this .AllowedAttributes )

{

if ( name == attributeLocalName )

{

canWrite = true ;

break ;

}

}

// If allowed, write the attribute

if ( canWrite )

this .WriteStartAttribute(reader.Prefix,

attributeLocalName, reader.NamespaceURI);

while (reader.ReadAttributeValue())

{

if (reader.NodeType == XmlNodeType.EntityReference)

{

if ( canWrite ) this .WriteEntityRef(reader.Name);

continue ;

}

if ( canWrite ) this .WriteString(reader.Value);

}

if ( canWrite ) this .WriteEndAttribute();

结论

示例应用是一个你可以立即用来清理HTML文件的实用工具。你可以将这个类应用在像博客等需要发布一些HTML到Web服务的工具中。

声明:本文原创发布php中文网,转载请注明出处,感谢您的尊重!如有疑问,请联系admin@php.cn处理

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值