php 保留html,如何提取html的正文以及保留某些<>内容?

正文提取就是去除掉html代码里的<>的内容。这段代码增加了可选择保留某些<>内容。1 using System;

2 using System.Text;

3 namespace HtmlStrip

4 {

5 class MainClass

6 {

7 public static void Main (string[] args)

8 {

9 string str = "

abc

efgoo";

10 //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");

11 //str=rd.ReadToEnd ();

12 HtmlParser t = new HtmlParser (str); //

13 t.KeepTag (new string[] { "br" }); //设置br标签不过虑

14 Console.Write (t.Text ());

15 }

16

17

18

19 }

20 class HtmlParser

21 {

22 private string[] htmlcode; //把html转为数组形式用于分析

23 private StringBuilder result = new StringBuilder (); //输出的结果

24 private int seek; //分析文本时候的指针位置

25 private string[] keepTag; //用于保存要保留的尖括号内容

26 private bool _inTag; //标记现在的指针是不是在尖括号内

27 private bool needContent = true; //是否要提取正文

28 private string tagName; //当前尖括号的名字

29 private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容,一般这些标签的正文是不要的

30

31 ///

32 /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字

33 ///

34 public bool inTag {

35 get { return _inTag; }

36 set {

37 _inTag = value;

38 if (!value)

39 return;

40 bool ok = true;

41 tagName = "";

42 while (ok) {

43 string word = read ();

44 if (word != " " && word != ">") {

45 tagName += word;

46 } else if (word == " " && tagName.Length > 0) {

47 ok = false;

48 } else if (word == ">") {

49 ok = false;

50 inTag = false;

51 seek -= 1;

52 }

53 }

54 }

55 }

56 ///

57 /// 初始化类

58 ///

59 ///

60 /// 要分析的html代码

61 ///

62 public HtmlParser (string html)

63 {

64 htmlcode = new string[html.Length];

65 for (int i = 0; i < html.Length; i++) {

66 htmlcode[i] = html[i].ToString ();

67 }

68 KeepTag (new string[] { });

69 }

70 ///

71 /// 设置要保存那些标签不要被过滤掉

72 ///

73 ///

74 ///

75 ///

76 public void KeepTag (string[] tags)

77 {

78 keepTag = tags;

79 }

80

81 ///

82 ///

83 ///

84 ///

85 /// 输出处理后的文本

86 ///

87 public string Text ()

88 {

89 int startTag = 0;

90 int endTag = 0;

91 while (seek < htmlcode.Length) {

92 string word = read ();

93 if (word.ToLower () == "

94 startTag = seek;

95 inTag = true;

96 } else if (word.ToLower () == ">") {

97 endTag = seek;

98 inTag = false;

99 if (iskeepTag (tagName.Replace ("/", ""))) {

100 for (int i = startTag - 1; i < endTag; i++) {

101 result.Append (htmlcode[i].ToString ());

102 }

103 } else if (tagName.StartsWith ("!--")) {

104 bool ok = true;

105 while (ok) {

106 if (read () == "-") {

107 if (read () == "-") {

108 if (read () == ">") {

109 ok = false;

110 } else {

111 seek -= 1;

112 }

113 }

114 }

115 }

116 } else {

117 foreach (string str in specialTag) {

118 if (tagName == str) {

119 needContent = false;

120 break;

121 } else

122 needContent = true;

123 }

124 }

125 } else if (!inTag && needContent) {

126 result.Append (word);

127 }

128

129 }

130 return result.ToString ();

131 }

132 ///

133 /// 判断是否要保存这个标签

134 ///

135 ///

136 /// A

137 ///

138 ///

139 /// A

140 ///

141 private bool iskeepTag (string tag)

142 {

143 foreach (string ta in keepTag) {

144 if (tag.ToLower () == ta.ToLower ()) {

145 return true;

146 }

147 }

148 return false;

149 }

150 private string read ()

151 {

152 return htmlcode[seek++];

153 }

154

155 }

156 }

157

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值