1.import java.net.URL;
2.
3.import junit.framework.TestCase;
4.
5.import org.apache.log4j.Logger;
6.import org.htmlparser.Node;
7.import org.htmlparser.NodeFilter;
8.import org.htmlparser.Parser;
9.import org.htmlparser.Tag;
10.import org.htmlparser.beans.LinkBean;
11.import org.htmlparser.filters.NodeClassFilter;
12.import org.htmlparser.filters.OrFilter;
13.import org.htmlparser.filters.TagNameFilter;
14.import org.htmlparser.tags.HeadTag;
15.import org.htmlparser.tags.ImageTag;
16.import org.htmlparser.tags.InputTag;
17.import org.htmlparser.tags.LinkTag;
18.import org.htmlparser.tags.OptionTag;
19.import org.htmlparser.tags.SelectTag;
20.import org.htmlparser.tags.TableColumn;
21.import org.htmlparser.tags.TableRow;
22.import org.htmlparser.tags.TableTag;
23.import org.htmlparser.tags.TitleTag;
24.import org.htmlparser.util.NodeIterator;
25.import org.htmlparser.util.NodeList;
26.import org.htmlparser.util.ParserException;
27.import org.htmlparser.visitors.HtmlPage;
28.import org.htmlparser.visitors.NodeVisitor;
29.import org.htmlparser.visitors.ObjectFindingVisitor;
30.
31.public class T extends TestCase {
32.
33. private static final Logger logger = Logger.getLogger(T.class);
34.
35. public T(String name) {
36. super(name);
37. }
38.
39.
42. public void testImageVisitor() {
43. try {
44. ImageTag imgLink;
45. ObjectFindingVisitor visitor = new ObjectFindingVisitor(ImageTag.class);
46. Parser parser = new Parser();
47. parser.setURL("http://www.google.com");
48. parser.setEncoding(parser.getEncoding());
49. parser.visitAllNodesWith(visitor);
50. Node[] nodes = visitor.getTags();
51. for (int i = 0; i < nodes.length; i++) {
52. imgLink = (ImageTag) nodes[i];
53. logger.fatal("testImageVisitor() ImageURL = " + imgLink.getImageURL());
54. logger.fatal("testImageVisitor() ImageLocation = " + imgLink.extractImageLocn());
55. logger.fatal("testImageVisitor() SRC = " + imgLink.getAttribute("SRC"));
56. }
57. } catch (Exception e) {
58. e.printStackTrace();
59. }
60. }
61.
62.
65. public void testNodeFilter() {
66. try {
67. NodeFilter filter = new TagNameFilter("IMG");
68. Parser parser = new Parser();
69. parser.setURL("http://www.google.com");
70. parser.setEncoding(parser.getEncoding());
71. NodeList list = parser.extractAllNodesThatMatch(filter);
72. for (int i = 0; i < list.size(); i++) {
73. logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml());
74. }
75. } catch (Exception e) {
76. e.printStackTrace();
77. }
78.
79. }
80.
81.
84. public void testLinkTag() {
85. try {
86.
87. NodeFilter filter = new NodeClassFilter(LinkTag.class);
88. Parser parser = new Parser();
89. parser.setURL("http://www.google.com");
90. parser.setEncoding(parser.getEncoding());
91. NodeList list = parser.extractAllNodesThatMatch(filter);
92. for (int i = 0; i < list.size(); i++) {
93. LinkTag node = (LinkTag) list.elementAt(i);
94. logger.fatal("testLinkTag() Link is :" + node.extractLink());
95. }
96. } catch (Exception e) {
97. e.printStackTrace();
98. }
99.
100. }
101.
102.
105. public void testLinkCSS() {
106. try {
107.
108. Parser parser = new Parser();
109. parser.setInputHTML("<head><title>Link Test</title>"
110. + "<link href="’/test01/css.css" mce_href="’/test01/css.css"' text='text/css' rel='stylesheet' />"
111. + "<link href="/test02/css.css" mce_href="test02/css.css" text='text/css' rel='stylesheet' />" + "</head>"
112. + "<body>");
113. parser.setEncoding(parser.getEncoding());
114.
115. for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
116. Node node = e.nextNode();
117. logger.fatal("testLinkCSS()" + node.getText() + node.getClass());
118.
119. }
120. } catch (Exception e) {
121. e.printStackTrace();
122. }
123. }
124.
125.
128. public void testOrFilter() {
129. NodeFilter inputFilter = new NodeClassFilter(InputTag.class);
130. NodeFilter selectFilter = new NodeClassFilter(SelectTag.class);
131.
132. NodeList nodeList = null;
133.
134. try {
135. Parser parser = new Parser();
136. parser
137. .setInputHTML("<head><title>OrFilter Test</title>"
138. + "<link href="/test01/css.css" mce_href="test01/css.css" text='text/css' rel='stylesheet' />"
139. + "<link href="/test02/css.css" mce_href="test02/css.css" text='text/css' rel='stylesheet' />"
140. + "</head>"
141. + "<body>"
142. + "<input type='text' value='text1′ name='text1′/>"
143. + "<input type='text' value='text2′ name='text2′/>"
144. + "<select><option id='1′>1</option><option id='2′>2</option><option id='3′></option></select>"
145. + "<a href="http://www.yeeach.com" mce_href="yeeach.comhttp://www.yeeach.com">yeeach.com</a>" + "</body>");
146.
147. parser.setEncoding(parser.getEncoding());
148. OrFilter lastFilter = new OrFilter();
149. lastFilter.setPredicates(new NodeFilter[] { selectFilter, inputFilter });
150. nodeList = parser.parse(lastFilter);
151. for (int i = 0; i <= nodeList.size(); i++) {
152. if (nodeList.elementAt(i) instanceof InputTag) {
153. InputTag tag = (InputTag) nodeList.elementAt(i);
154. logger.fatal("OrFilter tag name is :" + tag.getTagName() + " ,tag value is:"
155. + tag.getAttribute("value"));
156. }
157. if (nodeList.elementAt(i) instanceof SelectTag) {
158. SelectTag tag = (SelectTag) nodeList.elementAt(i);
159. NodeList list = tag.getChildren();
160.
161. for (int j = 0; j < list.size(); j++) {
162. OptionTag option = (OptionTag) list.elementAt(j);
163. logger.fatal("OrFilter Option" + option.getOptionText());
164. }
165.
166. }
167. }
168.
169. } catch (ParserException e) {
170. e.printStackTrace();
171. }
172. }
173.
174.
177. public void testTable() {
178. Parser myParser;
179. NodeList nodeList = null;
180. myParser = Parser.createParser("<body> " + "<table id='table1′ >"
181. + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
182. + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
183. + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >"
184. + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
185. + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
186. + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>", "GBK");
187. NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
188. OrFilter lastFilter = new OrFilter();
189. lastFilter.setPredicates(new NodeFilter[] { tableFilter });
190. try {
191. nodeList = myParser.parse(lastFilter);
192. for (int i = 0; i <= nodeList.size(); i++) {
193. if (nodeList.elementAt(i) instanceof TableTag) {
194. TableTag tag = (TableTag) nodeList.elementAt(i);
195. TableRow[] rows = tag.getRows();
196.
197. for (int j = 0; j < rows.length; j++) {
198. TableRow tr = (TableRow) rows[j];
199. TableColumn[] td = tr.getColumns();
200. for (int k = 0; k < td.length; k++) {
201. logger.fatal("<td>" + td[k].toPlainTextString());
202. }
203.
204. }
205.
206. }
207. }
208.
209. } catch (ParserException e) {
210. e.printStackTrace();
211. }
212. }
213.
214.
217. public void testVisitorAll() {
218. try {
219. Parser parser = new Parser();
220. parser.setURL("http://www.google.com");
221. parser.setEncoding(parser.getEncoding());
222. NodeVisitor visitor = new NodeVisitor() {
223. public void visitTag(Tag tag) {
224. logger.fatal("testVisitorAll() Tag name is :" + tag.getTagName() + " \n Class is :"
225. + tag.getClass());
226. }
227.
228. };
229.
230. parser.visitAllNodesWith(visitor);
231. } catch (ParserException e) {
232. e.printStackTrace();
233. }
234. }
235.
236.
239. public void testTagVisitor() {
240. try {
241.
242. Parser parser = new Parser("<head><title>dddd</title>"
243. + "<link href="/test01/css.css" mce_href="test01/css.css" text='text/css' rel='stylesheet' />"
244. + "<link href="/test02/css.css" mce_href="test02/css.css" text='text/css' rel='stylesheet' />" + "</head>"
245. + "<body>" + "<a href="http://www.yeeach.com" mce_href="yeeach.comhttp://www.yeeach.com">yeeach.com</a>" + "</body>");
246. NodeVisitor visitor = new NodeVisitor() {
247. public void visitTag(Tag tag) {
248. if (tag instanceof HeadTag) {
249. logger.fatal("visitTag() HeadTag : Tag name is :" + tag.getTagName()
250. + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());
251. } else if (tag instanceof TitleTag) {
252. logger.fatal("visitTag() TitleTag : Tag name is :" + tag.getTagName()
253. + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());
254.
255. } else if (tag instanceof LinkTag) {
256. logger.fatal("visitTag() LinkTag : Tag name is :" + tag.getTagName()
257. + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText()
258. + " \n getAttribute is :" + tag.getAttribute("href"));
259. } else {
260. logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + " \n Class is :"
261. + tag.getClass() + "\n Text is :" + tag.getText());
262. }
263.
264. }
265.
266. };
267.
268. parser.visitAllNodesWith(visitor);
269. } catch (Exception e) {
270. e.printStackTrace();
271. }
272. }
273.
274.
277. public void testHtmlPage() {
278. String inputHTML = "<html>" + "<head>"
279. + "<title>Welcome to the HTMLParser website</title>" + "</head>" + "<body>"
280. + "Welcome to HTMLParser" + "<table id='table1′ >"
281. + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
282. + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
283. + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >"
284. + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
285. + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
286. + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>" + "</html>";
287. Parser parser = new Parser();
288. try {
289. parser.setInputHTML(inputHTML);
290. parser.setEncoding(parser.getURL());
291. HtmlPage page = new HtmlPage(parser);
292. parser.visitAllNodesWith(page);
293. logger.fatal("testHtmlPage -title is :" + page.getTitle());
294. NodeList list = page.getBody();
295.
296. for (NodeIterator iterator = list.elements(); iterator.hasMoreNodes();) {
297. Node node = iterator.nextNode();
298. logger.fatal("testHtmlPage -node is :" + node.toHtml());
299. }
300.
301. } catch (ParserException e) {
302. // TODO Auto-generated catch block
303. e.printStackTrace();
304. }
305. }
306.
307.
310. public void testLinkBean() {
311. Parser parser = new Parser();
312.
313. LinkBean linkBean = new LinkBean();
314. linkBean.setURL("http://www.google.com");
315. URL[] urls = linkBean.getLinks();
316.
317. for (int i = 0; i < urls.length; i++) {
318. URL url = urls[i];
319. logger.fatal("testLinkBean() -url is :" + url);
320. }
321.
322. }
323.
324.}
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/tianhewulei/archive/2009/10/14/4670460.aspx
<script type="text/javascript" id="wumiiRelatedItems"> </script>