htmlparser实例

1.import java.net.URL;     
2.    
3.import junit.framework.TestCase;     
4.    
5.import org.apache.log4j.Logger;     
6.import org.htmlparser.Node;     
7.import org.htmlparser.NodeFilter;     
8.import org.htmlparser.Parser;     
9.import org.htmlparser.Tag;     
10.import org.htmlparser.beans.LinkBean;     
11.import org.htmlparser.filters.NodeClassFilter;     
12.import org.htmlparser.filters.OrFilter;     
13.import org.htmlparser.filters.TagNameFilter;     
14.import org.htmlparser.tags.HeadTag;     
15.import org.htmlparser.tags.ImageTag;     
16.import org.htmlparser.tags.InputTag;     
17.import org.htmlparser.tags.LinkTag;     
18.import org.htmlparser.tags.OptionTag;     
19.import org.htmlparser.tags.SelectTag;     
20.import org.htmlparser.tags.TableColumn;     
21.import org.htmlparser.tags.TableRow;     
22.import org.htmlparser.tags.TableTag;     
23.import org.htmlparser.tags.TitleTag;     
24.import org.htmlparser.util.NodeIterator;     
25.import org.htmlparser.util.NodeList;     
26.import org.htmlparser.util.ParserException;     
27.import org.htmlparser.visitors.HtmlPage;     
28.import org.htmlparser.visitors.NodeVisitor;     
29.import org.htmlparser.visitors.ObjectFindingVisitor;     
30.    
31.public class T extends TestCase {     
32.    
33.  private static final Logger logger = Logger.getLogger(T.class);     
34.    
35.  public T(String name) {     
36.    super(name);     
37.     
38.    
39.      
42.  public void testImageVisitor() {     
43.    try {     
44.      ImageTag imgLink;     
45.      ObjectFindingVisitor visitor = new ObjectFindingVisitor(ImageTag.class);     
46.      Parser parser = new Parser();     
47.      parser.setURL("http://www.google.com");     
48.      parser.setEncoding(parser.getEncoding());     
49.      parser.visitAllNodesWith(visitor);     
50.      Node[] nodes = visitor.getTags();     
51.      for (int i = 0; i < nodes.length; i++) {     
52.        imgLink = (ImageTag) nodes[i];     
53.        logger.fatal("testImageVisitor() ImageURL = " + imgLink.getImageURL());     
54.        logger.fatal("testImageVisitor() ImageLocation = " + imgLink.extractImageLocn());     
55.        logger.fatal("testImageVisitor() SRC = " + imgLink.getAttribute("SRC"));     
56.         
57.    } catch (Exception e) {     
58.      e.printStackTrace();     
59.       
60.     
61.    
62.      
65.  public void testNodeFilter() {     
66.    try {     
67.      NodeFilter filter = new TagNameFilter("IMG");     
68.      Parser parser = new Parser();     
69.      parser.setURL("http://www.google.com");     
70.      parser.setEncoding(parser.getEncoding());     
71.      NodeList list = parser.extractAllNodesThatMatch(filter);     
72.      for (int i = 0; i < list.size(); i++) {     
73.        logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml());     
74.         
75.    } catch (Exception e) {     
76.      e.printStackTrace();     
77.       
78.    
79.     
80.    
81.      
84.  public void testLinkTag() {     
85.    try {     
86.    
87.      NodeFilter filter = new NodeClassFilter(LinkTag.class);     
88.      Parser parser = new Parser();     
89.      parser.setURL("http://www.google.com");     
90.      parser.setEncoding(parser.getEncoding());     
91.      NodeList list = parser.extractAllNodesThatMatch(filter);     
92.      for (int i = 0; i < list.size(); i++) {     
93.        LinkTag node = (LinkTag) list.elementAt(i);     
94.        logger.fatal("testLinkTag() Link is :" + node.extractLink());     
95.         
96.    } catch (Exception e) {     
97.      e.printStackTrace();     
98.       
99.    
100.     
101.    
102.      
105.  public void testLinkCSS() {     
106.    try {     
107.    
108.      Parser parser = new Parser();     
109.      parser.setInputHTML("<head><title>Link Test</title>"    
110.          + "<link href="’/test01/css.css" mce_href="’/test01/css.css"' text='text/css' rel='stylesheet' />"    
111.          + "<link href="/test02/css.css" mce_href="test02/css.css" text='text/css' rel='stylesheet' />" + "</head>"    
112.          + "<body>");     
113.      parser.setEncoding(parser.getEncoding());     
114.    
115.      for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {     
116.        Node node = e.nextNode();     
117.        logger.fatal("testLinkCSS()" + node.getText() + node.getClass());     
118.    
119.         
120.    } catch (Exception e) {     
121.      e.printStackTrace();     
122.       
123.     
124.    
125.      
128.  public void testOrFilter() {     
129.    NodeFilter inputFilter = new NodeClassFilter(InputTag.class);     
130.    NodeFilter selectFilter = new NodeClassFilter(SelectTag.class);     
131.    
132.    NodeList nodeList = null;     
133.    
134.    try {     
135.      Parser parser = new Parser();     
136.      parser     
137.          .setInputHTML("<head><title>OrFilter Test</title>"    
138.              + "<link href="/test01/css.css" mce_href="test01/css.css" text='text/css' rel='stylesheet' />"    
139.              + "<link href="/test02/css.css" mce_href="test02/css.css" text='text/css' rel='stylesheet' />"    
140.              + "</head>"    
141.              + "<body>"    
142.              + "<input type='text' value='text1&prime; name='text1&prime;/>"    
143.              + "<input type='text' value='text2&prime; name='text2&prime;/>"    
144.              + "<select><option id='1&prime;>1</option><option id='2&prime;>2</option><option id='3&prime;></option></select>"    
145.              + "<a href="http://www.yeeach.com" mce_href="yeeach.comhttp://www.yeeach.com">yeeach.com</a>" + "</body>");     
146.    
147.      parser.setEncoding(parser.getEncoding());     
148.      OrFilter lastFilter = new OrFilter();     
149.      lastFilter.setPredicates(new NodeFilter[] { selectFilter, inputFilter });     
150.      nodeList = parser.parse(lastFilter);     
151.      for (int i = 0; i <= nodeList.size(); i++) {     
152.        if (nodeList.elementAt(i) instanceof InputTag) {     
153.          InputTag tag = (InputTag) nodeList.elementAt(i);     
154.          logger.fatal("OrFilter tag name is :" + tag.getTagName() + " ,tag value is:"    
155.              + tag.getAttribute("value"));     
156.           
157.        if (nodeList.elementAt(i) instanceof SelectTag) {     
158.          SelectTag tag = (SelectTag) nodeList.elementAt(i);     
159.          NodeList list = tag.getChildren();     
160.    
161.          for (int j = 0; j < list.size(); j++) {     
162.            OptionTag option = (OptionTag) list.elementAt(j);     
163.            logger.fatal("OrFilter Option" + option.getOptionText());     
164.             
165.    
166.           
167.         
168.    
169.    } catch (ParserException e) {     
170.      e.printStackTrace();     
171.       
172.     
173.    
174.      
177.  public void testTable() {     
178.    Parser myParser;     
179.    NodeList nodeList = null;     
180.    myParser = Parser.createParser("<body> " + "<table id='table1&prime; >"    
181.        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"    
182.        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"    
183.        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2&prime; >"    
184.        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"    
185.        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"    
186.        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>", "GBK");     
187.    NodeFilter tableFilter = new NodeClassFilter(TableTag.class);     
188.    OrFilter lastFilter = new OrFilter();     
189.    lastFilter.setPredicates(new NodeFilter[] { tableFilter });     
190.    try {     
191.      nodeList = myParser.parse(lastFilter);     
192.      for (int i = 0; i <= nodeList.size(); i++) {     
193.        if (nodeList.elementAt(i) instanceof TableTag) {     
194.          TableTag tag = (TableTag) nodeList.elementAt(i);     
195.          TableRow[] rows = tag.getRows();     
196.    
197.          for (int j = 0; j < rows.length; j++) {     
198.            TableRow tr = (TableRow) rows[j];     
199.            TableColumn[] td = tr.getColumns();     
200.            for (int k = 0; k < td.length; k++) {     
201.              logger.fatal("<td>" + td[k].toPlainTextString());     
202.               
203.    
204.             
205.    
206.           
207.         
208.    
209.    } catch (ParserException e) {     
210.      e.printStackTrace();     
211.       
212.     
213.    
214.      
217.  public void testVisitorAll() {     
218.    try {     
219.      Parser parser = new Parser();     
220.      parser.setURL("http://www.google.com");     
221.      parser.setEncoding(parser.getEncoding());     
222.      NodeVisitor visitor = new NodeVisitor() {     
223.        public void visitTag(Tag tag) {     
224.          logger.fatal("testVisitorAll()  Tag name is :" + tag.getTagName() + " \n Class is :"    
225.              + tag.getClass());     
226.           
227.    
228.      };     
229.    
230.      parser.visitAllNodesWith(visitor);     
231.    } catch (ParserException e) {     
232.      e.printStackTrace();     
233.       
234.     
235.    
236.      
239.  public void testTagVisitor() {     
240.    try {     
241.    
242.      Parser parser = new Parser("<head><title>dddd</title>"    
243.          + "<link href="/test01/css.css" mce_href="test01/css.css" text='text/css' rel='stylesheet' />"    
244.          + "<link href="/test02/css.css" mce_href="test02/css.css" text='text/css' rel='stylesheet' />" + "</head>"    
245.          + "<body>" + "<a href="http://www.yeeach.com" mce_href="yeeach.comhttp://www.yeeach.com">yeeach.com</a>" + "</body>");     
246.      NodeVisitor visitor = new NodeVisitor() {     
247.        public void visitTag(Tag tag) {     
248.          if (tag instanceof HeadTag) {     
249.            logger.fatal("visitTag() HeadTag : Tag name is :" + tag.getTagName()     
250.                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());     
251.          } else if (tag instanceof TitleTag) {     
252.            logger.fatal("visitTag() TitleTag : Tag name is :" + tag.getTagName()     
253.                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());     
254.    
255.          } else if (tag instanceof LinkTag) {     
256.            logger.fatal("visitTag() LinkTag : Tag name is :" + tag.getTagName()     
257.                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText()     
258.                + " \n getAttribute is :" + tag.getAttribute("href"));     
259.          } else {     
260.            logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + " \n Class is :"    
261.                + tag.getClass() + "\n Text is :" + tag.getText());     
262.             
263.    
264.           
265.    
266.      };     
267.    
268.      parser.visitAllNodesWith(visitor);     
269.    } catch (Exception e) {     
270.      e.printStackTrace();     
271.       
272.     
273.    
274.      
277.  public void testHtmlPage() {     
278.    String inputHTML = "<html>" + "<head>"    
279.        + "<title>Welcome to the HTMLParser website</title>" + "</head>" + "<body>"    
280.        + "Welcome to HTMLParser" + "<table id='table1&prime; >"    
281.        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"    
282.        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"    
283.        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2&prime; >"    
284.        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"    
285.        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"    
286.        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>" + "</html>";     
287.    Parser parser = new Parser();     
288.    try {     
289.      parser.setInputHTML(inputHTML);     
290.      parser.setEncoding(parser.getURL());     
291.      HtmlPage page = new HtmlPage(parser);     
292.      parser.visitAllNodesWith(page);     
293.      logger.fatal("testHtmlPage -title is :" + page.getTitle());     
294.      NodeList list = page.getBody();     
295.    
296.      for (NodeIterator iterator = list.elements(); iterator.hasMoreNodes();) {     
297.        Node node = iterator.nextNode();     
298.        logger.fatal("testHtmlPage -node  is :" + node.toHtml());     
299.         
300.    
301.    } catch (ParserException e) {     
302.      // TODO Auto-generated catch block     
303.      e.printStackTrace();     
304.       
305.     
306.    
307.      
310.  public void testLinkBean() {     
311.    Parser parser = new Parser();     
312.    
313.    LinkBean linkBean = new LinkBean();     
314.    linkBean.setURL("http://www.google.com");     
315.    URL[] urls = linkBean.getLinks();     
316.    
317.    for (int i = 0; i < urls.length; i++) {     
318.      URL url = urls[i];     
319.      logger.fatal("testLinkBean() -url  is :" + url);     
320.       
321.    
322.     
323.    
324.}   


本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/tianhewulei/archive/2009/10/14/4670460.aspx

<script type="text/javascript" id="wumiiRelatedItems"> </script>
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值