htmlparser网页解析实例

 今天又看了一下htmlparser对网页的解析,发现越看越不会,例如吧,有以下两个网页:

<font color="#808080">2005-2-25 11:06:27</font>和

<h2 class="ContentAuthor">作者:蜘蛛日期:2007-09-20</h2>

看上去吧格式相同啊!为什么总是不能用同一种方法把它们解析呢??????

看一下代码吧;

  1. import java.io.BufferedWriter;
  2. import java.io.File;
  3. import java.io.FileWriter;
  4. import java.io.IOException;
  5. import org.htmlparser.Node;
  6. import org.htmlparser.NodeFilter;
  7. import org.htmlparser.Parser;
  8. import org.htmlparser.filters.AndFilter;
  9. import org.htmlparser.filters.HasAttributeFilter;
  10. import org.htmlparser.filters.HasChildFilter;
  11. import org.htmlparser.filters.OrFilter;
  12. import org.htmlparser.filters.TagNameFilter;
  13. import org.htmlparser.tags.HeadingTag;
  14. import org.htmlparser.util.NodeList;
  15. import org.htmlparser.visitors.TextExtractingVisitor;
  16. import com.extractor.Extractor;
  17. public class ExtractorHang extends Extractor{
  18.     public void extract()
  19.     {
  20.         BufferedWriter bw=null;
  21.         String indextime="";
  22.         String title="";
  23.         StringBuffer body=new StringBuffer();
  24.         NodeFilter time_filter=new OrFilter(new AndFilter(new TagNameFilter("h2"),new HasAttributeFilter("class","ContentAuthor")),
  25.                 new AndFilter(new TagNameFilter("font"),new HasAttributeFilter("color","#808080")));
  26.         
  27.         NodeFilter title_filter1=new OrFilter(new AndFilter(new TagNameFilter("td"),new HasChildFilter(new TagNameFilter("b"))),
  28.                 new AndFilter(new TagNameFilter("h1"),new HasChildFilter(new TagNameFilter("strong"))));//new AndFilter(new HasAttributeFilter("color","#000080")
  29.         
  30.         NodeFilter  body_filter=new OrFilter(new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("id","logPanel")),
  31.                 new AndFilter(new TagNameFilter("td"),new HasChildFilter(new TagNameFilter("p"))));
  32.         
  33.         try
  34.         {
  35.             NodeList title_nodes=this.getParser().parse(title_filter1);
  36.             //遍历所有节点
  37.             
  38.                /*
  39.                 * 取得符合过滤条件的第一个结果
  40.                 */
  41.                 Node node=title_nodes.elementAt(0);
  42.                 /*
  43.                  * 把符合条件的tag之间的所有孩子节点进行列表。
  44.                  */
  45.                 NodeList node2=node.getChildren(); 
  46.                 
  47.                 //title=node2.elementAt(0).toHtml();
  48.                 /*
  49.                  * 博客解析,取值
  50.                  */
  51.                 title=node2.elementAt(1).toHtml();   
  52.                // title=node2.elementAt(2).toHtml(); 
  53.                 /*
  54.                  * 主页解析
  55.                  */
  56.                 if(title.indexOf("<")!=-1)
  57.                 title=node2.elementAt(3).toHtml();   
  58.             /*
  59.              * 写入文件。
  60.              */
  61.             bw=new BufferedWriter(new FileWriter(new File(this.getOutputPath()+title+".txt")));
  62.             
  63.             int end1=getInputFilePath().lastIndexOf("i");
  64.             int end2=getInputFilePath().lastIndexOf(".");
  65.             String url_seg1=getInputFilePath().substring(3,end1);
  66.             String url_seg2=getInputFilePath().substring(end1, end2);
  67.             String url_seg=url_seg1+".asp?"+url_seg2;
  68.             url_seg=url_seg.replaceAll("","/");
  69.             String url="http://"+url_seg;
  70.             
  71.             bw.write(url+NEWLINE);
  72.             bw.write(title+NEWLINE);
  73.             
  74.         
  75.         }
  76.         catch(Exception e)
  77.         {
  78.             e.printStackTrace();
  79.         }
  80.         
  81.         this.getParser().reset();
  82.         try
  83.         {
  84.             NodeList time_nodes=this.getParser().parse(time_filter);
  85.             for(int i=0;i<time_nodes.size();i++)
  86.             {
  87.                /*
  88.                 * 不太好,有待改进。
  89.                 */
  90.                 Node time_node=time_nodes.elementAt(i);
  91.                 indextime=time_node.getNextSibling().toHtml();
  92.                 
  93.                 /*
  94.                 Node node=time_nodes.elementAt(i); 
  95.                 NodeList node1=node.getChildren(); 
  96.                 indextime=node1.elementAt(0).toHtml();
  97.                 */
  98.                 /*
  99.                  * 用Node类,其对象不能用getChildren。
  100.                  * 而用tag类,则可以。
  101.                  */
  102.                 if(indextime.indexOf("日期")!=-1)
  103.                 {
  104.                     indextime=indextime.substring(indextime.lastIndexOf(":")+1);
  105.                 }
  106.             }
  107.             
  108.             bw.write(indextime+NEWLINE);
  109.         }
  110.         catch(Exception e)
  111.         {
  112.             e.printStackTrace();
  113.         }
  114.         
  115.         this.getParser().reset();
  116.         try
  117.         {
  118.             NodeList body_nodes=this.getParser().parse(body_filter);
  119.             for(int i=0;i<body_nodes.size();i++)
  120.             {
  121.                 Node node=body_nodes.elementAt(i);
  122.                 
  123.                 Parser body_parser=new Parser(node.toHtml());
  124.                 TextExtractingVisitor visitor=new TextExtractingVisitor();
  125.                 body_parser.visitAllNodesWith(visitor);
  126.                 body.append(visitor.getExtractedText());
  127.             }
  128.             String text=body.toString();
  129.             String newtext="";
  130.             /*while(text.indexOf("<")!=-1)
  131.             {
  132.                 text=text.substring(0, body.indexOf("<"));
  133.                 text+=text.substring(75);
  134.             }
  135.             */
  136.             /*
  137.              * 循环,有点耗时了。
  138.              * 主要是为了除去文本中
  139.              * <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" />
  140.              * 字段。顺便把一些没必要的空格去掉。
  141.              */
  142.             for(int i=0;i<text.length();i++)
  143.             {
  144.                 char ch=text.charAt(i);
  145.                 if(ch=='<')
  146.                 {
  147.                     i++;
  148.                     while(text.charAt(i)!='>')
  149.                         i++;
  150.                 }
  151.                 
  152.                 if(ch!='<'&&ch!=' ')
  153.                 newtext+=ch;
  154.             }
  155.             
  156.             bw.write(newtext+NEWLINE);
  157.         
  158.         }
  159.         catch(Exception e)
  160.         {
  161.             e.printStackTrace();
  162.         }
  163.         
  164.         try
  165.         {
  166.             if(bw!=null)
  167.                 bw.close();
  168.         }catch(IOException e)
  169.         {
  170.             e.printStackTrace();
  171.         }
  172.     }
  173. }

上述代码的time_filter部分还有待研究……

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值