使用javax.swing.text.html包解析html文档,javax w3c 网页解析(一)

package test;

import java.io.FileReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.Reader;

import java.net.HttpURLConnection;

import java.net.URL;

import javax.swing.text.Document;

import javax.swing.text.EditorKit;

import javax.swing.text.SimpleAttributeSet;

import javax.swing.text.html.HTML;

import javax.swing.text.html.HTMLDocument;

import javax.swing.text.html.HTMLEditorKit;

public class javahtml {

public static void main(String[] args)

throws Exception

{

EditorKit kit = new HTMLEditorKit();

Document doc = kit.createDefaultDocument();

// The Document class does not yet handle charset's properly.

doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);

// Create a reader on the HTML content.

Reader rd = getReader("http://hexun.com/kangojian/default.html");

// Parse the HTML.

kit.read(rd, doc, 0);

//  The HTML text is now stored in the document

HTMLDocument.Iterator it = ((HTMLDocument) doc).getIterator(HTML.Tag.A);

while(it.isValid())

{

SimpleAttributeSet s = (SimpleAttributeSet)it.getAttributes();

String href = (String)s.getAttribute(HTML.Attribute.HREF);

System.out.println(href);

it.next();

}

}      // Returns a reader on the HTML data. If 'uri' begins   // with "http:", it's treated as a URL; otherwise,   // it's assumed to be a local filename.      static Reader getReader(String uri)    throws IOException   {    // Retrieve from Internet.    if (uri.startsWith("http:"))    {     HttpURLConnection conn = (HttpURLConnection) new URL(uri).openConnection();     return new InputStreamReader(conn.getInputStream());    }    // Retrieve from file.    else    {     return new FileReader(uri);    }   }    }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值