HttpClient的下载地址:http://hc.apache.org/downloads.cgi
Jsoup的下载地址:http://jsoup.org/download
在Eclipse中导入所下载的包即可。
首先利用HttpClient获取目标网站的的html文件,然后通过jsoup来解析。
代码如下:
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class Select_Element_Demo {
public static void main(String[] args) throws IOException {
HttpClient httpclient = new DefaultHttpClient();
try{
HttpGet httpget = new HttpGet("http://www.ibm.com/developerworks/cn/java/j-lo-jsouphtml/");//把目标网站设置进去
System.out.println("executing request " + httpget.getURI());
HttpResponse response = httpclient.execute(httpget);//获得response
HttpEntity entity = response.getEntity();//获得网站的内容
System.out.println("getStatusLine"+response.getStatusLine());
if (entity != null){
System.out.println("Response content length: " + entity.getContentLength());
String html = EntityUtils.toString(entity);//将网站的内容转成字符串
Document doc = Jsoup.parse(html);//把html封装进Document
Element element= doc.getElementById("ibm-content");//提取目标板块的信息
System.out.println("body:"+element.text());
}
}catch(Exception e){
e.printStackTrace();
}finally{
//关闭连接,释放资源
httpclient.getConnectionManager().shutdown();
}
}
}
从上面可以看到,利用HttpClient和Jsoup来从网站爬取信息很容易。