Java网页数据抓取实例(httpclient4.2.1+jsoup1.7.2)

最新推荐文章于 2020-10-07 16:07:27 发布

置顶 Aaron莫言

最新推荐文章于 2020-10-07 16:07:27 发布

阅读量1.2k

点赞数 3

分类专栏： Java 文章标签： Java数据采集 Java网页抓取网站内容

本文链接：https://blog.csdn.net/aoxiangzhe/article/details/54970495

版权

Java 专栏收录该内容

75 篇文章 5 订阅

订阅专栏

之前自己写的一篇Java网页数据抓取实例,采用的方式是httpclient+正则表达式(http://blog.csdn.net/aoxiangzhe/article/details/40679127),现在,我们采用httpclient4.2.1+jsoup1.7.2进行数据抓取,声明,本程序供学习交流请勿用于非法用途.

确定我们的目标采集数据:

其次,我们请看网页源码,分析网页源码结构:

分析结构完成,我们现在进行基本编码工作:

package com.xxx.moyan.dms.demo;

import java.io.IOException;

import org.apache.http.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.xxx.moyan.utils.FetchUtils;

public class EtmeList {
	public static void main(String[] args) throws ParseException, IOException {
		String content = FetchUtils.getHtmlByUrl(
				"http://www.iautos.cn/news/esczx/");
		parserHtml(content);
	}

	
	public static void parserHtml(String content) throws ParseException, IOException {
		Document doc = Jsoup.parse(content);
		Elements links = doc.getElementById("news-list").select("li");
		for (Element e : links) {
			System.out.println("获取标题:" + e.select("h4").text());
			//获取页面链接
			Elements linkHref = e.select("a");
			//截取时间字符串
			Elements timeStr = e.select("div[class=time]");
			//中间商
			Elements merchant = e.select("span[class=name-txt]");
			//资讯类别
			Elements info = e.select("i[class=label-txt]");
			//简略信息
			Elements comment = e.select("p[class=txt]");
			System.out.println("获取文章链接地址:" + linkHref.attr("href"));
			System.out.println("获取文章时间:" + timeStr.text());
			System.out.println("获取中间商:" + merchant.text());
			System.out.println("获取资讯类别:" + info.text());
			System.out.println("获取文章简略信息:" + comment.text());
			
			System.out.println("=============================================================");
		}
		
	}
}
package com.xxx.moyan.utils;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
/**
 * 传递网页链接
 * 返回网页源码
 * @author Administrator
 *
 */
public class FetchUtils {
	//第一次获取网页源码
	public static String getHtmlByUrl(String url){  
        String html = null;  
        HttpClient httpClient = new DefaultHttpClient();//创建httpClient对象   
        HttpGet httpget = new HttpGet(url);//以get方式请求该URL
        try {  
            HttpResponse responce = httpClient.execute(httpget);//得到responce对象  
            int resStatu = responce.getStatusLine().getStatusCode();//返回码
            if (resStatu == HttpStatus.SC_OK) {//200正常  其他就不对
                //获得相应实体  
                HttpEntity entity = responce.getEntity();  
                if (entity != null) {  
                    html = EntityUtils.toString(entity);//获得html源代码
                }  
            }  
        } catch (Exception e) {
        	System.out.println("访问【"+url+"】出现异常!");  
            e.printStackTrace();  
        } finally {
        	//关闭连接
            httpClient.getConnectionManager().shutdown();  
        }  
        return html;  
    }  
}

看看我们的成果,大功告成!