之前自己写的一篇Java网页数据抓取实例,采用的方式是httpclient+正则表达式(http://blog.csdn.net/aoxiangzhe/article/details/40679127),现在,我们采用httpclient4.2.1+jsoup1.7.2进行数据抓取,声明,本程序供学习交流请勿用于非法用途.
确定我们的目标采集数据:
其次,我们请看网页源码,分析网页源码结构:
分析结构完成,我们现在进行基本编码工作:
package com.xxx.moyan.dms.demo;
import java.io.IOException;
import org.apache.http.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.xxx.moyan.utils.FetchUtils;
public class EtmeList {
public static void main(String[] args) throws ParseException, IOException {
String content = FetchUtils.getHtmlByUrl(
"http://www.iautos.cn/news/esczx/");
parserHtml(content);
}
public static void parserHtml(String content) throws ParseException, IOException {
Document doc = Jsoup.parse(content);
Elements links = doc.getElementById("news-list").select("li");
for (Element e : links) {
System.out.println("获取标题:" + e.select("h4").text());
//获取页面链接
Elements linkHref = e.select("a");
//截取时间字符串
Elements timeStr = e.select("div[class=time]");
//中间商
Elements merchant = e.select("span[class=name-txt]");
//资讯类别
Elements info = e.select("i[class=label-txt]");
//简略信息
Elements comment = e.select("p[class=txt]");
System.out.println("获取文章链接地址:" + linkHref.attr("href"));
System.out.println("获取文章时间:" + timeStr.text());
System.out.println("获取中间商:" + merchant.text());
System.out.println("获取资讯类别:" + info.text());
System.out.println("获取文章简略信息:" + comment.text());
System.out.println("=============================================================");
}
}
}
package com.xxx.moyan.utils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
/**
* 传递网页链接
* 返回网页源码
* @author Administrator
*
*/
public class FetchUtils {
//第一次获取网页源码
public static String getHtmlByUrl(String url){
String html = null;
HttpClient httpClient = new DefaultHttpClient();//创建httpClient对象
HttpGet httpget = new HttpGet(url);//以get方式请求该URL
try {
HttpResponse responce = httpClient.execute(httpget);//得到responce对象
int resStatu = responce.getStatusLine().getStatusCode();//返回码
if (resStatu == HttpStatus.SC_OK) {//200正常 其他就不对
//获得相应实体
HttpEntity entity = responce.getEntity();
if (entity != null) {
html = EntityUtils.toString(entity);//获得html源代码
}
}
} catch (Exception e) {
System.out.println("访问【"+url+"】出现异常!");
e.printStackTrace();
} finally {
//关闭连接
httpClient.getConnectionManager().shutdown();
}
return html;
}
}
看看我们的成果,大功告成!