目的
一:使用httpclient模拟发送http请求,获取页面数据。
二:使用jsoup解析 一步骤获取的数据(HTML)
准备:
用maven构建项目,添加依赖logback,httpclient,jsoup
项目结构
代码:
-------抽象父类--------
/**
* Created by man on 2017/11/22.
*/
public abstract class AbstractCrawler<T> {
static int HTTP_RESPONSE_CODE_SUCCESS =200;
public abstract T parserForNews();
}
--------测试main方法--------
/**
* Created by man on 2017/11/21.
*/
public class UserMain {
public static void main(String[] args) {
new IFengCrawler("http://news.ifeng.com/a/20171121/53459907_0.shtml").parserForNews();
}
}
----具体业务逻辑实现------ (因为该网站有多种类型的布局,所以选取了其中两种常用的布局进行解析(parseOne/parseTwo) ))
package com.kimt.newsdrawler.crawler;
import com.kimt.newsdrawler.dto.News;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* @author kimt
* Created by man on 2017/11/22.
*/
public class IFengCrawler extends AbstractCrawler {
private Logger logger = LoggerFactory.getLogger(IFengCrawler.class);
private String url;
public IFengCrawler(String url) {
this.url = url;
}
@Override
public News parserForNews() {
CloseableHttpClient client;
HttpGet httpGet;
HttpEntity entity = null;
try {
client = HttpClients.createDefault();
httpGet = new HttpGet(url);
// 设置请求头信息
httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httpGet.setHeader("Accept-Charset", "utf-8,utf-8;q=0.7,*;q=0.7");
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
// 执行get请求
CloseableHttpResponse response = client.execute(httpGet);
// 获取http状态码
int stateCode = response.getStatusLine().getStatusCode();
if (stateCode == AbstractCrawler.HTTP_RESPONSE_CODE_SUCCESS) {
News news = null;
// 从response中获取entity
entity = response.getEntity();
// 将Entity转成String格式html
String html = EntityUtils.toString(entity, "utf-8");
// 用Jsoup解析html
Document doc = Jsoup.parse(html);
String title = doc.title();
// 新闻网的第一种新闻页面
Element articleDiv = doc.getElementById("artical");
if (articleDiv != null){
news = parseOne(articleDiv, title);
}else{
// 新闻网的第二种新闻页面,使用第二种解析方式
Element article2Div = doc.getElementsByClass("yc_main wrap").first();
if(article2Div != null){
news = parseTwo(article2Div, title);
}
}
if (news != null){
logger.info(news.toString());
}
// 返回抓取到的新闻对象
return news;
}
} catch (IOException e) {
e.printStackTrace();
logger.error("IOException"+e.getMessage());
} catch (ParseException e) {
e.printStackTrace();
logger.error("ParseException"+e.getMessage());
}finally {
try {
// 释放资源
EntityUtils.consume(entity);
} catch (IOException e) {
e.printStackTrace();
logger.error("consume entity causing IOException"+e.getMessage());
}
}
return null;
}
/**
*
* @param articleDiv 最靠近新闻内容div
* @param title 文章标题
* @return News对象
* 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据
*/
private News parseOne(Element articleDiv, String title) throws ParseException {
News news = new News();
news.setTitle(title);
if (articleDiv != null){
// 获取新闻来源,发布时间
Element headDiv = articleDiv.getElementById("artical_sth");
// 获取新闻内容
Element contentDiv = articleDiv.getElementById("main_content");
if (headDiv != null){
// 获取发布时间
String publishTime = headDiv.getElementsByClass("ss01").text();
// 获取新闻来源
String origin = headDiv.getElementsByClass("ss03").text();
// 格式转换String->Data
SimpleDateFormat sdf = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss");
Date date = sdf.parse(publishTime);
// News对象成员赋值
news.setPublishTime(date);
news.setOrigin(origin);
}
if (contentDiv != null){
// 删除img标签
contentDiv.select("img").remove();
// 获取新闻内容html,方便后续分段,而不是直接获取text()
String content = contentDiv.html();
// News对象成员赋值
news.setContent(content);
}
}
return news;
}
/**
*
* @param article2Div 最靠近新闻内容div
* @param title 文章标题
* @return News对象
* 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据
*/
private News parseTwo(Element article2Div, String title) throws ParseException {
News news = new News();
news.setTitle(title);
if (article2Div != null){
// 获取新闻来源,发布时间
Element headDiv = article2Div.getElementsByClass("yc_tit").first();
// 获取新闻内容
Element contentDiv = article2Div.getElementById("yc_con_txt");
if (headDiv != null){
// 获取发布时间
String publishTime = headDiv.getElementsByTag("span").text();
// 获取新闻来源
String origin = headDiv.getElementsByTag("a").first().text();
// 格式转换String->Data
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = sdf.parse(publishTime);
// News对象成员赋值
news.setPublishTime(date);
news.setOrigin(origin);
}
if (contentDiv != null){
// 删除没用的div
contentDiv.select("div").remove();
contentDiv.select("script").remove();
// 获取新闻内容html,方便后续分段,而不是直接获取text()
String content = contentDiv.html();
// News对象成员赋值
news.setContent(content);
}
}
return news;
}
}