在一的基础上,简单新增了广度url爬取算法。
缺点:单线程,url爬取算法,新闻内容爬取,都丢在同一个线程,效率很慢。 后续继续优化。
(自己有留意,没有爬取过疯狂(程序刚入门),所以没有ip跳板。)
待解决问题: 用多线程,实现业务分离(内容爬取算法,url爬取算法),提高抓取效率,优化抓取算法,待抓取队列数据结构选用,
----------------main测试方法-------------
package com.kimt.newsdrawler;
import com.kimt.newsdrawler.crawler.IFengCrawler;
import com.kimt.newsdrawler.dto.News;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
/**
* Created by man on 2017/11/21.
*/
public class UserMain {
private static Logger logger = LoggerFactory.getLogger(UserMain.class);
public static void main(String[] args) {
/* version_1.0 new IFengCrawler("http://news.ifeng.com/a/20171121/53459907_0.shtml").parserForNews();*/
/* version_2.0测试广度优先遍历算法
// 初始化待抓取url队列,已抓取url集合
LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();
HashSet<String> catchedUrl = new HashSet<String>();
// 传入种子url,爬取url到队列中
new IFengUrlCatcher(toCatcheUrl,catchedUrl).urlCatch("http://news.ifeng.com/");
logger.info("info:",toCatcheUrl);*/
/*version_2.1 测试广度优先遍历算法,并且爬取数据*/
LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();
HashSet<String> catchedUrl = new HashSet<String>();
List<News> list = new IFengCrawler(toCatcheUrl, catchedUrl).parserForNews("http://news.ifeng.com/");
logger.info("一共爬取了 "+list.size()+" 条新闻");
}
}
----------------提供种子url,广度url爬取算法-------------
package com.kimt.newsdrawler.urlcatcher; import com.kimt.newsdrawler.httpclientutils.HttpClientUtil; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; /** * @Date create on 2017/11/22 * @author man * @Description */ public class IFengUrlCatcher extends AbstractUrlCatcher { private static Logger logger = LoggerFactory.getLogger(IFengUrlCatcher.class); /** 待爬取的url队列 */ private LinkedBlockingQueue<String> toCatcheUrl; /** 已爬取的url集合*/ private HashSet<String> catchedUrl; /** * * @param toCatcheUrl 待抓取url队列 * @param catchedUrl 已抓取url集合 */ public IFengUrlCatcher(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) { this.toCatcheUrl = toCatcheUrl; this.catchedUrl = catchedUrl; } @Override public void urlCatch(String seedUrl) { try { CloseableHttpResponse httpResponse = HttpClientUtil.getHttpResponse(seedUrl); HttpEntity entity = httpResponse.getEntity(); // 将Entity转成String格式html String html = EntityUtils.toString(entity, "utf-8"); // 遍历页面,可获取新闻内容的url入待抓取队 traversalUrlForIFengNews(html); } catch (IOException e) { e.printStackTrace(); } } /** * 遍历出html页面,获取所有能爬取新闻内容的url并且入队,不同的新闻网站算法不一样 * @param html */ private void traversalUrlForIFengNews(String html){ String baseUrl = "news.ifeng.com"; String url; Document doc = Jsoup.parse(html); //获取html页面的所有<a>标签 Elements elements = doc.getElementsByTag("a"); // 遍历所有<a>标签 for (Element e:elements) { // 获取所有<a>标签的href属性的值(url) url = e.attr("href"); // 如果该url是新闻网的内容页面,并且没有在已爬取队列中,则入队列 if(url.contains(baseUrl) && !catchedUrl.contains(url)){ try { toCatcheUrl.put(url.trim()); } catch (InterruptedException e1) { e1.printStackTrace(); logger.error("InterruptedException",e1.getMessage()); } } } } }
----------------包装httpClient工具类,减少重复代码(后续待优化)-------------
package com.kimt.newsdrawler.httpclientutils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; /** * @author man * @Date create on 2017/11/22 * @Description */ public class HttpClientUtil { private static Logger logger = LoggerFactory.getLogger(HttpClientUtil.class); public static CloseableHttpResponse getHttpResponse(String url) throws IOException { CloseableHttpClient client; client = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); // 设置请求头信息 httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5"); httpGet.setHeader("Accept-Charset", "utf-8;q=0.7,*;q=0.7"); httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpGet.setHeader("Accept-Encoding", "gzip, deflate"); // 执行get请求 return client.execute(httpGet); } }
------------新闻爬取类-----------package com.kimt.newsdrawler.crawler; import com.kimt.newsdrawler.dto.News; import com.kimt.newsdrawler.httpclientutils.HttpClientUtil; import com.kimt.newsdrawler.urlcatcher.IFengUrlCatcher; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; /** * @author kimt * Created by man on 2017/11/23. */ public class IFengCrawler extends AbstractCrawler { private Logger logger = LoggerFactory.getLogger(IFengCrawler.class); /** 待爬取的url队列 */ private LinkedBlockingQueue<String> toCatcheUrl; /** 已爬取的url集合*/ private HashSet<String> catchedUrl; private IFengUrlCatcher urlCatcher ; public IFengCrawler(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) { this.toCatcheUrl = toCatcheUrl; this.catchedUrl = catchedUrl; this.urlCatcher = new IFengUrlCatcher(toCatcheUrl,catchedUrl); } @Override public List<News> parserForNews(String seedUrl) { // 先爬取种子url,初始化待爬取队列 urlCatcher.urlCatch(seedUrl); List<News> list = new ArrayList<News>(); try { String url; int stateCode; HttpEntity entity ; CloseableHttpResponse response; // 循环爬取待抓取队列中的url 200次 for(int i =0;i<200;i++){ // 从待抓取队列,拿出一条url url = toCatcheUrl.take(); // 广度优先算法,遍历该url下的所有可爬取的url,内部实现入队操作 urlCatcher.urlCatch(url); // 向指定url模拟发送GET请求 response = HttpClientUtil.getHttpResponse(url); // 获取http状态码 stateCode = response.getStatusLine().getStatusCode(); if (stateCode == AbstractCrawler.HTTP_RESPONSE_CODE_SUCCESS) { News news = null; // 从response中获取entity entity = response.getEntity(); // 将Entity转成String格式html String html = EntityUtils.toString(entity, "utf-8"); // 用Jsoup解析html Document doc = Jsoup.parse(html); String title = doc.title(); // 凤凰新闻网的第一种新闻页面 Element articleDiv = doc.getElementById("artical"); if (articleDiv != null){ news = parseOne(articleDiv, title); }else{ // 凤凰新闻网的第二种新闻页面,使用第二种解析方式 Element article2Div = doc.getElementsByClass("yc_main wrap").first(); if(article2Div != null){ news = parseTwo(article2Div, title); } } // 返回抓取到的新闻对象 if(news != null){ list.add(news); } // 标记为已抓取url catchedUrl.add(url); // 释放资源 EntityUtils.consume(entity); } } } catch (IOException e) { e.printStackTrace(); logger.error("IOException"+e.getMessage()); } catch (ParseException e) { e.printStackTrace(); logger.error("ParseException"+e.getMessage()); } catch (InterruptedException e) { e.printStackTrace(); logger.error("InterruptedException"+e.getMessage()); } return list; } /** * * @param articleDiv 最靠近新闻内容div * @param title 文章标题 * @return News对象 * 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据 */ private News parseOne(Element articleDiv, String title) throws ParseException { News news = new News(); news.setTitle(title); if (articleDiv != null){ // 获取新闻来源,发布时间 Element headDiv = articleDiv.getElementById("artical_sth"); // 获取新闻内容 Element contentDiv = articleDiv.getElementById("main_content"); if (headDiv != null){ // 获取发布时间 String publishTime = headDiv.getElementsByClass("ss01").text(); // 获取新闻来源 String origin = headDiv.getElementsByClass("ss03").text(); // 格式转换String->Data SimpleDateFormat sdf = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss"); Date date = sdf.parse(publishTime); // News对象成员赋值 news.setPublishTime(date); news.setOrigin(origin); } if (contentDiv != null){ // 删除img标签 contentDiv.select("img").remove(); // 获取新闻内容html,方便后续分段,而不是直接获取text() String content = contentDiv.html(); // News对象成员赋值 news.setContent(content); } } return news; } /** * * @param article2Div 最靠近新闻内容div * @param title 文章标题 * @return News对象 * 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据 */ private News parseTwo(Element article2Div, String title) throws ParseException { News news = new News(); news.setTitle(title); if (article2Div != null){ // 获取新闻来源,发布时间 Element headDiv = article2Div.getElementsByClass("yc_tit").first(); // 获取新闻内容 Element contentDiv = article2Div.getElementById("yc_con_txt"); if (headDiv != null){ // 获取发布时间 String publishTime = headDiv.getElementsByTag("span").text(); // 获取新闻来源 String origin = headDiv.getElementsByTag("a").first().text(); // 格式转换String->Data SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); Date date = sdf.parse(publishTime); // News对象成员赋值 news.setPublishTime(date); news.setOrigin(origin); } if (contentDiv != null){ // 删除没用的div contentDiv.select("div").remove(); contentDiv.select("script").remove(); // 获取新闻内容html,方便后续分段,而不是直接获取text() String content = contentDiv.html(); // News对象成员赋值 news.setContent(content); } } return news; } }