HttpClient和Jsoup爬虫实例

最新推荐文章于 2024-05-25 11:24:08 发布

我不想再熬夜了

最新推荐文章于 2024-05-25 11:24:08 发布

阅读量1.8k

点赞数 2

分类专栏： Java 文章标签： HttpClient、Jsoup

本文链接：https://blog.csdn.net/qq_42969074/article/details/85702541

版权

Java 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

最近学习了一个爬虫项目，用到的是HttpClient+Jsoup实现，然后我就学习了一下HttpClient和Jsoup的内容，代码在最下面有地址：

HttpClient学习：https://blog.csdn.net/qq_42969074/article/details/85618628

Jsoup学习：https://blog.csdn.net/qq_42969074/article/details/85562751

我的项目目录如下：

1.首先创建一个数据库：

/*
Navicat MySQL Data Transfer

Source Server         : localhost_3306
Source Server Version : 50720
Source Host           : localhost:3306
Source Database       : db_blogcrawler

Target Server Type    : MYSQL
Target Server Version : 50720
File Encoding         : 65001

Date: 2019-01-03 17:21:21
*/

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for `t_blog`
-- ----------------------------
DROP TABLE IF EXISTS `t_blog`;
CREATE TABLE `t_blog` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(100) DEFAULT NULL,
  `content` text,
  `crawlerDate` datetime DEFAULT NULL,
  `oldUrl` varchar(100) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of t_blog
-- ----------------------------

2. 然后在C盘创建一个ehcache.xml，内容如下：

<?xml version="1.0" encoding="UTF-8"?>

<ehcache>
   <!-- 
         磁盘存储:将缓存中暂时不使用的对象,转移到硬盘,类似于Windows系统的虚拟内存
          path:指定在硬盘上存储对象的路径
   -->
   <diskStore path="C:\blogehcache" />
   
   <!-- 
        defaultCache:默认的缓存配置信息,如果不加特殊说明,则所有对象按照此配置项处理
        maxElementsInMemory:设置了缓存的上限,最多存储多少个记录对象
        eternal:代表对象是否永不过期
        overflowToDisk:当内存中Element数量达到maxElementsInMemory时，Ehcache将会Element写到磁盘中
   -->
   <defaultCache
      maxElementsInMemory="1"
      eternal="true"
      overflowToDisk="true"/>

    <cache 
      name="cnblog"
      maxElementsInMemory="1"
      diskPersistent="true"
      eternal="true"
      overflowToDisk="true"/>
      
  

</ehcache>

3.主要代码如下：

package com.thr.crawler;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.thr.util.DateUtil;
import com.thr.util.DbUtil;
import com.thr.util.PropertiesUtil;

import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Status;

/**
 * @author Tang Haorong
 * @description 主体类
 */
public class CnBlogCrawler {

    private static Logger logger=Logger.getLogger(CnBlogCrawler.class);

    private static final String URL="http://www.cnblogs.com/";

    private static Connection con=null;

    private static CacheManager manager=null; // cache管理器

    private static Cache cache=null; // cache缓存对象

    /**
     * 解析主页
     */
    private static void parseHomePage(){
        while(true){
            logger.info("开始爬取"+URL+"网页");
            manager= CacheManager.create(PropertiesUtil.getValue("cacheFilePath"));
            cache=manager.getCache("cnblog");
            CloseableHttpClient httpClient= HttpClients.createDefault(); // 获取HttpClient实例
            HttpGet httpget=new HttpGet(URL); // 创建httpget实例
            RequestConfig config=RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间
                    .setConnectTimeout(5000)  // 设置连接超时时间
                    .build();
            httpget.setConfig(config);
            CloseableHttpResponse response=null;
            try {
                response=httpClient.execute(httpget);
            } catch (ClientProtocolException e) {
                logger.error(URL+"-ClientProtocolException",e);
            } catch (IOException e) {
                logger.error(URL+"-IOException",e);
            }
            if(response!=null){
                HttpEntity entity=response.getEntity(); // 获取返回实体
                // 判断返回状态是否为200
                if(response.getStatusLine().getStatusCode()==200){
                    String webPageContent=null;
                    try {
                        webPageContent= EntityUtils.toString(entity, "utf-8");
                        parseHomeWebPage(webPageContent);
                    } catch (ParseException e) {
                        logger.error(URL+"-ParseException",e);
                    } catch (IOException e) {
                        logger.error(URL+"-IOException",e);
                    }
                }else{
                    logger.error(URL+"-返回状态非200");
                }
            }else{
                logger.error(URL+"-连接超时");
            }
            try{
                if(response!=null){
                    response.close();
                }
                if(httpClient!=null){
                    httpClient.close();
                }
            }catch(Exception e){
                logger.error(URL+"Exception", e);
            }
            if(cache.getStatus()== Status.STATUS_ALIVE){
                cache.flush(); // 把缓存写入文件
            }
            manager.shutdown();
            try {
                Thread.sleep(1*60*1000); // 每隔10分钟抓取一次网页数据
            } catch (InterruptedException e) {
                logger.error("InterruptedException", e);
            }
            logger.info("结束爬取"+URL+"网页");
        }
    }

    /**
     * 解析首页内容 提取博客link
     * @param webPageContent
     */
    private static void parseHomeWebPage(String webPageContent){
        if("".equals(webPageContent)){
            return;
        }
        Document doc= Jsoup.parse(webPageContent);
        Elements links=doc.select("#post_list .post_item .post_item_body h3 a");
        for(int i=0;i<links.size();i++){
            Element link=links.get(i);
            String url=link.attr("href");
            System.out.println(url);
            if(cache.get(url)!=null){ // 如果缓存中存在就不插入
                logger.info(url+"-缓存中存在");
                continue;
            }
            parseBlogLink(url);
        }

    }

    /**
     * 解析博客链接地址 获取博客内容
     * @param link
     */
    private static void parseBlogLink(String link){
        logger.info("开始爬取"+link+"网页");
        CloseableHttpClient httpClient=HttpClients.createDefault(); // 获取HttpClient实例
        HttpGet httpget=new HttpGet(link); // 创建httpget实例
        RequestConfig config=RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间
                .setConnectTimeout(5000)  // 设置连接超时时间
                .build();
        httpget.setConfig(config);
        CloseableHttpResponse response=null;
        try {
            response=httpClient.execute(httpget);
        } catch (ClientProtocolException e) {
            logger.error(URL+"-ClientProtocolException",e);
        } catch (IOException e) {
            logger.error(URL+"-IOException",e);
        }
        if(response!=null){
            HttpEntity entity=response.getEntity(); // 获取返回实体
            // 判断返回状态是否为200
            if(response.getStatusLine().getStatusCode()==200){
                String blogContent=null;
                try {
                    blogContent=EntityUtils.toString(entity, "utf-8");
                    parseBlogPage(blogContent,link);
                } catch (ParseException e) {
                    logger.error(URL+"-ParseException",e);
                } catch (IOException e) {
                    logger.error(URL+"-IOException",e);
                }
            }else{
                logger.error(URL+"-返回状态非200");
            }
        }else{
            logger.error(URL+"-连接超时");
        }
        try{
            if(response!=null){
                response.close();
            }
            if(httpClient!=null){
                httpClient.close();
            }
        }catch(Exception e){
            logger.error(URL+"Exception", e);
        }
        logger.info("结束爬取"+link+"网页");
    }

    /**
     * 解析博客内容，提取有效信息
     * @param blogContent
     * @param link
     */
    private static void parseBlogPage(String blogContent,String link){
        if("".equals(blogContent)){
            return;
        }
        Document doc=Jsoup.parse(blogContent);
        Elements titleElements=doc.select("#cb_post_title_url"); // 获取博客标题
        if(titleElements.size()==0){
            logger.error(link+"-未获取到博客标题");
            return;
        }
        String title=titleElements.get(0).text();
        System.out.println("博客标题："+title);

        Elements contentElements=doc.select("#cnblogs_post_body"); // 获取博客内容
        Elements imgElements=contentElements.select("img"); // 获取所有图片元素
        if(contentElements.size()==0){
            logger.error(link+"-未获取到博客内容");
            return;
        }
        String content=contentElements.get(0).html();
        System.out.println("博客内容："+content);

        List<String> imgUrlList=new LinkedList<String>();
        for(int i=0;i<imgElements.size();i++){
            Element imgEle=imgElements.get(i);
            String url=imgEle.attr("src");
            imgUrlList.add(url);
            System.out.println(url);
        }

        if(imgUrlList.size()>0){
            Map<String,String> replaceImgMap=downLoadImages(imgUrlList);
            String newContent=replaceWebPageImages(content,replaceImgMap);
            content=newContent;
        }

        // 插入数据库
        String sql="insert into t_blog values(null,?,?,now(),?)";
        try {
            PreparedStatement pstmt=con.prepareStatement(sql);
            pstmt.setString(1, title);
            pstmt.setString(2, content);
            pstmt.setString(3, link);
            if(pstmt.executeUpdate()==1){
                logger.info(link+"-成功插入数据库");
                cache.put(new net.sf.ehcache.Element(link, link));
                logger.info(link+"-已加入缓存");
            }else{
                logger.info(link+"-插入数据库失败");
            }
        } catch (SQLException e) {
            logger.error("SQLException",e);
        }
    }

    /**
     * 把原来的网页图片地址换成本地新的
     * @param content
     * @param replaceImgMap
     * @return
     */
    private static String replaceWebPageImages(String content, Map<String, String> replaceImgMap) {
        for(String url:replaceImgMap.keySet()){
            String newPath=replaceImgMap.get(url);
            content=content.replace(url, newPath);
        }
        return content;
    }

    /**
     * 下载图片到本地
     * @param imgUrlList
     * @return
     */
    private static Map<String,String> downLoadImages(List<String> imgUrlList) {
        Map<String,String> replaceImgMap=new HashMap<String,String>();

        RequestConfig config=RequestConfig.custom().setSocketTimeout(10000) // 设置读取超时时间
                .setConnectTimeout(5000)  // 设置连接超时时间
                .build();
        CloseableHttpClient httpClient=HttpClients.createDefault(); // 获取HttpClient实例
        for(int i=0;i<imgUrlList.size();i++){
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            String url=imgUrlList.get(i);
            logger.info("开始爬取"+url+"图片");

            CloseableHttpResponse response=null;

            try {
                HttpGet httpget=new HttpGet(url); // 创建httpget实例
                httpget.setConfig(config);
                response=httpClient.execute(httpget);
            } catch (ClientProtocolException e) {
                logger.error(url+"-ClientProtocolException");
            } catch (IOException e) {
                logger.error(url+"-IOException");
            }
            if(response!=null){
                HttpEntity entity=response.getEntity(); // 获取返回实体
                // 判断返回状态是否为200
                if(response.getStatusLine().getStatusCode()==200){
                    try {
                        InputStream inputStream=entity.getContent();
                        String imageType=entity.getContentType().getValue();
                        String urlB=imageType.split("/")[1];
                        String uuid= UUID.randomUUID().toString();
                        String currentDatePath= DateUtil.getCurrentDatePath();
                        String newPath=PropertiesUtil.getValue("imagePath")+currentDatePath+"/"+uuid+"."+urlB;
                        FileUtils.copyToFile(inputStream, new File(PropertiesUtil.getValue("imageFilePath")+currentDatePath+"/"+uuid+"."+urlB));
                        replaceImgMap.put(url, newPath);
                    } catch (UnsupportedOperationException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    } catch (Exception e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }else{
                    logger.error("返回状态非200");
                }
            }else{
                logger.error("连接超时");
            }
            try{
                if(response!=null){
                    response.close();
                }
            }catch(Exception e){
                logger.error("Exception", e);
            }
            logger.info("结束爬取"+url+"图片");
        }

        return replaceImgMap;
    }

    public static void start(){
        DbUtil dbUtil=new DbUtil();
        try {
            con=dbUtil.getCon();
        } catch (Exception e) {
            logger.error("创建数据库连接失败", e);
        }
        parseHomePage();
    }

    public static void main(String[] args) {
        start();
    }

}

还有一些工具类没有写出来，可以下面的地址下载。

4.运行后的效果；

数据库里面数据：

然后图片也有：

GitHub地址：https://github.com/tanghaorong/Crawler

我不想再熬夜了

关注

2
点赞
踩
15

收藏

觉得还不错? 一键收藏
0
评论
HttpClient和Jsoup爬虫实例

最近学习了一个爬虫项目，用到的是HttpClient+Jsoup实现，然后我就学习了一下HttpClient和Jsoup的内容，代码在最下面有地址：HttpClient学习：https://blog.csdn.net/qq_42969074/article/details/85618628Jsoup学习：https://blog.csdn.net/qq_42969074/article/d...
复制链接

扫一扫