java date只保留年月日_入门之JAVA爬虫

一:什么是爬虫?

  1. 爬虫是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。

二:写java爬虫需要具备什么基础知识?

  1. jdbc:操作数据库。
  2. ehcache(redis):重复url判断。
  3. log4j:日志记录。
  4. httpclient:发送http请求。
  5. jsoup:解析返回的网页内容。

三:举个例子

  1. 博客园首页爬取 地址:博客园 - 代码改变世界。
  2. 项目结构

bd82117f5cce5b27ce59de9a2fe88b38.png


pom.xml:项目maven依赖

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.inossem</groupId>
  <artifactId>BlogSpider</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <dependencies>
     <!-- jdbc驱动包  -->
      <dependency>
         <groupId>mysql</groupId>
         <artifactId>mysql-connector-java</artifactId>
         <version>5.1.37</version>
      </dependency>
      
      <!-- 添加Httpclient支持 -->
       <dependency>
          <groupId>org.apache.httpcomponents</groupId>
          <artifactId>httpclient</artifactId>
          <version>4.5.2</version>
      </dependency>
      
      <!-- 添加jsoup支持 -->
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.10.1</version>
      </dependency>
      
      
      <!-- 添加日志支持 -->
      <dependency>
          <groupId>log4j</groupId>
          <artifactId>log4j</artifactId>
          <version>1.2.16</version>
      </dependency>
      
      <!-- 添加ehcache支持 -->
      <dependency>
          <groupId>net.sf.ehcache</groupId>
          <artifactId>ehcache</artifactId>
          <version>2.10.3</version>
      </dependency>
      
      <!-- 添加commons io支持 -->
      <dependency>
          <groupId>commons-io</groupId>
          <artifactId>commons-io</artifactId>
          <version>2.5</version>
      </dependency>
  </dependencies>
</project>

log4j.properties: 日志相关

log4j.rootLogger=INFO, stdout,D  
  
#Console  
log4j.appender.stdout=org.apache.log4j.ConsoleAppender  
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout  
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n

#D
log4j.appender.D = org.apache.log4j.RollingFileAppender
log4j.appender.D.File = D://bloglogs/log.log
log4j.appender.D.MaxFileSize=100KB
log4j.appender.D.MaxBackupIndex=100
log4j.appender.D.Append = true
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss}  [ %t:%r ] - [ %p ]  %m%n 

ehcache.xml:缓存相关

<?xml version="1.0" encoding="UTF-8"?>

<ehcache>
   <!-- 
         磁盘存储:将缓存中暂时不使用的对象,转移到硬盘,类似于Windows系统的虚拟内存
          path:指定在硬盘上存储对象的路径
   -->
   <diskStore path="D:blogehcache" />
   
   <!-- 
        defaultCache:默认的缓存配置信息,如果不加特殊说明,则所有对象按照此配置项处理
        maxElementsInMemory:设置了缓存的上限,最多存储多少个记录对象
        eternal:代表对象是否永不过期
        overflowToDisk:当内存中Element数量达到maxElementsInMemory时,Ehcache将会Element写到磁盘中
   -->
   <defaultCache
      maxElementsInMemory="1"
      eternal="true"
      overflowToDisk="true"/>

    <cache 
      name="cnblog"
      maxElementsInMemory="1"
      diskPersistent="true"
      eternal="true"
      overflowToDisk="true"/>
      
  

</ehcache>

spider.properties:配置文件

dbUrl=jdbc:mysql://localhost:3306/db_blogs?autoReconnect=true
dbUserName=root
dbPassword=root
jdbcName=com.mysql.jdbc.Driver
cacheFilePath=D://ehcache//ehcache.xml
imageFilePath=D://blogImages/
imagePath=http://localhost:8080/BlogCms/static/blogmIages/

DateUtil.java: 获取日期工具类

package com.inossem.blog.util;

import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * 日期工具类
 * @author user
 *
 */
public class DateUtil {

   /**
    * 获取当前年月日路径
    * @return
    * @throws Exception
    */
   public static String getCurrentDatePath()throws Exception{
      Date date=new Date();
      SimpleDateFormat sdf=new SimpleDateFormat("yyyy/MM/dd");
      return sdf.format(date);
   }
   
   public static void main(String[] args) {
      try {
         System.out.println(getCurrentDatePath());
      } catch (Exception e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
      }
   }
}

DbUtil.java: 数据库操作工具类

package com.inossem.blog.util;

import java.sql.Connection;
import java.sql.DriverManager;

/**
 * 数据库工具类
 * @author user
 *
 */
public class DbUtil {

   /**
    * 获取连接
    * @return
    * @throws Exception
    */
   public Connection getCon()throws Exception{
      Class.forName(PropertiesUtil.getValue("jdbcName"));
      Connection con=DriverManager.getConnection(PropertiesUtil.getValue("dbUrl"), PropertiesUtil.getValue("dbUserName"), PropertiesUtil.getValue("dbPassword"));
      return con;
   }
   
   /**
    * 关闭连接
    * @param con
    * @throws Exception
    */
   public void closeCon(Connection con)throws Exception{
      if(con!=null){
         con.close();
      }
   }
   
   public static void main(String[] args) {
      DbUtil dbUtil=new DbUtil();
      try {
         dbUtil.getCon();
         System.out.println("数据库连接成功");
      } catch (Exception e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
         System.out.println("数据库连接失败");
      }
   }
}

PropertiesUtil.java: 获取配置文件信息工具类

package com.inossem.blog.util;

import java.io.*;
import java.util.Properties;

/**
 * properties工具类
 * @author user
 *
 */
public class PropertiesUtil {

   /**
    * 根据key获取value值
    * @param key
    * @return
    */
   public static String getValue(String key){
      Properties prop=new Properties();
      try {
         InputStream in = new FileInputStream("src/main/resources/spider.properties");
         prop.load(in);
      } catch (IOException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
      }
      return prop.getProperty(key);
   }

   public static void main(String[] args) {
      System.out.println(getValue("imageFilePath"));
   }
}

CnBlogSpider.java: 爬虫

package com.inossem.blog.spider;

import com.inossem.blog.util.DateUtil;
import com.inossem.blog.util.DbUtil;
import com.inossem.blog.util.PropertiesUtil;
import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Status;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

public class CnBlogSpider {

    private static Logger logger = Logger.getLogger(CnBlogSpider.class);

    private static final String URL = "http://www.cnblogs.com/";

    private static Connection con = null;

    private static CacheManager manager = null; // cache管理器
    private static Cache cache = null; // cache缓存对象

    /**
     * 解析主页
     */
    private static void parseHomePage() {
        logger.info("开始爬取" + URL + "网页");
        manager = CacheManager.create(PropertiesUtil.getValue("cacheFilePath"));
        cache = manager.getCache("cnblog");
        CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例
        HttpGet httpget = new HttpGet(URL); // 创建httpget实例
        RequestConfig config = RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间
                .setConnectTimeout(5000)  // 设置连接超时时间
                .build();
        httpget.setConfig(config);
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpget);
        } catch (ClientProtocolException e) {
            logger.error(URL + "-ClientProtocolException", e);
        } catch (IOException e) {
            logger.error(URL + "-IOException", e);
        }
        if (response != null) {
            HttpEntity entity = response.getEntity(); // 获取返回实体
            // 判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == 200) {
                String webPageContent = null;
                try {
                    webPageContent = EntityUtils.toString(entity, "utf-8");
                    parseHomeWebPage(webPageContent);
                } catch (ParseException e) {
                    logger.error(URL + "-ParseException", e);
                } catch (IOException e) {
                    logger.error(URL + "-IOException", e);
                }
            } else {
                logger.error(URL + "-返回状态非200");
            }
        } else {
            logger.error(URL + "-连接超时");
        }
        try {
            if (response != null) {
                response.close();
            }
            if (httpClient != null) {
                httpClient.close();
            }
        } catch (Exception e) {
            logger.error(URL + "Exception", e);
        }
        if (cache.getStatus() == Status.STATUS_ALIVE) {
            cache.flush(); // 把缓存写入文件
        }
        manager.shutdown();
        logger.info("结束爬取" + URL + "网页");
    }

    /**
     * 解析首页内容 提取博客link
     *
     * @param webPageContent
     */
    private static void parseHomeWebPage(String webPageContent) {
        if ("".equals(webPageContent)) {
            return;
        }
        Document doc = Jsoup.parse(webPageContent);
        Elements links = doc.select("#post_list .post_item .post_item_body h3 a");
        for (int i = 0; i < links.size(); i++) {
            Element link = links.get(i);
            String url = link.attr("href");
            System.out.println(url);
            if (cache.get(url) != null) { // 如果缓存中存在就不插入
                logger.info(url + "-缓存中存在");
                continue;
            }
            parseBlogLink(url);
        }

    }

    /**
     * 解析博客链接地址 获取博客内容
     *
     * @param link
     */
    private static void parseBlogLink(String link) {
        logger.info("开始爬取" + link + "网页");
        CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例
        HttpGet httpget = new HttpGet(link); // 创建httpget实例
        RequestConfig config = RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间
                .setConnectTimeout(5000)  // 设置连接超时时间
                .build();
        httpget.setConfig(config);
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpget);
        } catch (ClientProtocolException e) {
            logger.error(URL + "-ClientProtocolException", e);
        } catch (IOException e) {
            logger.error(URL + "-IOException", e);
        }
        if (response != null) {
            HttpEntity entity = response.getEntity(); // 获取返回实体
            // 判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == 200) {
                String blogContent = null;
                try {
                    blogContent = EntityUtils.toString(entity, "utf-8");
                    parseBlogPage(blogContent, link);
                } catch (ParseException e) {
                    logger.error(URL + "-ParseException", e);
                } catch (IOException e) {
                    logger.error(URL + "-IOException", e);
                }
            } else {
                logger.error(URL + "-返回状态非200");
            }
        } else {
            logger.error(URL + "-连接超时");
        }
        try {
            if (response != null) {
                response.close();
            }
            if (httpClient != null) {
                httpClient.close();
            }
        } catch (Exception e) {
            logger.error(URL + "Exception", e);
        }
        logger.info("结束爬取" + link + "网页");
    }

    /**
     * 解析博客内容,提取有效信息
     *
     * @param blogContent
     * @param link
     */
    private static void parseBlogPage(String blogContent, String link) {
        if ("".equals(blogContent)) {
            return;
        }
        Document doc = Jsoup.parse(blogContent);
        Elements titleElements = doc.select("#cb_post_title_url"); // 获取博客标题
        if (titleElements.size() == 0) {
            logger.error(link + "-未获取到博客标题");
            return;
        }
        String title = titleElements.get(0).text();
        System.out.println("博客标题:" + title);

        Elements contentElements = doc.select("#cnblogs_post_body"); // 获取博客内容
        if (contentElements.size() == 0) {
            logger.error(link + "-未获取到博客内容");
            return;
        }
        String content = contentElements.get(0).html();
        System.out.println("博客内容:" + content);

        /**
         * 处理图片内容
         */
//        Elements imgElements = contentElements.select("img"); // 获取所有图片元素
//        List<String> imgUrlList = new LinkedList<String>();
//        for (int i = 0; i < imgElements.size(); i++) {
//            Element imgEle = imgElements.get(i);
//            String url = imgEle.attr("src");
//            imgUrlList.add(url);
//            System.out.println(url);
//        }
//
//        if (imgUrlList.size() > 0) {
//            Map<String, String> replaceImgMap = downLoadImages(imgUrlList);
//            String newContent = replaceWebPageImages(content, replaceImgMap);
//            content = newContent;
//        }

        // 插入数据库
        String sql = "insert into t_article values(null,?,?,?,now())";
        try {
            PreparedStatement pstmt = con.prepareStatement(sql);
            pstmt.setString(1, title);
            pstmt.setString(2, content);
            pstmt.setString(3, link);
            if (pstmt.executeUpdate() == 1) {
                logger.info(link + "-成功插入数据库");
                cache.put(new net.sf.ehcache.Element(link, link));
                logger.info(link + "-已加入缓存");
            } else {
                logger.info(link + "-插入数据库失败");
            }
        } catch (SQLException e) {
            logger.error("SQLException", e);
        }
    }

    /**
     * 把原来的网页图片地址换成本地新的
     *
     * @param content
     * @param replaceImgMap
     * @return
     */
    private static String replaceWebPageImages(String content, Map<String, String> replaceImgMap) {
        for (String url : replaceImgMap.keySet()) {
            String newPath = replaceImgMap.get(url);
            content = content.replace(url, newPath);
        }
        return content;
    }

    /**
     * 下载图片到本地
     *
     * @param imgUrlList
     * @return
     */
    private static Map<String, String> downLoadImages(List<String> imgUrlList) {
        Map<String, String> replaceImgMap = new HashMap<String, String>();

        RequestConfig config = RequestConfig.custom().setSocketTimeout(10000) // 设置读取超时时间
                .setConnectTimeout(5000)  // 设置连接超时时间
                .build();
        CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例
        for (int i = 0; i < imgUrlList.size(); i++) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

            String url = imgUrlList.get(i);
            logger.info("开始爬取" + url + "图片");

            CloseableHttpResponse response = null;

            try {
                HttpGet httpget = new HttpGet(url); // 创建httpget实例
                httpget.setConfig(config);
                response = httpClient.execute(httpget);
            } catch (ClientProtocolException e) {
                logger.error(url + "-ClientProtocolException");
            } catch (IOException e) {
                logger.error(url + "-IOException");
            }
            if (response != null) {
                HttpEntity entity = response.getEntity(); // 获取返回实体
                // 判断返回状态是否为200
                if (response.getStatusLine().getStatusCode() == 200) {
                    try {
                        InputStream inputStream = entity.getContent();
                        String imageType = entity.getContentType().getValue();
                        String urlB = imageType.split("/")[1];
                        String uuid = UUID.randomUUID().toString();
                        String currentDatePath = DateUtil.getCurrentDatePath();
                        String newPath = PropertiesUtil.getValue("imagePath") + currentDatePath + "/" + uuid + "." + urlB;
                        FileUtils.copyToFile(inputStream, new File(PropertiesUtil.getValue("imageFilePath") + currentDatePath + "/" + uuid + "." + urlB));
                        replaceImgMap.put(url, newPath);
                    } catch (UnsupportedOperationException e) {
                        e.printStackTrace();
                    } catch (IOException e) {
                        e.printStackTrace();
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                } else {
                    logger.error("返回状态非200");
                }
            } else {
                logger.error("连接超时");
            }
            try {
                if (response != null) {
                    response.close();
                }
            } catch (Exception e) {
                logger.error("Exception", e);
            }
            logger.info("结束爬取" + url + "图片");
        }

        return replaceImgMap;
    }

    public static void start() {
        DbUtil dbUtil = new DbUtil();
        try {
            con = dbUtil.getCon();
        } catch (Exception e) {
            logger.error("创建数据库连接失败", e);
        }
        parseHomePage();
    }

    public static void main(String[] args) {
        start();
    }
}

DownLoadImageTest.java: 下载图片测试类

package com.inossem.blog.spider;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.UUID;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.log4j.Logger;

import com.inossem.blog.util.DateUtil;
import com.inossem.blog.util.PropertiesUtil;

public class DownLoadImageTest {
   
   private static Logger logger=Logger.getLogger(DownLoadImageTest.class);
   
   private static final String link="http://images2015.cnblogs.com/blog/952033/201705/952033-20170511210141910-342481715.png";
   
   public static void main(String[] args) {
      logger.info("开始爬取"+link+"图片");
      CloseableHttpClient httpClient=HttpClients.createDefault(); // 获取HttpClient实例
      HttpGet httpget=new HttpGet(link); // 创建httpget实例
      RequestConfig config=RequestConfig.custom().setSocketTimeout(10000) // 设置读取超时时间
                                               .setConnectTimeout(5000)  // 设置连接超时时间
                                               .build();
      httpget.setConfig(config);
      CloseableHttpResponse response=null;
      try {
         response=httpClient.execute(httpget);
      } catch (ClientProtocolException e) {
         logger.error("ClientProtocolException",e);
      } catch (IOException e) {
         logger.error("IOException",e);
      }
      if(response!=null){
         HttpEntity entity=response.getEntity(); // 获取返回实体
         // 判断返回状态是否为200
         if(response.getStatusLine().getStatusCode()==200){
            try {
               InputStream inputStream=entity.getContent();
               String imageType=entity.getContentType().getValue();
               String urlB=imageType.split("/")[1];
               String uuid=UUID.randomUUID().toString();
               FileUtils.copyToFile(inputStream, new File(PropertiesUtil.getValue("imageFilePath")+DateUtil.getCurrentDatePath()+"/"+uuid+"."+urlB));
            } catch (UnsupportedOperationException e) {
               // TODO Auto-generated catch block
               e.printStackTrace();
            } catch (IOException e) {
               // TODO Auto-generated catch block
               e.printStackTrace();
            } catch (Exception e) {
               // TODO Auto-generated catch block
               e.printStackTrace();
            }
         }else{
            logger.error("返回状态非200");
         }
      }else{
         logger.error("连接超时");
      }
      try{
         if(response!=null){
            response.close();
         }
         if(httpClient!=null){
            httpClient.close();
         }
      }catch(Exception e){
         logger.error("Exception", e);
      }
      logger.info("结束爬取"+link+"图片");
   }
}

创建数据库脚本:

CREATE DATABASE db_blogs;
SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for t_article
-- ----------------------------
DROP TABLE IF EXISTS `t_article`;
CREATE TABLE `t_article` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键id',
  `title` varchar(200) DEFAULT NULL COMMENT '博客标题',
  `content` longtext COMMENT '博客正文内容',
  `orUrl` varchar(1000) DEFAULT NULL COMMENT '源博客地址',
  `crawlerDate` datetime DEFAULT NULL COMMENT '爬虫博客日期',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=23 DEFAULT CHARSET=utf8;

在D:ehcache 下创建ehcache.xml

运行CnBlogSpider.java

打开数据库查看数据并选择一条数据

我们选取 标题:【机器学习】算法原理详细推导与实现(一):线性回归的博客内容。

创建txt文档:博客内容.txt

复制内容到博客内容.txt中,并修改文件名称为博客内容.html

打开html部分截图如下:

2152366d75ba682507461a5a14cf7e13.png

这样就爬取成功了,大功告成!!!

四:推荐

今天跟大家分享的java爬虫是入门级别的,深入一些推荐大家一个网站:webmagic:WebMagic

也推荐一个工具火车头:火车采集器官网-网页抓取工具_火车头采集器_免费网站采集软件

最后感谢大家参加本人的分享,日后会吸取大家共同交流的经验。希望大家共同努力,共同进步。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值