java爬虫难学吗_入门之JAVA爬虫

一:什么是爬虫?爬虫是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。

二:写java爬虫需要具备什么基础知识?jdbc:操作数据库。

ehcache(redis):重复url判断。

log4j:日志记录。

httpclient:发送http请求。

jsoup:解析返回的网页内容。

三:举个例子博客园首页爬取 地址:博客园 - 代码改变世界。

项目结构

pom.xml:项目maven依赖

xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

4.0.0

com.inossem

BlogSpider

0.0.1-SNAPSHOT

mysql

mysql-connector-java

5.1.37

org.apache.httpcomponents

httpclient

4.5.2

org.jsoup

jsoup

1.10.1

log4j

log4j

1.2.16

net.sf.ehcache

ehcache

2.10.3

commons-io

commons-io

2.5

log4j.rootLogger=INFO, stdout,D

#Console

log4j.appender.stdout=org.apache.log4j.ConsoleAppender

log4j.appender.stdout.Target = System.out

log4j.appender.stdout.layout=org.apache.log4j.PatternLayout

log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n

#D

log4j.appender.D = org.apache.log4j.RollingFileAppender

log4j.appender.D.File = D://bloglogs/log.log

log4j.appender.D.MaxFileSize=100KB

log4j.appender.D.MaxBackupIndex=100

log4j.appender.D.Append = true

log4j.appender.D.layout = org.apache.log4j.PatternLayout

log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n

ehcache.xml:缓存相关

maxElementsInMemory="1"

eternal="true"

overflowToDisk="true"/>

name="cnblog"

maxElementsInMemory="1"

diskPersistent="true"

eternal="true"

overflowToDisk="true"/>

dbUrl=jdbc:mysql://localhost:3306/db_blogs?autoReconnect=true

dbUserName=root

dbPassword=root

jdbcName=com.mysql.jdbc.Driver

cacheFilePath=D://ehcache//ehcache.xml

imageFilePath=D://blogImages/

imagePath=http://localhost:8080/BlogCms/static/blogmIages/

DateUtil.java: 获取日期工具类

package com.inossem.blog.util;

import java.text.SimpleDateFormat;

import java.util.Date;

/**

* 日期工具类

* @author user

*

*/

public class DateUtil {

/**

* 获取当前年月日路径

* @return

* @throws Exception

*/

public static String getCurrentDatePath()throws Exception{

Date date=new Date();

SimpleDateFormat sdf=new SimpleDateFormat("yyyy/MM/dd");

return sdf.format(date);

}

public static void main(String[] args) {

try {

System.out.println(getCurrentDatePath());

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

DbUtil.java: 数据库操作工具类

package com.inossem.blog.util;

import java.sql.Connection;

import java.sql.DriverManager;

/**

* 数据库工具类

* @author user

*

*/

public class DbUtil {

/**

* 获取连接

* @return

* @throws Exception

*/

public Connection getCon()throws Exception{

Class.forName(PropertiesUtil.getValue("jdbcName"));

Connection con=DriverManager.getConnection(PropertiesUtil.getValue("dbUrl"), PropertiesUtil.getValue("dbUserName"), PropertiesUtil.getValue("dbPassword"));

return con;

}

/**

* 关闭连接

* @param con

* @throws Exception

*/

public void closeCon(Connection con)throws Exception{

if(con!=null){

con.close();

}

}

public static void main(String[] args) {

DbUtil dbUtil=new DbUtil();

try {

dbUtil.getCon();

System.out.println("数据库连接成功");

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

System.out.println("数据库连接失败");

}

}

}

PropertiesUtil.java: 获取配置文件信息工具类

package com.inossem.blog.util;

import java.io.*;

import java.util.Properties;

/**

* properties工具类

* @author user

*

*/

public class PropertiesUtil {

/**

* 根据key获取value值

* @param key

* @return

*/

public static String getValue(String key){

Properties prop=new Properties();

try {

InputStream in = new FileInputStream("src/main/resources/spider.properties");

prop.load(in);

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return prop.getProperty(key);

}

public static void main(String[] args) {

System.out.println(getValue("imageFilePath"));

}

}

CnBlogSpider.java: 爬虫

package com.inossem.blog.spider;

import com.inossem.blog.util.DateUtil;

import com.inossem.blog.util.DbUtil;

import com.inossem.blog.util.PropertiesUtil;

import net.sf.ehcache.Cache;

import net.sf.ehcache.CacheManager;

import net.sf.ehcache.Status;

import org.apache.commons.io.FileUtils;

import org.apache.http.HttpEntity;

import org.apache.http.ParseException;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.util.EntityUtils;

import org.apache.log4j.Logger;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import java.io.File;

import java.io.IOException;

import java.io.InputStream;

import java.sql.Connection;

import java.sql.PreparedStatement;

import java.sql.SQLException;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.UUID;

public class CnBlogSpider {

private static Logger logger = Logger.getLogger(CnBlogSpider.class);

private static final String URL = "http://www.cnblogs.com/";

private static Connection con = null;

private static CacheManager manager = null; // cache管理器

private static Cache cache = null; // cache缓存对象

/**

* 解析主页

*/

private static void parseHomePage() {

logger.info("开始爬取" + URL + "网页");

manager = CacheManager.create(PropertiesUtil.getValue("cacheFilePath"));

cache = manager.getCache("cnblog");

CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例

HttpGet httpget = new HttpGet(URL); // 创建httpget实例

RequestConfig config = RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间

.setConnectTimeout(5000) // 设置连接超时时间

.build();

httpget.setConfig(config);

CloseableHttpResponse response = null;

try {

response = httpClient.execute(httpget);

} catch (ClientProtocolException e) {

logger.error(URL + "-ClientProtocolException", e);

} catch (IOException e) {

logger.error(URL + "-IOException", e);

}

if (response != null) {

HttpEntity entity = response.getEntity(); // 获取返回实体

// 判断返回状态是否为200

if (response.getStatusLine().getStatusCode() == 200) {

String webPageContent = null;

try {

webPageContent = EntityUtils.toString(entity, "utf-8");

parseHomeWebPage(webPageContent);

} catch (ParseException e) {

logger.error(URL + "-ParseException", e);

} catch (IOException e) {

logger.error(URL + "-IOException", e);

}

} else {

logger.error(URL + "-返回状态非200");

}

} else {

logger.error(URL + "-连接超时");

}

try {

if (response != null) {

response.close();

}

if (httpClient != null) {

httpClient.close();

}

} catch (Exception e) {

logger.error(URL + "Exception", e);

}

if (cache.getStatus() == Status.STATUS_ALIVE) {

cache.flush(); // 把缓存写入文件

}

manager.shutdown();

logger.info("结束爬取" + URL + "网页");

}

/**

* 解析首页内容 提取博客link

*

* @param webPageContent

*/

private static void parseHomeWebPage(String webPageContent) {

if ("".equals(webPageContent)) {

return;

}

Document doc = Jsoup.parse(webPageContent);

Elements links = doc.select("#post_list .post_item .post_item_body h3 a");

for (int i = 0; i < links.size(); i++) {

Element link = links.get(i);

String url = link.attr("href");

System.out.println(url);

if (cache.get(url) != null) { // 如果缓存中存在就不插入

logger.info(url + "-缓存中存在");

continue;

}

parseBlogLink(url);

}

}

/**

* 解析博客链接地址 获取博客内容

*

* @param link

*/

private static void parseBlogLink(String link) {

logger.info("开始爬取" + link + "网页");

CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例

HttpGet httpget = new HttpGet(link); // 创建httpget实例

RequestConfig config = RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间

.setConnectTimeout(5000) // 设置连接超时时间

.build();

httpget.setConfig(config);

CloseableHttpResponse response = null;

try {

response = httpClient.execute(httpget);

} catch (ClientProtocolException e) {

logger.error(URL + "-ClientProtocolException", e);

} catch (IOException e) {

logger.error(URL + "-IOException", e);

}

if (response != null) {

HttpEntity entity = response.getEntity(); // 获取返回实体

// 判断返回状态是否为200

if (response.getStatusLine().getStatusCode() == 200) {

String blogContent = null;

try {

blogContent = EntityUtils.toString(entity, "utf-8");

parseBlogPage(blogContent, link);

} catch (ParseException e) {

logger.error(URL + "-ParseException", e);

} catch (IOException e) {

logger.error(URL + "-IOException", e);

}

} else {

logger.error(URL + "-返回状态非200");

}

} else {

logger.error(URL + "-连接超时");

}

try {

if (response != null) {

response.close();

}

if (httpClient != null) {

httpClient.close();

}

} catch (Exception e) {

logger.error(URL + "Exception", e);

}

logger.info("结束爬取" + link + "网页");

}

/**

* 解析博客内容,提取有效信息

*

* @param blogContent

* @param link

*/

private static void parseBlogPage(String blogContent, String link) {

if ("".equals(blogContent)) {

return;

}

Document doc = Jsoup.parse(blogContent);

Elements titleElements = doc.select("#cb_post_title_url"); // 获取博客标题

if (titleElements.size() == 0) {

logger.error(link + "-未获取到博客标题");

return;

}

String title = titleElements.get(0).text();

System.out.println("博客标题:" + title);

Elements contentElements = doc.select("#cnblogs_post_body"); // 获取博客内容

if (contentElements.size() == 0) {

logger.error(link + "-未获取到博客内容");

return;

}

String content = contentElements.get(0).html();

System.out.println("博客内容:" + content);

/**

* 处理图片内容

*/

// Elements imgElements = contentElements.select("img"); // 获取所有图片元素

// List imgUrlList = new LinkedList();

// for (int i = 0; i < imgElements.size(); i++) {

// Element imgEle = imgElements.get(i);

// String url = imgEle.attr("src");

// imgUrlList.add(url);

// System.out.println(url);

// }

//

// if (imgUrlList.size() > 0) {

// Map replaceImgMap = downLoadImages(imgUrlList);

// String newContent = replaceWebPageImages(content, replaceImgMap);

// content = newContent;

// }

// 插入数据库

String sql = "insert into t_article values(null,?,?,?,now())";

try {

PreparedStatement pstmt = con.prepareStatement(sql);

pstmt.setString(1, title);

pstmt.setString(2, content);

pstmt.setString(3, link);

if (pstmt.executeUpdate() == 1) {

logger.info(link + "-成功插入数据库");

cache.put(new net.sf.ehcache.Element(link, link));

logger.info(link + "-已加入缓存");

} else {

logger.info(link + "-插入数据库失败");

}

} catch (SQLException e) {

logger.error("SQLException", e);

}

}

/**

* 把原来的网页图片地址换成本地新的

*

* @param content

* @param replaceImgMap

* @return

*/

private static String replaceWebPageImages(String content, Map replaceImgMap) {

for (String url : replaceImgMap.keySet()) {

String newPath = replaceImgMap.get(url);

content = content.replace(url, newPath);

}

return content;

}

/**

* 下载图片到本地

*

* @param imgUrlList

* @return

*/

private static Map downLoadImages(List imgUrlList) {

Map replaceImgMap = new HashMap();

RequestConfig config = RequestConfig.custom().setSocketTimeout(10000) // 设置读取超时时间

.setConnectTimeout(5000) // 设置连接超时时间

.build();

CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例

for (int i = 0; i < imgUrlList.size(); i++) {

try {

Thread.sleep(1000);

} catch (InterruptedException e) {

e.printStackTrace();

}

String url = imgUrlList.get(i);

logger.info("开始爬取" + url + "图片");

CloseableHttpResponse response = null;

try {

HttpGet httpget = new HttpGet(url); // 创建httpget实例

httpget.setConfig(config);

response = httpClient.execute(httpget);

} catch (ClientProtocolException e) {

logger.error(url + "-ClientProtocolException");

} catch (IOException e) {

logger.error(url + "-IOException");

}

if (response != null) {

HttpEntity entity = response.getEntity(); // 获取返回实体

// 判断返回状态是否为200

if (response.getStatusLine().getStatusCode() == 200) {

try {

InputStream inputStream = entity.getContent();

String imageType = entity.getContentType().getValue();

String urlB = imageType.split("/")[1];

String uuid = UUID.randomUUID().toString();

String currentDatePath = DateUtil.getCurrentDatePath();

String newPath = PropertiesUtil.getValue("imagePath") + currentDatePath + "/" + uuid + "." + urlB;

FileUtils.copyToFile(inputStream, new File(PropertiesUtil.getValue("imageFilePath") + currentDatePath + "/" + uuid + "." + urlB));

replaceImgMap.put(url, newPath);

} catch (UnsupportedOperationException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} catch (Exception e) {

e.printStackTrace();

}

} else {

logger.error("返回状态非200");

}

} else {

logger.error("连接超时");

}

try {

if (response != null) {

response.close();

}

} catch (Exception e) {

logger.error("Exception", e);

}

logger.info("结束爬取" + url + "图片");

}

return replaceImgMap;

}

public static void start() {

DbUtil dbUtil = new DbUtil();

try {

con = dbUtil.getCon();

} catch (Exception e) {

logger.error("创建数据库连接失败", e);

}

parseHomePage();

}

public static void main(String[] args) {

start();

}

}

DownLoadImageTest.java: 下载图片测试类

package com.inossem.blog.spider;

import java.io.File;

import java.io.IOException;

import java.io.InputStream;

import java.util.UUID;

import org.apache.commons.io.FileUtils;

import org.apache.http.HttpEntity;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.log4j.Logger;

import com.inossem.blog.util.DateUtil;

import com.inossem.blog.util.PropertiesUtil;

public class DownLoadImageTest {

private static Logger logger=Logger.getLogger(DownLoadImageTest.class);

private static final String link="http://images2015.cnblogs.com/blog/952033/201705/952033-20170511210141910-342481715.png";

public static void main(String[] args) {

logger.info("开始爬取"+link+"图片");

CloseableHttpClient httpClient=HttpClients.createDefault(); // 获取HttpClient实例

HttpGet httpget=new HttpGet(link); // 创建httpget实例

RequestConfig config=RequestConfig.custom().setSocketTimeout(10000) // 设置读取超时时间

.setConnectTimeout(5000) // 设置连接超时时间

.build();

httpget.setConfig(config);

CloseableHttpResponse response=null;

try {

response=httpClient.execute(httpget);

} catch (ClientProtocolException e) {

logger.error("ClientProtocolException",e);

} catch (IOException e) {

logger.error("IOException",e);

}

if(response!=null){

HttpEntity entity=response.getEntity(); // 获取返回实体

// 判断返回状态是否为200

if(response.getStatusLine().getStatusCode()==200){

try {

InputStream inputStream=entity.getContent();

String imageType=entity.getContentType().getValue();

String urlB=imageType.split("/")[1];

String uuid=UUID.randomUUID().toString();

FileUtils.copyToFile(inputStream, new File(PropertiesUtil.getValue("imageFilePath")+DateUtil.getCurrentDatePath()+"/"+uuid+"."+urlB));

} catch (UnsupportedOperationException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}else{

logger.error("返回状态非200");

}

}else{

logger.error("连接超时");

}

try{

if(response!=null){

response.close();

}

if(httpClient!=null){

httpClient.close();

}

}catch(Exception e){

logger.error("Exception", e);

}

logger.info("结束爬取"+link+"图片");

}

}

创建数据库脚本:

CREATE DATABASE db_blogs;

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------

-- Table structure for t_article

-- ----------------------------

DROP TABLE IF EXISTS `t_article`;

CREATE TABLE `t_article` (

`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键id',

`title` varchar(200) DEFAULT NULL COMMENT '博客标题',

`content` longtext COMMENT '博客正文内容',

`orUrl` varchar(1000) DEFAULT NULL COMMENT '源博客地址',

`crawlerDate` datetime DEFAULT NULL COMMENT '爬虫博客日期',

PRIMARY KEY (`id`)

) ENGINE=InnoDB AUTO_INCREMENT=23 DEFAULT CHARSET=utf8;

在D:\ehcache 下创建ehcache.xml

运行CnBlogSpider.java

打开数据库查看数据并选择一条数据

我们选取 标题:【机器学习】算法原理详细推导与实现(一):线性回归的博客内容。

创建txt文档:博客内容.txt

复制内容到博客内容.txt中,并修改文件名称为博客内容.html

打开html部分截图如下:

这样就爬取成功了,大功告成!!!

四:推荐

今天跟大家分享的java爬虫是入门级别的,深入一些推荐大家一个网站:webmagic:WebMagic

最后感谢大家参加本人的分享,日后会吸取大家共同交流的经验。希望大家共同努力,共同进步。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值