WebMagic爬取58同城租房数据
1.WebMagic
webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
webmagic的主要特色:
完全模块化的设计,强大的可扩展性。
核心简单但是涵盖爬虫的全部流程,灵活而强大,也是学习爬虫入门的好材料。
提供丰富的抽取页面API。
无配置,但是可通过POJO+注解形式实现一个爬虫。
支持多线程。
支持分布式。
支持爬取js动态渲染的页面。
无框架依赖,可以灵活的嵌入到项目中去。
2.Maven依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
</dependency>
<!-- SpringMVC -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>2.4.2</version>
</dependency>
<!-- SpringData jpa -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
<version>2.4.1</version>
</dependency>
<!-- MariaDB驱动 -->
<!-- https://mvnrepository.com/artifact/org.mariadb.jdbc/mariadb-java-client -->
<dependency>
<groupId>org.mariadb.jdbc</groupId>
<artifactId>mariadb-java-client</artifactId>
<version>2.6.0</version>
</dependency>
<!-- webmagic 核心包 -->
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- webmagic 扩展包 -->
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
<!-- webmagic 对布隆过滤器的支持 -->
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.0</version>
</dependency>
<!-- 工具 StringUtils-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.11</version>
</dependency>
<!-- 使用传统的xml或properties配置 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<!-- 热部署,提高开发者的开发效率,无需手动重启Spring Boot应用 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<optional>true</optional>
</dependency>
<!-- Java工具包,帮助简化每一行代码,避免重复造轮子。 -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>4.1.19</version>
</dependency>
<!-- jsoup Java 的HTML解析器 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
3.核心代码
(1) 数据库表格
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET NAMES utf8 */;
/*!50503 SET NAMES utf8mb4 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
CREATE DATABASE IF NOT EXISTS `rent_crawler` /*!40100 DEFAULT CHARACTER SET utf8 */;
USE `rent_crawler`;
CREATE TABLE IF NOT EXISTS `house_info` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`house_title` varchar(100) DEFAULT NULL COMMENT '房屋标题',
`house_pay` varchar(30) DEFAULT NULL COMMENT '房屋租金',
`house_pay_way` varchar(30) DEFAULT NULL COMMENT '租金支付方式',
`rent_way` varchar(30) DEFAULT NULL COMMENT '租赁方式',
`house_type` varchar(30) DEFAULT NULL COMMENT '房屋类型',
`house_area` varchar(30) DEFAULT NULL COMMENT '房屋大小',
`house_decora` varchar(30) DEFAULT NULL COMMENT '房屋装修',
`toward` varchar(30) DEFAULT NULL COMMENT '房屋朝向',
`floor` varchar(30) DEFAULT NULL COMMENT '房屋楼层',
`floor_height` varchar(30) DEFAULT NULL COMMENT '房屋高度',
`house_estate` varchar(30) DEFAULT NULL COMMENT '所在小区',
`area` varchar(30) DEFAULT NULL COMMENT '所属区域',
`address` varchar(50) DEFAULT NULL COMMENT '详细地址',
`pic` varchar(4000) DEFAULT NULL COMMENT '房屋图片',
`time` varchar(20) DEFAULT NULL COMMENT '房屋最近发布时间',
`agent_name` varchar(30) DEFAULT NULL COMMENT '房屋所属',
`house_disposal` varchar(200) DEFAULT NULL COMMENT '房屋配置',
`house_spot` varchar(200) DEFAULT NULL COMMENT '房屋亮点',
`house_desc` text DEFAULT NULL COMMENT '房屋描述',
`url` varchar(1000) NOT NULL COMMENT '详情页地址',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1493 DEFAULT CHARSET=utf8;
/*!40101 SET SQL_MODE=IFNULL(@OLD_SQL_MODE, '') */;
/*!40014 SET FOREIGN_KEY_CHECKS=IF(@OLD_FOREIGN_KEY_CHECKS IS NULL, 1, @OLD_FOREIGN_KEY_CHECKS) */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
(2) application.properties
# 连接MariaDB数据库配置
spring.datasource.url=jdbc:mariadb://localhost:3307/rent_crawler
spring.datasource.username=root
spring.datasource.password=root
spring.datasource.driver-class-name=org.mariadb.jdbc.Driver
#开启驼峰命名规则
mybatis.configuration.map-underscore-to-camel-case=true
#JPA Configuration:
spring.jpa.database=mysql
spring.jpa.show-sql=true
logging.level.root=INFO
spring.mvc.favicon.enabled=false
server.address=0.0.0.0
server.port=8888
server.tomcat.uri-encoding=utf-8
spring.web.resources.static-locations=classpath:/templates
#当遇到同样名字的时候,是否允许覆盖注册
spring.main.allow-bean-definition-overriding=true
# mybatis
# 对应实体类的包名
mybatis.typeAliasesPackage=com.zhq.crawler.pojo.HouseInfo
# mapper.xml文件所在位置,我放到了resources下面
mybatis.mapperLocations=classpath:/mapper/HouseMapper.xml
logging.level.com.zhq.analysis.mapper : debug
spring.thymeleaf.mode=HTML
//禁用模板引擎的缓存
spring.thymeleaf.cache=false
(3) log4j.properties
log4j.rootLogger=INFO,A1
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n
(4) 使用Spring Data Jpa
package com.zhq.crawler.dao;
import com.zhq.crawler.pojo.HouseInfo;
import org.springframework.data.jpa.repository.JpaRepository;
public interface HouseInfoDao extends JpaRepository<HouseInfo,Long> {
}
(5)代理IP,突破IP封锁
package com.zhq.crawler.ip;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.IOException;
@Component
public class KuaiDaiLiIP {
public String getIPList(){
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpGet对象,发起响应的地址
HttpGet httpGet = new HttpGet("http://dps.kdlapi.com/api/getdps/?orderid=932046070754723&num=1&pt=1&sep=1");
//使用HttpClient对象,发起响应,获得response
CloseableHttpResponse response = null;