一、项目需要
- 对某类网站的数据进行数据采集
- 采用每天定时开启任务
- 将数据写入数据库
- 对已经采集对信息进行跟新处理
二、采用技术
- 项目框架Spring Boot
- 爬虫工具:webmagic
- 数据库:Mysql
- 定时器:@Scheduled 开启
- cron在心解析地址:http://cron.qqe2.com/
- 网页分析:Goole Chrome
- 语句解析采用:XPach
三、图解
四、技术说明
1.添加依赖
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>1.1.10</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
2.mybatis映射文件检查配置
<build>
<resources>
<resource>
<directory>src/main/java</directory>
<includes>
<include>**/*.xml</include>
</includes>
</resource>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
</plugins>
</build>
3.项目配置文件
application.properties 文件
spring.profiles.active= dev
application-dev.properties 开发环境配置文件
server.port=8810
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.username=root
spring.datasource.password=kxxxxg198811
spring.datasource.url=jdbc:mysql://localhost:3306/zhaizhu?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai
zhzmagic.wydyqcchome=https://www.qqqqqccccccc.com/
zhzmagic.wydgjhome=''
application-pro.properties 生产环境配置文件
server.port=9010
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.username=****
spring.datasource.password=kxxxxg198811
spring.datasource.url=jdbc:mysql://************:3306/*****?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai
zhzmagic.wydyqcchome=https://www.qqqqqccccccc.com/
zhzmagic.wydgjhome=''
自定义属性配置类
@Component
@ConfigurationProperties(prefix = "zhzmagic")
public class MagicUrlConfig
{
private String qcchome;
private String gjhome;
public String getGjhome() {
return gjhome;
}
public String getQcchome() {
return qcchome;
}
public void setGjhome(String gjhome) {
this.gjhome = gjhome;
}
public void setQcchome(String qcchome) {
this.qcchome = qcchome;
}
}
4.定时任务类
@Component
public class MagicTask {
@Autowired
MagicUrlConfig magicUrlConfig;
@Autowired
CompQccHomeProcessor compQccHomeProcessor;
//根据情况自行定义时间
@Scheduled(cron = "1 42 14 * * ? ")
public void wagicCompany()
{
String url = magicUrlConfig.getQcchome();
Spider.create(compQccHomeProcessor)
.addUrl(url)
.thread(1)//设置 线程数
.run();
}
}
@SpringBootApplication
//映射文件扫描路径
@MapperScan(basePackages = "com.kxg.datasrc.mapper")
//开启定时
@EnableScheduling
public class DatasrcApplication {
public static void main(String[] args) {
SpringApplication.run(DatasrcApplication.class, args);
}
}
5.网络爬虫相关配置
@Component
public class CompQccHomeProcessor implements PageProcessor {
@Autowired
QccCompsService qccCompsService;
private Site site = Site.me()
.setRetryTimes(3)//重试次数
.setSleepTime(8000)//睡眠时间
.addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36");
@Override
public Site getSite()
{
return site;
}
@Override
public void process(Page page)
{
//判断是否首页
if (page.getUrl().regex("https://www.***********.com/").match()){
List<String> compCategories = new ArrayList<>();
//去往解析类,将分页连接解析出来
Parse.getCompCategorysUrl(page, compCategories);
//添加到任务队列
page.addTargetRequests(compCategories);
}
//检查是否是二级分页
if (page.getUrl().regex("https://www.***********.com/" + "industry_*").match()){
List<String> urlList = new ArrayList<>();
List<CompDetail> compDetailList = new ArrayList<>();
//解析详情页链接列表和分页数据
Parse.getCompList(page,compDetailList,urlList);
//将分页数据写入数据库(因为详情页面的数据不全,如果不需要这里的数据采集,可
在下级解析中解析数据并插入到数据库中,这里插入后,下级解析变成更新操作 )
qccCompsService.insertCompany(compDetailList);
if (urlList.size() >0)
{
//将链接添加到任务队列
page.addTargetRequests(urlList);
}
}
//检查是否是详情页
if (page.getUrl().regex("https://www.********.com/" + "firm_*").match()){
CompDetail detail = new CompDetail();
//解析详情数据
Parse.getCompCategoryDetail(page, detail);
//更新到数据库
qccCompsService.updateCompanyDetail(detail);
}
}
}
6.数据解析文件
public class Parse {
public static void getCompCategorysUrl(Page page, List<String> compCategoryList)
{
Html html = page.getHtml();
Selectable selectable = html.xpath("//*[@id=\"hangye\"]");
Selectable as = selectable.xpath("//a");
List<String> moreurls = selectable.links().all();
for (Selectable node : as.nodes())
{
String h = node.links().get();
String t = node.xpath("//span/text()").get();
if (t == null && h != null)
{
compCategoryList.add(h);
}
}
}
public static void getCompCategorysListUrl(Page page, List<String> compCategoryList)
{
Html html = page.getHtml();
Selectable selectable = html.xpath("//*[@id=\"searchlist\"]/table/tbody");
Selectable as = selectable.xpath("//a");
//compCategoryList = selectable.links().all();
for (Selectable node : as.nodes())
{
String h = node.links().get();
if ( h != null)
{
compCategoryList.add(h);
}
}
}
public static void getCompList(Page page,List<CompDetail> compDetailslist, List<String> urlList){
Html html = page.getHtml();
Selectable selectable = html.xpath("//*[@id=\"searchlist\"]/table/tbody/tr");
for (Selectable node : selectable.nodes()){
CompDetail compDetail = new CompDetail();
String imgurl = node.xpath("/tr/td[1]/img/@src").get();
compDetail.setLogoUrl(imgurl);
String complink = node.xpath("/tr/td[2]/a").links().get();
if (complink != null){
urlList.add(complink);
}
String cname = node.xpath("/tr/td[2]/a/text()").get();
String phone = node.xpath("/tr/td[2]/p[2]/span/text()").get();
String email = node.xpath("/tr/td[2]/p[2]/text()").get();
String[] strs=phone.split(":");
phone = strs[strs.length-1];
String[] strs2=email.split(":");
email = strs2[strs2.length-1];
compDetail.setPhone(phone);
compDetail.setEmail(email);
compDetail.setCompName(cname);
compDetailslist.add(compDetail);
System.out.println(phone);
}
}
public static void getCompCategoryDetail(Page page, CompDetail compDetail)
{
if (compDetail == null)
{
compDetail = new CompDetail();
}
Html html = page.getHtml();
Selectable selectable = html.xpath("//*[@id=\"searchlist\"]/table/tbody");
String compname = html.xpath("//*[@id=\"company-top\"]/div[2]/div[1]/div[1]/img/@alt").get();
compDetail.setCompName(compname);
Selectable detailNode = html.xpath("//*[@id=\"Cominfo\"]/table/tbody");
String legalpersonname = detailNode.xpath("/tbody/tr[1]/td[2]/div/a/text()").get();
String statustd = detailNode.xpath("/tbody/tr[1]/td[4]/text()").get();
String regCatpital =detailNode.xpath("/tbody/tr[2]/td[2]/text()").get();
String address =detailNode.xpath("/tbody/tr[8]/td[2]/text()").get();
String compType =detailNode.xpath("/tbody/tr[5]/td[2]/text()").get();
String industry =detailNode.xpath("/tbody/tr[4]/td[6]/text()").get();
String orgcode =detailNode.xpath("/tbody/tr[3]/td[4]/text()").get();
String regdate =detailNode.xpath("/tbody/tr[1]/td[6]/text()").get();
String creditcode =detailNode.xpath("/tbody/tr[3]/td[2]/text()").get();
String regauth =detailNode.xpath("/tbody/tr[5]/td[6]/text()").get();
String region =detailNode.xpath("/tbody/tr[6]/td[6]/text()").get();
String busscope =detailNode.xpath("tbody/tr[9]/td[2]/text()").get();
String usedname =detailNode.xpath("/tbody/tr[7]/td[2]/text()").get();
String perscale =detailNode.xpath("/tbody/tr[6]/td[2]/text()").get();
compDetail.setDetailUrl(page.getUrl().get());
compDetail.setLegalPerson(legalpersonname);
compDetail.setStatusetd(statustd);
compDetail.setRegCapital(regCatpital);
compDetail.setAddress(address);
compDetail.setCompType(compType);
compDetail.setIndustry(industry);
compDetail.setOrgCode(orgcode);
compDetail.setRegDate(regdate);
compDetail.setCreditCode(creditcode);
compDetail.setRegAuth(regauth);
compDetail.setRegion(region);
compDetail.setBusScope(busscope);
compDetail.setUsedName(usedname);
compDetail.setPerScale(perscale);
System.out.println(compDetail);
}
}
7.service
@Service
public class QccCompsService {
@Resource
CompDetailMapper compDetailMapper;
public int insertCompany(List<CompDetail> compDetails)
{
return compDetailMapper.insertCompany(compDetails);
}
@Transactional
public int updateCompanyDetail(CompDetail compDetail)
{
return compDetailMapper.updateCompanyDetail(compDetail);
}
8、数据mapper文件
public interface CompDetailMapper {
int insertCompany(List<CompDetail> compDetails);
int updateCompanyDetail(CompDetail compDetail);
int updateCompanyUseStatus(String compName,int isUse);
int updateCompanys(List<CompDetail> compDetails);
}
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.kxg.datasrc.mapper.CompDetailMapper">
<insert id="insertCompany" parameterType="java.util.List">
insert ignore compdetail (compName,email,phone,logoUrl)
values
<foreach collection="list" item="item" index="index" separator=",">
(#{item.compName,jdbcType=VARCHAR},#{item.email,jdbcType=VARCHAR},#{item.phone,jdbcType=CHAR},#{item.logoUrl,jdbcType=VARCHAR})
</foreach>
</insert>
<update id="updateCompanyDetail" parameterType="com.kxg.datasrc.model.CompDetail">
update compdetail
<set>
<if test="compName != null">
compName = #{compName,jdbcType=VARCHAR},
</if>
<if test="address != null">
address = #{address,jdbcType=VARCHAR},
</if>
<if test="statusetd != null">
statusetd = #{statusetd,jdbcType=VARCHAR},
</if>
<if test="detailUrl != null">
detailUrl = #{detailUrl,jdbcType=VARCHAR},
</if>
<if test="legalPerson != null">
legalPerson = #{legalPerson,jdbcType=VARCHAR},
</if>
<if test="regCapital != null">
regCapital = #{regCapital,jdbcType=VARCHAR},
</if>
<if test="regDate != null">
regDate = #{regDate,jdbcType=VARCHAR},
</if>
<if test="compType != null">
compType = #{compType,jdbcType=VARCHAR},
</if>
<if test="orgCode != null">
orgCode = #{orgCode,jdbcType=VARCHAR},
</if>
<if test="industry != null">
industry = #{industry,jdbcType=VARCHAR},
</if>
<if test="creditCode != null">
creditCode = #{creditCode,jdbcType=VARCHAR},
</if>
<if test="regAuth != null">
regAuth = #{regAuth,jdbcType=VARCHAR},
</if>
<if test="region != null">
region = #{region,jdbcType=VARCHAR},
</if>
<if test="busScope != null">
busScope = #{busScope,jdbcType=VARCHAR},
</if>
<if test="usedName != null">
usedName = #{usedName,jdbcType=VARCHAR},
</if>
<if test="perScale != null">
perScale = #{perScale,jdbcType=VARCHAR},
</if>
<if test="isUse != null">
isUse = #{isUse,jdbcType=BIT},
</if>
</set>
where compName = #{compName,jdbcType=VARCHAR}
</update>
<update id="updateCompanyUseStatus" >
update compdetail
<set>
<if test="isUse != null">
isUse = #{isUse,jdbcType=BIT},
</if>
</set>
where compName = #{compName,jdbcType=VARCHAR}
</update>
</mapper>
四、测试
运行项目稍等片刻成功