Java 爬虫实战

16 篇文章 0 订阅
2 篇文章 0 订阅

一、项目需要

  • 对某类网站的数据进行数据采集
  • 采用每天定时开启任务
  • 将数据写入数据库
  • 对已经采集对信息进行跟新处理

二、采用技术

  •  项目框架Spring Boot
  •  爬虫工具:webmagic
  •  数据库:Mysql
  •  定时器:@Scheduled 开启
  •  cron在心解析地址:http://cron.qqe2.com/
  •  网页分析:Goole Chrome
  •  语句解析采用:XPach

三、图解

四、技术说明

1.添加依赖

	<dependencies>
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>org.mybatis.spring.boot</groupId>
			<artifactId>mybatis-spring-boot-starter</artifactId>
			<version>2.1.0</version>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>druid-spring-boot-starter</artifactId>
			<version>1.1.10</version>
		</dependency>
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-core</artifactId>
			<version>0.7.3</version>
		</dependency>
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-extension</artifactId>
			<version>0.7.3</version>
		</dependency>

 

2.mybatis映射文件检查配置

   <build>
		<resources>
			<resource>
				<directory>src/main/java</directory>
				<includes>
					<include>**/*.xml</include>
				</includes>
			</resource>
			<resource>
				<directory>src/main/resources</directory>
			</resource>
		</resources>
		<plugins>
		</plugins>
	</build>

3.项目配置文件

application.properties 文件
spring.profiles.active= dev

   

application-dev.properties 开发环境配置文件

server.port=8810 

spring.datasource.type=com.alibaba.druid.pool.DruidDataSource

spring.datasource.username=root
spring.datasource.password=kxxxxg198811
spring.datasource.url=jdbc:mysql://localhost:3306/zhaizhu?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai

zhzmagic.wydyqcchome=https://www.qqqqqccccccc.com/
zhzmagic.wydgjhome=''

 

application-pro.properties 生产环境配置文件

server.port=9010 

spring.datasource.type=com.alibaba.druid.pool.DruidDataSource

spring.datasource.username=****
spring.datasource.password=kxxxxg198811
spring.datasource.url=jdbc:mysql://************:3306/*****?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai

zhzmagic.wydyqcchome=https://www.qqqqqccccccc.com/
zhzmagic.wydgjhome=''

  自定义属性配置类

@Component
@ConfigurationProperties(prefix = "zhzmagic")
public class MagicUrlConfig
{
    private String qcchome;
    private String gjhome;

    public String getGjhome() {
        return gjhome;
    }

    public String getQcchome() {
        return qcchome;
    }

    public void setGjhome(String gjhome) {
        this.gjhome = gjhome;
    }

    public void setQcchome(String qcchome) {
        this.qcchome = qcchome;
    }
}

4.定时任务类

@Component
public class MagicTask {
    @Autowired
    MagicUrlConfig magicUrlConfig;

    @Autowired
    CompQccHomeProcessor compQccHomeProcessor;
    //根据情况自行定义时间
    @Scheduled(cron = "1 42 14 * * ? ")
    public void wagicCompany()
    {
        String url = magicUrlConfig.getQcchome();
        Spider.create(compQccHomeProcessor)
                .addUrl(url)
                .thread(1)//设置 线程数
                .run();
    }
}
@SpringBootApplication
//映射文件扫描路径
@MapperScan(basePackages = "com.kxg.datasrc.mapper")
//开启定时
@EnableScheduling
public class DatasrcApplication {

	public static void main(String[] args) {
		SpringApplication.run(DatasrcApplication.class, args);
	}

}

5.网络爬虫相关配置

@Component
public class CompQccHomeProcessor implements PageProcessor {
    @Autowired
    QccCompsService qccCompsService;

    private Site site = Site.me()
            .setRetryTimes(3)//重试次数
            .setSleepTime(8000)//睡眠时间
            .addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36");
    @Override
    public  Site getSite()
    {
        return  site;
    }

    @Override
    public  void  process(Page page)
    {
        //判断是否首页
        if (page.getUrl().regex("https://www.***********.com/").match()){
            List<String> compCategories = new ArrayList<>();
            //去往解析类,将分页连接解析出来
            Parse.getCompCategorysUrl(page, compCategories);
            //添加到任务队列
            page.addTargetRequests(compCategories);
        }
        //检查是否是二级分页
        if (page.getUrl().regex("https://www.***********.com/"  + "industry_*").match()){
            List<String> urlList = new ArrayList<>();
            List<CompDetail> compDetailList = new ArrayList<>();
            //解析详情页链接列表和分页数据
            Parse.getCompList(page,compDetailList,urlList);
            //将分页数据写入数据库(因为详情页面的数据不全,如果不需要这里的数据采集,可
              在下级解析中解析数据并插入到数据库中,这里插入后,下级解析变成更新操作 )
            qccCompsService.insertCompany(compDetailList);
            if (urlList.size() >0)
            {
                //将链接添加到任务队列
                page.addTargetRequests(urlList);
            }

        }
        //检查是否是详情页
        if (page.getUrl().regex("https://www.********.com/"  + "firm_*").match()){
            CompDetail detail = new CompDetail();
            //解析详情数据
            Parse.getCompCategoryDetail(page, detail);
            //更新到数据库
            qccCompsService.updateCompanyDetail(detail);
        }

    }
}

6.数据解析文件

public class Parse {
    

    public  static  void  getCompCategorysUrl(Page page, List<String> compCategoryList)
    {
        Html html = page.getHtml();
        Selectable selectable = html.xpath("//*[@id=\"hangye\"]");
        Selectable as = selectable.xpath("//a");
        List<String> moreurls =  selectable.links().all();
        for (Selectable node : as.nodes())
        {
            String h = node.links().get();
            String t = node.xpath("//span/text()").get();
            if (t == null && h != null)
            {
                compCategoryList.add(h);
            }
        }
    }

    public  static  void  getCompCategorysListUrl(Page page, List<String> compCategoryList)
    {
        Html html = page.getHtml();
        Selectable selectable = html.xpath("//*[@id=\"searchlist\"]/table/tbody");
        Selectable as = selectable.xpath("//a");
        //compCategoryList =  selectable.links().all();
        for (Selectable node : as.nodes())
        {
            String h = node.links().get();
            if ( h != null)
            {
                compCategoryList.add(h);
            }
        }
    }

    public  static  void  getCompList(Page page,List<CompDetail> compDetailslist, List<String> urlList){
        Html html = page.getHtml();
        Selectable selectable = html.xpath("//*[@id=\"searchlist\"]/table/tbody/tr");
        for (Selectable node : selectable.nodes()){
            CompDetail compDetail = new CompDetail();
            String imgurl = node.xpath("/tr/td[1]/img/@src").get();
            compDetail.setLogoUrl(imgurl);
            String complink =  node.xpath("/tr/td[2]/a").links().get();
            if (complink != null){
                urlList.add(complink);
            }
            String cname =  node.xpath("/tr/td[2]/a/text()").get();
            String phone =  node.xpath("/tr/td[2]/p[2]/span/text()").get();
            String email =  node.xpath("/tr/td[2]/p[2]/text()").get();

            String[]  strs=phone.split(":");
            phone = strs[strs.length-1];
            String[]  strs2=email.split(":");
            email = strs2[strs2.length-1];

            compDetail.setPhone(phone);
            compDetail.setEmail(email);
            compDetail.setCompName(cname);
            compDetailslist.add(compDetail);
            System.out.println(phone);

        }
    }
    public  static  void  getCompCategoryDetail(Page page, CompDetail compDetail)
    {
        if (compDetail == null)
        {
            compDetail = new CompDetail();
        }
        Html html = page.getHtml();
        Selectable selectable = html.xpath("//*[@id=\"searchlist\"]/table/tbody");
        String compname = html.xpath("//*[@id=\"company-top\"]/div[2]/div[1]/div[1]/img/@alt").get();
        compDetail.setCompName(compname);
        Selectable detailNode = html.xpath("//*[@id=\"Cominfo\"]/table/tbody");
        String legalpersonname = detailNode.xpath("/tbody/tr[1]/td[2]/div/a/text()").get();
        String statustd = detailNode.xpath("/tbody/tr[1]/td[4]/text()").get();
        String regCatpital =detailNode.xpath("/tbody/tr[2]/td[2]/text()").get();
        String address =detailNode.xpath("/tbody/tr[8]/td[2]/text()").get();
        String compType =detailNode.xpath("/tbody/tr[5]/td[2]/text()").get();
        String industry =detailNode.xpath("/tbody/tr[4]/td[6]/text()").get();
        String orgcode =detailNode.xpath("/tbody/tr[3]/td[4]/text()").get();
        String regdate =detailNode.xpath("/tbody/tr[1]/td[6]/text()").get();
        String creditcode =detailNode.xpath("/tbody/tr[3]/td[2]/text()").get();
        String regauth =detailNode.xpath("/tbody/tr[5]/td[6]/text()").get();
        String region =detailNode.xpath("/tbody/tr[6]/td[6]/text()").get();
        String busscope =detailNode.xpath("tbody/tr[9]/td[2]/text()").get();
        String usedname =detailNode.xpath("/tbody/tr[7]/td[2]/text()").get();
        String perscale =detailNode.xpath("/tbody/tr[6]/td[2]/text()").get();

        compDetail.setDetailUrl(page.getUrl().get());
        compDetail.setLegalPerson(legalpersonname);
        compDetail.setStatusetd(statustd);
        compDetail.setRegCapital(regCatpital);
        compDetail.setAddress(address);
        compDetail.setCompType(compType);
        compDetail.setIndustry(industry);
        compDetail.setOrgCode(orgcode);
        compDetail.setRegDate(regdate);
        compDetail.setCreditCode(creditcode);
        compDetail.setRegAuth(regauth);
        compDetail.setRegion(region);
        compDetail.setBusScope(busscope);
        compDetail.setUsedName(usedname);
        compDetail.setPerScale(perscale);
        System.out.println(compDetail);

    }
}

7.service

@Service
public class QccCompsService {
    @Resource
    CompDetailMapper compDetailMapper;

    public int insertCompany(List<CompDetail> compDetails)
    {
      return  compDetailMapper.insertCompany(compDetails);
    }

    @Transactional
    public int updateCompanyDetail(CompDetail  compDetail)
    {
       return compDetailMapper.updateCompanyDetail(compDetail);
    }

8、数据mapper文件

public interface CompDetailMapper {
    int insertCompany(List<CompDetail> compDetails);

    int updateCompanyDetail(CompDetail  compDetail);
    int updateCompanyUseStatus(String compName,int isUse);
    int updateCompanys(List<CompDetail> compDetails);
    
}
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.kxg.datasrc.mapper.CompDetailMapper">
    <insert id="insertCompany" parameterType="java.util.List">
        insert ignore compdetail (compName,email,phone,logoUrl)
        values
        <foreach collection="list" item="item" index="index" separator=",">
            (#{item.compName,jdbcType=VARCHAR},#{item.email,jdbcType=VARCHAR},#{item.phone,jdbcType=CHAR},#{item.logoUrl,jdbcType=VARCHAR})
        </foreach>
    </insert>
    <update id="updateCompanyDetail" parameterType="com.kxg.datasrc.model.CompDetail">
        update compdetail
        <set>
            <if test="compName != null">
                compName = #{compName,jdbcType=VARCHAR},
            </if>
            <if test="address != null">
                address = #{address,jdbcType=VARCHAR},
            </if>
            <if test="statusetd != null">
                statusetd = #{statusetd,jdbcType=VARCHAR},
            </if>
            <if test="detailUrl != null">
                detailUrl = #{detailUrl,jdbcType=VARCHAR},
            </if>
            <if test="legalPerson != null">
                legalPerson = #{legalPerson,jdbcType=VARCHAR},
            </if>
            <if test="regCapital != null">
                regCapital = #{regCapital,jdbcType=VARCHAR},
            </if>
            <if test="regDate != null">
                regDate = #{regDate,jdbcType=VARCHAR},
            </if>
            <if test="compType != null">
                compType = #{compType,jdbcType=VARCHAR},
            </if>
            <if test="orgCode != null">
                orgCode = #{orgCode,jdbcType=VARCHAR},
            </if>
            <if test="industry != null">
                industry = #{industry,jdbcType=VARCHAR},
            </if>
            <if test="creditCode != null">
                creditCode = #{creditCode,jdbcType=VARCHAR},
            </if>
            <if test="regAuth != null">
                regAuth = #{regAuth,jdbcType=VARCHAR},
            </if>
            <if test="region != null">
                region = #{region,jdbcType=VARCHAR},
            </if>
            <if test="busScope != null">
                busScope = #{busScope,jdbcType=VARCHAR},
            </if>
            <if test="usedName != null">
                usedName = #{usedName,jdbcType=VARCHAR},
            </if>
            <if test="perScale != null">
                perScale = #{perScale,jdbcType=VARCHAR},
            </if>
            <if test="isUse != null">
                isUse = #{isUse,jdbcType=BIT},
            </if>
        </set>
        where compName = #{compName,jdbcType=VARCHAR}
    </update>

    <update id="updateCompanyUseStatus" >
        update compdetail
        <set>
            <if test="isUse != null">
                isUse = #{isUse,jdbcType=BIT},
            </if>
        </set>
        where compName = #{compName,jdbcType=VARCHAR}
    </update>
</mapper>

四、测试

运行项目稍等片刻成功

 

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值