Java 爬虫实战二

16 篇文章 0 订阅
2 篇文章 0 订阅

一、项目需要

  • 对某类网站的数据进行数据采集
  • 对主要数据采集点每天定时开启任务
  • 对部分数据采用周六日开启任务采集
  • 将数据写入数据库
  • 对已经采集对信息进行更新处理

二、采用技术

  •  项目框架Spring Boot
  •  爬虫工具:webmagic
  •  数据库:Mysql
  •  定时器:@Scheduled 开启
  •  cron在心解析地址:http://cron.qqe2.com/
  •  网页分析:Goole Chrome
  •  语句解析采用:$CSS节点定位、 XPach解析
     

三、任务时序图

 

四、代码

1.定时任务类

    @Scheduled(cron = "1 35 21 ? * 5")
    public void wagicCity()
    {
        String url = magicUrlConfig.get**City();
        Spider.create(jobPosProcessor)
                .addUrl(url)
                .thread(3)
                .run();
    }

    @Scheduled(cron = "1 35 2 ? * 6")
    public void wagicJobAll()
    {
        String url = magicUrlConfig.get*****All();
        Spider.create(jobPosProcessor)
                .addUrl(url)
                .thread(3)
                .run();
    }

    @Scheduled(cron = "1 35 23 * * ?")
    public void wagicHost*******()
    {
        String url = magicUrlConfig.get**home();
        Spider.create(jobPosProcessor)
                .addUrl(url)
                .thread(3)
                .run();
    }

2、爬取配置类

@Component
public class JobPosProcessor implements PageProcessor {
    @Autowired
    CityService cityService;
    @Autowired
    JobService jobService;
    private Site site = Site.me()
            .setRetryTimes(3)
            .setSleepTime(8000)
            .addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 Firefox/73.0")
            .addCookie("ganji_uuid","7134539899767277506921")
            .addCookie("ganji_xuuid","73e05d39-b634-4ee9-fc5b-e8d81257f0e5.1570941235445");
    @Override
    public  Site getSite()
    {
        return  site;
    }

    @Override
    public  void  process(Page page)
    {
     
        if (page.getUrl().regex("http://www.****.com/all").match()){
            List<String> jobCities = cityService.selectCityUrl();
            if (jobCities.size()>0){
               page.addTargetRequests(jobCities);
            }

        }
        //查询城市列表
        if (page.getUrl().regex("http://www.****.com/index.htm").match()){
            List<JobCity> jobCities = new ArrayList<>();
            Parse.getJobCities(page, jobCities);
            cityService.insertCities(jobCities);
        }

        if(page.getUrl().regex("http://www.*****.com/jzhi/").match()){
            Html html = page.getHtml();
            Selectable selectable = html.xpath("//dl[@class=\"choose clearfix\"]");
            List<String> urls  =selectable.links().all();
            //过滤掉不需要的链接
            urls.removeIf(s -> !s.contains("jzhi"));
            urls.removeIf(s -> s.contains("www."));
            page.addTargetRequests(urls);
        }
       
        if (page.getUrl().regex("[a-zA-z]+://(\\b((?![wW]{3})\\w)+\\b)\\.****.com/jianzhi/").match()){
            Html html = page.getHtml();
            Selectable selectable = html.xpath("//*[@class=\"f-label-new clearfix\"]");
            List<String> cls  =selectable.links().all();
            page.addTargetRequests(cls);
        }
        
        if (page.getUrl().regex("[a-zA-z]+://(\\b((?![wW]{3})\\w)+\\b)\\.*****.com/jzbiaoqian/*").match()){
            List<String> joburls = new ArrayList<>();
            Parse.getPartJobs(page,joburls);
            page.addTargetRequests(joburls);
        }
        //查询兼职职位 详情
        if(page.getUrl().regex("htt(p|ps)://(\\b((?![wW]{3})\\w)+\\b)\\.*****.com/(\\b((?!jzbiaoqian)\\w)+\\b)/\\d+x.htm").match()){
            List<JobDetail> jobDetailList = new ArrayList<>();
            Parse.getPartJobDetail(page,jobDetailList);
            jobService.insertJobs(jobDetailList);
            System.out.println(jobDetailList);
        }
    }
}

3.解析类

public static void getJobCities(Page page, List<JobCity> jobCities){
        if(jobCities == null){
            jobCities = new ArrayList<>();
        }
        Html html = page.getHtml();
        Selectable selectable = html.xpath("/html/body/div[1]/div[3]/dl");
        Selectable dds   = selectable.xpath("/dl/dd");
        List<Selectable> nodes = dds.nodes();
        for (Selectable node : nodes) {
            int index = nodes.indexOf(node) + 1;
            String group = selectable.xpath("/dl/dt["+index+"]/text()").get();
            Selectable anodes = node.xpath("/dd/a");
            for (Selectable an: anodes.nodes()){
                JobCity jobCity = new JobCity();
                String url   = an.xpath("/a/@href").get()+"jzhi/";
                String name   = an.xpath("/a/text()").get();
                jobCity.setClsification(group);
                jobCity.setUrl(url);
                jobCity.setName(name);
                jobCities.add(jobCity);
            }

        }
    }

    public static void getPartJobs(Page page, List<String> cls){
        if (cls == null){
            cls = new ArrayList<>();
        }
        Html html = page.getHtml();
        Selectable selectable = html.xpath("//*[@id=\"list-job-id\"]/div[5]/dl");
        for (Selectable node : selectable.nodes())
        {
            String url = node.xpath("//dt/a/@post_url").get();
            cls.add(url);
        }
        System.out.println(cls);
    }

    public static void getPartJobDetail(Page page, List<JobDetail> jobDetailList){
        if (jobDetailList == null){
            jobDetailList = new ArrayList<>();
        }
        JobDetail job = new JobDetail();
        Html html = page.getHtml();
        Selectable selectable = html.$("ul[class='clearfix pos-relat']");

        job.setJobName(selectable.xpath("/ul/li[1]/em/a/text()").get());
        job.setJobType(selectable.xpath("/ul/li[1]/em/span/text()").get());
        job.setJobPay(selectable.xpath("/ul/li[2]/em/text()").get());
        job.setEdu(selectable.xpath("/ul/li[3]/em/text()").get());
        job.setWorkbg(selectable.xpath("/ul/li[4]/em/text()").get());
        job.setAge(selectable.xpath("/ul/li[5]/em/text()").get());
        job.setJobCount(selectable.xpath("/ul/li[6]/em/text()").get());
        job.setJobLabel(selectable.xpath("/ul/li[9]/em/span/em/text()").get());
        job.setWorkTime(selectable.xpath("/ul/li[10]/em/text()").get());
        Selectable selheader = html.$("div[class='d-c-left-hear']");
        job.setJobTitle(selheader.xpath("/div/h1/text()").toString());
        String update = selheader.xpath("/div/p/span[1]/text()").toString();
        String[]  strs=update.split(":");
        job.setUpdateDate(strs[strs.length-1]);

        Selectable describenode = html.$("div[class='deta-Corp']");
        String describe = describenode.xpath("/div/text()").get();
        job.setJobDesc(describe);

        job.setPhone(DomainFormat.getDomainForUrl(page.getUrl().toString()) +html.xpath("//*[@id=\"isShowPhoneBotton\"]/img/@src").get());

        Selectable contactnode = html.$("dl[class='detail-contact']");
        job.setContacts(contactnode.xpath("/dl/dd[2]/text()").get());
        job.setAddress(contactnode.xpath("/dl/dd[3]/text()").get());
        jobDetailList.add(job);
    }

五、总结

在节点定位上使用$CSS定位比较好,否则因为因为页面的动态布局变动会出现爬取数据 的情况

解析采用XPath提取

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值