java爬虫框架之webMagic

前言

文章仅供安全领域的朋友学习使用!!
严禁做违法违纪的事情,责任自负

介绍

WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。

webmagic官网 http://webmagic.io/
文档地址 http://webmagic.io/docs/zh/

案例

官方文档写的很详细 可以很简单搭建出一套demo
下面是个案例,爬取房天下北京小区的价格走势图

控制类:

package com.mengkeng.spider_demo.controller;

import com.mengkeng.spider_demo.spider.price.XpaperZgtcbPopeline;
import com.mengkeng.spider_demo.spider.price.XpaperZgtcbProcessor;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;


@RestController
@RequestMapping("/spider")
public class SpiderController {

    public static final String BASE_URL = "https://esf.fang.com/housing/";
    public static final String PRICE_URL = "https://pinggun.fang.com/RunChartNew/MakeChartData/";
    @RequestMapping("1")
    public String spiderDemo(){
        Spider spider = Spider.create(new XpaperZgtcbProcessor());
        spider.addUrl(BASE_URL).addPipeline(new XpaperZgtcbPopeline()).thread(1).run();
        return "ok";
    }
}

解析类:

package com.mengkeng.spider_demo.spider.price;

import com.alibaba.fastjson.JSON;
import com.mengkeng.spider_demo.config.RestTemplateConfig;
import com.mengkeng.spider_demo.entity.TkBuildingsPriceAjk;
import com.mengkeng.spider_demo.utils.IdGeneratorSnowflake;
import com.mengkeng.spider_demo.utils.RandomStringUtil;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.http.*;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
import org.springframework.web.client.RestTemplate;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component
public class XpaperZgtcbProcessor implements PageProcessor {
    public static final String BASE_URL = "https://esf.fang.com/housing/";
    public static final String DETAIL_URL = "https://esf.fang.com/loupan/";
    public static final String PRICE_URL = "https://pinggun.fang.com/RunChartNew/MakeChartData/";
    //https://pinggun.fang.com/RunChartNew/MakeChartData
    // ?newcode=1010749951&city=%u5317%u4EAC&district=%u6D77%u6DC0&commerce=&titleshow=&year=

    @Override
    public void process(Page page) {
        Html html = page.getHtml();

        String pageNum = html.xpath("//span[@class='txt']/text()").toString();
        if (StringUtils.isNotBlank(pageNum)) {
            String pageNow = html.xpath("//a[@class=pageNow]/text()").get();
            if (StringUtils.isNotBlank(pageNow) && "1".equals(pageNow)) {
                String pageNumTrim = Pattern.compile("[^(0-9)]").matcher(pageNum).replaceAll("").trim();
                if (StringUtils.isNotBlank(pageNumTrim)) {
                    for (int i = 2; i <= Integer.parseInt(pageNumTrim); i++) {
                        String random1 = RandomStringUtil.nextString18();
                        page.addTargetRequest(BASE_URL + "__0_3_0_0_" + i + "_0_0_0/?rfss=1-" + random1 + "-2b");
                    }
                }
            }

            List<Selectable> nodes = html.xpath("//div[@class='list rel mousediv']/dl/dd").nodes();
            for (Selectable node : nodes) {
                String code = node.xpath("//p[1]/a[2]/@projcode").toString();
                String community = node.xpath("//p[1]/a[1]/text()").toString();
                String area = node.xpath("//p[2]/a[1]/text()").toString();
                if (StringUtils.isNotBlank(code)) {
                    HashMap<String, Object> map = new HashMap<>();
                    map.put("newcode", code);
                    map.put("city", cnToUnicode("北京"));
                    map.put("district", cnToUnicode(area));

                    HttpHeaders headers = new HttpHeaders();
                    headers.setContentType(MediaType.APPLICATION_JSON_UTF8);
                    HttpEntity<String> entity = new HttpEntity<>(JSON.toJSONString(map), headers);
                    RestTemplate restTemplate = null;
                    try {
                        restTemplate = new RestTemplate(RestTemplateConfig.generateHttpRequestFactory());
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    ResponseEntity<String> stringResponseEntity = restTemplate.exchange(PRICE_URL, HttpMethod.POST, entity, String.class);
                    Pattern compile = Pattern.compile(",(\\w+)]");
                    Matcher matcher = compile.matcher(stringResponseEntity.getBody());

                    Pattern compileMonth = Pattern.compile("年(\\w+)月");
                    Matcher matcherMonth = compileMonth.matcher(stringResponseEntity.getBody());
                    ArrayList<String> list = new ArrayList<>();
                    while (matcherMonth.find()) {
                        list.add(matcherMonth.group(1));
                    }

                    Pattern compileYear = Pattern.compile("&(\\w+)年");
                    Matcher matcherYear = compileYear.matcher(stringResponseEntity.getBody());
                    int year = 2020;
                    while (matcherYear.find()) {
                        year = Integer.parseInt(matcherYear.group(1));
                    }
                    ArrayList months = null;
                    if (!CollectionUtils.isEmpty(list)) {
                        months = getMonths(year, Integer.parseInt(list.get(0)), Integer.parseInt(list.get(1)));
                    }

                    int count = 0;
                    while (matcher.find()) {
                        TkBuildingsPriceAjk ajk = new TkBuildingsPriceAjk();
                        ajk.setDataOrigin("anjuke");
                        ajk.setId(IdGeneratorSnowflake.snowflakeId());
                        ajk.setCommunityCode(code);
                        ajk.setCommunity(community);
                        if (!CollectionUtils.isEmpty(months) && months.size() > count) {
                            ajk.setYearmonth(months.get(count).toString());
                        }
                        count++;
                        ajk.setAvgPrice(new BigDecimal(matcher.group(1)));
                        page.putField(page.getUrl() + RandomStringUtil.nextString18() + count, ajk);
                    }
                }
            }
        } else {
            //详情
        }
    }

    private void parseList(Page page, Html html) {
        List<Selectable> nodes = html.xpath("//ul[@class=pList]/li").nodes();
        for (Selectable node : nodes) {
            String address = node.xpath("//div[@class=listX]/p[3]/text()").toString();
            String href = node.xpath("//a[1]/@href").toString();
            String mount = node.xpath("//strong/text()").toString();
        }

        //  首页添加逻辑
        String[] xiaoqus = page.getUrl().get().split("xiaoqu/");

        if (ArrayUtils.contains(xiaoqus, "n1") || xiaoqus.length <= 1) {
            String totalPage = html.xpath("//div[@class=total-box]/span/text()").toString();
            int pageNum = Integer.valueOf(totalPage) / 30 + (Integer.valueOf(totalPage) % 30 > 1 ? 1 : 0);
            for (int i = 2; i <= 3; i++) {
                page.addTargetRequest(BASE_URL + "n" + i + "/");
            }
        }
    }

    public boolean checkList(Html html) {
        return html.xpath("//div[@class=pageBox]]").match();
    }

    @Override
    public Site getSite() {
        return Site.me().setDomain(BASE_URL).setRetryTimes(3).setSleepTime(1000).setCharset("utf-8").setTimeOut(5000)
                .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0")
                .addHeader("Refresh", BASE_URL);
    }

    private String cnToUnicode(String cn) {
        char[] chars = cn.toCharArray();
        StringBuilder returnStr = new StringBuilder();
        for (int i = 0; i < chars.length; i++) {
            returnStr.append("\\u").append(Integer.toString(chars[i], 16));
        }
        return returnStr.toString();
    }

    public static ArrayList getMonths(int year, int start, int end) {
        ArrayList res = new ArrayList();
        for (int i = start; i <= (end == 12 ? 12 : end + 12); i++) {
            if (i > 12) {
                res.add((year + 1) + String.format("%02d", i - 12));
            } else {
                res.add(year + String.format("%02d", i));
            }
        }
        return res;
    }
}

持久化类:

package com.mengkeng.spider_demo.spider.price;

import com.alibaba.fastjson.JSON;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.mengkeng.spider_demo.entity.TkBuildingsPriceAjk;
import com.mengkeng.spider_demo.mapper.TkBuildingsPriceAjkMapper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import com.mengkeng.spider_demo.utils.ContextUtil;

import java.util.Map;
import java.util.Objects;

@Component
@Slf4j
public class XpaperZgtcbPopeline implements Pipeline {

    @Override
    public void process(ResultItems resultItems, Task task) {
        TkBuildingsPriceAjkMapper tkBuildingsPriceAjkMapper = ContextUtil.getBean(TkBuildingsPriceAjkMapper.class);
        log.info("持久化开始---------------------------------");
        Map<String, Object> all = resultItems.getAll();
        for (Map.Entry<String, Object> map : all.entrySet()) {
            TkBuildingsPriceAjk ajk = (TkBuildingsPriceAjk) map.getValue();
            QueryWrapper<TkBuildingsPriceAjk> queryWrapper = new QueryWrapper<>();
            queryWrapper.eq("community_code", ajk.getCommunityCode());
            queryWrapper.eq("yearmonth", ajk.getYearmonth());
            try {
                TkBuildingsPriceAjk oldTK = tkBuildingsPriceAjkMapper.selectOne(queryWrapper);
                if (Objects.nonNull(oldTK)) {
                    oldTK.setAvgPrice(ajk.getAvgPrice());
                    tkBuildingsPriceAjkMapper.updateById(oldTK);
                } else {
                    tkBuildingsPriceAjkMapper.insert(ajk);
                }
            } catch (Exception e) {
                log.error("持久化失败" + e.getMessage());
                log.error("失败数据为" + JSON.toJSONString(ajk));
            }
        }
        log.info("持久化结束---------------------------------");
    }
}

用到的工具类以及配置类:

package com.mengkeng.spider_demo.config;

import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.ssl.TrustStrategy;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.http.client.ClientHttpRequestFactory;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
import org.springframework.http.client.SimpleClientHttpRequestFactory;
import org.springframework.web.client.RestTemplate;

import javax.net.ssl.SSLContext;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;

@Configuration
public class RestTemplateConfig {

    @Bean
    public RestTemplate restTemplate(ClientHttpRequestFactory factory) {
        return new RestTemplate(factory);
    }

    @Bean
    public ClientHttpRequestFactory simpleClientHttpRequestFactory() {
        SimpleClientHttpRequestFactory factory = new SimpleClientHttpRequestFactory();
        factory.setConnectTimeout(3000);
        factory.setReadTimeout(5000);
        return factory;
    }

    /**
     * restTemplate发送https处理
     * @return
     * @throws NoSuchAlgorithmException
     * @throws KeyManagementException
     * @throws KeyStoreException
     */
    public static HttpComponentsClientHttpRequestFactory generateHttpRequestFactory()
            throws NoSuchAlgorithmException, KeyManagementException, KeyStoreException {
        TrustStrategy acceptingTrustStrategy = (x509Certificates, authType) -> true;
        SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, acceptingTrustStrategy).build();
        SSLConnectionSocketFactory connectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
                new NoopHostnameVerifier());
        HttpClientBuilder httpClientBuilder = HttpClients.custom();
        httpClientBuilder.setSSLSocketFactory(connectionSocketFactory);
        CloseableHttpClient httpClient = httpClientBuilder.build();
        HttpComponentsClientHttpRequestFactory factory = new HttpComponentsClientHttpRequestFactory();
        factory.setHttpClient(httpClient);
        return factory;
    }

}
package com.mengkeng.spider_demo.utils;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;

/**
 * @Author: Mr sheng.z
 * @Description: 实现了ApplicationContextAware ,这个类就可以获取到所有引用对象的bean
 * @Date: Create in 13:19 2020/6/30
 */
@Component
public final class ContextUtil implements ApplicationContextAware {
    protected static ApplicationContext applicationContext ;
    @Override
    public void setApplicationContext(ApplicationContext arg0) throws BeansException {
        if (applicationContext == null) {
            applicationContext = arg0;
        }
    }
    public static Object getBean(String name) {
        //name表示其他要注入的注解name名
        return applicationContext.getBean(name);
    }
    /**
     * 拿到ApplicationContext对象实例后就可以手动获取Bean的注入实例对象
     */
    public static <T> T getBean(Class<T> clazz) {
        return applicationContext.getBean(clazz);
    }
}

package com.mengkeng.spider_demo.utils;

import cn.hutool.core.lang.Snowflake;
import cn.hutool.core.util.IdUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;

/**
 * 雪花算法封装工具类
 * 生成唯一性id
 */
@Slf4j
@Component
public class IdGeneratorSnowflake {

    private static long workerId = 16;
    private static long datacenterId = 11;
    //取得雪花算法并创建全局唯一id
    private static Snowflake snowflake = IdUtil.createSnowflake(workerId, datacenterId);


    public synchronized static long snowflakeId() {
        return snowflake.nextId();
    }

    public synchronized static String snowflakeIdStr() {
        return String.valueOf(snowflake.nextId());
    }

    public synchronized static long snowflakeId(long workerId, long datacenterId) {
        Snowflake snowflake = IdUtil.createSnowflake(workerId, datacenterId);
        return snowflake.nextId();
    }

    public synchronized static long getSnowflake() {
        Snowflake snowflake = IdUtil.getSnowflake(workerId, datacenterId);
        return snowflake.nextId();
    }


}

```java
package com.mengkeng.spider_demo.utils;

import org.apache.commons.lang3.RandomStringUtils;

/**
 *
 * Date: 2022-07-08 11:08
 * Description:
 */
public class RandomStringUtil {
    public static  String nextString18() {
        return RandomStringUtils.random(18, new char[]{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'
                , 'g', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'});
    }

}

注意事项

其中需要关注的重点就两处
1.解析类 添加下次爬取链接路径
在这里插入图片描述2.解析类 添加持久化字段
在这里插入图片描述因房天下小区详情页 有些价格走势图不予展示 故抓取到接口请求(public static final String PRICE_URL = “https://pinggun.fang.com/RunChartNew/MakeChartData/”)

后语

该框架原理其实就是将页面下载下来 , 然后用Xpath语法解析页面数据
众所周知 , 有爬虫就有反爬虫 , 频繁发送http请求会被警告限制 , 例如 滑动验证码 或者 跳转登录提示页 那么这种该怎么办呢?
下一期开始另一套框架 selenium 自动化测试框架 模拟人工点击的操作

代码链接

https://download.csdn.net/download/DoAsOnePleases/86752915

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值