前言
文章仅供安全领域的朋友学习使用!!
严禁做违法违纪的事情,责任自负
介绍
WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
webmagic官网 http://webmagic.io/
文档地址 http://webmagic.io/docs/zh/
案例
官方文档写的很详细 可以很简单搭建出一套demo
下面是个案例,爬取房天下北京小区的价格走势图
控制类:
package com.mengkeng.spider_demo.controller;
import com.mengkeng.spider_demo.spider.price.XpaperZgtcbPopeline;
import com.mengkeng.spider_demo.spider.price.XpaperZgtcbProcessor;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
@RestController
@RequestMapping("/spider")
public class SpiderController {
public static final String BASE_URL = "https://esf.fang.com/housing/";
public static final String PRICE_URL = "https://pinggun.fang.com/RunChartNew/MakeChartData/";
@RequestMapping("1")
public String spiderDemo(){
Spider spider = Spider.create(new XpaperZgtcbProcessor());
spider.addUrl(BASE_URL).addPipeline(new XpaperZgtcbPopeline()).thread(1).run();
return "ok";
}
}
解析类:
package com.mengkeng.spider_demo.spider.price;
import com.alibaba.fastjson.JSON;
import com.mengkeng.spider_demo.config.RestTemplateConfig;
import com.mengkeng.spider_demo.entity.TkBuildingsPriceAjk;
import com.mengkeng.spider_demo.utils.IdGeneratorSnowflake;
import com.mengkeng.spider_demo.utils.RandomStringUtil;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.http.*;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
import org.springframework.web.client.RestTemplate;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Component
public class XpaperZgtcbProcessor implements PageProcessor {
public static final String BASE_URL = "https://esf.fang.com/housing/";
public static final String DETAIL_URL = "https://esf.fang.com/loupan/";
public static final String PRICE_URL = "https://pinggun.fang.com/RunChartNew/MakeChartData/";
//https://pinggun.fang.com/RunChartNew/MakeChartData
// ?newcode=1010749951&city=%u5317%u4EAC&district=%u6D77%u6DC0&commerce=&titleshow=&year=
@Override
public void process(Page page) {
Html html = page.getHtml();
String pageNum = html.xpath("//span[@class='txt']/text()").toString();
if (StringUtils.isNotBlank(pageNum)) {
String pageNow = html.xpath("//a[@class=pageNow]/text()").get();
if (StringUtils.isNotBlank(pageNow) && "1".equals(pageNow)) {
String pageNumTrim = Pattern.compile("[^(0-9)]").matcher(pageNum).replaceAll("").trim();
if (StringUtils.isNotBlank(pageNumTrim)) {
for (int i = 2; i <= Integer.parseInt(pageNumTrim); i++) {
String random1 = RandomStringUtil.nextString18();
page.addTargetRequest(BASE_URL + "__0_3_0_0_" + i + "_0_0_0/?rfss=1-" + random1 + "-2b");
}
}
}
List<Selectable> nodes = html.xpath("//div[@class='list rel mousediv']/dl/dd").nodes();
for (Selectable node : nodes) {
String code = node.xpath("//p[1]/a[2]/@projcode").toString();
String community = node.xpath("//p[1]/a[1]/text()").toString();
String area = node.xpath("//p[2]/a[1]/text()").toString();
if (StringUtils.isNotBlank(code)) {
HashMap<String, Object> map = new HashMap<>();
map.put("newcode", code);
map.put("city", cnToUnicode("北京"));
map.put("district", cnToUnicode(area));
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON_UTF8);
HttpEntity<String> entity = new HttpEntity<>(JSON.toJSONString(map), headers);
RestTemplate restTemplate = null;
try {
restTemplate = new RestTemplate(RestTemplateConfig.generateHttpRequestFactory());
} catch (Exception e) {
e.printStackTrace();
}
ResponseEntity<String> stringResponseEntity = restTemplate.exchange(PRICE_URL, HttpMethod.POST, entity, String.class);
Pattern compile = Pattern.compile(",(\\w+)]");
Matcher matcher = compile.matcher(stringResponseEntity.getBody());
Pattern compileMonth = Pattern.compile("年(\\w+)月");
Matcher matcherMonth = compileMonth.matcher(stringResponseEntity.getBody());
ArrayList<String> list = new ArrayList<>();
while (matcherMonth.find()) {
list.add(matcherMonth.group(1));
}
Pattern compileYear = Pattern.compile("&(\\w+)年");
Matcher matcherYear = compileYear.matcher(stringResponseEntity.getBody());
int year = 2020;
while (matcherYear.find()) {
year = Integer.parseInt(matcherYear.group(1));
}
ArrayList months = null;
if (!CollectionUtils.isEmpty(list)) {
months = getMonths(year, Integer.parseInt(list.get(0)), Integer.parseInt(list.get(1)));
}
int count = 0;
while (matcher.find()) {
TkBuildingsPriceAjk ajk = new TkBuildingsPriceAjk();
ajk.setDataOrigin("anjuke");
ajk.setId(IdGeneratorSnowflake.snowflakeId());
ajk.setCommunityCode(code);
ajk.setCommunity(community);
if (!CollectionUtils.isEmpty(months) && months.size() > count) {
ajk.setYearmonth(months.get(count).toString());
}
count++;
ajk.setAvgPrice(new BigDecimal(matcher.group(1)));
page.putField(page.getUrl() + RandomStringUtil.nextString18() + count, ajk);
}
}
}
} else {
//详情
}
}
private void parseList(Page page, Html html) {
List<Selectable> nodes = html.xpath("//ul[@class=pList]/li").nodes();
for (Selectable node : nodes) {
String address = node.xpath("//div[@class=listX]/p[3]/text()").toString();
String href = node.xpath("//a[1]/@href").toString();
String mount = node.xpath("//strong/text()").toString();
}
// 首页添加逻辑
String[] xiaoqus = page.getUrl().get().split("xiaoqu/");
if (ArrayUtils.contains(xiaoqus, "n1") || xiaoqus.length <= 1) {
String totalPage = html.xpath("//div[@class=total-box]/span/text()").toString();
int pageNum = Integer.valueOf(totalPage) / 30 + (Integer.valueOf(totalPage) % 30 > 1 ? 1 : 0);
for (int i = 2; i <= 3; i++) {
page.addTargetRequest(BASE_URL + "n" + i + "/");
}
}
}
public boolean checkList(Html html) {
return html.xpath("//div[@class=pageBox]]").match();
}
@Override
public Site getSite() {
return Site.me().setDomain(BASE_URL).setRetryTimes(3).setSleepTime(1000).setCharset("utf-8").setTimeOut(5000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0")
.addHeader("Refresh", BASE_URL);
}
private String cnToUnicode(String cn) {
char[] chars = cn.toCharArray();
StringBuilder returnStr = new StringBuilder();
for (int i = 0; i < chars.length; i++) {
returnStr.append("\\u").append(Integer.toString(chars[i], 16));
}
return returnStr.toString();
}
public static ArrayList getMonths(int year, int start, int end) {
ArrayList res = new ArrayList();
for (int i = start; i <= (end == 12 ? 12 : end + 12); i++) {
if (i > 12) {
res.add((year + 1) + String.format("%02d", i - 12));
} else {
res.add(year + String.format("%02d", i));
}
}
return res;
}
}
持久化类:
package com.mengkeng.spider_demo.spider.price;
import com.alibaba.fastjson.JSON;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.mengkeng.spider_demo.entity.TkBuildingsPriceAjk;
import com.mengkeng.spider_demo.mapper.TkBuildingsPriceAjkMapper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import com.mengkeng.spider_demo.utils.ContextUtil;
import java.util.Map;
import java.util.Objects;
@Component
@Slf4j
public class XpaperZgtcbPopeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
TkBuildingsPriceAjkMapper tkBuildingsPriceAjkMapper = ContextUtil.getBean(TkBuildingsPriceAjkMapper.class);
log.info("持久化开始---------------------------------");
Map<String, Object> all = resultItems.getAll();
for (Map.Entry<String, Object> map : all.entrySet()) {
TkBuildingsPriceAjk ajk = (TkBuildingsPriceAjk) map.getValue();
QueryWrapper<TkBuildingsPriceAjk> queryWrapper = new QueryWrapper<>();
queryWrapper.eq("community_code", ajk.getCommunityCode());
queryWrapper.eq("yearmonth", ajk.getYearmonth());
try {
TkBuildingsPriceAjk oldTK = tkBuildingsPriceAjkMapper.selectOne(queryWrapper);
if (Objects.nonNull(oldTK)) {
oldTK.setAvgPrice(ajk.getAvgPrice());
tkBuildingsPriceAjkMapper.updateById(oldTK);
} else {
tkBuildingsPriceAjkMapper.insert(ajk);
}
} catch (Exception e) {
log.error("持久化失败" + e.getMessage());
log.error("失败数据为" + JSON.toJSONString(ajk));
}
}
log.info("持久化结束---------------------------------");
}
}
用到的工具类以及配置类:
package com.mengkeng.spider_demo.config;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.ssl.TrustStrategy;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.http.client.ClientHttpRequestFactory;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
import org.springframework.http.client.SimpleClientHttpRequestFactory;
import org.springframework.web.client.RestTemplate;
import javax.net.ssl.SSLContext;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
@Configuration
public class RestTemplateConfig {
@Bean
public RestTemplate restTemplate(ClientHttpRequestFactory factory) {
return new RestTemplate(factory);
}
@Bean
public ClientHttpRequestFactory simpleClientHttpRequestFactory() {
SimpleClientHttpRequestFactory factory = new SimpleClientHttpRequestFactory();
factory.setConnectTimeout(3000);
factory.setReadTimeout(5000);
return factory;
}
/**
* restTemplate发送https处理
* @return
* @throws NoSuchAlgorithmException
* @throws KeyManagementException
* @throws KeyStoreException
*/
public static HttpComponentsClientHttpRequestFactory generateHttpRequestFactory()
throws NoSuchAlgorithmException, KeyManagementException, KeyStoreException {
TrustStrategy acceptingTrustStrategy = (x509Certificates, authType) -> true;
SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, acceptingTrustStrategy).build();
SSLConnectionSocketFactory connectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
new NoopHostnameVerifier());
HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setSSLSocketFactory(connectionSocketFactory);
CloseableHttpClient httpClient = httpClientBuilder.build();
HttpComponentsClientHttpRequestFactory factory = new HttpComponentsClientHttpRequestFactory();
factory.setHttpClient(httpClient);
return factory;
}
}
package com.mengkeng.spider_demo.utils;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
/**
* @Author: Mr sheng.z
* @Description: 实现了ApplicationContextAware ,这个类就可以获取到所有引用对象的bean
* @Date: Create in 13:19 2020/6/30
*/
@Component
public final class ContextUtil implements ApplicationContextAware {
protected static ApplicationContext applicationContext ;
@Override
public void setApplicationContext(ApplicationContext arg0) throws BeansException {
if (applicationContext == null) {
applicationContext = arg0;
}
}
public static Object getBean(String name) {
//name表示其他要注入的注解name名
return applicationContext.getBean(name);
}
/**
* 拿到ApplicationContext对象实例后就可以手动获取Bean的注入实例对象
*/
public static <T> T getBean(Class<T> clazz) {
return applicationContext.getBean(clazz);
}
}
package com.mengkeng.spider_demo.utils;
import cn.hutool.core.lang.Snowflake;
import cn.hutool.core.util.IdUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
/**
* 雪花算法封装工具类
* 生成唯一性id
*/
@Slf4j
@Component
public class IdGeneratorSnowflake {
private static long workerId = 16;
private static long datacenterId = 11;
//取得雪花算法并创建全局唯一id
private static Snowflake snowflake = IdUtil.createSnowflake(workerId, datacenterId);
public synchronized static long snowflakeId() {
return snowflake.nextId();
}
public synchronized static String snowflakeIdStr() {
return String.valueOf(snowflake.nextId());
}
public synchronized static long snowflakeId(long workerId, long datacenterId) {
Snowflake snowflake = IdUtil.createSnowflake(workerId, datacenterId);
return snowflake.nextId();
}
public synchronized static long getSnowflake() {
Snowflake snowflake = IdUtil.getSnowflake(workerId, datacenterId);
return snowflake.nextId();
}
}
```java
package com.mengkeng.spider_demo.utils;
import org.apache.commons.lang3.RandomStringUtils;
/**
*
* Date: 2022-07-08 11:08
* Description:
*/
public class RandomStringUtil {
public static String nextString18() {
return RandomStringUtils.random(18, new char[]{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'
, 'g', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'});
}
}
注意事项
其中需要关注的重点就两处
1.解析类 添加下次爬取链接路径
2.解析类 添加持久化字段
因房天下小区详情页 有些价格走势图不予展示 故抓取到接口请求(public static final String PRICE_URL = “https://pinggun.fang.com/RunChartNew/MakeChartData/”)
后语
该框架原理其实就是将页面下载下来 , 然后用Xpath语法解析页面数据
众所周知 , 有爬虫就有反爬虫 , 频繁发送http请求会被警告限制 , 例如 滑动验证码 或者 跳转登录提示页 那么这种该怎么办呢?
下一期开始另一套框架 selenium 自动化测试框架 模拟人工点击的操作
代码链接
https://download.csdn.net/download/DoAsOnePleases/86752915