一、Gecco是什么
Gecco是一款用java语言开发的轻量化的易用的网络爬虫,不同于Nutch这样的面向搜索引擎的通用爬虫,Gecco是面向主题的爬虫。
- 通用爬虫一般关注三个主要的问题:下载、排序、索引。
- 主题爬虫一般关注的是:下载、内容抽取、灵活的业务逻辑处理。
Gecco的目标是提供一个完善的主题爬虫框架,简化下载和内容抽取的开发,利用管道过滤器模式,提供灵活的内容清洗和持久化处理模式,让开发人员把更多的精力投入到与业务主题相关的内容处理上。
主要特征
- 简单易用,使用jquery的selector风格抽取元素
- 支持页面中的异步ajax请求
- 支持页面中的javascript变量抽取
- 利用Redis实现分布式抓取,参考gecco-redis
- 支持下载时UserAgent随机选取
- 支持下载代理服务器随机选取
- 支持结合Spring开发业务逻辑,参考gecco-spring
- 支持htmlunit扩展,参考gecco-htmlunit
- 支持插件扩展机制
二、使用步骤
1. maven依赖
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.10</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.geccocrawler</groupId>
<artifactId>gecco-spring</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.5.7</version>
</dependency>
</dependencies>
2.项目目录结构
3.入口
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
@EnableScheduling
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
4.定时任务
import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.spring.SpringPipelineFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
@Component
public class Task {
@Autowired
private SpringPipelineFactory springPipelineFactory;
@Scheduled(cron = "0 0/1 * * * ?")
public void pull() {
GeccoEngine.create()
.pipelineFactory(springPipelineFactory)
.classpath("com.mt.imooc.spider")
.start("https://www.imooc.com")
.interval(3000)
.start();
}
}
5.Gecco配置
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
/**
* @author jay
* @date 2021/1/21 22:11
*/
@Configuration
@ComponentScan(basePackages = "com.geccocrawler.gecco.spring")
public class GeccoConfig {
}
6. 慕课网主页标题与子标题抓取
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.annotation.Text;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;
import lombok.Data;
import java.util.List;
/**
* @author jay
* @date 2021/1/21 22:11
*/
@Data
@Gecco(matchUrl="https://www.imooc.com", pipelines="indexPipeline")
public class Index implements HtmlBean {
@Request
private HttpRequest request;
/**
* 左侧导航栏标题
*/
@Text
@HtmlField(cssPath = "#main > div.bgfff.banner-box > div > div.menuContent > div > span.title")
private List<String> title;
/**
* 左侧导航栏子标题
*/
@Text
@HtmlField(cssPath = "#main > div.bgfff.banner-box > div > div.menuContent > div > span.sub-title")
private List<String> subTitle;
}
7.针对慕课网主页的数据的后续处理
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.DeriveSchedulerContext;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* @author jay
* @date 2021/1/21 22:11
*/
@Service
public class IndexPipeline implements Pipeline<Index> {
@Override
public void process(Index index) {
HttpRequest currRequest = index.getRequest();
List<String> list = index.getTitle();
list.forEach(e -> {
// 只抓取后端开发为java的
if ("后端开发:".equals(e)) {
DeriveSchedulerContext.into(currRequest.subRequest("https://coding.imooc.com/?c=java"));
}
});
}
}
8.代码主页的抓取
import com.geccocrawler.gecco.annotation.*;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.SpiderBean;
import lombok.Data;
import java.util.List;
/**
* @author jay
* @date 2021/1/21 22:11
*/
@Data
@Gecco(matchUrl="https://coding.imooc.com/?c={type}", pipelines="codingIndexPipeline")
public class CodingIndex implements SpiderBean {
@RequestParameter
private String type;
@Request
private HttpRequest request;
/**
* 课程封面在style里面
*/
@Attr("style")
@HtmlField(cssPath = "body > div.main > div.w1430 > ul > li > a > div")
private List<String> style;
/**
* 课程标题
*/
@Text
@HtmlField(cssPath = "body > div.main > div.w1430 > ul > li > a > p.title.ellipsis2")
private List<String> title;
/**
* 当前页
*/
@Text
@HtmlField(cssPath = "body > div.main > div.w1430 > div.page > a.active")
private int currPage;
/**
* 自定义渲染totalPage
*/
@FieldRenderName("totalPageFieldRender")
private int totalPage;
}
9.代码主页数据的后续处理
import cn.hutool.core.util.StrUtil;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.DeriveSchedulerContext;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* @author jay
* @date 2021/1/21 22:11
*/
@Service
public class CodingIndexPipeline implements Pipeline<CodingIndex> {
@Override
public void process(CodingIndex codingIndex) {
List<String> style = codingIndex.getStyle();
style.forEach(e -> System.out.println("课程封面:" + StrUtil.subBetween(e, "//", ")")));
HttpRequest request = codingIndex.getRequest();
int currPage = codingIndex.getCurrPage();
int nextPage = currPage + 1;
int totalPage = codingIndex.getTotalPage();
if(nextPage <= totalPage) {
String nextUrl;
String currUrl = request.getUrl();
if(currUrl.contains("page=")) {
nextUrl = StringUtils.replaceOnce(currUrl, "page=" + currPage, "page=" + nextPage);
} else {
nextUrl = currUrl + "&" + "page=" + nextPage;
}
DeriveSchedulerContext.into(request.subRequest(nextUrl));
}
}
}
10.自定义渲染字段
import cn.hutool.core.util.StrUtil;
import com.geccocrawler.gecco.annotation.FieldRenderName;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.response.HttpResponse;
import com.geccocrawler.gecco.spider.SpiderBean;
import com.geccocrawler.gecco.spider.render.CustomFieldRender;
import net.sf.cglib.beans.BeanMap;
import java.lang.reflect.Field;
/**
* @author jay
* @date 2021/1/28 21:47
*/
@FieldRenderName("totalPageFieldRender")
public class TotalPageFieldRender implements CustomFieldRender {
@Override
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean, Field field) {
String content = response.getContent();
if (StrUtil.isNotBlank(content)) {
if (!StrUtil.contains(content, "<span class=\"disabled_page\">尾页</span>")) {
String s = StrUtil.subBetween(content, "下一页", "尾页");
int i = StrUtil.lastIndexOfIgnoreCase(s, "\"");
int j = StrUtil.lastIndexOfIgnoreCase(s, "=");
int totalPage = Integer.parseInt(StrUtil.sub(s, j + 1, i));
beanMap.put(field.getName(), totalPage);
}
}
}
}