学习文档地址:
http://webmagic.io/docs/zh/
一个简单的爬虫实例代码
- 导入依赖
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
-
AnswerController类
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; @RestController public class AnswerController { @Autowired private AnswerSpider answerSpider; @RequestMapping("/startSpider") public String startSpider(){ answerSpider.start(); return "ok"; } }
-
自定义PageProcessor
import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import java.util.List; @Component public class AnswerPageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { Html html = page.getHtml(); String title = html.xpath("//title/text()").toString(); //判断页面是否是首页 if (title.contains("首页")) { //将首页所有的问题专区链接放入队列 List<Selectable> nodes = html.xpath("//div[@class='media']").nodes(); for (Selectable node : nodes) { String href = node.xpath("//a[1]/@href").toString(); page.addTargetRequest("http://28.0.000.00:0000" + href); } //获取下一页列表,将链接放入队列 List<Selectable> nextNodes = html.xpath("//ul[@class='pagination']/li").nodes(); Selectable nextSel = nextNodes.get(nextNodes.size() - 2); String next = nextSel.xpath("//a[1]/@href").toString(); page.addTargetRequest("http://28.0.000.00:0000" + next); page.getResultItems().setSkip(true); } else { //执行问题页面分析逻辑 String question = html.xpath("//div[@class='col-lg-9']/div[1]/h4/span/text()").toString(); String answer = html.xpath("//div[@id='question-view']/textarea/text()").toString(); ChatInfo chatInfo = new ChatInfo(); chatInfo.setKey(question); List<Selectable> mediaBodys = html.xpath("//div[@class='media-body']").nodes(); if (mediaBodys.size() != 0) { for (Selectable mediaBody : mediaBodys) { String body = mediaBody.xpath("//h4/div[1]/text()").toString(); answer = answer + body; } } chatInfo.setWord(answer); //将数据存入resultItems page.putField("chatInfo",chatInfo); } } @Override public Site getSite() { return site; } }
- 自定义Pipeline
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; @Component //自定义Pipeline,将数据持久化数据库 public class AnswerPipeline implements Pipeline { @Autowired private ChatMapper chatMapper; @Override public void process(ResultItems resultItems, Task task) { //使用存入时的key取出 ChatInfo chatInfo = resultItems.get("chatInfo"); chatMapper.insert(chatInfo.getKey(),chatInfo.getWord()); } }
-
Spider是爬虫启动的入口
@Component public class AnswerSpider { @Autowired private AnswerPageProcessor answerPageProcessor; @Autowired private AnswerPipeline answerPipeline; public void start(){ Spider.create(answerPageProcessor) .addUrl("http://28.0.000.00:0000/index") //添加自定义pipeline,将数据保存数据库 //不添加时,使用默认pipeline,会将结果打印在控制台 .addPipeline(answerPipeline) .thread(1) .start(); } }
spider的配置详细请看学习文档