【SpringBoot学习】22、Webmagic 爬虫爬取网络资源
1、采用 webmagic
2、集成 webmagic
添加依赖,然后就没了,就是这么简单
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
3、爬取 CSDN 案例
写一个普通 main 方法,实现 PageProcessor 接口即可
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* csdn 文章爬取
*
* @author Tellsea
* @date 2021/10/31
*/
public class CsdnPageProcesser implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
@Override
public void process(Page page) {
List<String> typeList = page.getHtml().xpath("//div[@class='article-item-box']/h4/a/span/text()").all();
List<String> titleList = page.getHtml().xpath("//div[@class='article-item-box']/h4/a/text()").all();
List<String> descList = page.getHtml().xpath("//div[@class='article-item-box']/p[@class='content']/text()").all();
List<String> timeList = page.getHtml().xpath("//div[@class='article-item-box']/div[@class='info-box']/p/span[@class='date']/text()").all();
List<String> readList = page.getHtml().xpath("//div[@class='article-item-box']/div[@class='info-box']/p/span[@class='read-num']/text()").all();
List<String> linkList = page.getHtml().css("div.article-item-box h4 a", "href").all();
System.out.println("----------------------------------------------- 爬虫开始");
for (int i = 0; i < typeList.size(); i++) {
System.out.println("------------------------ 第 " + (i + 1) + " 篇文章");
System.out.println("类型:" + typeList.get(i).trim());
System.out.println("标题:" + titleList.get(i).trim());
System.out.println("描述:" + descList.get(i).trim());
System.out.println("时间:" + timeList.get(i).trim());
System.out.println("阅读:" + readList.get(i).trim());
System.out.println("链接:" + linkList.get(i).trim());
}
System.out.println("----------------------------------------------- 爬虫结束");
}
@Override
public Site getSite() {
return this.site;
}
public static void main(String[] args) {
Spider.create(new CsdnPageProcesser())
.addUrl("https://blog.csdn.net/qq_38762237")
.addPipeline(new ConsolePipeline()).run();
}
}
爬取效果图