WebMagic入门实战下CSDN,20行代码实现爬取标题
spiderimport java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class CsdnSpider implements PageProcessor{
Site site=Site.me().setRetryTimes(5).setTimeOut(5000).setSleepTime(200).addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
@Override
public void process(Page page) {
List<String> list= page.getHtml().xpath("//div[@class='article-list']/div/@data-articleid").all();
if(!list.isEmpty()) {
for (String string : list) {
page.addTargetRequest("https://blog.csdn.net/qq_36783371/article/details/"+string);
}
}
if(page.getRequest().getUrl().matches("https://blog\\.csdn\\.net/qq_36783371/article/details/\\d+")) {
page.putField("title", page.getHtml().xpath("//h6[@class='title-article']/text()").toString());
//page.putField("text", page.getHtml().xpath("").toString());
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new CsdnSpider()).addUrl("https://blog.csdn.net/qq_36783371","https://blog.csdn.net/qq_36783371/article/list/2?").addPipeline(new CsdnPipline()).thread(5).runAsync();
}
}
pipline
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class CsdnPipline implements Pipeline{
@Override
public void process(ResultItems resultItems, Task task) {
try {
System.out.println(resultItems.get("title").toString());
} catch (Exception e) {
}
}
}
改造下变成刷访问量
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class CsdnSpider implements PageProcessor{
Site site=Site.me().setRetryTimes(5).setTimeOut(5000).setSleepTime(200).addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
@Override
public void process(Page page) {
List<String> list= page.getHtml().xpath("//div[@class='article-list']/div/@data-articleid").all();
if(!list.isEmpty()) {
for (String string : list) {
page.addTargetRequest("https://blog.csdn.net/qq_36783371/article/details/"+string);
}
}
if(page.getRequest().getUrl().matches("https://blog\\.csdn\\.net/qq_36783371/article/details/\\d+")) {
//page.putField("title", page.getHtml().xpath("//h6[@class='title-article']/text()").toString());
//page.putField("text", page.getHtml().xpath("").toString());
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) throws Exception {
for (int i = 0; i < 100; i++) {
Thread.sleep(5000);
Spider.create(new CsdnSpider()).addUrl("https://blog.csdn.net/qq_36783371","https://blog.csdn.net/qq_36783371/article/list/2?").thread(5).runAsync();
}
}
}