使用WebMagic编写一个网络爬虫
引入依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
编写测试类
public class NewPage implements PageProcessor {
private Site site = Site
.me()
.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
@Override
public void process(Page page)
{
page.putField("title", page.getHtml().xpath("//h1[@class='content-heading']/text()").get());
page.putField("time", page.getHtml().xpath("//div[@class='container']/p/text()").get());
page.putField("name", page.getHtml().xpath("//div[@class='card-inner']").get());
if (page.getResultItems().get("name")==null){
page.setSkip(true);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
for (int nums = 1000;nums< 4895;nums++) {
Spider.create(new NewPage()).addUrl("https://acm.taifua.com/bzoj/p/"+nums+".html").addPipeline(new TestPipeline(nums)).run();
}
}
}
编写持久化类
public class TestPipeline implements Pipeline {
private int index;
public int getIndex() {
return index;
}
public TestPipeline(int index) {
this.setIndex(index);
}
@Override
public void process(ResultItems resultItems, Task task) {
try {
String title = resultItems.get("title");
PrintWriter printWriter = new PrintWriter(new FileWriter("D:\\text\\"+this.getIndex()+ title+".json"));
printWriter.write(JSON.toJSONString(resultItems.getAll()));
printWriter.close();
} catch (IOException var5) {
}
}
public TestPipeline setIndex(int index) {
this.index = index;
return this;
}
}
结果