webmagic抓取分页数据:
pom文件:
<!--webmagic 核心包-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<!-- webmagic 扩展包 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
public interface PagedModel {
public String getPageKey();
public Collection<String> getOtherPages();
public String getPage();
public PagedModel combine(PagedModel pagedModel);
}
抓取测试类
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements PagedModel, AfterExtractor {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
private String pageKey;
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
private List<String> otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
private String title;
@ExtractBy("//div[@id=\"epContentLeft\"]")
private String content;
@Override
public String getPageKey() {
return pageKey;
}
@Override
public Collection<String> getOtherPages() {
return otherPage;
}
@Override
public String getPage() {
if (page == null) {
return "1";
}
return page;
}
@Override
public PagedModel combine(PagedModel pagedModel) {
News163 news163 = new News163();
News163 pagedModel1 = (News163) pagedModel;
news163.content = this.content + pagedModel1.content;
return news163;
}
@Override
public String toString() {
return "News163{" +
"content='" + content + '\'' +
", title='" + title + '\'' +
", otherPage=" + otherPage +
'}';
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
}
@Override
public void afterProcess(Page page) {
Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
}
}