基于Webmagic的Java爬虫(一)
- 配置Webmagic(基于Maven)
在创建的Maven项目的 pom.xml 文件中添加如下依赖:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
- 跑例子
package ang.one;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
private static int count =0;
public Site getSite() {
return site;
}
public void process(Page page) {
if(!page.getUrl().regex("http://www.cnblogs.com/[a-z 0-9 -]+/p/[0-9]{7}.html").match()){
page.addTargetRequests(
page.getHtml().xpath("//*[@id=\"post_list\"]/div/div[@class='post_item_body']/h3/a/@href").all());
}
System.out.println("抓取的内容:"+
page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()").get()
);
count ++;
}
public static void main(String[] args) {
long startTime, endTime;
System.out.println("开始爬取...");
startTime = System.currentTimeMillis();
Spider.create(new GithubRepoPageProcessor()).addUrl("https://www.cnblogs.com/").thread(5).run();
endTime = System.currentTimeMillis();
System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录");
}
}
- 基本流程
①实现 PageProcessor 接口
②设置抓取网站的相关配置: Site
③编写抽取逻辑:Process{
page.getUrl();
page.addTargetRequests();
page.putField();
}
④启动爬虫
Spider.create(new GithubRepoPageProcessor()).addUrl(“xxx”).thread(int).run();