- 使用框架:webmagic
- 项目核心:建立DoubanProcessor类,继承webmagic的PageProcessor,重写process()方法
- 爬取网址:https://movie.douban.com/subject/26985127/comments?start=0&limit=20&sort=new_score&status=P
代码详情:
package MyWebMagic;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class DoubanProcessor implements PageProcessor {
//抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
private static int count =0;
@Override
public Site getSite() {
return site;
}
@Override