前言
- 自己主要是IOS,但是也想业余时间学学后台的一点,之前用过mfc,c#,写点嵌入式辅佐小工具,现在入坑脱做IOS.所以,想向全栈走,就业余时间看看springboot相关,先上一个在springboot配置下的简单爬虫,主要框架是webmagic。本爬虫系列预计会列出3个以上站点不同站点处理。
注意
-
常规网页展示部分一般分为列表页和详情页,niconico站是大部分页面都是后端吐出完成html,所有主要设计解析方法为,xpath标记方法处理。
-
爬虫比较多不确定性,所有需要做容错处理,特别是xpath解析HTML时,不让回导致整爬虫退出
-
本人的springboot 是采用多线程定时器,分别定时跑爬虫数据和下载爬虫资源,以后会在这里慢慢列出解决方法。
-
niconico为日本网站需要翻墙,笔者在树莓派上,用上级路由作翻墙处理。
-
SpiderNicoService为一些数据的入库处理,SslDownloader为webmagic获取https,请求处理。
pom.xml配置
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<!--<version>0.7.2</version>-->
<version>0.5.2</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
process
/**
* Created by ray on 2017/7/16.
* 爬虫管道
*/
@Service
public class NicoNicoProcessor implements PageProcessor {
private Logger logger = LoggerFactory.getLogger(this.getClass());
//https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=1
private String bashUrl = "http://www.nicovideo.jp/tag/%E8%B8%8A%E3%81%A3%E3%81%A6%E3%81%BF%E3%81%9F?page=" ;// + i
//http://www.nicovideo.jp/watch/sm23385186
private String detailUrl = "http://www.nicovideo.jp/watch/";
@Value("${spider.niconico.maxSize}")
int maxSize;
@Autowired
NicoNicoPipeLine pipeLine;
@Override
public Site getSite() {
//HttpHost httpHost = new HttpHost("127.0.0.1",1087);
Site site = Site.me()
//.setHttpProxy(httpHost)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36")
.setSleepTime(10 * 1000)
.setTimeOut(20 * 1000)
.setRetryTimes(3)
.setCycleRetryTimes(3);
return site;
}
@Override
public void process(Page page){
String pageUrl = page.getUrl().toString();
//新增请求列表
List<String> requestUrls =new ArrayList<>();
List<SpiderNico> resList = new ArrayList<>();
logger.info(pageUrl);
try {
if (pageUrl.contains(bashUrl)){
//解析列表
List <String> htmlList = page.getHtml().xpath("//div/ul[@class='list']/li[@class='item']").all();
for (String tmp: htmlList) {
if (tmp.length() < 5) continue;
if (tmp.contains("data-id")){
try {
Html html = new Html(tmp);
//id
String dataId = html.xpath("//li[@class='item']/@data-id").toString();
//标题
String title = html.xpath("//p[@class='itemTitle']/a/text()").toString();
//封面
String icon = html.xpath("//img[@class='jsLazyImage thumb']/@data-original").toString();
String view = "0";
String comment = "0";
String wrapTitle = null;
String createTime = "0000-00-00";
//光看人数
view = html.xpath("//ul[@class='list']/li[@class='count view']/span/text()").toString().replace(",","");
//回复人数
comment = html.xpath("//ul[@class='list']/li[@class='count comment']/span/text()").toString().replace(",","");
//其他
wrapTitle = html.xpath("//div[@class='wrap']/p/@title").toString().replace(",","");
//创建时间
createTime = html.xpath("//p[@class='itemTime']/span/span/text()").toString().replace("/","-");
createTime = "20" + createTime;
//增加nico对象
SpiderNico nico = new SpiderNico ();
nico.setAid(dataId);
nico.setTitle(title);
nico.setDescription(wrapTitle);
nico.setCreate(createTime);
nico.setComment(Integer.valueOf(comment));
nico.setPic(icon);
nico.setView(Integer.valueOf(view));
resList.add(nico);
//增加请求地址
String url = detailUrl + dataId;
requestUrls.add(url);
}
catch (Exception e){
logger.error("nico xpath:" + pageUrl );
}
}
}
//批量增加请求
if (resList.size() > 0 ){
page.putField("type", 0);
page.putField("data", resList);
}
}
else if (pageUrl.contains(detailUrl)){
logger.info(pageUrl);
}
}
catch (Exception e){
logger.error("url:" + pageUrl );
}
}
public void run(){
Spider spider = Spider.create(new NicoNicoProcessor())
//.setDownloader(new HttpClientDownloader())
//.setDownloader(new HttpDownloader())
.setDownloader(new SslDownloader())
//.addPipeline(new ConsolePipeline())//打印到控制台
.addPipeline(pipeLine);
for (int i = 1; i < maxSize; i++) {
String tmp = bashUrl + i;
spider.addUrl(tmp);
}
spider.run();
}
}
pipeline
/**
* Created by ray on 2017/7/16.
* 爬虫进程
*/
@Service
public class NicoNicoPipeLine implements Pipeline {
@Autowired
SpiderNicoService service;
@Override
public void process(ResultItems resultItems, Task task){
if (resultItems.getAll().isEmpty() == false) {
int type = resultItems.get("type");
if (type == 0){
//列表内容
List<SpiderNico> list = resultItems.get("data");
for (SpiderNico obj: list) {
service.updateBySpider(obj);
}
}
else if(type == 1){
}
}
}
}
[原文](http://raychow.linkfun.top/2017/12/01/archives/9_javaSpring/spriderNicoNico/index/)