package com;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
import java.util.Scanner;
public class MyBlogMagic implements PageProcessor {
//文章的总页数
static String state = "0";
部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1);
;
//这里通过page.addTargetRequests()方法来增加要抓取的URL
// 这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。
public void process(Page page) {
//用于获取所有满足这个正则表达式的链接
List links = page.getHtml().links().regex("http://bolg\\.\\.net/article/list/\\d+").all();
//将这些链接加入到待抓取的队列中去
page.addTargetRequests(links);
//相同元素的结果加到相应的集合中去。
List titlelist = page.getHtml().xpath("//span[@class='link_title']/a/text()").all();
List readlist = page.getHtml().xpath("//span[@class='link_view']/text()").all();
List pinlunlist = page.getHtml().xpath("//span[@class='link_comments']/text()").all();
if (!state.equals("0")) {
for (int i = 0; i < titlelist.size(); i++) {
if (i == 0) {
System.out.println("-------------------------------------------------");
System.out.println("------------------------------------------------");
}
System.out.println("题目:" + titlelist.get(i));
System.out.println("阅读人数:" + readlist.get(i).replace("(", "").replace(")", ""));
System.out.println("评论次数:" + pinlunlist.get(i).replace("(", "").replace(")", ""));
if (i != titlelist.size() - 1) {
System.out.println("********************************************\n");
}
}
}
String pagelist = page.getHtml().xpath("//div[@class='pagelist']/span/text()").get();
if (pagelist != null) {
state = pagelist.substring(pagelist.indexOf("共") + 1, pagelist.indexOf("页"));
} else {
state = "true";
}
}
public Site getSite() {
return site;
}
public static void main(String[] args) {
Scanner scanner1 = new Scanner(System.in);
System.out.println("++++++++++++++++++++++++++++++++");
System.out.println("+ +");
System.out.println("+请输入您的博客名字,按回车确认:+");
System.out.println("+ +");
System.out.println("++++++++++++++++++++++++++++++++");
String name = scanner1.next();
Spider
.create(new MyBlogMagic())
//从这个地址开始抓
.addUrl("http://blog..net/" + name + "/article/list")
.run();
if (state.equals("true")) {
Spider
.create(new MyBlogMagic())
//从这个地址开始抓
.addUrl("http://blog..net/" + name)
.run();
} else while (!state.equals("0")) {
try {
Scanner scanner = new Scanner(System.in);
System.out.println("********************************************************");
System.out.println("**总页数为 " + state + " 页 **");
System.out.println("**请输入您要看的页码,按回车确认: **");
System.out.println("**输入大于" + state + "的数字可查看全部文章列表 **");
System.out.println("********************************************************");
String str = scanner.next();
int a = Integer.valueOf(str);
Spider
.create(new MyBlogMagic())
//从这个地址开始抓
.addUrl("http://blog..net/" + name + "/article/list/" + str)
.run();
} catch (Exception e) {
new Thread(new Runnable() {
@Override
public void run() {
String str="不要输入非数字啦!!人家都要奔溃了(>﹏
try {
for(int i=0;i
Thread.sleep(100);
System.out.print(str.charAt(i));
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}).start();
}
}
}
}