万事有因有果,有利有弊,犹如阴阳,男女,天地,南北,对立统一。
六祖坛经云:
言语道断,心行处灭。无念念即正,有念念成邪,有无俱不计,常御白牛车。
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
/**
* @author xuezhiyan
* @date 2018/9/12
* 使用jsoup和json
*/
public class CinemaCrawer implements Callable<Integer> {
private static final Integer hold = 5;
private String url;
private Integer from;
private Integer to;
private ExecutorService pool;
public CinemaCrawer(String url, Integer from, Integer to,ExecutorService pool) {
this.url = url;
this.from = from;
this.to = to;
this.pool = pool;
}
@Override
public Integer call() throws Exception {
ArrayList<Future<Integer>> callers = new ArrayList<Future<Integer>>();
if (to - from <10) {
for(int i=from ;i<to;i++){
String newUrl="";
if(i>1){
newUrl= url + "index" + i+".html";
}else{
newUrl=url;
}
System.out.println("第"+i+"页的内容如下:");
Document document = CinemaMain.getElement(newUrl, CinemaMain.tryCount);
Elements lis = document.getElementById("contents").children();
for (Element li : lis) {
System.out.println();
Elements child = li.children();
for (Element temp : child) {
System.out.println(temp.text());
}
}
}
} else {
Integer mid = (to + from) / 2;
CinemaCrawer first = new CinemaCrawer(url, from, mid,pool);
CinemaCrawer second = new CinemaCrawer(url, mid, to,pool);
FutureTask<Integer> task = new FutureTask<Integer>(first);
Future<Integer> result1=pool.submit(first);
FutureTask<Integer> task2 = new FutureTask<Integer>(second);
Future<Integer> result2=pool.submit(second);
callers.add(result1);
callers.add(result2);
for(Future<Integer> temp: callers){
temp.get();
}
}
return null;
}
}
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.concurrent.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author xuezhiyan
* @date 2018/9/12
*/
public class CinemaMain {
private static final String url="http://aqdybq.com/lusi/";
public static final Integer tryCount=5;
public static void main(String[] args) throws IOException {
Integer count=0;
long startTime=System.currentTimeMillis();
count=getUrl(url,tryCount);
System.out.println("需要下载的数据总共为"+count+"页");
ExecutorService pool= Executors.newCachedThreadPool();
CinemaCrawer crawer=new CinemaCrawer(url,1,count,pool);
FutureTask<Integer> task2 = new FutureTask<Integer>(crawer);
Future<Integer> result2=pool.submit(crawer);
try {
result2.get();
}catch (Exception e){
e.printStackTrace();
}
pool.shutdown();
if(pool.isShutdown()){
System.out.println("总共耗时:"+(System.currentTimeMillis()-startTime));
}
}
public static Integer getUrl(String url,Integer tryCount)throws IOException {
Integer count=0;
Document document=getElement(url,tryCount);
if(document!=null){
Elements elements=document.body().getElementsByClass("pages short-page fn-right");
if(elements.size()>0){
String key=elements.get(0).text();
System.out.println(key);
String regEx = "(\\D*)共(\\d+)(.*)页(\\D*)";
Pattern pattern = Pattern.compile(regEx);
Matcher matcher = pattern.matcher(key);
if(matcher.find()){
count=Integer.valueOf(matcher.group(2));
}
}else{
count=getUrl(url, tryCount-1);
}
}
return count;
}
public static Document getElement(String url,Integer tryCount) throws IOException{
if(tryCount<=0){
System.out.println("请检查网络链接是否正常!");
return null;
}
return Jsoup.connect(url).ignoreContentType(true).userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").timeout(5000).get();
}
}