兴趣与行动 简单的爬虫

 万事有因有果,有利有弊,犹如阴阳,男女,天地,南北,对立统一。
 六祖坛经云:
  言语道断,心行处灭。无念念即正,有念念成邪,有无俱不计,常御白牛车。

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;

/**
 * @author xuezhiyan
 * @date 2018/9/12
 * 使用jsoup和json
 */
public class CinemaCrawer implements Callable<Integer> {
    private static final Integer hold = 5;
    private String url;
    private Integer from;
    private Integer to;
    private ExecutorService pool;

    public CinemaCrawer(String url, Integer from, Integer to,ExecutorService pool) {
        this.url = url;
        this.from = from;
        this.to = to;
        this.pool = pool;
    }

    @Override
    public Integer call() throws Exception {
        ArrayList<Future<Integer>> callers = new ArrayList<Future<Integer>>();
        if (to - from <10) {
            for(int i=from ;i<to;i++){
                String newUrl="";
                if(i>1){
                    newUrl= url + "index" + i+".html";
                }else{
                    newUrl=url;
                }
                System.out.println("第"+i+"页的内容如下:");
                Document document = CinemaMain.getElement(newUrl, CinemaMain.tryCount);
                Elements lis = document.getElementById("contents").children();
                for (Element li : lis) {
                    System.out.println();
                    Elements child = li.children();
                    for (Element temp : child) {
                        System.out.println(temp.text());
                    }
                }
            }
        } else {

            Integer mid = (to + from) / 2;
            CinemaCrawer first = new CinemaCrawer(url, from, mid,pool);
            CinemaCrawer second = new CinemaCrawer(url, mid, to,pool);

            FutureTask<Integer> task = new FutureTask<Integer>(first);
            Future<Integer> result1=pool.submit(first);

            FutureTask<Integer> task2 = new FutureTask<Integer>(second);
            Future<Integer> result2=pool.submit(second);
            callers.add(result1);
            callers.add(result2);
            for(Future<Integer> temp: callers){
                temp.get();
            }

        }
        return null;
    }
}

 

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.concurrent.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author xuezhiyan
 * @date 2018/9/12
 */
public class CinemaMain {
    private static final String url="http://aqdybq.com/lusi/";
    public static final Integer tryCount=5;

    public static void main(String[] args) throws IOException {
        Integer count=0;
        long startTime=System.currentTimeMillis();
        count=getUrl(url,tryCount);
        System.out.println("需要下载的数据总共为"+count+"页");
        ExecutorService pool= Executors.newCachedThreadPool();
        CinemaCrawer crawer=new CinemaCrawer(url,1,count,pool);
        FutureTask<Integer> task2 = new FutureTask<Integer>(crawer);
        Future<Integer> result2=pool.submit(crawer);
        try {
            result2.get();
        }catch (Exception e){
            e.printStackTrace();
        }
        pool.shutdown();
        if(pool.isShutdown()){
            System.out.println("总共耗时:"+(System.currentTimeMillis()-startTime));
        }
    }

   public static Integer getUrl(String url,Integer tryCount)throws IOException {
        Integer count=0;
       Document document=getElement(url,tryCount);
       if(document!=null){
           Elements elements=document.body().getElementsByClass("pages short-page fn-right");
           if(elements.size()>0){
               String key=elements.get(0).text();
               System.out.println(key);
               String regEx = "(\\D*)共(\\d+)(.*)页(\\D*)";
               Pattern pattern = Pattern.compile(regEx);
               Matcher matcher = pattern.matcher(key);
               if(matcher.find()){
                   count=Integer.valueOf(matcher.group(2));
               }
           }else{
               count=getUrl(url, tryCount-1);
           }
       }
       return count;
   }

   public static Document getElement(String url,Integer tryCount) throws IOException{
       if(tryCount<=0){
           System.out.println("请检查网络链接是否正常!");
           return null;
       }
      return Jsoup.connect(url).ignoreContentType(true).userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").timeout(5000).get();
   }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大巨魔战将

如果对您有帮助,请打赏1分钱

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值