Java利用正则实现网络爬虫

工具类(待优化)

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class CreeperUtil {
 
    private static CreeperUtil test6;
 
    private CreeperUtil () {
 
    }
 
    public static CreeperUtil getIntance() {
        if (test6 == null) {
            test6 = new CreeperUtil();
        }
        return test6;
    }
 
 
    public String threadTool (String url,String regex,String... group) {
        //创建一个线程池
        ExecutorService pool = Executors.newFixedThreadPool(2);
        //创建两个有返回值的任务
        Callable c1 = CreeperUtil.getIntance().new InitCallable(url,regex,group);
        //执行任务并获取Future对象
        Future f1 = pool.submit(c1);
        //从Future对象上获取任务的返回值,并输出到控制台
        try {
            return f1.get().toString();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ExecutionException e) {
            e.printStackTrace();
        }
        return null;
    }
 
    class InitCallable implements Callable<String>{
 
        private String url;
 
        private String data;
 
        private String regex;
 
        private String group[];
 
        InitCallable(){
 
        }
 
        public InitCallable(String url,String regex,String... group){
            this.data = mesh(url);
            this.url = url;
            this.regex = regex;
            this.group = group;
        }
 
        public String call() throws Exception {
            return regular(data,regex,group);
        }
 
    }
 
    /**
    * @Title: mesh
    * @Description: 获取资源
    * @param @param url
    * @param @return    设定文件
    * @return String    返回类型
    * @throws 
    */
    public String mesh (String url) {
        StringBuilder sb = new StringBuilder();
        URL urls = null;
        URLConnection con = null;
        InputStream stream = null;
        InputStreamReader isr = null;
        BufferedReader br = null;
        try {
            urls = new URL(url);
            con = urls.openConnection();
            stream = con.getInputStream();
            isr = new InputStreamReader(stream);
            br = new BufferedReader(isr);
            String line = null;
            while ((line = br.readLine()) != null){
                sb.append(line);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                br.close();
                isr.close();
                stream.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString();
    }
 
    /**
    * @Title: regular
    * @Description:正则匹配
    * @param @param data
    * @param @param regex
    * @param @param group    设定文件
    * @return void    返回类型
    * @throws 
    */
    public String regular (String data,String regex,String... group) {
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(data);
        StringBuilder sb = new StringBuilder();
        while(matcher.find()) {
            if (group.length == 0) {
                sb.append(matcher.group());
            } else {
                for (int i = 1,j = group.length+1 ; i < j ; i++) {
                    sb.append(matcher.group(i)+"\t");
                }
            }
            sb.append("\n");
        }
        return sb.toString();
    }
}

客户端调用

class Client{
    public static void main(String[] args) {
        String url = "file:///C:/Users/Administrator/Desktop/HTML%E4%BB%A3%E7%A0%81/1.html";
        String regex = "<li.*?J_Cat a-all\">.*?<a.*?data-dataid=\"(.*?)\".*?>(.*?)</a>.*?<a.*?data-dataid=\"(.*?)\".*?>(.*?)</a>.*?<a.*?data-dataid=\"(.*?)\".*?>(.*?)</a>.*?<\\/li>";
        String group[] = {"1","2","3","4","5","6"};
        long start = System.currentTimeMillis();
        String result = CreeperUtil.getIntance().threadTool(url,regex,group);
        long end = System.currentTimeMillis();
        System.out.println(result+"\n花费时间:"+(end-start));
    }
}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

terrybg

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值