基于正则表达式的Java爬虫项目

需求:抓取新闻网的前一百条新闻题目以及对应的网页链接

编者这里以齐鲁工业大学校园新闻网为示例,利用Java网络编程、多线程、正则表达式来实现对于新闻内容的抓取。(注:由于校园网限制,不连接齐鲁工业大学校园网可能暂时无法抓取全部内容或抓取的内容会存在缺失是正常情况)

源代码示例:

import java.io.BufferedReader;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.net.URL;

import java.net.URLConnection;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



public  class NetUrl {

    //网络爬虫

    //1、向页面发送请求

    //2、获取页面html---纯文本

    //3、解析要的内容---rege、xml、jsoup、json

    //reg 字面量、字符类、数量?+ *





        public static void testUrl () {

            try {

//                private Lock lock=new ReentrantLock();

//





                        for (int j = 1; j <= 100; j++) {

                    StringBuffer sb0=new StringBuffer();

                    sb0.append("http://www.qlu.edu.cn/gdyw/list"+j+".htm");

                            String content = "https://www.qlu.edu.cn/gdyw/list" + j + ".htm";

                    System.out.println(sb0.toString());

//                    System.out.println("content = " + content);





                            URL url = new URL(content);

                            URLConnection urlConnection = url.openConnection();

//            System.out.println("urlConnection.getContentEncoding() = " + urlConnection.getContentEncoding());

//            System.out.println("urlConnection.getContentLengthLong() = " + urlConnection.getContentLengthLong());

//            System.out.println("urlConnection.getContentType() = " + urlConnection.getContentType());

                            //层层嵌套

                            InputStream ins = urlConnection.getInputStream();

                            InputStreamReader inr = new InputStreamReader(ins);

                            BufferedReader br = new BufferedReader(inr);

                            StringBuffer sb = new StringBuffer();

                            for (String l = br.readLine(); l != null; l = br.readLine()) {

                                sb.append(l);

                            }

                            String newsList = sb.toString();



                            try {

                                Thread.sleep(1000);

                            } catch (InterruptedException e) {

                                e.printStackTrace();

                            }

//            System.out.println("newsList = " + newsList);

                            Pattern pattern = Pattern.compile("((<li class=\"news n\\d{1,} clearfix\">)(.*?)(</li>))+");

                            Matcher matcher = pattern.matcher(newsList);

//            boolean fd = matcher.find();

//            System.out.println("fd = " + fd);

                            Pattern atag = Pattern.compile("(<a href=')(.*?)(')(.*title=')(.*?)(')");

                            while (matcher.find()) {

//            System.out.println("matcher.groupCount() = " + matcher.groupCount());

//            System.out.println("matcher.group(1) = " + matcher.group(1));

//            System.out.println("matcher.group(2) = " + matcher.group(2));

                                String li = matcher.group(3);

                                Matcher aMatcher = atag.matcher(li);

                                if (aMatcher.find()) {

                                    System.out.println("href = " + aMatcher.group(2));

                                    System.out.println("title = " + aMatcher.group(5));

                                }







//         网页示例       <span style="text-indent:0.99cm;font-size:18px">迈着铿锵的步伐,我们即将挥手告别满载收获与喜悦的</span>

                                URL url2 = new URL("https://www.qlu.edu.cn/"+aMatcher.group(2));

                                System.out.println("url2 = " + url2);

                                URLConnection urlConnection2 = url.openConnection();

                                InputStream ins2 = urlConnection.getInputStream();

                                InputStreamReader inr2 = new InputStreamReader(ins2);

                                BufferedReader br2 = new BufferedReader(inr2);

                        StringBuffer sb2=new StringBuffer();

                                for (String m = br2.readLine(); m != null; m = br2.readLine()) {

                                    sb2.append(m);

                                }

                                Pattern pattern2=Pattern.compile("((<span style=\"text-indent:0.99cm;font-size:18px\">)(.*?)(</span>))+");

                                String newSpan = sb2.toString();

                                Matcher matcher1=pattern.matcher(newSpan);

                                while (matcher1.find()) {

                                    boolean fd=matcher1.find();

                                    System.out.println("fd = " + fd);

                                    System.out.println("matcher1.groupCount() = " + matcher1.groupCount());

                                    String span = matcher.group(3);

                                    System.out.println("newSpan = " + newSpan);

                                    System.out.println("matcher1.group(3) = " + matcher1.group(3));

                                }

















//            System.out.println("matcher.group(4) = " + matcher.group(4));

                            }

                            System.out.println(Thread.currentThread().getName()+"_____________——抓取");

                            ins.close();

                            inr.close();

                            br.close();





//                    System.out.println(content);

                        }



                    }

                    catch(Exception e){

                        System.out.println("e.getMessage() = " + e.getMessage());

                    }

                }







    public static void main(String[] args) throws Exception {

//            1.创建一个五个线程的线程池

        ExecutorService es=Executors.newFixedThreadPool(5);

//        2.提交任务

        Runnable run =new Runnable() {

            @Override

            public void run() {

                while(true){

                    synchronized(NetUrl.class) {

                        testUrl();

                    }

                }

            }

        };

//        3.提交任务

        for (int i=0;i <= 5;i++){

            es.submit(run);

        }



//        4.关闭线程池

        es.shutdown();

    }

}

 网页源码示例如上:

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

逃逸线LOF

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值