简单的java爬虫示例

在这里插入图片描述

package game.Worm;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @Author: Child
 * @Version: 1.0
 * @Date: 2022/12/31/9:28
 * @Description: Created with IntelliJ IDEA
 */
public class InternetWorm {
    public static void main(String[] args) throws IOException {
        Scanner scanner = new Scanner(System.in);
        System.out.print("请输入初始爬取网址入口:");
        //网址推荐
        //https://iask.sina.com.cn/b/iR6wYv2zsA8X.html
        //创建URL对象
        String address = scanner.next();

        System.out.print("请输入想要爬取的内容(0代表全部爬取):");
        String content = scanner.next();
        new Thread(new WormWebsite(address,content)).start();

    }
}

//多线程爬取网址
class WormWebsite implements Runnable {
    String content;
    URL url;

    public WormWebsite(String address, String content) {
        try {
            this.url = new URL(address);
            this.content = content;
        } catch (MalformedURLException e) {
//            throw new RuntimeException(e);
        }
    }

    @Override
    public void run() {
        if (url == null) {
            return;
        }
        //连接网址
        URLConnection urlConnection = null;
        try {
            urlConnection = url.openConnection();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        if (urlConnection == null) {
            return;
        }

        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
            String line;
            //获取正则表达式对象
//        String regex1 = "(\\d{1,2}:){0,1}[0-5]\\d:[0-5]\\d";
            String regex1 = "https://[^\"']+";
            Pattern pattern = Pattern.compile(regex1);

            while ((line = bufferedReader.readLine()) != null) {
                Matcher matcher = pattern.matcher(line);
                while (matcher.find()) {
                    //输出网址
//                    System.out.println(matcher.group());
                    new Thread(new WormWebsite(matcher.group(),content)).start();
                    Thread.sleep(1000);
                    new Thread(new WormComment(matcher.group(),content)).start();
                    Thread.sleep(1000);
                }
            }
        } catch (Exception e) {
//            throw new RuntimeException(e);
//            System.out.println("爬虫出了点小问题");
        }


    }
}

//多线程搜查资料
class WormComment implements Runnable {
    String content;
    URL url;

    public WormComment(String address, String content) {
        try {
            this.url = new URL(address);
            this.content = content;
        } catch (MalformedURLException e) {
//            throw new RuntimeException(e);
        }
    }

    @Override
    public void run() {
        if (url == null) {
            return;
        }
        //连接网址
        URLConnection urlConnection = null;
        try {
            urlConnection = url.openConnection();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        if (urlConnection == null) {
            return;
        }

        try {
            //中文转unicode
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
            String line;
            StringBuffer unicode = new StringBuffer();

            for (int i = 0; i < content.length(); i++) {
                char c = content.charAt(i);
                unicode.append("\\u").append(Integer.toHexString(c));
            }

            //获取正则表达式对象
            String regex2 = "[^"+unicode.toString()+"]*"+unicode.toString()+".*";


            if (content.equals("0")){
                regex2 = " .* ";
            }
            Pattern pattern = Pattern.compile(regex2);


            while ((line = bufferedReader.readLine()) != null) {
                Matcher matcher = pattern.matcher(line);
                while (matcher.find()) {
                    Thread.sleep(50);
                    //输出内容
                    System.out.println(matcher.group());
                }
            }
        } catch (Exception e) {
//            throw new RuntimeException(e);
//                        System.out.println("爬虫出了点小问题");
        }


    }
}
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值