通过URL多线程异步爬取页面、图片转Base64格式

        最近学习、需要通过url爬取页面html数据信息、将图片转成base64格式,图片太大转base64存在库里对数据库造成一定的压力,然后我又将图片给压缩了一下,在不失真的情况下、中间用到了别的工具类我都列举出来了。

一、获取url

二、图片压缩转base64如果想用我的这个,可以访问:

地址:图片压缩工具类(适合所有格式的图片)_单人可khalil的博客-CSDN博客

三、将url转成 MultipartFile、可以访问:

地址:将url转MultipartFile工具类_单人可khalil的博客-CSDN博客

四、核心代码、可以参考自行根据逻辑进行修改调整。


    // 正则表达式(根据页面实际情况进行调整)
    private static final Pattern IMAGE_SRC_PATTERN = Pattern.compile("data-src\\s*=\\s*\"?(.*?)(\"|>|\\s+)");

    private static final Pattern IMAGE_OPACITY_WIDTH_PATTERN = Pattern.compile("(?<=<img.{1,5000})style\\s*=\\s*\".*?\"");




    /**
     * 爬取详细数据信息
     *
     * @param newsContent String
     * @return String
     */
    private String buildNewsContent(String newsContent) {
        try {
            if (StringUtil.isEmpty(newsContent)) {
                logger.error("error message: 获取待爬取<url><{}>为空", newsContent);
                return null;
            }
            long urlStart = System.currentTimeMillis();
            CompletableFuture<StringBuilder> stringBuilderCompletableFuture = CompletableFuture.completedFuture(buildUrl(newsContent));
            CompletableFuture.allOf(stringBuilderCompletableFuture);
            StringBuilder html = stringBuilderCompletableFuture.get();
            long urlEnd = System.currentTimeMillis();
            logger.info("获取html信息、时间:<{}>", urlEnd - urlStart);

            long htmlStart = System.currentTimeMillis();
            String getHtml = html
                    .toString()
                    .replaceAll(ConstantUtil.NEWS_MACRO_INSIGHT_A_CLASS, ConstantUtil.SPECIAL_CHARACTER_EMPTY)
                    .replaceAll(ConstantUtil.NEWS_MACRO_INSIGHT_DIV_CLASS, ConstantUtil.SPECIAL_CHARACTER_EMPTY);
            List<String> stringList = new ArrayList<>();
            stringList.add(html.toString());
            // 获取图片信息
            List<String> urlList = new ArrayList<>();
            List<String> widthList = new ArrayList<>();
            for (String s : stringList) {
                Matcher matcher = IMAGE_SRC_PATTERN.matcher(s);
                while (matcher.find()) {
                    urlList.add(matcher.group());
                }
                Matcher matcherWidth = IMAGE_OPACITY_WIDTH_PATTERN.matcher(s);
                while (matcherWidth.find()) {
                    widthList.add(matcherWidth.group(0));
                }
            }
            long htmlEnd = System.currentTimeMillis();
            logger.info("获取图片信息、时间:<{}>", htmlEnd - htmlStart);

            long start = System.currentTimeMillis();

            // 去重
            List<String> string = urlList.stream().distinct().collect(Collectors.toList());
            Map<String, String> map = new ConcurrentHashMap<>(string.size());
            List<CompletableFuture<String>> base64FutureList = string.stream()
                    .filter(base -> base.contains(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_HTTP))
                    .map(base -> CompletableFuture.supplyAsync(() -> buildBase64(base, map), threadPoolTaskExecutor))
                    .collect(Collectors.toList());
            CompletableFuture.allOf(base64FutureList.toArray(new CompletableFuture[0]));
            for (CompletableFuture<String> stringCompletableFuture : base64FutureList) {
                if (StringUtil.isNotEmpty(stringCompletableFuture.get())
                        && StringUtil.isNotEmpty(map.get(stringCompletableFuture.get()))) {
                    getHtml = getHtml.replace(stringCompletableFuture.get(), ConstantUtil.NEWS_MACRO_INSIGHT_SRC +
                            map.get(stringCompletableFuture.get()) +
                            ConstantUtil.NEWS_MACRO_INSIGHT_CHARACTER_EMPTY);
                }

            }
            // 添加 opacity: 1;width: 100%; 用于前端图片展示和图片占比
            getHtml = buildStyle(widthList, getHtml);

            long end = System.currentTimeMillis();
            logger.info("转base64格式、时间:<{}>", end - start);
            return getHtml;
        } catch (Exception e) {
            Thread.currentThread().interrupt();
            logger.error("error message: 爬取<华鑫视点>详细信息失败、原因:", e);
            return null;
        }
    }

    /**
     * 添加 样式 opacity: 1;width: 100%;
     *
     * @param widthList List<String> widthList
     * @param getHtml   String
     * @return String
     */
    private String buildStyle(List<String> widthList, String getHtml) {
        for (String style : widthList) {
            if (style.contains(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_VERTICAL_ALIGN)) {
                style = style.replace(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_STYLE, ConstantUtil.SPECIAL_CHARACTER_EMPTY)
                        .replace(ConstantUtil.NEWS_MACRO_INSIGHT_CHARACTER_EMPTY, ConstantUtil.SPECIAL_CHARACTER_EMPTY);
                if (!style.contains(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_OPACITY_WIDTH)) {
                    getHtml = getHtml.replace(style, style + ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_OPACITY_WIDTH);
                }
            }
        }
        return getHtml;
    }

    /**
     * 获取 html 信息
     *
     * @param newsContent String
     * @return StringBuilder
     */
    private StringBuilder buildUrl(String newsContent) {
        try {
            //建立连接
            URL url = new URL(newsContent);
            HttpURLConnection httpUrlConn = (HttpURLConnection) url.openConnection();
            httpUrlConn.setDoInput(true);
            httpUrlConn.setRequestMethod(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_GET);
            httpUrlConn.setRequestProperty(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_USER_AGENT,
                    ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_MOZILLA);
            //获取输入流
            InputStream input = httpUrlConn.getInputStream();
            //将字节输入流转换为字符输入流
            InputStreamReader read = new InputStreamReader(input, StandardCharsets.UTF_8);
            //为字符输入流添加缓冲
            BufferedReader br = new BufferedReader(read);
            // 读取返回结果
            StringBuilder html = new StringBuilder();
            String data;
            while ((data = br.readLine()) != null) {
                html.append(data);
            }
            // 释放资源
            br.close();
            read.close();
            input.close();
            httpUrlConn.disconnect();
            return html;
        } catch (Exception e) {
            return null;
        }
    }

    /**
     * 通过 url 转成 base64格式
     *
     * @param base String
     * @return String
     */
    private String buildBase64(String base, Map<String, String> map) {
        // 获取src
        String src = base.replace(ConstantUtil.NEWS_MACRO_INSIGHT_DATA_SRC_SYN, ConstantUtil.SPECIAL_CHARACTER_EMPTY)
                .replace(ConstantUtil.NEWS_MACRO_INSIGHT_CHARACTER_EMPTY, ConstantUtil.SPECIAL_CHARACTER_EMPTY);
        // 随机名称
        String randomMixVerifyCode = MessageUtil.getRandomMixVerifyCode(6);
        int count = 0;
        MultipartFile multipartFile = null;
        while (count <= ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_COUNT) {
            // 转成 MultipartFile
            multipartFile = FileUtil.urlToMultipartFile(src, randomMixVerifyCode + ConstantUtil.WIND_STOCK_SHARE_PNG);
            if (!FileUtil.isMultipartFileEmpty(multipartFile)) {
                break;
            }
            count++;
        }
        // 压缩图片
        // 返回base64
        String base64 = ConstantUtil.NEWS_MACRO_INSIGHT_DATA_IMAGE_PNG_BASE64 + ImageUtil.compressImageByMultipartFile(multipartFile);
        map.put(base, base64);
        return base;
    }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值