最近学习、需要通过url爬取页面html数据信息、将图片转成base64格式,图片太大转base64存在库里对数据库造成一定的压力,然后我又将图片给压缩了一下,在不失真的情况下、中间用到了别的工具类我都列举出来了。
一、获取url
二、图片压缩转base64如果想用我的这个,可以访问:
地址:图片压缩工具类(适合所有格式的图片)_单人可khalil的博客-CSDN博客
三、将url转成 MultipartFile、可以访问:
地址:将url转MultipartFile工具类_单人可khalil的博客-CSDN博客
四、核心代码、可以参考自行根据逻辑进行修改调整。
// 正则表达式(根据页面实际情况进行调整)
private static final Pattern IMAGE_SRC_PATTERN = Pattern.compile("data-src\\s*=\\s*\"?(.*?)(\"|>|\\s+)");
private static final Pattern IMAGE_OPACITY_WIDTH_PATTERN = Pattern.compile("(?<=<img.{1,5000})style\\s*=\\s*\".*?\"");
/**
* 爬取详细数据信息
*
* @param newsContent String
* @return String
*/
private String buildNewsContent(String newsContent) {
try {
if (StringUtil.isEmpty(newsContent)) {
logger.error("error message: 获取待爬取<url><{}>为空", newsContent);
return null;
}
long urlStart = System.currentTimeMillis();
CompletableFuture<StringBuilder> stringBuilderCompletableFuture = CompletableFuture.completedFuture(buildUrl(newsContent));
CompletableFuture.allOf(stringBuilderCompletableFuture);
StringBuilder html = stringBuilderCompletableFuture.get();
long urlEnd = System.currentTimeMillis();
logger.info("获取html信息、时间:<{}>", urlEnd - urlStart);
long htmlStart = System.currentTimeMillis();
String getHtml = html
.toString()
.replaceAll(ConstantUtil.NEWS_MACRO_INSIGHT_A_CLASS, ConstantUtil.SPECIAL_CHARACTER_EMPTY)
.replaceAll(ConstantUtil.NEWS_MACRO_INSIGHT_DIV_CLASS, ConstantUtil.SPECIAL_CHARACTER_EMPTY);
List<String> stringList = new ArrayList<>();
stringList.add(html.toString());
// 获取图片信息
List<String> urlList = new ArrayList<>();
List<String> widthList = new ArrayList<>();
for (String s : stringList) {
Matcher matcher = IMAGE_SRC_PATTERN.matcher(s);
while (matcher.find()) {
urlList.add(matcher.group());
}
Matcher matcherWidth = IMAGE_OPACITY_WIDTH_PATTERN.matcher(s);
while (matcherWidth.find()) {
widthList.add(matcherWidth.group(0));
}
}
long htmlEnd = System.currentTimeMillis();
logger.info("获取图片信息、时间:<{}>", htmlEnd - htmlStart);
long start = System.currentTimeMillis();
// 去重
List<String> string = urlList.stream().distinct().collect(Collectors.toList());
Map<String, String> map = new ConcurrentHashMap<>(string.size());
List<CompletableFuture<String>> base64FutureList = string.stream()
.filter(base -> base.contains(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_HTTP))
.map(base -> CompletableFuture.supplyAsync(() -> buildBase64(base, map), threadPoolTaskExecutor))
.collect(Collectors.toList());
CompletableFuture.allOf(base64FutureList.toArray(new CompletableFuture[0]));
for (CompletableFuture<String> stringCompletableFuture : base64FutureList) {
if (StringUtil.isNotEmpty(stringCompletableFuture.get())
&& StringUtil.isNotEmpty(map.get(stringCompletableFuture.get()))) {
getHtml = getHtml.replace(stringCompletableFuture.get(), ConstantUtil.NEWS_MACRO_INSIGHT_SRC +
map.get(stringCompletableFuture.get()) +
ConstantUtil.NEWS_MACRO_INSIGHT_CHARACTER_EMPTY);
}
}
// 添加 opacity: 1;width: 100%; 用于前端图片展示和图片占比
getHtml = buildStyle(widthList, getHtml);
long end = System.currentTimeMillis();
logger.info("转base64格式、时间:<{}>", end - start);
return getHtml;
} catch (Exception e) {
Thread.currentThread().interrupt();
logger.error("error message: 爬取<华鑫视点>详细信息失败、原因:", e);
return null;
}
}
/**
* 添加 样式 opacity: 1;width: 100%;
*
* @param widthList List<String> widthList
* @param getHtml String
* @return String
*/
private String buildStyle(List<String> widthList, String getHtml) {
for (String style : widthList) {
if (style.contains(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_VERTICAL_ALIGN)) {
style = style.replace(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_STYLE, ConstantUtil.SPECIAL_CHARACTER_EMPTY)
.replace(ConstantUtil.NEWS_MACRO_INSIGHT_CHARACTER_EMPTY, ConstantUtil.SPECIAL_CHARACTER_EMPTY);
if (!style.contains(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_OPACITY_WIDTH)) {
getHtml = getHtml.replace(style, style + ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_OPACITY_WIDTH);
}
}
}
return getHtml;
}
/**
* 获取 html 信息
*
* @param newsContent String
* @return StringBuilder
*/
private StringBuilder buildUrl(String newsContent) {
try {
//建立连接
URL url = new URL(newsContent);
HttpURLConnection httpUrlConn = (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_GET);
httpUrlConn.setRequestProperty(ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_USER_AGENT,
ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_MOZILLA);
//获取输入流
InputStream input = httpUrlConn.getInputStream();
//将字节输入流转换为字符输入流
InputStreamReader read = new InputStreamReader(input, StandardCharsets.UTF_8);
//为字符输入流添加缓冲
BufferedReader br = new BufferedReader(read);
// 读取返回结果
StringBuilder html = new StringBuilder();
String data;
while ((data = br.readLine()) != null) {
html.append(data);
}
// 释放资源
br.close();
read.close();
input.close();
httpUrlConn.disconnect();
return html;
} catch (Exception e) {
return null;
}
}
/**
* 通过 url 转成 base64格式
*
* @param base String
* @return String
*/
private String buildBase64(String base, Map<String, String> map) {
// 获取src
String src = base.replace(ConstantUtil.NEWS_MACRO_INSIGHT_DATA_SRC_SYN, ConstantUtil.SPECIAL_CHARACTER_EMPTY)
.replace(ConstantUtil.NEWS_MACRO_INSIGHT_CHARACTER_EMPTY, ConstantUtil.SPECIAL_CHARACTER_EMPTY);
// 随机名称
String randomMixVerifyCode = MessageUtil.getRandomMixVerifyCode(6);
int count = 0;
MultipartFile multipartFile = null;
while (count <= ConstantUtil.NEWS_MACRO_INSIGHT_NEWS_COUNT) {
// 转成 MultipartFile
multipartFile = FileUtil.urlToMultipartFile(src, randomMixVerifyCode + ConstantUtil.WIND_STOCK_SHARE_PNG);
if (!FileUtil.isMultipartFileEmpty(multipartFile)) {
break;
}
count++;
}
// 压缩图片
// 返回base64
String base64 = ConstantUtil.NEWS_MACRO_INSIGHT_DATA_IMAGE_PNG_BASE64 + ImageUtil.compressImageByMultipartFile(multipartFile);
map.put(base, base64);
return base;
}