工作中有时候需要爬取其他平台的文章,或者自己由于兴趣爬取文章,文章后续的处理或展示,一般都是自定义的标签格式,可能与标准的html格式不一致,这里就需要标签替换。
有不少文章都是使用正则表达式来替换,正则表达式替换可能由于匹配问题导致多换或少换。最靠谱的方式当然还是根据index查找某个标签的beginIndex和endIndex,然后进行内容的替换,这样肯定是最准确的,不过效率稍微低一些!
代码中主要展示了,针对<img src=’%s’ mimetype="%s" width=’%s’ height=’%s’ alt=’%s’ />\n";替换为
<img data-src=’%s’ data-mimetype="%s" data-width=’%s’ data-height=’%s’ alt=’%s’ />\n";样式。并在img前添加div及占位符。
@Data
public class ArticleMedia {
/**
* 视频/图片高度
*/
private int height;
/**
* 视频/图片宽度
*/
private int width;
}
private static ArticleMedia getImageInfo(String path) {
ArticleMedia articleMedia = new ArticleMedia();
try {
URL url = new URL(path);
URLConnection connection = url.openConnection();
connection.setDoOutput(true);
BufferedImage image = ImageIO.read(connection.getInputStream());
articleMedia.setHeight(image.getHeight());
articleMedia.setWidth(image.getWidth());
} catch (IOException ex) {
log.warn("path {} file not exist", path);
} catch (Exception ex) {
log.warn("path {} file may not image", path);
} finally {
log.info("path {} article info {}", path, articleMedia);
}
return articleMedia;
}
private static String formatContent(String content) {
if (content == null || content.length() == 0) return null;
boolean hasImageDiv = false;
if (content.contains("<div class=\"pgc-img\">")) {
content = content.replaceAll("<div class=\"pgc-img\">", "<div class=\"my-custom-image\">");
hasImageDiv = true;
}
int imgBeginIndex = 0, imgEndIndex = 0;
StringBuilder stringBuilder = new StringBuilder(content);
while ((imgBeginIndex = stringBuilder.indexOf("<img", imgEndIndex)) >= 0 && (imgEndIndex = stringBuilder.indexOf(">", imgBeginIndex)) > 0) {
int imgSrcIndex = stringBuilder.indexOf("src=", imgBeginIndex);
String picUrl = stringBuilder.substring(imgSrcIndex + 5, stringBuilder.indexOf("\"", imgSrcIndex + 5));
String mimeType = ImageUtils.getMimeType(picUrl);
if (StringUtils.isBlank(mimeType)) {
log.error("unknown mimetype of url:{}", picUrl);
imgEndIndex += 5;
continue;
}
StringBuilder sb = new StringBuilder();
int widthIndex = stringBuilder.indexOf("width", imgSrcIndex);
int heightIndex = stringBuilder.indexOf("height", imgSrcIndex);
int altIndex = stringBuilder.indexOf("alt", imgSrcIndex);
int width = 0, height = 0;
String alt = "";
if (widthIndex > 0 && widthIndex < imgEndIndex) {
width = Integer.valueOf(stringBuilder.substring(widthIndex + 7, stringBuilder.indexOf("\"", widthIndex + 7)));
}
if (heightIndex > 0 && heightIndex < imgEndIndex) {
height = Integer.valueOf(stringBuilder.substring(heightIndex + 8, stringBuilder.indexOf("\"", heightIndex + 8)));
}
if (altIndex > 0 && altIndex < imgEndIndex) {
alt = stringBuilder.substring(altIndex + 5, stringBuilder.indexOf("\"", altIndex + 5));
}
// 未获取到高和宽
if (height == 0 && width == 0) {
ArticleMedia media = getImageInfo(picUrl);
height = media.getHeight();
width = media.getWidth();
}
sb.append(imgReplace(picUrl, mimeType, width, height, alt, hasImageDiv));
stringBuilder.delete(imgBeginIndex, imgEndIndex + 1);
stringBuilder.insert(imgBeginIndex, sb);
imgEndIndex = imgBeginIndex + sb.length();
}
return stringBuilder.toString();
}
private static String imgReplace(String imgUrl, String mimeType, int width, int height, String alt, boolean hasImageDiv) {
StringBuilder sb = new StringBuilder();
// 查找image替换
if (!hasImageDiv) {
sb.append("\n <div class=\"my-custom-image\">\n");
}
String templateStr = " <!-- 图片占位元素,padding-bottom 的值由图片本身真实的高度除以宽度转化为百分数 -->\n" +
" <div class=\"my-custom-imageplaceholder\" style=\"width: %.0f%%; padding-bottom: %.3f%%;\"></div>\n" +
" <img data-src='%s' data-mimetype=\"%s\" data-width='%s' data-height='%s' alt='%s' />\n";
sb.append(String.format(templateStr, 100/1.0, height * 100 / width * 1.0, imgUrl, mimeType, width, height, alt));
if (!hasImageDiv) {
sb.append(" </div>\n");
}
return sb.toString();
}
public static void main(String[] args) {
String str = "<img src=\"http://p3.pstatp.com/large/pgc-image/f67024509a3e4d639bdd885d4dd9b41a\" width=\"640\" height=\"458\" alt=\"清朝:买官和考科举当官,哪一个难度更大?\" inline=\"0\"><p class=\"pgc-img-caption\"></p><p class=\"ql-align-justify\">\u200B</p><p class=\"ql-align-justify\">先来说说科举,科举出现于隋唐,在明、清时形成定制。当时科举的选题不仅有固定的判断标准,而且监考过程极为严格。</p><p class=\"ql-align-justify\">清朝因为科举作弊的案件,杀过许多考官。对于一个读书人来说,当他们走上科举这条路后,就必须先参加地方举办的童生试。</p><p class=\"ql-align-justify\"><br></p><img src=\"http://p1.pstatp.com/large/pgc-image/822159119cf940d3b210002b777c31b8\" width=\"640\" height=\"427\" alt=\"清朝:买官和考科举当官,哪一个难度更大?\" inline=\"0\">";
System.out.println(formatContent(str));
}