* 截取 HTML 文本中的图片
* @param htmlStr
* @return
*/
public static List getImgStr(String htmlStr) {
String img = "";
Pattern p_image;
Matcher m_image;
List pics = new ArrayList();
String regEx_img = "]*?>";
p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
m_image = p_image.matcher(htmlStr);
while (m_image.find()) {
img = img + "," + m_image.group();
Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
while (m.find()) {
pics.add(m.group(1));
}
}
return pics;
}
}
/**
* 截取 HTML 文本中的图片 图片类型Base64
* @param htmlStr
* @return
*/
public static final Pattern PATTERN = Pattern.compile("<img\\s+(?:[^>]*)src\\s*=\\s*([^>]+)", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
public static List getImgSrc(String html) {
Matcher matcher = PATTERN.matcher(html);
List list = new ArrayList();
while (matcher.find()) {
String group = matcher.group(1);
if (group == null) {
continue;
}
// 这里可能还需要更复杂的判断,用以处理src="...."内的一些转义符
if (group.startsWith("'")) {
list.add(group.substring(1, group.indexOf("'", 1)));
} else if (group.startsWith("\"")) {
list.add(group.substring(1, group.indexOf("\"", 1)));
} else {
list.add(group.split("\\s")[0]);
}
}
return list;
}