package test;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test {
public static void main(String[] args) {
String data = "<p>aa</p>"
+"<br/>"
+ "<img alt=\"\" src=\"/a/201901.jpg\" id=\"pic1\" class=\"pic\"/>"
+"<br/>"
+ "<p>bb</p>"
+"<br/>"
+ "<img alt=\"\" src=\"/a/201902.jpg\" id=\"pic1\" class=\"pic\"/>"
+"<br/>"
+ "<p>cc</p>"
+"<br/>"
+ "<img alt=\"\" src=\"/a/201903.jpg\" id=\"pic1\" class=\"pic\"/>"
+"<br/>"
+ "<p>dd</p>"
+"<br/>"
+ "<img alt=\"\" src=\"/a/201904.jpg\" id=\"pic1\" class=\"pic\"/>"
+"<br/>";
/**
* [/a/201901.jpg, /a/201902.jpg, /a/201903.jpg, /a/201904.jpg]
*/
// List<String> data1 = getTextFromHtml(data, true);
/**
* [<p>aa</p><br/>,
* <img alt="" src="/a/201901.jpg" id="pic1" class="pic"/>,
* <br/><p>bb</p><br/>,
* <img alt="" src="/a/201902.jpg" id="pic1" class="pic"/>,
* <br/><p>cc</p><br/>,
* <img alt="" src="/a/201903.jpg" id="pic1" class="pic"/>,
* <br/><p>dd</p><br/>,
* <img alt="" src="/a/201904.jpg" id="pic1" class="pic"/>, <br/>]
*/
List<String> data1 = cutStringByImgTag(data);
System.out.println(data1);
}
public static List<String> cutStringByImgTag(String targetStr) {
List<String> splitTextList = new ArrayList<String>();
Pattern pattern = Pattern.compile("<img.*?src=\\\"(.*?)\\\".*?>");
Matcher matcher = pattern.matcher(targetStr);
int lastIndex = 0;
while (matcher.find()) {
if (matcher.start() > lastIndex) {
splitTextList.add(targetStr.substring(lastIndex, matcher.start()));
}
splitTextList.add(targetStr.substring(matcher.start(), matcher.end()));
lastIndex = matcher.end();
}
if (lastIndex != targetStr.length()) {
splitTextList.add(targetStr.substring(lastIndex, targetStr.length()));
}
return splitTextList;
}
public static String getImgSrc(String content){
String str_src = null;
//目前img标签标示有3种表达式
//<img alt="" src="1.jpg"/> <img alt="" src="1.jpg"></img> <img alt="" src="1.jpg">
//开始匹配content中的<img />标签
Pattern p_img = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)");
Matcher m_img = p_img.matcher(content);
boolean result_img = m_img.find();
if (result_img) {
while (result_img) {
//获取到匹配的<img />标签中的内容
String str_img = m_img.group(2);
//开始匹配<img />标签中的src
Pattern p_src = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')");
Matcher m_src = p_src.matcher(str_img);
if (m_src.find()) {
str_src = m_src.group(3);
}
//结束匹配<img />标签中的src
//匹配content中是否存在下一个<img />标签,有则继续以上步骤匹配<img />标签中的src
result_img = m_img.find();
}
}
return str_src;
}
public static ArrayList<String> getTextFromHtml(String html, boolean isGetImage){
ArrayList<String> imageList = new ArrayList<>();
ArrayList<String> textList = new ArrayList<>();
//根据img标签分割出图片和字符串
List<String> list = cutStringByImgTag(html);
for (int i = 0; i < list.size(); i++) {
String text = list.get(i);
if (text.contains("<img") && text.contains("src=")) {
//从img标签中获取图片地址
String imagePath = getImgSrc(text);
imageList.add(imagePath);
} else {
textList.add(text);
}
}
//判断是获取图片还是文本
if (isGetImage) {
return imageList;
} else {
return textList;
}
}
}
将固定格式的html文本分割成ArrayList
最新推荐文章于 2023-03-02 10:13:17 发布