public static final String startTag = "<span style='color:#e53639'>";
public static final String endTag = "</span>";
public static String camel2Snake(String camel) {
StringBuilder chars = new StringBuilder(camel);
for (int i = 0; i < chars.length(); i++) {
if (chars.charAt(i) >= 'A' && chars.charAt(i) <= 'Z') {
chars.insert(i, '_');
}
}
return chars.toString().toUpperCase();
}
/**
* 映射html与纯文本标红
*
* @param result
* @return
*/
public static JSONArray mappingHtmlText(JSONArray result) {
for (Object o : result) {
//解析返回结果取值
JSONObject jsonObject = (JSONObject) o;
JSONObject sim_content = jsonObject.getJSONObject("sim_content");
JSONArray target = sim_content.getJSONArray("target");
String html = jsonObject.getString("contentHtml");
html = HtmlUtil.unescape(html);
String text = Jsoup.parse(html).text();
LinkedHashMap<Integer, Integer> map = getMappingMap(html, text);
//开始标签索引
LinkedList<Integer> startTagList = new LinkedList<>();
//结束标签索引
LinkedList<Integer> endTagList = new LinkedList<>();
for (Object o1 : target) {
JSONObject clause = (JSONObject) o1;
String content = clause.getString("content");
JSONArray detail = clause.getJSONArray("detail");
Integer start = detail.getInteger(0);
Integer end = detail.getInteger(1) - 1;
if (map.get(start) == null || map.get(end) == null) {
System.out.println(String.format("获取索引异常:%d=>%d", start, end));
}
startTagList.add(map.get(start));
endTagList.add(map.get(end));
}
String markHtml = getMarkHtml(html, startTagList, endTagList);
jsonObject.put("contentHtml", markHtml);
jsonObject.remove("sim_content");
jsonObject.remove("sim_title");
}
return result;
}
/**
* 获取标红html片段
*
* @param html
* @param startTagList
* @param endTagList
* @return
*/
private static String getMarkHtml(String html, LinkedList<Integer> startTagList, LinkedList<Integer> endTagList) {
StringBuffer markHtml = new StringBuffer();
char[] htmlChars = html.toCharArray();
for (int i = 0; i < htmlChars.length; i++) {
if (startTagList.contains(i)) {
markHtml.append(startTag);
}
markHtml.append(htmlChars[i]);
if (endTagList.contains(i)) {
markHtml.append(endTag);
}
}
return markHtml.toString();
}
/**
* 获取html代码和纯文本映射关系
*
* @param html
* @param text
* @return
*/
private static LinkedHashMap<Integer, Integer> getMappingMap(String html, String text) {
char[] htmlChars = html.toCharArray();
//查找原文最小连续文字片段,然后记录html与text对应索引建立mapping
//1.每个字符都分割,然后对每个文字标红,然后再遍历,连续字符片段标红合并
char[] textChars = text.toCharArray();
//必要条件,html与text文本内容顺序一致
LinkedHashMap<Integer, Integer> map = new LinkedHashMap<>();
int last = 0;
for (int i = 0; i < textChars.length; i++) {
for (int j = last; j < htmlChars.length; j++) {
if (textChars[i] == htmlChars[j] && isNotInTag(htmlChars, j)) {
map.put(i, j);
last = j;
break;
}
}
}
last = -1;
for (Map.Entry<Integer, Integer> entry : map.entrySet()) {
Integer key = entry.getKey();
if (key == last + 1) {
last = key;
} else {
System.out.println(key);
}
// System.out.println(String.format("key:%d=>value:%d", key, entry.getValue()));
}
return map;
}
/**
* 判断字符是否在标签内部
*
* @param htmlChars
* @param j
* @return
*/
private static boolean isNotInTag(char[] htmlChars, int j) {
while (j-- > 0) {
if (htmlChars[j] == '>') {
return true;
} else if (htmlChars[j] == '<') {
return false;
}
}
for (; j < htmlChars.length; j++) {
if (htmlChars[j] == '<') {
return true;
} else if (htmlChars[j] == '>') {
return false;
}
}
return true;
}
Html标签文本查重标红(算法端不支持对html标签文本查重)
最新推荐文章于 2024-04-01 22:40:34 发布