Html标签文本查重标红(算法端不支持对html标签文本查重)

    public static final String startTag = "<span style='color:#e53639'>";
    public static final String endTag = "</span>";

    public static String camel2Snake(String camel) {
        StringBuilder chars = new StringBuilder(camel);
        for (int i = 0; i < chars.length(); i++) {
            if (chars.charAt(i) >= 'A' && chars.charAt(i) <= 'Z') {
                chars.insert(i, '_');
            }
        }
        return chars.toString().toUpperCase();
    }

    /**
     * 映射html与纯文本标红
     *
     * @param result
     * @return
     */
    public static JSONArray mappingHtmlText(JSONArray result) {
        for (Object o : result) {
            //解析返回结果取值
            JSONObject jsonObject = (JSONObject) o;
            JSONObject sim_content = jsonObject.getJSONObject("sim_content");
            JSONArray target = sim_content.getJSONArray("target");
            String html = jsonObject.getString("contentHtml");
            html = HtmlUtil.unescape(html);
            String text = Jsoup.parse(html).text();
            LinkedHashMap<Integer, Integer> map = getMappingMap(html, text);

            //开始标签索引
            LinkedList<Integer> startTagList = new LinkedList<>();
            //结束标签索引
            LinkedList<Integer> endTagList = new LinkedList<>();
            for (Object o1 : target) {
                JSONObject clause = (JSONObject) o1;
                String content = clause.getString("content");
                JSONArray detail = clause.getJSONArray("detail");
                Integer start = detail.getInteger(0);
                Integer end = detail.getInteger(1) - 1;
                if (map.get(start) == null || map.get(end) == null) {
                    System.out.println(String.format("获取索引异常:%d=>%d", start, end));
                }
                startTagList.add(map.get(start));
                endTagList.add(map.get(end));
            }
            String markHtml = getMarkHtml(html, startTagList, endTagList);
            jsonObject.put("contentHtml", markHtml);
            jsonObject.remove("sim_content");
            jsonObject.remove("sim_title");
        }
        return result;
    }

    /**
     * 获取标红html片段
     *
     * @param html
     * @param startTagList
     * @param endTagList
     * @return
     */
    private static String getMarkHtml(String html, LinkedList<Integer> startTagList, LinkedList<Integer> endTagList) {
        StringBuffer markHtml = new StringBuffer();
        char[] htmlChars = html.toCharArray();
        for (int i = 0; i < htmlChars.length; i++) {
            if (startTagList.contains(i)) {
                markHtml.append(startTag);
            }
            markHtml.append(htmlChars[i]);
            if (endTagList.contains(i)) {
                markHtml.append(endTag);
            }
        }
        return markHtml.toString();
    }

    /**
     * 获取html代码和纯文本映射关系
     *
     * @param html
     * @param text
     * @return
     */
    private static LinkedHashMap<Integer, Integer> getMappingMap(String html, String text) {
        char[] htmlChars = html.toCharArray();
        //查找原文最小连续文字片段,然后记录html与text对应索引建立mapping
        //1.每个字符都分割,然后对每个文字标红,然后再遍历,连续字符片段标红合并
        char[] textChars = text.toCharArray();
        //必要条件,html与text文本内容顺序一致
        LinkedHashMap<Integer, Integer> map = new LinkedHashMap<>();
        int last = 0;
        for (int i = 0; i < textChars.length; i++) {
            for (int j = last; j < htmlChars.length; j++) {
                if (textChars[i] == htmlChars[j] && isNotInTag(htmlChars, j)) {
                    map.put(i, j);
                    last = j;
                    break;
                }
            }
        }
        last = -1;
        for (Map.Entry<Integer, Integer> entry : map.entrySet()) {
            Integer key = entry.getKey();
            if (key == last + 1) {
                last = key;
            } else {
                System.out.println(key);
            }
//            System.out.println(String.format("key:%d=>value:%d", key, entry.getValue()));
        }
        return map;
    }

    /**
     * 判断字符是否在标签内部
     *
     * @param htmlChars
     * @param j
     * @return
     */
    private static boolean isNotInTag(char[] htmlChars, int j) {
        while (j-- > 0) {
            if (htmlChars[j] == '>') {
                return true;
            } else if (htmlChars[j] == '<') {
                return false;
            }
        }
        for (; j < htmlChars.length; j++) {
            if (htmlChars[j] == '<') {
                return true;
            } else if (htmlChars[j] == '>') {
                return false;
            }
        }
        return true;
    }

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值