AC自动机实现自然语言关键词高亮

AC自动机实现自然语言关键词高亮

坐标

  		<dependency>
            <groupId>com.hankcs</groupId>
            <artifactId>hanlp</artifactId>
            <version>portable-1.7.8</version>
        </dependency>

代码

import com.cmbchina.gsm.config.GSMGlobalConfig;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;

/**
 * <p>
 * AC自动机关键词高亮处理
 * </p>
 *
 * @version 1.0
 * @author IT803300
 * @date 2022-08-15
 */
public class KeywordMatchUtil {

    /**
     * 构建ac自动机
     *
     * @param keywords 关键词
     * @return AhoCorasickDoubleArrayTrie
     */
    public static AhoCorasickDoubleArrayTrie<String> buildACDT(Set<String> keywords) {
        AhoCorasickDoubleArrayTrie<String> acdt = new AhoCorasickDoubleArrayTrie<>();
        TreeMap<String, String> map = new TreeMap<>();
        for (String keyword : keywords) {
            map.put(keyword, keyword);
        }
        acdt.build(map);
        return acdt;
    }

    /**
     * 高亮处理
     *
     * @param originText 语言
     * @param acdt       ac自动机树
     * @return String, 高亮后的结果
     */
    public static String highLight(String originText, AhoCorasickDoubleArrayTrie<String> acdt) {
        List<int[]> hitLocationList = new ArrayList<>();
        // ac算法匹配关键词
        acdt.parseText(originText, (begin, end, value) -> {
            int[] indexPair = new int[2];
            indexPair[0] = begin;
            indexPair[1] = end - 1;
            hitLocationList.add(indexPair);
        });
        // 构建bitmap
        byte[] posStatus = new byte[originText.length()];
        for (int[] item : hitLocationList) {
            posStatus[item[0]] = 1;
            for (int i = item[0]; i <= item[1]; i++) {
                posStatus[i] = 1;
            }
        }
        // 字符串拼接
        int lastStatus = 0;
        char[] charArray = originText.toCharArray();
        StringBuilder stringBuilder = new StringBuilder();
        for (int i = 0; i < posStatus.length; i++) {
            if (posStatus[i] == lastStatus) {
                stringBuilder.append(charArray[i]);
            } else if (0 == lastStatus) {
                stringBuilder.append(GSMGlobalConfig.highlightPrefix).append(charArray[i]);
                lastStatus = 1;
            } else if (1 == lastStatus) {
                stringBuilder.append(GSMGlobalConfig.highlightSuffix).append(charArray[i]);
                lastStatus = 0;
            }
            if (i == posStatus.length - 1 && 1 == lastStatus) {
                stringBuilder.append(GSMGlobalConfig.highlightSuffix);
            }
        }
        return stringBuilder.toString();
    }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
以下是AC自动机Java中的实现代码: ```java import java.util.LinkedList; import java.util.Queue; class ACNode { public char data; public ACNode[] children = new ACNode[26]; public boolean isEndingChar = false; public int length = -1; public ACNode fail; public ACNode(char data) { this.data = data; } } class ACAutomaton { private ACNode root; public ACAutomaton() { root = new ACNode('/'); } public void insert(String text) { ACNode p = root; for (int i = 0; i < text.length(); i++) { int index = text.charAt(i) - 'a'; if (p.children[index] == null) { ACNode newNode = new ACNode(text.charAt(i)); p.children[index] = newNode; } p = p.children[index]; } p.isEndingChar = true; p.length = text.length(); } public void buildFailurePointer() { Queue<ACNode> queue = new LinkedList<>(); root.fail = null; queue.add(root); while (!queue.isEmpty()) { ACNode p = queue.remove(); for (int i = 0; i < 26; i++) { ACNode pc = p.children[i]; if (pc == null) continue; if (p == root) { pc.fail = root; } else { ACNode q = p.fail; while (q != null) { ACNode qc = q.children[pc.data - 'a']; if (qc != null) { pc.fail = qc; break; } q = q.fail; } if (q == null) { pc.fail = root; } } queue.add(pc); } } } public int match(String text) { ACNode p = root; int n = text.length(); for (int i = 0; i < n; i++) { int index = text.charAt(i) - 'a'; while (p.children[index] == null && p != root) { p = p.fail; } p = p.children[index]; if (p == null) p = root; ACNode tmp = p; while (tmp != root) { if (tmp.isEndingChar) { return i - tmp.length + 1; } tmp = tmp.fail; } } return -1; } } ``` 使用方法: ```java ACAutomaton ac = new ACAutomaton(); ac.insert("he"); ac.insert("she"); ac.insert("his"); ac.insert("hers"); ac.buildFailurePointer(); System.out.println(ac.match("ushers")); ``` 输出:2 这个例子中,AC自动机匹配到了字符串"she"的结尾,所以返回2。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值