C++语言实现中文的ac自动机

在原先博主的基础上稍作修改,可支持中文,英文,表情,标点符号。

#include <string>
#include <iostream>
#include <set>
#include <string>
#include <vector>
#include <queue>
#include <iostream>
#include <map>

using namespace std;

#define VISIBLE_NUMBER 224

//定义节点结构
struct StateNode {
    bool finish_{false};
    int state_{0};
    int num_{0};
    string pattern_{};
    //goto table
    vector<StateNode *> transition_table_{vector<StateNode *>(VISIBLE_NUMBER)};
};
//定义一个树的类
class TrieAc  {
private:
    StateNode *start_node_;
    int state_count_;
    vector<StateNode *> corresponding_node_;
    vector<StateNode *> fail_;
public:
    TrieAc() : start_node_{new StateNode()}, state_count_{0} {
        //state0 is start_node_
        corresponding_node_.push_back(start_node_);
    }

    ~TrieAc() {};

    //read all patterns and produce the goto table
    void load_pattern(const vector<string> &_Patterns);

    //produce fail function
    void dispose();

    //search matching
    void match(const string &_Str, set<int> &matched);

};

class TreeModels {
public:
    TrieAc trieac;
    map<string, int> tagcollect;
    vector<string> patterns;

    void quick_find_tags(std::string &query);

    TreeModels();

    ~TreeModels() {};
};


void TrieAc::load_pattern(const vector<string> &_Patterns) {
    int latest_state = 1;
    int count = 0;
    for (const auto &pattern : _Patterns) {
        auto *p = start_node_;
        for (int i = 0; i < pattern.size(); ++i) {
            auto *next_node = p->transition_table_[0x20 - pattern[i]];
            if (next_node == nullptr) {
                next_node = new StateNode();
            }
            if (next_node->state_ == 0) {
                next_node->state_ = latest_state++;
                corresponding_node_.push_back(next_node);
            }
            p->transition_table_[0x20 - pattern[i]] = next_node;
            p = next_node;
        }
        p->finish_ = true;
        p->pattern_ = pattern;
        p->num_ = count++;
    }

    int aaa = 0;
    int bbb = 0;
    for (int i = 0; i < 224; ++i) {
        if (start_node_->transition_table_[i] == nullptr) {
            start_node_->transition_table_[i] = start_node_;
            aaa++;
        }else{
            bbb++;
        }
    }
    cout<<aaa<<endl;
    cout<<bbb<<endl;
    state_count_ = latest_state;
}

//produce fail function
void TrieAc::dispose() {
    queue<StateNode *> q;
    fail_ = std::move(vector<StateNode *>(state_count_));

    for (const auto nxt : start_node_->transition_table_) {
        if (nxt && nxt->state_ != 0) {
            fail_[nxt->state_] = start_node_;
            q.push(nxt);
        }
    }
    while (!q.empty()) {
        auto known = q.front();
        q.pop();
        for (int i = 0; i < 224; ++i) {
            auto nxt = known->transition_table_[i];
            if (nxt && nxt->state_ != 0) {
                auto p = fail_[known->state_];
                while (!p->transition_table_[i]) {
                    p = fail_[p->state_];
                }
                fail_[nxt->state_] = p->transition_table_[i];
                q.push(nxt);
            } 
        }
    }
}

//search matching
void TrieAc::match(const string &_Str, set<int> &matched) {
    int trans = 0;
    auto p = start_node_;
    for (int i = 0; i < _Str.size(); ++i) {

        trans = 0x20 - _Str[i];
        p = p->transition_table_[trans] ? p->transition_table_[trans] : (--i, fail_[p->state_]);
        cout << "p:" << p << endl;
        if (p && p->finish_) {
            matched.insert(p->num_);
        } else {
            if (p == nullptr) {
                p = start_node_;
            }
        }
        cout << "matched.size:" << matched.size() << endl;
    }
}

int main() {
    TrieAc trieac;
    set<int> matched;
    vector<std::string> patterns{"躺平","毫无波澜","顺从心理","加班"};
    std::string query = "躺平,网络流行词,指无论对方做出什么反应,你内心都毫无波澜,对此不会有任何反应或者反抗,表示顺从心理。另外在部分语境中表示为:瘫倒在地,不再鸡血沸腾、渴求成功了。躺平看似是妥协、放弃,但其实是“向下突破天花板”,选择最无所作为的方式反叛裹挟。年轻人选择躺平,就是选择走向边缘,超脱于加班、升职、挣钱、买房的主流路径之外,用自己的方式消解外在环境对个体的规训。";
    trieac.load_pattern(patterns);
    trieac.dispose();
    trieac.match(query,matched);
    for(auto m:matched){
        cout << patterns[m] << endl;
    }
    cout<<"end"<<endl;

}
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值