Boost搜索引擎：项目整体代码及布局

最新推荐文章于 2024-09-10 18:38:40 发布

挣扎的泽

最新推荐文章于 2024-09-10 18:38:40 发布

阅读量456

点赞数 8

分类专栏： Boost搜索引擎文章标签：搜索引擎

本文链接：https://blog.csdn.net/2201_75324712/article/details/141068182

版权

Boost搜索引擎专栏收录该内容

9 篇文章 0 订阅

订阅专栏

项目在xshell上面的目录

例如：dict -> /home/dz/...

这是建立的动态链接方便编译器查找你也可将所安装的开发工具放到当前目录下，最终还是要在你代码中包含，所以你只需在你代码中科院指明要包含开发工具的头文件等就可以了，不是非要构建动态链接。

动态链接的构建

ln -s /home/dz/Project/Hpp/cppjieba/dict/ ./dict
       要链接的 头文件 或 目录 所在路径      目标路径 dict就是动态链接库
           最好是绝对路径

动态链接的删除

unlink dict

rm 动态链接名也可以删除动态链接，最好用unlink 动态链接名称

---------------------------------------------------------------------------------------------------------------------------------

以下代码就是Boost搜索引擎整体代码，代码实现细节Boost搜索引擎项目专栏中都有列举，请先参考代码实现细节再来整体观看项目代码，个人感觉会好一些。

该项目中索引构建模块实现，运用很多容器，需要掌握不同容器的特性，以及容器中存储的结构体信息，只有了解到，容器与容器之间的关系，结构体之间的关系，才能彻底看懂这个项目。

项目工具模块util.hpp

#pragma once

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <mutex>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include "log.hpp"
#include "cppjieba/Jieba.hpp"

namespace ns_util{
    class FileUtil{
        public:
            static bool ReadFile(const std::string &file_path, std::string *out)
            {
                std::ifstream in(file_path, std::ios::in);
                if(!in.is_open()){
                    std::cerr << "open file " << file_path << " error" << std::endl;
                    return false;
                }

                std::string line;
                while(std::getline(in, line)){ //如何理解getline读取到文件结束呢？？getline的返回值是一个&，while(bool), 本质是因为重载了强制类型转化
                    *out += line;
                }

                in.close();
                return true;
            }
    };
    class StringUtil{
        public:
            static void Split(const std::string &target, std::vector<std::string> *out, const std::string &sep)
            {
                //boost split
                boost::split(*out, target, boost::is_any_of(sep), boost::token_compress_on);
            }
    };

    const char* const DICT_PATH = "./dict/jieba.dict.utf8";
    const char* const HMM_PATH = "./dict/hmm_model.utf8";
    const char* const USER_DICT_PATH = "./dict/user.dict.utf8";
    const char* const IDF_PATH = "./dict/idf.utf8";
    const char* const STOP_WORD_PATH = "./dict/stop_words.utf8";

    class JiebaUtil{
        private:
            //static cppjieba::Jieba jieba;
            cppjieba::Jieba jieba;
            std::unordered_map<std::string, bool> stop_words;
        private:
            JiebaUtil():jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH)
            {}
            JiebaUtil(const JiebaUtil&) = delete;

            static JiebaUtil *instance;
        public:

            static JiebaUtil* get_instance()
            {
                static std::mutex mtx;
                if(nullptr == instance){
                    mtx.lock();
                    if(nullptr == instance){
                        instance = new JiebaUtil();
                        instance->InitJiebaUtil();
                    }
                    mtx.unlock();
                }

                return instance;
            }

            void InitJiebaUtil()
            {
                std::ifstream in(STOP_WORD_PATH);
                if(!in.is_open()){
                    LOG(FATAL, "load stop words file error");
                    return;
                }

                std::string line;
                while(std::getline(in, line)){
                    stop_words.insert({line, true});
                }

                in.close();
            }

            void CutStringHelper(const std::string &src, std::vector<std::string> *out)
            {
                jieba.CutForSearch(src, *out);
                for(auto iter = out->begin(); iter != out->end(); ){
                    auto it = stop_words.find(*iter);
                    if(it != stop_words.end()){
                        //说明当前的string 是暂停词，需要去掉
                        iter = out->erase(iter);
                    }
                    else{
                        iter++;
                    }
                }
            }

        public:
            static void CutString(const std::string &src, std::vector<std::string> *out)
            {
                ns_util::JiebaUtil::get_instance()->CutStringHelper(src, out);
                //jieba.CutForSearch(src, *out);
            }
    };

    JiebaUtil *JiebaUtil::instance = nullptr;
    //cppjieba::Jieba JiebaUtil::jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH);
}

日志模块log.hpp

#pragma once

#include <iostream>
#include <string>
#include <ctime>

#define NORMAL  1
#define WARNING 2
#define DEBUG   3
#define FATAL   4

#define LOG(LEVEL, MESSAGE) log(#LEVEL, MESSAGE, __FILE__, __LINE__)

void log(std::string level, std::string message, std::string file, int line)
{
    std::cout << "[" << level << "]" << "[" << time(nullptr) << "]" << "[" << message << "]" << "[" << file << " : " << line << "]" << std::endl;
}

数据处理模块parser.cc

#include <iostream>
#include <string>
#include <vector>
#include <boost/filesystem.hpp>
#include "util.hpp"

//是一个目录，下面放的是所有的html网页
const std::string src_path = "data/input";
const std::string output = "data/raw_html/raw.txt";

typedef struct DocInfo{
    std::string title;   //文档的标题
    std::string content; //文档内容
    std::string url;     //该文档在官网中的url
}DocInfo_t;

//const &: 输入
//*: 输出
//&：输入输出
bool EnumFile(const std::string &src_path, std::vector<std::string> *files_list);
bool ParseHtml(const std::vector<std::string> &files_list, std::vector<DocInfo_t> *results);
bool SaveHtml(const std::vector<DocInfo_t> &results, const std::string &output);

int main()
{
    std::vector<std::string> files_list;
    //第一步: 递归式的把每个html文件名带路径，保存到files_list中，方便后期进行一个一个的文件进行读取
    if(!EnumFile(src_path, &files_list)){
        std::cerr << "enum file name error!" << std::endl;
        return 1;

    }
    //第二步: 按照files_list读取每个文件的内容，并进行解析
    std::vector<DocInfo_t> results;
    if(!ParseHtml(files_list, &results)){
        std::cerr << "parse html error" << std::endl;
        return 2;
    }

    //第三步: 把解析完毕的各个文件内容，写入到output,按照\3作为每个文档的分割符
    if(!SaveHtml(results, output)){
        std::cerr << "sava html error" << std::endl;
        return 3;
    }

    return 0;
}

bool EnumFile(const std::string &src_path, std::vector<std::string> *files_list)
{
    namespace fs = boost::filesystem;
    fs::path root_path(src_path);

    //判断路径是否存在，不存在，就没有必要再往后走了
    if(!fs::exists(root_path)){
        std::cerr << src_path << " not exists" << std::endl;
        return false;
    }

    //定义一个空的迭代器，用来进行判断递归结束
    fs::recursive_directory_iterator end;
    for(fs::recursive_directory_iterator iter(root_path); iter != end; iter++){
        //判断文件是否是普通文件，html都是普通文件
        if(!fs::is_regular_file(*iter)){ 
            continue;
        }
        if(iter->path().extension() != ".html"){ //判断文件路径名的后缀是否符合要求
            continue;
        }
        //std::cout << "debug: " << iter->path().string() << std::endl;
        //当前的路径一定是一个合法的，以.html结束的普通网页文件
        files_list->push_back(iter->path().string()); //将所有带路径的html保存在files_list,方便后续进行文本分析
    }

    return true;
}

static bool ParseTitle(const std::string &file, std::string *title)
{
    std::size_t begin = file.find("<title>");
    if(begin == std::string::npos){
        return false;
    }
    std::size_t end = file.find("</title>");
    if(end == std::string::npos){
        return false;
    }

    begin += std::string("<title>").size();

    if(begin > end){
        return false;
    }
    *title = file.substr(begin, end - begin);
    return true;
}

static bool ParseContent(const std::string &file, std::string *content)
{
    //去标签,基于一个简易的状态机
    enum status{
        LABLE,
        CONTENT
    };

    enum status s = LABLE;
    for( char c : file){
        switch(s){
            case LABLE:
                if(c == '>') s = CONTENT;
                break;
            case CONTENT:
                if(c == '<') s = LABLE;
                else {
                    //我们不想保留原始文件中的\n,因为我们想用\n作为html解析之后文本的分隔符
                    if(c == '\n') c = ' ';
                    content->push_back(c);
                }
                break;
            default:
                break;
        }
    }

    return true;
}

static bool ParseUrl(const std::string &file_path, std::string *url)
{
    std::string url_head = "https://www.boost.org/doc/libs/1_78_0/doc/html";
    std::string url_tail = file_path.substr(src_path.size());

    *url = url_head + url_tail;
    return true;
}

//for debug
static void ShowDoc( const DocInfo_t &doc)
{
    std::cout << "title: " << doc.title << std::endl;
    std::cout << "content: " << doc.content << std::endl;
    std::cout << "url: " << doc.url << std::endl;
}

bool ParseHtml(const std::vector<std::string> &files_list, std::vector<DocInfo_t> *results)
{
    for(const std::string &file : files_list){
        //1. 读取文件，Read();
        std::string result;
        if(!ns_util::FileUtil::ReadFile(file, &result)){
            continue;
        }
        DocInfo_t doc;
        //2. 解析指定的文件，提取title
        if(!ParseTitle(result, &doc.title)){
            continue;
        }
        //3. 解析指定的文件，提取content,就是去标签
        if(!ParseContent(result, &doc.content)){
            continue;
        }
        //4. 解析指定的文件路径，构建url
        if(!ParseUrl(file, &doc.url)){
            continue;
        }

        //done,一定是完成了解析任务，当前文档的相关结果都保存在了doc里面
        results->push_back(std::move(doc)); //bug:todo;细节，本质会发生拷贝，效率可能会比较低

        //for debug
        //ShowDoc(doc);
        //break;
    }
    return true;
}

bool SaveHtml(const std::vector<DocInfo_t> &results, const std::string &output)
{
#define SEP '\3'
    //按照二进制方式进行写入
    std::ofstream out(output, std::ios::out | std::ios::binary);
    if(!out.is_open()){
        std::cerr << "open " << output << " failed!" << std::endl;
        return false;
    }

    //就可以进行文件内容的写入了
    for(auto &item : results){
        std::string out_string;
        out_string = item.title;
        out_string += SEP;
        out_string += item.content;
        out_string += SEP;
        out_string += item.url;
        out_string += '\n';

        out.write(out_string.c_str(), out_string.size());
    }

    out.close();

    return true;
}

索引构建模块index.hpp

#pragma once

#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <mutex>
#include "util.hpp"
#include "log.hpp"

namespace ns_index{

    struct DocInfo{
        std::string title;   //文档的标题
        std::string content; //文档对应的去标签之后的内容
        std::string url;     //官网文档url
        uint64_t doc_id;          //文档的ID，暂时先不做过多理解
    };

    struct InvertedElem{
        uint64_t doc_id;
        std::string word;
        int weight;
        InvertedElem():weight(0){}
    };

    //倒排拉链
    typedef std::vector<InvertedElem> InvertedList;

    class Index{
        private:
            //正排索引的数据结构用数组，数组的下标天然是文档的ID
            std::vector<DocInfo> forward_index; //正排索引
            //倒排索引一定是一个关键字和一组(个)InvertedElem对应[关键字和倒排拉链的映射关系]
            std::unordered_map<std::string, InvertedList> inverted_index;
        private:
            Index(){} //但是一定要有函数体，不能delete
            Index(const Index&) = delete;
            Index& operator=(const Index&) = delete;

            static Index* instance;
            static std::mutex mtx;
        public:
            ~Index(){}
        public:
            static Index* GetInstance()
            {
                //if(nullptr == instance){
                    //mtx.lock();
                    if(nullptr == instance){
                        instance = new Index();
                    }
                    //mtx.unlock();
                //}
                return instance;
            }
            //根据doc_id找到找到文档内容
            DocInfo *GetForwardIndex(uint64_t doc_id)
            {
                if(doc_id >= forward_index.size()){
                    std::cerr << "doc_id out range, error!" << std::endl;
                    return nullptr;
                }
                return &forward_index[doc_id];
            }

            //根据关键字string，获得倒排拉链
            InvertedList *GetInvertedList(const std::string &word)
            {
                auto iter = inverted_index.find(word);
                if(iter == inverted_index.end()){
                    std::cerr << word << " have no InvertedList" << std::endl;
                    return nullptr;
                }
                return &(iter->second);
            }
            //根据去标签，格式化之后的文档，构建正排和倒排索引
            //data/raw_html/raw.txt
            bool BuildIndex(const std::string &input) //parse处理完毕的数据交给我
            {
                std::ifstream in(input, std::ios::in | std::ios::binary);
                if(!in.is_open()){
                    std::cerr << "sorry, " << input << " open error" << std::endl;
                    return false;
                }

                std::string line;
                int count = 0;
                while(std::getline(in, line)){
                    DocInfo * doc = BuildForwardIndex(line);
                    if(nullptr == doc){
                        std::cerr << "build " << line << " error" << std::endl; //for deubg
                        continue;
                    }

                    BuildInvertedIndex(*doc);
                    count++;
                    //if(count % 50 == 0){
                        //std::cout <<"当前已经建立的索引文档: " << count <<std::endl;
                        LOG(NORMAL, "当前的已经建立的索引文档: " + std::to_string(count));
                    //}
                }
                return true;
            }
        private:
            DocInfo *BuildForwardIndex(const std::string &line)
            {
                //1. 解析line，字符串切分
                //line -> 3 string, title, content, url
                std::vector<std::string> results;
                const std::string sep = "\3";   //行内分隔符
                ns_util::StringUtil::Split(line, &results, sep);
                //ns_util::StringUtil::CutString(line, &results, sep);
                if(results.size() != 3){
                    return nullptr;
                }
                //2. 字符串进行填充到DocIinfo
                DocInfo doc;
                doc.title = results[0]; //title
                doc.content = results[1]; //content
                doc.url = results[2];   ///url
                doc.doc_id = forward_index.size(); //先进行保存id，在插入，对应的id就是当前doc在vector中的下标!
                //3. 插入到正排索引的vector
                forward_index.push_back(std::move(doc)); //doc,html文件内容
                return &forward_index.back();
            }

            bool BuildInvertedIndex(const DocInfo &doc)
            {
                //DocInfo{title, content, url, doc_id}
                //word -> 倒排拉链
                struct word_cnt{
                    int title_cnt;
                    int content_cnt;

                    word_cnt():title_cnt(0), content_cnt(0){}
                };
                std::unordered_map<std::string, word_cnt> word_map; //用来暂存词频的映射表

                //对标题进行分词
                std::vector<std::string> title_words;
                ns_util::JiebaUtil::CutString(doc.title, &title_words);

                //if(doc.doc_id == 1572){
                //    for(auto &s : title_words){
                //        std::cout << "title: " << s << std::endl;
                //    }
                //}

                //对标题进行词频统计
                for(std::string s : title_words){
                    boost::to_lower(s); //需要统一转化成为小写
                    word_map[s].title_cnt++; //如果存在就获取，如果不存在就新建
                }

                //对文档内容进行分词
                std::vector<std::string> content_words;
                ns_util::JiebaUtil::CutString(doc.content, &content_words);
                //if(doc.doc_id == 1572){
                //    for(auto &s : content_words){
                //        std::cout << "content: " << s << std::endl;
                //    }
                //}

                //对内容进行词频统计
                for(std::string s : content_words){
                    boost::to_lower(s);
                    word_map[s].content_cnt++;
                }

#define X 10
#define Y 1
                //Hello,hello,HELLO
                for(auto &word_pair : word_map){
                    InvertedElem item;
                    item.doc_id = doc.doc_id;
                    item.word = word_pair.first;
                    item.weight = X*word_pair.second.title_cnt + Y*word_pair.second.content_cnt; //相关性
                    InvertedList &inverted_list = inverted_index[word_pair.first];
                    inverted_list.push_back(std::move(item));
                }

                return true;
            }
    };
    Index* Index::instance = nullptr;
    std::mutex Index::mtx;
}

关键词搜索模块searcher.hpp

#pragma once

#include "index.hpp"
#include "util.hpp"
#include "log.hpp"
#include <algorithm>
#include <unordered_map>
#include <jsoncpp/json/json.h>

namespace ns_searcher{

    struct InvertedElemPrint{
        uint64_t doc_id;
        int weight;
        std::vector<std::string> words;
        InvertedElemPrint():doc_id(0), weight(0){}
    };

    class Searcher{
        private:
            ns_index::Index *index; //供系统进行查找的索引
        public:
            Searcher(){}
            ~Searcher(){}
        public:
            void InitSearcher(const std::string &input)
            {
                //1. 获取或者创建index对象
                index = ns_index::Index::GetInstance();
                //std::cout << "获取index单例成功..." << std::endl;
                LOG(NORMAL, "获取index单例成功...");
                //2. 根据index对象建立索引
                index->BuildIndex(input);
                //std::cout << "建立正排和倒排索引成功..." << std::endl;
                LOG(NORMAL, "建立正排和倒排索引成功...");
            }
            //query: 搜索关键字
            //json_string: 返回给用户浏览器的搜索结果
            void Search(const std::string &query, std::string *json_string)
            {
                //1.[分词]:对我们的query进行按照searcher的要求进行分词
                std::vector<std::string> words;
                ns_util::JiebaUtil::CutString(query, &words);
                //2.[触发]:就是根据分词的各个"词"，进行index查找,建立index是忽略大小写，所以搜索，关键字也需要
                //ns_index::InvertedList inverted_list_all; //内部InvertedElem
                std::vector<InvertedElemPrint> inverted_list_all;

                std::unordered_map<uint64_t, InvertedElemPrint> tokens_map;

                for(std::string word : words){
                    boost::to_lower(word);

                    ns_index::InvertedList *inverted_list = index->GetInvertedList(word);
                    if(nullptr == inverted_list){
                        continue;
                    }
                    //不完美的地方：暂时可以交给大家 , 你/是/一个/好人 100
                    //inverted_list_all.insert(inverted_list_all.end(), inverted_list->begin(), inverted_list->end());
                    for(const auto &elem : *inverted_list){
                        auto &item = tokens_map[elem.doc_id]; //[]:如果存在直接获取，如果不存在新建
                        //item一定是doc_id相同的print节点
                        item.doc_id = elem.doc_id;
                        item.weight += elem.weight;
                        item.words.push_back(elem.word);
                    }
                }
                for(const auto &item : tokens_map){
                    inverted_list_all.push_back(std::move(item.second));
                }

                //3.[合并排序]：汇总查找结果，按照相关性(weight)降序排序
                //std::sort(inverted_list_all.begin(), inverted_list_all.end(),\
                //      [](const ns_index::InvertedElem &e1, const ns_index::InvertedElem &e2){
                //        return e1.weight > e2.weight;
                //        });
                  std::sort(inverted_list_all.begin(), inverted_list_all.end(),\
                          [](const InvertedElemPrint &e1, const InvertedElemPrint &e2){
                          return e1.weight > e2.weight;
                          });
                //4.[构建]:根据查找出来的结果，构建json串 -- jsoncpp --通过jsoncpp完成序列化&&反序列化
                Json::Value root;
                for(auto &item : inverted_list_all){
                    ns_index::DocInfo * doc = index->GetForwardIndex(item.doc_id);
                    if(nullptr == doc){
                        continue;
                    }
                    Json::Value elem;
                    elem["title"] = doc->title;
                    elem["desc"] = GetDesc(doc->content, item.words[0]); //content是文档的去标签的结果，但是不是我们想要的，我们要的是一部分 TODO
                    elem["url"]  = doc->url;
                    //for deubg, for delete
                    elem["id"] = (int)item.doc_id;
                    elem["weight"] = item.weight; //int->string

                    root.append(elem);
                }

                //Json::StyledWriter writer;
                Json::FastWriter writer;
                *json_string = writer.write(root);
            }

            std::string GetDesc(const std::string &html_content, const std::string &word)
            {
                //找到word在html_content中的首次出现，然后往前找50字节(如果没有，从begin开始)，往后找100字节(如果没有，到end就可以的)
                //截取出这部分内容
                const int prev_step = 50;
                const int next_step = 100;
                //1. 找到首次出现
                auto iter = std::search(html_content.begin(), html_content.end(), word.begin(), word.end(), [](int x, int y){
                        return (std::tolower(x) == std::tolower(y));
                        });
                if(iter == html_content.end()){
                    return "None1";
                }
                int pos = std::distance(html_content.begin(), iter);

                //2. 获取start，end , std::size_t 无符号整数
                int start = 0; 
                int end = html_content.size() - 1;
                //如果之前有50+字符，就更新开始位置
                if(pos > start + prev_step) start = pos - prev_step;
                if(pos < end - next_step) end = pos + next_step;

                //3. 截取子串,return
                if(start >= end) return "None2";
                std::string desc = html_content.substr(start, end - start);
                desc += "...";
                return desc;
            }
    };
}

http网络模块http_server.cc

#include "cpp-httplib"
#include "searcher.hpp"

const std::string input = "data/raw_html/raw.txt";
const std::string root_path = "./wwwroot";

int main()
{
    ns_searcher::Searcher search;
    search.InitSearcher(input);

    httplib::Server svr;
    svr.set_base_dir(root_path.c_str());
    svr.Get("/s", [&search](const httplib::Request &req, httplib::Response &rsp){
            if(!req.has_param("word")){
                rsp.set_content("必须要有搜索关键字!", "text/plain; charset=utf-8");
                return;
            }
            std::string word = req.get_param_value("word");
            //std::cout << "用户在搜索：" << word << std::endl;
            LOG(NORMAL, "用户搜索的: " + word);
            std::string json_string;
            search.Search(word, &json_string);
            rsp.set_content(json_string, "application/json");
            //rsp.set_content("你好,世界!", "text/plain; charset=utf-8");
            });

    LOG(NORMAL, "服务器启动成功...");
    svr.listen("0.0.0.0", 8081);
    return 0;
}

makefile模块编写

PARSER=parser
DUG=debug
HTTP_SERVER=http_server
cc=g++

.PHONY:all
all:$(PARSER) $(DUG) $(HTTP_SERVER)

$(PARSER):parser.cc
	$(cc) -o $@ $^ -lboost_system -lboost_filesystem -std=c++11
$(DUG):debug.cc
	$(cc) -o $@ $^ -ljsoncpp -std=c++11
$(HTTP_SERVER):http_server.cc
	$(cc) -o $@ $^ -ljsoncpp -lpthread -std=c++11
.PHONY:clean
clean:
	rm -f $(PARSER) $(DUG) $(HTTP_SERVER)