3.编写parser文件使用boost枚举文件名-CSDN博客

本文链接：https://blog.csdn.net/CoderZzz6310/article/details/145649941

编写parser.cc

在raw_html文件夹中创建一个raw.txt文件
把所有网页的内容处理完就放到raw.txt中

touch raw.txt

![[Pasted image 20250215101522.png]]

创建parser.cc

vim parser.cc

#include <iostream>
#include <string>
#include <vector>
#include <boost/filesystem.hpp>

//是一个目录，下面放的是所有的html网页
const std::string src_path = "data/input";
const std::string output = "data/raw_html/raw.txt";

typedef struct DocInfo{
    std::string title;   //文档的标题
    std::string content; //文档内容
    std::string url;     //该文档在官网中的url
}DocInfo_t;

//const &: 输入
//*: 输出
//&：输入输出
bool EnumFile(const std::string &src_path, std::vector<std::string> *files_list);
bool ParseHtml(const std::vector<std::string> &files_list, std::vector<DocInfo_t> *results);
bool SaveHtml(const std::vector<DocInfo_t> &results, const std::string &output);

int main()
{
    std::vector<std::string> files_list;
    
    //第一步: 递归式的把每个html文件名带路径，保存到files_list中，方便后期进行一个一个的文件进行读取
    if(!EnumFile(src_path, &files_list)){
        std::cerr << "enum file name error!" << std::endl;
        return 1;
    }
    
    //第二步: 按照files_list读取每个文件的内容，并进行解析
    std::vector<DocInfo_t> results;
    if(!ParseHtml(files_list, &results)){
        std::cerr << "parse html error" << std::endl;
        return 2;
    }
    
    //第三步: 把解析完毕的各个文件内容，写入到output,按照\3作为每个文档的分割符
    if(!SaveHtml(results, output)){
        std::cerr << "sava html error" << std::endl;
        return 3;
    }
    return 0;
}

bool EnumFile(const std::string &src_path, std::vector<std::string> *files_list)
{
    return true;
}
bool ParseHtml(const std::vector<std::string> &files_list, std::vector<DocInfo_t> *results)
{
	return true;
}
bool SaveHtml(const std::vector<DocInfo_t> &results, const std::string &output)
{
	return true;
}

使用boost枚举文件名

引用加入boost库

#include <boost/filesystem.hpp>

![[Pasted image 20250215111124.png]]

安装boost库

sudo yum install -y boost-devel

![[Pasted image 20250215111557.png]]

编辑EnumFile方法

bool EnumFile(const std::string &src_path, std::vector<std::string> *files_list)
{
    namespace fs = boost::filesystem;
    fs::path root_path(src_path);
    //判断路径是否存在，不存在，就没有必要再往后走了
    if(!fs::exists(root_path)){
        std::cerr << src_path << " not exists" << std::endl;
        return false;
    }
    //定义一个空的迭代器，用来进行判断递归结束
    fs::recursive_directory_iterator end;
    for(fs::recursive_directory_iterator iter(root_path); iter != end; iter++){
        //判断文件是否是普通文件，html都是普通文件
        if(!fs::is_regular_file(*iter)){ 
            continue;
        }
        if(iter->path().extension() != ".html"){ //判断文件路径名的后缀是否符合要求
            continue;
        }
        std::cout << "debug: " << iter->path().string() << std::endl;
        //当前的路径一定是一个合法的，以.html结束的普通网页文件
        files_list->push_back(iter->path().string()); //将所有带路径的html保存在files_list,方便后续进行文本分析
    }
    return true;
}

将parser.cc追加到makefile

ls parser.cc > makefile

![[Pasted image 20250215115330.png]]

编写makefile

vim makefile

cc=g++

parser:parser.cc
        $(cc) -o $@ $^ -lboost_system -lboost_filesystem -std=c++11
.PHONY:clean
clean:
        rm -f parser

安装组件

yum install "gcc-c++.x86_64" -y

编译

make

![[Pasted image 20250215120647.png]]

运行parser

./parser

![[Pasted image 20250215133717.png]]

查看html文件个数

./parser | wc -l

![[Pasted image 20250215133646.png]]