5.编写写入文件代码|调试|编写建立索引的模块index的代码结构(C++)

本文链接：https://blog.csdn.net/CoderZzz6310/article/details/145656339

编写写入文件代码

将解析内容写⼊⽂件中

采⽤下⾯的⽅案：  
version2: 写⼊⽂件中，⼀定要考虑下⼀次在读取的时候，也要⽅便操作 
类似：title\3content\3url \n title\3content\3url \n title\3content\3url \n ...  
⽅便我们getline(ifsream, line)，直接获取⽂档的全部内容：title\3content\3url

bool ParseHtml(const std::vector<std::string> &files_list, std::vector<DocInfo_t> *results)
{
	for(const std::string &file : files_list){
		//读取文件，Read();
		std::string result;
		if(!ns_util::FileUtil::ReadFile(file, &result)){
			continue;
		}
		DocInfo_t doc;
		//解析指定的文件，提取title
		if(!ParseTitle(result, &doc.title)){
			continue;
		}
		//解析指定的文件，提取content，就是去标签
		if(!ParseContent(result, &doc.content)){
			continue;
		}
		//解析指定的文件路径，构建url
		if(!ParseUrl(file, &doc.url)){
			continue;
		}

		//done，一定是完成了解析任务，当前文档的所有的相关结果都保存在了doc里面
		results->push_back(std::move(doc)); //bug：todo；细节，本质会发生拷贝，效率可能会比较低

		//for debug
		//ShowDoc(doc);
		//break;
	}
	return true;
}


bool SaveHtml(const std::vector<DocInfo_t> &results, const std::string &output)
{
#define SEP '\3'
	//按照二进制方式进行写入
	std::ofstream out(output, std::ios::out | std::ios::binary);
	if(!out.is_open()){
		std::cerr << "open " << output << " failed" << std::endl;
		return false;
	}

	//进行文件内容的写入
	for(auto &item : results){
		std::string out_string;
		out_string = item.title;
		out_string += SEP;
		out_string += item.content;
		out_string += SEP;
		out_string += item.url;
		out_string += '\n';

		out.write(out_string.c_str(), out_string.size());
	}

	out.close();

	return true;
}

调试

![[Pasted image 20250215182528.png]]

打开raw.txt文件
![[Pasted image 20250215182840.png]]

^C就是\3
Site Unreachable
第一个网页
![[Pasted image 20250215183309.png]]

当前项目内容
![[Pasted image 20250215183558.png]]

编写建立索引的模块index

代码结构

建立index.hpp文件
![[Pasted image 20250215183748.png]]

#pragma once

#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>

namespace ns_index{

	struct DocInfo{
		std::string title;   //文档标题
		std::string content; //文档对应的去标签之后的内容
		std::string url;     //官网文档url
		uint64_t dic_id;     //文档的ID
	}

	struct InvertedElem{
		uint64_t doc_id;
		std::string word;
		int weight;
	}

	//倒排拉链
	typedef std::vector<InvertedElem> InvertedList;

	class Index{
		private:
			//正排索引的数据结构用数组，数组的下标天然是文档的ID
			std::vector<DocInfo> forward_index; //正排索引
			//倒排索引一定是一个关键字和一组(个)InvertedElem对应[关键字和倒排拉链的映射关系]
			std::unordered_map<std::string, InvertedList> inverted_index;
		public:
			Index(){}
			~Index(){}
		public:
			//根据doc_id找到文档内容
			DocInfo *GetForwardIdex(uint64_t doc_id)
			{
				return nullptr;
			}
			//根据关键字string获得倒排拉链
			InvertedList *GetInvertedList(const std::string &word)
			{
				return nullptr;
			}
			//根据去标签，格式化之后的文档，构建正排和倒排索引
			//data/raw_html/raw.txt
			bool BuildIndex(const std::string &input) //parse处理完毕的数据交给我
			{
				return true;
			}
	};
}

编写代码索引准备工作

#pragma once

#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>

namespace ns_index{
	struct DocInfo{
		std::string title;   //文档标题
		std::string content; //文档对应的去标签之后的内容
		std::string url;     //官网文档url
		uint64_t dic_id;     //文档的ID
	}

	struct InvertedElem{
		uint64_t doc_id;
		std::string word;
		int weight;
	}

	//倒排拉链
	typedef std::vector<InvertedElem> InvertedList;

	class Index{
		private:
			//正排索引的数据结构用数组，数组的下标天然是文档的ID
			std::vector<DocInfo> forward_index; //正排索引
			//倒排索引一定是一个关键字和一组(个)InvertedElem对应[关键字和倒排拉链的映射关系]
			std::unordered_map<std::string, InvertedList> inverted_index;
		public:
			Index(){}
			~Index(){}

		public:
			//根据doc_id找到文档内容
			DocInfo *GetForwardIdex(uint64_t doc_id)
			{
				if(doc_id >= forward_index.size()){
					std::cerr << "doc_id out range, error" << std::endl;
					return nullptr;
				}
				return &forward_index[doc_id];
			}

			//根据关键字string获得倒排拉链
			InvertedList *GetInvertedList(const std::string &word)
			{
				auto iter = inverted_index.find(word);
				if(iter == inverted_index.end()){
					std::cerr << word << " have no InvertedList" << std::endl;
					return nullptr;
				}

				return &(iter->second);
			}
			//根据去标签，格式化之后的文档，构建正排和倒排索引
			//data/raw_html/raw.txt
			bool BuildIndex(const std::string &input) //parse处理完毕的数据交给我
			{
				std::ifstream in(input, std::ios::in | std::ios::binary);
				if(!in.is_open()){
					std::cerr << "sorry, " << input << " open error" << std::endl;
					return false;
				}

				std::string line;
				while(std::getline(in, line)){
					DocInfo * doc = BuildForwardIndex(line);
					if(nullptr == doc){
						std::cerr << "build " << line << " error" << std::endl; //for debug
						continue;
					}

					BuildInvertedIndex(*doc);
				}
				return true;
			}
		private:
			DocInfo *BuildForwardIndex(const std::string &line)
			{

			}

			bool BuildInvertedIndex(const DocInfo &doc)
			{

			}
	};
}