写倒排索引的总结

最新推荐文章于 2024-06-15 01:16:17 发布

CY_TEC

最新推荐文章于 2024-06-15 01:16:17 发布

阅读量781

点赞数

文章标签：倒排索引文件读写

本文链接：https://blog.csdn.net/CY_TEC/article/details/46843107

版权

这次小作业的任务是对18000篇文章中的单词建立倒排索引，并保存成“单词.idx”文件输出。

要处理的文件每一行的格式是：《文章ID》《句子ID》句子。所有的数据一共包含180个二进制文件，名字分别是docs1、docs2... ...、docs180。其中每一个docs中有100篇文章，每一篇文章又包含若干个句子。

首先要读出所有单词，并记录其《文章ID》《句子ID》。为了方便写入到“单词.idx”，用了以下两种结构体。

struct id_node{
int ati_id;//aticle id，表示文章ID
int sen_id;//sentence id，句子ID
long wCount = 0; //record the frequency of the word，在这个句子中出现了几次
long id_next = -1;//next aticle and sentence，下一个id_node距离文件头的绝对位置。对于同一个单词，为了记录出现在不同的文章的ID和句子ID，采用链表方式对其进行按从小到大进行排序。
}id_node;

struct head_node{
char word[1000];//记录这个单词，这是为了在“index.idx”文件中方便识别单词的char数组。
long sen_num = 0;// how many sentence dose this word appear.，每出现一个新句子，增加一。也就是每增加一个id_node，自加一。
long fre_num = 0;//单词每出现一次，自加一。
long index_pos = -1;//用于更新,即在index.idx中的位置。为了提高检索所有单词时的效率，把保存在“单词.idx”文件中的头结点也保存到“index.idx”中。这个操作在新建“单词.idx”文件时进行操作，获取在“index.idx”中的位置（index文件的最后位置）。
long first_id_pos = -1;//position of first id node，指出id_node有序表的第一个node在“单词.idx”文件中的位置。
}head_node;

在一个“单词.idx”文件中，文件开头的第一个数据块是一个head_node，后面是一系列的id_node。这些id_node由head_node.first_id_node指向第一个id_node，后面用单链表的形式，从大到小连接。

因为每一行的格式都是固定的，所以，读取数据开始我就写了一个读取每一行数据的函数。代码如下：

//deal with every line, include article id, sentence id and sentence
void read_line(FILE* &fp)
{
//read the first line
char temp = '\0';
int ati_id;
int sen_id;
//get the aticle id and sentence id
ati_id = get_id(fp);//文章ID和句子ID都在开头，也都以'\t'结尾，所以就写了一个简单的get_id函数。这个函数只针对这个数据方式，没有适用性。
sen_id = get_id(fp);
char word[1000];
int i = 0;
fread(&temp, 1, sizeof(char), fp);

while (temp != '\n')//逐个字符读入，判断其是不是分隔符。如果是则对前面的单词进行建表操作。
{//get every word
if (is_split(temp))//这个地方有个个判断是不是分隔字符的函数，判断一下是不是所设定的分割字符。
{
word[i] = '\0';
l_to_u(word);//因为建表的时候不区分大小写，所以，把所有的字符都转化成大写。这个l_to_u函数就是把low character转化成upper character。
if (word[0] != '\0'&&i<255)
insert_char(word, ati_id, sen_id);//插入到文件的函数
i = 0;
}
else
word[i++] = temp;//如果不是分隔字符，继续“吞”char。

fread(&temp, 1, sizeof(char), fp);

if (temp == '\n'&&i>0)//以'\n'结尾。如果到了句子的最后，且最后的字符不是分割字符，且word中还有字符，则把这个剩下的字符也插入。
{
word[i] = '\0';
l_to_u(word);
//insert
if (i<255)
insert_char(word, ati_id, sen_id);
i = 0;
}
}
}

下面是get_id这个函数。

//get every aticle id and sentence id of each line
int get_id(FILE* &fp)
{
int i = 0;
char a[255];
int id = 0;
char temp = '\0';
while (temp != '\t') {
fread(&temp, 1, sizeof(char), fp);
if (feof(fp))//这个东西还真有学问啊。。其实并不是很懂这里。因为，fp是文件尾指针的时候，仍然会读出最后一个数据块。
{
return -1;
}
a[i++] = temp;
}
a[i] = '\0';
id = atoi(a);
return id;
}

void l_to_u(char word[])
{
int i;
i = 0;
while (word[i] != '\0')
{
if (word[i] >= 'a'&& word[i] <= 'z')
word[i] = word[i] - 32;//小写字母比大写字母的ASCII码值大32.
i++;
}
}

bool is_split(char temp)//这个就是写一些分隔规则。
{
if (temp == ' ' || temp == '\t' || temp == ',' || temp == '.' || temp == '<' || temp == '>' || temp == '{' || temp == '}'||temp == '+' || temp == '-' || temp == '*' || temp == '/'||temp == '`'||temp=='"' )
{
return true;
}
if (temp=='='||temp=='\\'||temp == ';' || temp == ':' || temp == '?' || temp == '(' || temp == ')' || temp == '/' || temp == '|' || temp == '"')
return true;
return false;
}

插入字符到索引文件是这次作业中比较关键的部分。

int insert_char(char word[], int ati_id, int sen_id)
{
//打开相应目录，创建单词对应的文件
//获得对应文件名
char temp[1000];//temp用来生成索引文件保存目录
strcpy(temp, word);
fill_word_name(temp);//temp is the full name。这个函数用来补齐保存文件的绝对目录及文件名
//对单词文件进行写入和统计
//打开文件，并写入头
FILE* head;//word file
FILE* index;//index file

char index_path[1000];
strcpy(index_path, f_index_path);//f_index_path是父路径
strcat(index_path, state_of_file);//state_of_file是一个char型数组，如果是“all”则对所有的docs文件中的单词建立一个索引。如果是具体的某一个文件名如“docs1”则为这个文件建立一个子文件夹，并对这个文件中的单词建立一个索引，并保存到该子目录下。
strcat(index_path, ".idx");
//open index
if ((index = fopen(index_path, "rb+")) == NULL)//if open failed, build one，新建一个索引文件，并在文件开头放入一个空的head_node块。
{
index = fopen(index_path, "wb");
if (index == NULL)
{
std::cout << "build new index file failed" << std::endl;
return -1;
}

struct head_node h_node;
fseek(index, 0, SEEK_SET);
long pos = ftell(index);
h_node.index_pos = pos;
strcpy(h_node.word, "This the head of index file.");
//rewind(head);
//fwrite(&h_node, 1, sizeof(h_node), head);
//fseek(index, pos, SEEK_SET);
h_node.sen_num = 0;
h_node.fre_num = 0;
h_node.index_pos = 0;
fwrite(&h_node, 1, sizeof(h_node), index);
fclose(index);
if ((index = fopen(index_path, "rb+")) == NULL)
{
std::cout << "open failed of index file" << std::endl;
}
}

//open head
if ((head = fopen(temp, "rb+")) == NULL)//if first time appear
{
head = fopen(temp, "wb");
fclose(head);
if ((head = fopen(temp, "rb+")) == NULL)
{
std::cout << "open failed of head file" << std::endl;
}
else//第一次创建 word.idx，头结点既要写到head开头，又要写到index结尾。
{
struct head_node h_node;
fseek(index, 0, SEEK_END);
long pos = ftell(index);//找到存放头结点的位置
h_node.index_pos = pos;//在“单词.idx”文件head_node节点中中记录该节点在index.idx文件中的位置。
strcpy(h_node.word, word);
rewind(head);
fwrite(&h_node, 1, sizeof(h_node), head);//写入文件。
fseek(index, pos, SEEK_SET);
fwrite(&h_node, 1, sizeof(h_node), index);
}
}
struct head_node h_node;
rewind(head);
fread(&h_node, 1, sizeof(h_node), head);//取出头结点

//修改头结点信息，并将结点插入到 word.idx
h_node.fre_num++;
add_id(head, h_node, ati_id, sen_id);//这个是对“单词.idx”文件的操作，把单词出现的信息按照升序排列。

rewind(head);
fwrite(&h_node, 1, sizeof(h_node), head);//写到 word.idx

fseek(index, h_node.index_pos, SEEK_SET);
fwrite(&h_node, 1, sizeof(h_node), index);//写到index.idx

//将统计的结果写入到index.idx

fclose(head);
fclose(index);
return 1;
}

下面是add_id函数

void add_id(FILE* &fp, struct head_node &h_node, int ati_id, int sen_id)
{
long p = h_node.first_id_pos;
struct id_node cur_id, next_id;
if (p == -1)//处理第一个插入到该单词文件中的结点。
{
//add a new id node to the end or behind the head node
long pos;
//fseek(fp, sizeof(h_node), SEEK_SET);
fseek(fp, 0, SEEK_END);
pos = ftell(fp);//this is the new id node position
cur_id.sen_id = sen_id;
cur_id.ati_id = ati_id;
cur_id.wCount = 1;
cur_id.id_next = -1;

//new sentence
h_node.sen_num = 1;

h_node.first_id_pos = pos;
fseek(fp, 0, SEEK_SET);
fwrite(&h_node, 1, sizeof(h_node), fp);
fseek(fp, pos, SEEK_SET);
//std::cout<<"pos1 is:"<<pos<<std::endl;
fwrite(&cur_id, 1, sizeof(id_node), fp);

return;
}//no problem

//插在第一个结点前
p = h_node.first_id_pos;
fseek(fp, p, SEEK_SET);
fread(&cur_id, 1, sizeof(id_node), fp);
if ((ati_id<cur_id.ati_id) || (ati_id == cur_id.ati_id&&sen_id<cur_id.sen_id))//如果该单词的ID比之前最小的还小，则插入。
{
//add a new id node to the end or behind the head node
long pos;
fseek(fp, 0, SEEK_END);
pos = ftell(fp);//this is the new id node position
std::cout << "pos2 is:" << pos << std::endl;
struct id_node new_id;
new_id.sen_id = sen_id;
new_id.ati_id = ati_id;
new_id.wCount = 1;
new_id.id_next = h_node.first_id_pos;
std::cout << "h_node.first_id_pos1 is:" << h_node.first_id_pos << std::endl;
h_node.first_id_pos = pos;
fseek(fp, pos, SEEK_SET);
fwrite(&new_id, 1, sizeof(id_node), fp);

//new sentence
h_node.sen_num++;
return;
}//no problem. NEVER RUN

long q;
q = h_node.first_id_pos;//current pointer
fseek(fp, h_node.first_id_pos, SEEK_SET);
fread(&cur_id, 1, sizeof(id_node), fp);
p = cur_id.id_next;//next pointer
while (p != -1)
{
fseek(fp, p, SEEK_SET);//定位
fread(&next_id, 1, sizeof(id_node), fp);//下一个id node
if ((ati_id<next_id.ati_id) || ((ati_id == next_id.ati_id) && (sen_id<next_id.sen_id)))
{
//add a new id node to the end or behind the head node
long pos;
fseek(fp, 0, SEEK_END);
pos = ftell(fp);//this is the new id node position
struct id_node new_id;

new_id.sen_id = sen_id;
new_id.ati_id = ati_id;
new_id.wCount = 1;
new_id.id_next = p;
cur_id.id_next = pos;
fseek(fp, pos, SEEK_SET);
fwrite(&new_id, 1, sizeof(id_node), fp);
fseek(fp, q, SEEK_SET);
fwrite(&cur_id, 1, sizeof(id_node), fp);

//new sentence
h_node.sen_num++;

return;
}//并没有执行过

//等于next结点
if ((ati_id == next_id.ati_id) && (sen_id == next_id.sen_id))//如果等于，就把wCount成员变量自加一就好了。
{
next_id.wCount++;
fseek(fp, p, SEEK_SET);//定位
fwrite(&next_id, 1, sizeof(id_node), fp);
return;
}
//move
cur_id.wCount = next_id.wCount;
cur_id.ati_id = next_id.ati_id;
cur_id.sen_id = next_id.sen_id;
cur_id.id_next = next_id.id_next;
q = p;
p = cur_id.id_next;
}

if (p == -1)//在尾部新增。
{
//add a new id node to the end
long pos;
fseek(fp, 0, SEEK_END);
pos = ftell(fp);//this is the new id node position
next_id.sen_id = sen_id;
next_id.ati_id = ati_id;
next_id.wCount = 1;
next_id.id_next = -1;
cur_id.id_next = pos;
fseek(fp, pos, SEEK_SET);
fwrite(&next_id, 1, sizeof(id_node), fp);
fseek(fp, q, SEEK_SET);
fwrite(&cur_id, 1, sizeof(id_node), fp);

//new sentence
//test this
h_node.sen_num++;

}

}

写好读一行的操作，剩下的就好写了。

void read_file(FILE* &fp)
{
while (!feof(fp))
{
read_line(fp);
}
}

这个是读取一个文件的操作。只需要调用每一行的就可以了，多简单。

因为作业要求两种建立倒排的要求，就设置了一个状态标识state_of_file。如果这个char数组中是"all"，那就把所有的单词都插入到“all”文件下的索引文件中。如果不是"all"，那就是读取的文件名，这时候用文件名建立一个子目录。单词的建索引操作就只在这个文件名对应的目录下操作了。

void read_all_file()
{

//mk_dir(state_of_file);

//读取一个文件中所有数据的代码
char w_path[255];

FILE* list_fp;
//open_file(fp, list_path);
open_file(list_fp, (char*)list_path);
int i;
for (i = 0; i<180; i++)
{
char ai[10];
fscanf(list_fp, "%s", ai);
if (strcmp(state_of_file, "all") != 0)
{
strcpy(state_of_file, ai);//如果不是"all"，那么每次都改变写入的文件夹名称，如果是，则写入相应文件平下。
}
mk_dir(state_of_file);
strcpy(w_path, f_path);
//strcpy(ai, );
strcat(w_path, ai);
FILE* fp;
//std::cout<<ai<<std::endl;

open_file(fp, w_path);
read_file(fp);
std::cout << state_of_file << " done!" << std::endl;
fclose(fp);

}
fclose(list_fp);

}