哈希基于词频的文件相似度

最新推荐文章于 2021-08-09 22:53:54 发布

ZzMeei

最新推荐文章于 2021-08-09 22:53:54 发布

阅读量5.6k

点赞数 27

分类专栏：数据结构文章标签：数据结构

本文链接：https://blog.csdn.net/qq_33897261/article/details/103524505

版权

数据结构专栏收录该内容

14 篇文章 2 订阅

订阅专栏

哈希基于词频的文件相似度

题目

实现一种简单原始的文件相似度计算，即以两文件的公共词汇占总词汇的比例来定义相似度。为简化问题，这里不考虑中文（因为分词太难了），只考虑长度不小于3、且不超过10的英文单词，长度超过10的只考虑前10个字母。

输入

输入首先给出正整数N（<= 100），为文件总数，随后按以下格式给出每个文件的内容：首先给出文件正文，最后在一行中只给出一个字符“#”，表示文件结束。在N个文件内容结束之后，给出查询总数M（<= 10^4），随后M行，每行给出一对文件编号，其间以空格分隔。这里假设文件按给出的顺序从1到N编号。

输出

针对每一条查询，在一行中输出两文件的相似度，即两文件的公共词汇量占两文件总词汇量的百分比，精确到小数点后1位。注意，这里的一个“单词”只包含仅由英文字母组成的、长度不小于3、且不超过10的英文单词，长度超过10的只考虑前10个字母。单词间以任何非英文字母隔开。另外，大小写不同的同一单词被认为是相同的单词，例如“You”和“you”是同一个单词。

样例

在这里插入图片描述

题解

将每个单词利用哈希函数映射到对应的散列表中，同时将文件编号插入到散列表中的倒排索引表，之后将单词在散列表中的位置存入每个文件的词汇索引表。计算两个文件的相似度时，只需要选择词汇量较小的那个文件，遍历该文件的词汇索引表，找到单词在散列表中的位置并扫描该单词的倒排索引表，如果倒排索引表中的文件编号与另一个文件的编号相同，则说明该单词同时出现在两个文件中。
~~极致码农题~~

代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAXS 10
#define MINS 3
#define MAXB 5
#define MAXTable 500009

typedef char ElementType[MAXS + 1];

typedef struct FileEntry{
    int words;
    struct FileEntry *Next;
}WList;

typedef struct WordEntry{
    int FileNo;
    struct WordEntry *Next;
}FList;

struct HashEntry{
    ElementType Element;
    int FileNo;
    FList *InvIndex;
};

typedef struct HashTbl{
    int TableSize;
    struct HashEntry *TheCells;
}HashTable;

HashTable* Table_Init(int TableSize){
    HashTable *H = malloc(sizeof(HashTable));
    H->TableSize = TableSize;
    H->TheCells = malloc(sizeof(struct HashEntry) * H->TableSize);
    while (TableSize){
        H->TheCells[--TableSize].FileNo = 0;
        H->TheCells[TableSize].InvIndex = NULL;
    }
    return H;
}

WList* FileIndex_Init(int Size){
    WList *F = malloc(sizeof(FList) * Size);
    while (Size){
        F[--Size].words = 0;
        F[Size].Next = NULL;
    }
    return F;
}

int GetWord(ElementType Word){
    char c;
    int p = 0;
    scanf("%c", &c);
    while (!isalpha(c) && (c != '#'))
        scanf("%c", &c);
    if (c == '#')
        return 0;
    while (isalpha(c) && (p < MAXS)){
        Word[p++] = tolower(c);
        scanf("%c", &c);
    }
    while (isalpha(c))
        scanf("%c", &c);
    if (p < MINS)
        return GetWord(Word);
    else{
        Word[p] = '\0';
        return 1;
    }
}

int Hash(char *key,int p){
    unsigned int h = 0;
    while (*key != '\0')
        h = (h << MAXB) + (*key++ - 'a');
    return h % p;
}

int Find(ElementType key, HashTable *H){
    int pos = Hash(key, H->TableSize);
    while (H->TheCells[pos].FileNo && strcmp(H->TheCells[pos].Element, key)){
        pos++;
        if (pos == H->TableSize)
            pos -= H->TableSize;
    }
    return pos;
}

int InsertAndIndex(int FileNo, ElementType key, HashTable *H){
    FList *F;
    int pos = Find(key, H);
    if (H->TheCells[pos].FileNo != FileNo){
        if (!H->TheCells[pos].FileNo)
            strcpy(H->TheCells[pos].Element, key);
        H->TheCells[pos].FileNo = FileNo;
        F = malloc(sizeof(FList));
        F->FileNo = FileNo;
        F->Next = H->TheCells[pos].InvIndex;
        H->TheCells[pos].InvIndex = F;
        return pos;
    }
    else
        return -1;
}

void FileIndex(WList *File, int FileNo, int pos){
    WList *W;
    if (pos < 0)
        return;
    W = malloc(sizeof(WList));
    W->words = pos;
    W->Next = File[FileNo-1].Next;
    File[FileNo-1].Next = W;
    File[FileNo-1].words++;
}

double work(WList *File, int F1, int F2, HashTable *H){
    int temp;
    WList *W;
    FList *F;
    if (File[F1-1].words > File[F2-1].words){
        temp = F1;
        F1 = F2;
        F2 = temp;
    }
    temp = 0;
    W = File[F1-1].Next;
    while (W) {
        F = H->TheCells[W->words].InvIndex;
        while (F) {
            if (F->FileNo == F2)
                break;
            F = F->Next;
        }
        if (F)
            temp++;
        W = W->Next;
    }
    return ((double)(temp * 100)/ (double)(File[F1 - 1].words + File[F2 - 1].words - temp));
}

int main(){
    int n, m, f1, f2;
    ElementType word;
    HashTable *H;
    WList *File;
    scanf("%d", &n);
    File = FileIndex_Init(n);
    H = Table_Init(MAXTable);
    for (int i = 0; i < n; i++)
        while(GetWord(word))
            FileIndex(File, i + 1, InsertAndIndex(i+1, word, H));
    scanf("%d", &m);
    for (int i = 0 ; i < m; i++){
        scanf("%d %d", &f1, &f2);
        printf("%.1f%c\n", work(File, f1, f2, H), '%');
    }
    return 0;
}

代码详解：

文件的词汇索引表：

typedef struct FileEntry{ //是一个动态链表
    int words; //第一个表示该文件的词汇总数，之后表示该单词在散列表中的位置
    struct FileEntry *Next;
}WList;

简化版散列表定义以及初始化：

typedef struct WordEntry{ //是一个动态链表
    int FileNo; //文件编号
    struct WordEntry *Next; //指向下一个文件的编号
}FList;

struct HashEntry{ 
    ElementType Element; //单词
    int FileNo; 
    FList *InvIndex; //单词的倒排索引表
};

typedef struct HashTbl{
    int TableSize; //散列表的大小
    struct HashEntry *TheCells; //散列表静态数组
}HashTable;

HashTable* Table_Init(int TableSize){ //初始化散列表
    HashTable *H = malloc(sizeof(HashTable));
    H->TableSize = TableSize;
    H->TheCells = malloc(sizeof(struct HashEntry) * H->TableSize);
    while (TableSize){
        H->TheCells[--TableSize].FileNo = 0;
        H->TheCells[TableSize].InvIndex = NULL;
    }
    return H;
}

初始化文件索引表：

WList* FileIndex_Init(int Size){
    WList *F = malloc(sizeof(FList) * Size);
    while (Size){
        F[--Size].words = 0;
        F[Size].Next = NULL;
    }
    return F;
}

读取单词：

int GetWord(ElementType Word){
//从当前字符开始，读到单词尾的第一个非字母符号为止
//读成功则返回1；读到文件结束则返回0
    char c;
    int p = 0;
    scanf("%c", &c);
    while (!isalpha(c) && (c != '#'))
        scanf("%c", &c);//跳过最开始的非字母
    if (c == '#')
        return 0;
    while (isalpha(c) && (p < MAXS)){ //读入单词
        Word[p++] = tolower(c);
        scanf("%c", &c);
    }
    while (isalpha(c)) //跳过超长的字母（相当于只读取、不存储）
        scanf("%c", &c);
    if (p < MINS) //太短的单词不要，读下一个
        return GetWord(Word);
    else{
        Word[p] = '\0';
        return 1;
    }
}

字符串移位法散列函数（哈希函数）：

int Hash(char *key,int p){
    unsigned int h = 0;
    while (*key != '\0')
        h = (h << MAXB) + (*key++ - 'a');
    return h % p;
}

在散列表中分配单词及查找单词的位置：

int Find(ElementType key, HashTable *H){
//返回Key的位置，或者返回适合插入Key的位置
    int pos = Hash(key, H->TableSize);
//先找到散列映射后的位置
    while (H->TheCells[pos].FileNo && strcmp(H->TheCells[pos].Element, key)){
//若该位置已经被其它关键字占用
        pos++; //线性探测下一个位置
        if (pos == H->TableSize)
            pos -= H->TableSize;
    }
    return pos;
}

将单词插入散列表，同时插入对应的倒排索引表：

int InsertAndIndex(int FileNo, ElementType key, HashTable *H){
    FList *F;
    int pos = Find(key, H);
//找到Key的位置，或者是适合插入Key的位置
    if (H->TheCells[pos].FileNo != FileNo){ //插入散列表
        if (!H->TheCells[pos].FileNo) //新单词
            strcpy(H->TheCells[pos].Element, key);
        H->TheCells[pos].FileNo = FileNo; //更新最近文件
//将文件编号插入倒排索引表，相当于链表插入新节点的操作
        F = malloc(sizeof(FList));
        F->FileNo = FileNo;
        F->Next = H->TheCells[pos].InvIndex;
        H->TheCells[pos].InvIndex = F;
        return pos; //插入成功，返回单词位置
    }
    else
        return -1; //同一文件重复单词，不插入
}

将单词在散列表中的位置存入文件索引表：

void FileIndex(WList *File, int FileNo, int pos){
    WList *W;
    if (pos < 0)
        return; //重复的单词不处理
//插入索引表
    W = malloc(sizeof(WList));
    W->words = pos;
    W->Next = File[FileNo-1].Next;
    File[FileNo-1].Next = W;
    File[FileNo-1].words++; //头结点累计词汇量
}

计算两个文件之间的相似度：

double work(WList *File, int F1, int F2, HashTable *H){
    int temp;
    WList *W;
    FList *F;
    if (File[F1-1].words > File[F2-1].words){
        temp = F1;
        F1 = F2;
        F2 = temp;
    } //选择词汇量较小的那个文件作为文件索引表的文件
    temp = 0; //统计公共词汇量
    W = File[F1-1].Next; //扫描文件的词汇索引表
    while (W) {
//找到当前单词在散列表中的位置
        F = H->TheCells[W->words].InvIndex;
        while (F) { //扫描该单词的倒排索引表
            if (F->FileNo == F2) //如果该单词也在另一个文件中
                break;
            F = F->Next;
        }
        if (F)
            temp++; 说明该单词是公共的
        W = W->Next;
    }
//两文件的词汇总量 = 两文件词汇量的和 - 公共词汇量
    return ((double)(temp * 100)/ (double)(File[F1 - 1].words + File[F2 - 1].words - temp));
}

主程序部分：

int main(){
    int n, m, f1, f2;
    ElementType word;
    HashTable *H;
    WList *File;
    scanf("%d", &n);
    File = FileIndex_Init(n);
    H = Table_Init(MAXTable); //创建一个散列表
    for (int i = 0; i < n; i++) //读入并索引每个文件
        while(GetWord(word))
            FileIndex(File, i + 1, InsertAndIndex(i+1, word, H));
    scanf("%d", &m);
    for (int i = 0 ; i < m; i++){ //处理每条查询
        scanf("%d %d", &f1, &f2);
        printf("%.1f%c\n", work(File, f1, f2, H), '%');
    }
    return 0;
}

ZzMeei

关注

27
点赞
踩
130

收藏

觉得还不错? 一键收藏
2
评论
哈希基于词频的文件相似度

哈希基于词频的文件相似度题目实现一种简单原始的文件相似度计算，即以两文件的公共词汇占总词汇的比例来定义相似度。为简化问题，这里不考虑中文（因为分词太难了），只考虑长度不小于3、且不超过10的英文单词，长度超过10的只考虑前10个字母。输入输入首先给出正整数N（<= 100），为文件总数，随后按以下格式给出每个文件的内容：首先给出文件正文，最后在一行中只给出一个字符“#”，表示文件结...
复制链接

扫一扫