7-8 基于词频的文件相似度 (30 分)

Xuic

已于 2022-04-23 18:53:20 修改

阅读量1.2k

点赞数 1

文章标签： c语言

于 2022-04-21 11:02:54 首次发布

本文链接：https://blog.csdn.net/qq_61414177/article/details/124316564

版权

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<ctype.h>

#define MAXS 10//最大字符串长度
#define MINS 3//最小字符串长度
#define MAXB 5//每个字符占的位数
#define MAXTable 500009//散列表规模

typedef struct FileEntry *WList;
struct FileEntry{
    int Words;
    WList Next;
};
typedef struct WordEntry *FList;
struct WordEntry{
    int FileNo;
    FList Next;
};
typedef char ElementType[MAXS+1];
struct HashEntry{
    ElementType Element;
    int FileNo;//为0时表示节点为空
    FList InvIndex;//倒排索引
};
struct HashTbl{
    int TableSize;
    struct HashEntry *TheCells;
};
typedef struct HashTbl *HashTable;

HashTable InitializeTable(int TableSize);//散列表初始化
WList InitializeFileIndex(int Size);//初始化文件的词汇索引表
int GetAWord(ElementType Word);//读入单词
int Hash(char *Key, int P);//字符串Key移位法散列函数
int Find(ElementType Key, HashTable H);//返回Key的位置，或是适合Key插入的位置
int InsertAndIndex(int FileNo, ElementType Key, HashTable H);//将Key插入散列表，同时插入对应的倒排索引表
void FileIndex(WList File, int FileNo, int Pos);//将单词在散列表中的位置Pos存入文件FileNo对应的索引表
double ComputeSim(WList File, int F1, int F2, HashTable H);//计算文件F1和F2的相似度

int main()
{
    int N, M, F1, F2;
    ElementType Word;
    HashTable H;
    WList File;
    scanf("%d",&N);//读入文件总数
    File = InitializeFileIndex(N);//创建一个文件词汇索引表
    H = InitializeTable(MAXTable);//创建一个散列表
    for(int i=0;i<N;i++)
        while(GetAWord(Word))
            FileIndex(File, i+1, InsertAndIndex(i+1, Word, H));
    scanf("%d",&M);
    for(int i=0;i<M;i++){//处理每条查询
        scanf("%d%d",&F1,&F2);
        printf("%.1lf%c\n",ComputeSim(File, F1, F2, H),'%');
    }
    return 0;
}

HashTable InitializeTable(int TableSize){
    HashTable H = malloc(sizeof(struct HashTbl));
    H->TableSize = TableSize;
    H->TheCells = malloc(sizeof(struct HashEntry)*H->TableSize);
    while(TableSize){
        H->TheCells[--TableSize].FileNo = 0;
        H->TheCells[TableSize].InvIndex = NULL;
    }
    return H;
}

WList InitializeFileIndex(int Size){
    WList F = malloc(sizeof(struct WordEntry)*Size);
    while(Size){
        F[--Size].Words = 0;
        F[Size].Next = NULL;
    }
    return F;
}

int GetAWord(ElementType Word){//从当前字符开始，读到单词尾的第1个非字母符号为止
//读成功则返回1；读到文件结束则返回0
    char c;
    int p=0;
    scanf("%c",&c);//跳过开始的非字母
    while(!isalpha(c) && (c!='#')) scanf("%c",&c);
    if(c=='#') return 0;//读到文件结束
    while(isalpha(c)&&(p<MAXS)){//读入单词//isalpha()判断是否为英文字母
        Word[p++] = tolower(c);//tolower把字母字符转换成小写
        scanf("%c",&c);
    }
    while(isalpha(c)) scanf("%c",&c);//跳过超长的字母
    if(p<MINS) return GetAWord(Word);//太短的单词不要，读下一个
    else{
        Word[p] = '\0';
        return 1;//成功返回
    } 
}

int Hash(char *Key, int P){
    unsigned int h=0;
    while(*Key!='\0')
        h = (h<<MAXB) + (*Key++ - 'a');
    return h % P;
}

int Find(ElementType Key, HashTable H){
    int Pos = Hash(Key, H->TableSize);
    while(H->TheCells[Pos].FileNo && strcmp(H->TheCells[Pos].Element, Key)){//若该位置已被其他关键字占用
        Pos++;//线性探测下一个位置
        if(Pos==H->TableSize) Pos -= H->TableSize;
    }
    return Pos;
}

int InsertAndIndex(int FileNo, ElementType Key, HashTable H){
    FList F;
    int Pos = Find(Key, H);//找到Key的位置，或者是适合插入Key的位置
    if(H->TheCells[Pos].FileNo!=FileNo){//插入散列表
        if(!H->TheCells[Pos].FileNo) strcpy(H->TheCells[Pos].Element, Key);//插入新单词
        H->TheCells[Pos].FileNo = FileNo;//更新最新文件
        F = malloc(sizeof(struct WordEntry));//将文件编号插入倒排索引表
        F->FileNo = FileNo;
        F->Next = H->TheCells[Pos].InvIndex;
        H->TheCells[Pos].InvIndex = F;
        return Pos;//插入成功，返回单词位置
    }
    else return -1;//同一文件重复单词，不插入
}

void FileIndex(WList File, int FileNo, int Pos){
    WList W;
    if(Pos<0) return;//重复的单词不处理
    W = malloc(sizeof(struct FileEntry));
    W->Words = Pos;
    W->Next = File[FileNo-1].Next;
    File[FileNo-1].Next = W;
    File[FileNo-1].Words++;//头结点累计词汇量
}

double ComputeSim(WList File, int F1, int F2, HashTable H){
    int i;
    WList W;
    FList F;
    if(File[F1-1].Words > File[F2-1].Words){
        i = F1; F1 = F2; F2 = i;
    }//F1的词汇量较小
    i = 0;//准备统计公共词汇量
    W = File[F1-1].Next;
    while(W){//F先找到当前单词在散列表中的位置
        F = H->TheCells[W->Words].InvIndex;
        while(F){//F扫描该单词的倒排索引表
            if(F->FileNo==F2) break;//该词也在F2里
            F = F->Next;
        }
        if(F) i++;//说明该单词是公共的
        W = W->Next;
    }//两文件词汇总量 = 两文件词汇量的和 - 公共词汇量
    return ((double)(i*100)/(double)(File[F1-1].Words+File[F2-1].Words-i));
}

第二种

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define HashLenth 500009 //测试内存开销3MB，也就是不到1MB的数据个数。 原作用的10007 ，但是书上给的500009 
 
typedef struct node *Node;
struct node {
    char*Word;
    int Reached[1000];//各次出现的位置，倒排索引  
    int Times;//最近的一次出现在哪 
};
Node Hash[HashLenth]= {0};//哈希表 
int Same[101][101]= {0};//词汇索引
int Sum[101]= {0};//每一块词汇量统计 
int n;
 
int HashKey(char*);
int Mod(int);
char*scan();
void Insert(int ,char*);
void Refresh(Node,int);
 
int main() {
	int j; 
    scanf("%d",&n);
    for( j=1; j<=n; j++) {
        while(1) {
            char*temp=scan();
            if(temp) {
                if(temp[0]=='#') 
                    break;
                
                Insert(j,temp);
            }
        }
    }
    int m;
    scanf("%d",&m);
    while(m--) {
        int a,b;
        scanf("%d%d",&a,&b);
        if(a==b)printf("100.0%%\n");//测试了老半天才发现这个坑。。。 看看同期的PTA题目，这个漏洞1年了才发现
        else
            printf("%.1f%%\n",Same[a][b]*100.0/(Sum[a]+Sum[b]-Same[a][b]));
    }
    return 0;
}
//h是第几个文件 
void Refresh(Node N,int h) {
	int i;
    if(N->Reached[N->Times]==h)return;//同一个文件里面出现的重复单词不累加，上一次出现的文件位置和这一次的相比 
    ++Sum[h];//文件词汇量+1
    //更新第几次出现及其位置 
    N->Reached[++N->Times]=h;
    for(i=1; i<=N->Times; i++) {//所有包含此字符串的文件彼此Same++
        ++Same[N->Reached[i]][h];
        ++Same[h][N->Reached[i]];
        //这里我感觉是二维数组将文件之间关联起来，又是对称的所以两个都要加才能平衡，这两个式子性质上一样 
        
    }
}
 
int HashKey(char*K) {//尽量不让他抱团
    int temp=(K[0]-'a')*32*32+(K[1]-'a')*32+K[2]-'a';
    temp*=HashLenth;
    temp/=27482;//26*32*32+26*32+26
    return temp;
}
 
int Mod(int Num) {
    while(Num<0)Num+=HashLenth;
    return Num%HashLenth;
}
void Insert(int h ,char*K) {
    int Key=HashKey(K);
    int flag;
    int i;
    for( i=0; i<=HashLenth/2; i++) {
        flag=Mod(Key+i*i);
        if(!Hash[flag])break;//不存在 
        //插入的 
        else if(!strcmp(Hash[flag]->Word,K)) {
            Refresh(Hash[flag],h);
            return ;
        }
        flag=Mod(Key-i*i);
        if(!Hash[flag])break;
        else if(!strcmp(Hash[flag]->Word,K)) {
            Refresh(Hash[flag],h);
            return ;
        }
    }
    if(i>HashLenth/2) {//实践证明平台的词汇量并不多
        exit(1);
    }
    ++Sum[h];//对该文件词汇量+1
    Hash[flag]=(Node)malloc(sizeof(struct node));
    Hash[flag]->Word=(char*)malloc(sizeof(char)*strlen(K));
    strcpy(Hash[flag]->Word,K);
    Hash[flag]->Times=1;//怎么不是1？ 
    Hash[flag]->Reached[1]=h;//第一次遇到在文件h 
}
char*scan() {
    static char temp[11];
    static int Flag_br=0;//何时结束标记 
    int i=0;
    char c;
    while(1) //结束条件只有return  
	{
        c=getchar();
        switch(c) 
		{
            case '#' ://? 
                if(Flag_br==1) 
				{
                    c=getchar();
                    if(c=='\n') //如果回车了应该是真结束了 
					{
                        temp[0]='#';
                        temp[1]='\0';
                        return temp;//剩下的交给main函数去判断 
                    }
                }
            default:
                switch(c) 
				{
                    case 'a'...'z':
                        if(i<10)
                            temp[i++]=c;
                        break;
                    case 'A'...'Z':
                        if(i<10)
                            temp[i++]=c-'A'+'a';
                        break;
 
                    default:
                        if(c=='\n')Flag_br=1;//下次再出现#就说明这次结束了 
                        //else Flag_br=0;
 
                        temp[i]='\0';
                        //控制小于10的在前面 
                        if(i>2) return temp;//读入了一个单词 
                         else return NULL;
            
                        //break;
                }
               // break;
        }
    }
}