多线程版本的Item_KNN的C语言实现

这里模仿了Word2vec里面构建Hash索引的思想,版本1里面由于IO密集型和CPU密集型工作同时进行,这样导致效率很低,所以在版本2中将所有线程首先做CPU密集型的操作然后再由主线程把结果写道文件中,并且经过测试,6核(23虚拟核)的服务器,开100个线程执行效率最佳。
版本1:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#define MaxString 50
#define MaxUserSize 1024*1024*10
#define SIMILARITY_ITEM 30
#define MaxLen 100
char filename[20][15]={ "output1.txt","output2.txt","output3.txt","output4.txt",
                        "output5.txt","output6.txt","output7.txt","output8.txt",
                        "output9.txt","output10.txt","output11.txt","output12.txt",
                        "output13.txt","output14.txt","output15.txt","output16.txt",
                        "output17.txt","output18.txt","output19.txt","output20.txt"};

const int item_hash_size = 30000000;

struct itemInfo
{
    char itemId[MaxString];
    int totalUser;
    unsigned long long * userList;
    int max_user;
};

unsigned long long max_item =1000,item_size=0;
struct itemInfo * item;
int * item_hash;
char str1[10000][50];
int num_threads=20;
char str2[100];

int GetWordHash(char *itemId) {
  unsigned long long a, hash = 0;
  for (a = 0; a < strlen(itemId); a++) hash = hash * 257 + itemId[a];
  hash = hash % item_hash_size;
  return hash;
}

int SearchItem(char *itemId) {
  unsigned int hash = GetWordHash(itemId);
  while (1) {
    if (item_hash[hash] == -1) return -1;
    if (!strcmp(itemId, item[item_hash[hash]].itemId)) return item_hash[hash];
    hash = (hash + 1) % item_hash_size;
  }
  return -1;
}


int AddItemIdToItem(char *itemId,unsigned long long userId) {
  unsigned int hash;
  item[item_size].max_user = 100;
  item[item_size].userList = (unsigned long long *)calloc(item[item_size].max_user, sizeof(unsigned long long));
  strcpy(item[item_size].itemId, itemId);
  item[item_size].userList[0]=userId;
  item[item_size].totalUser = 1;
  item_size++;
  // Reallocate memory if needed
  if (item_size + 2 >= max_item) {
    max_item += 1000;
    item = (struct itemInfo *)realloc(item, max_item * sizeof(struct itemInfo));
  }
  hash = GetWordHash(itemId);
  while (item_hash[hash] != -1) hash = (hash + 1) % item_hash_size;
  item_hash[hash] = item_size - 1;
  return item_size - 1;
}


int ReadItemInfo()
{
    int a;
    //打开文件
    FILE * fin = fopen("data_1w.txt","rb");
    if (fin==NULL)
    {
        printf("The input file doesn't exist.\n");
        exit(1);
    }

    item =(struct itemInfo *)malloc(max_item*sizeof(struct itemInfo));
    if (item==NULL)
    {
        printf("item allocate failed.\n");
        exit(1);
    }

    //读取用户的购买记录
    char * str=(char *)malloc(MaxUserSize);
    while(fgets (str ,MaxUserSize,fin) != NULL)
    {
        //将用户购买记录分段
        memset(str1,0,sizeof(str1));
        int cn = 0;
        int b = 0;
        int c = 0;
        while(1){
            str1[cn][b] = str[c];
            b++;
            c++;
            str1[cn][b] = 0;
            if (str[c] == 10) break;
            if (str[c] == ' ') {
             cn++;
             b = 0;
             c++;
            }
        }
        cn++;

        //去除u的userID
        int len = strlen(str1[0]);
        unsigned long long value=0; 
        for (int j=1; j < len; j++)
            value = value*10+(str1[0][j]-'0');

        //将ItemId添加到Item中
        for (int i = 1; i < cn; ++i)
        {
            int index = SearchItem(str1[i]);
            if (index == -1) 
               a = AddItemIdToItem(str1[i],value);
            else {
                item[index].totalUser++;
                if (item[index].totalUser+2>=item[index].max_user)
                {
                    item[index].max_user+=100;
                    item[index].userList = (unsigned long long *)realloc(item[index].userList, item[index].max_user * sizeof(unsigned long long));
                }
                item[index].userList[item[index].totalUser-1]=value;
            }
        }

    }
    return 0;
}
void init(){
    //初始化Hash表
    item_hash = (int *)calloc(item_hash_size, sizeof(int));
    for (int i = 0; i < item_hash_size; ++i) item_hash[i] =  -1;

    //为item分配空间
    item = (struct itemInfo *)malloc(max_item*sizeof(struct itemInfo));

}


int binary_search(unsigned long long array[],int n,unsigned long long value)  {  
    int left=0;  
    int right=n-1;    
    while (left<=right)          //循环条件,适时而变  
    {  
        int middle=left + ((right-left)>>1);  //防止溢出,移位也更高效。同时,每次循环都需要更新。  

        if (array[middle]>value)  
        {  
            right =middle-1;   //right赋值,适时而变  
        }   
        else if(array[middle]<value)  
        {  
            left=middle+1;  
        }  
        else  
            return middle;    

    }  
    return -1;  
}  

void * CalItemSim(void * a){
    FILE * fout = fopen(filename[(int)a],"w");
    float bestSim[SIMILARITY_ITEM];
    char  bestUserID[SIMILARITY_ITEM][50];
    float p,similarity;
    int common,pos;
    int left = item_size/ num_threads * (int)a;
    int right  = item_size / num_threads *((int)a+1)-1;
    if ((int)a==num_threads-1)
        right=item_size-1;
    for (int i = left; i <= right; ++i) //遍历此线程需要处理的item
    {
        for (int w = 0; w < SIMILARITY_ITEM; ++w) //初始化
        {
            bestSim[w]=-1;
            bestUserID[w][0]=0;
        }
        for (int j = 0; j < item_size; ++j)
        {
            if (i!=j)
            {
                common=0;
                for (int t = 0; t < item[i].totalUser; ++t)  //寻找购买的相同物品个数
                {   
                    pos=binary_search(item[j].userList,item[j].totalUser,item[i].userList[t]);
                    if (pos!=-1)
                        common++;
                }
                if(common>5){
                    p=sqrt(item[i].totalUser * item[j].totalUser);
                    similarity = common/p;
                    for (int k = 0; k < SIMILARITY_ITEM; ++k)
                    {
                        if (similarity>bestSim[k])
                        {
                            for (int q = SIMILARITY_ITEM -1; q > k; q--)
                            {
                                bestSim[q] = bestSim[q-1];
                                strcpy(bestUserID[q],bestUserID[q-1]);
                            }
                            bestSim[k] =similarity;
                            strcpy(bestUserID[k],item[j].itemId);
                            break;
                        }
                    }
                }

            }   
        }
        for (int c = 0; (c < SIMILARITY_ITEM)&&bestSim[c]!=-1; ++c)
        {
            fprintf(fout,"%s %s %f\n",item[i].itemId,bestUserID[c],bestSim[c]);
        }
    }
    fclose(fout);
    pthread_exit(NULL);
}


void CreatMulThread(){
    pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
    for (long long a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, CalItemSim, (void *)a);
    for (long long a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);  
}
//将多个输出文件合并成
void FileJoin(){
    FILE * fout = fopen("output.txt","w");
    for (int i = 0; i < 20; ++i)
    {
        FILE * fin = fopen(filename[i],"r");
        while(fgets (str2 ,MaxLen,fin) != NULL){
            fprintf(fout,"%s",str2);
        }
        fclose(fin);

        if( remove(filename[i]) == 0 )
            printf("Removed %s\n", filename[i]);
        else
            perror("remove");
    }
    fclose(fout);
}
int main(){
    //freopen("output.txt","w",stdout);
    init();
    ReadItemInfo();
    CreatMulThread();
    FileJoin();
    /*
    char testpid[50]="p535223";
    int test=SearchItem(testpid);
    printf("test:%d\n",test );
    printf("item_size:%llu\n",item_size );
    printf("itemId:%s\n",item[test].itemId);
        printf("totalUser:%d\n",item[test].totalUser);
    for (int i = 0; i < item[test].totalUser; ++i)
    {
        printf("%llu\n",item[test].userList[i]);
    }
    */
    return 0;
}

版本2:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#define MaxString 50
#define MaxUserSize 1024*1024*10
#define SIMILARITY_ITEM 30
#define MaxLen 100
#define Num_Thread 100
const int item_hash_size = 30000000;

struct itemInfo
{
    char itemId[MaxString];
    int totalUser;
    unsigned long long * userList;
    int max_user;
};
struct recordInfo{
    char itemId1[MaxString];
    char itemId2[MaxString];
    float similarity;
};
struct recordInfo * record[Num_Thread];
int len[Num_Thread];
int max[Num_Thread];
unsigned long long max_item =1000,item_size=0;
struct itemInfo * item;
int * item_hash;
char str1[10000][50];
char str2[100];

int GetWordHash(char *itemId) {
  unsigned long long a, hash = 0;
  for (a = 0; a < strlen(itemId); a++) hash = hash * 257 + itemId[a];
  hash = hash % item_hash_size;
  return hash;
}

int SearchItem(char *itemId) {
  unsigned int hash = GetWordHash(itemId);
  while (1) {
    if (item_hash[hash] == -1) return -1;
    if (!strcmp(itemId, item[item_hash[hash]].itemId)) return item_hash[hash];
    hash = (hash + 1) % item_hash_size;
  }
  return -1;
}


int AddItemIdToItem(char *itemId,unsigned long long userId) {
  unsigned int hash;
  item[item_size].max_user = 100;
  item[item_size].userList = (unsigned long long *)calloc(item[item_size].max_user, sizeof(unsigned long long));
  strcpy(item[item_size].itemId, itemId);
  item[item_size].userList[0]=userId;
  item[item_size].totalUser = 1;
  item_size++;
  // Reallocate memory if needed
  if (item_size + 2 >= max_item) {
    max_item += 1000;
    item = (struct itemInfo *)realloc(item, max_item * sizeof(struct itemInfo));
  }
  hash = GetWordHash(itemId);
  while (item_hash[hash] != -1) hash = (hash + 1) % item_hash_size;
  item_hash[hash] = item_size - 1;
  return item_size - 1;
}


int ReadItemInfo()
{

    int a;
    //打开文件
    FILE * fin = fopen("record_20w.txt","rb");
    if (fin==NULL)
    {
        printf("The input file doesn't exist.\n");
        exit(1);
    }

    item =(struct itemInfo *)malloc(max_item*sizeof(struct itemInfo));
    if (item==NULL)
    {
        printf("item allocate failed.\n");
        exit(1);
    }

    //读取用户的购买记录
    char * str=(char *)malloc(MaxUserSize);
    while(fgets (str ,MaxUserSize,fin) != NULL)
    {
        //将用户购买记录分段
        memset(str1,0,sizeof(str1));
        int cn = 0;
        int b = 0;
        int c = 0;
        while(1){
            str1[cn][b] = str[c];
            b++;
            c++;
            str1[cn][b] = 0;
            if (str[c] == 10) break;
            if (str[c] == ' ') {
             cn++;
             b = 0;
             c++;
            }
        }
        cn++;

        //去除u的userID
        int len = strlen(str1[0]);
        unsigned long long value=0; 
        int j;
        for ( j=1; j < len; j++)
            value = value*10+(str1[0][j]-'0');

        //将ItemId添加到Item中
        int i;
        for ( i = 1; i < cn; ++i)
        {
            int index = SearchItem(str1[i]);
            if (index == -1) 
               a = AddItemIdToItem(str1[i],value);
            else {
                item[index].totalUser++;
                if (item[index].totalUser+2>=item[index].max_user)
                {
                    item[index].max_user+=100;
                    item[index].userList = (unsigned long long *)realloc(item[index].userList, item[index].max_user * sizeof(unsigned long long));
                }
                item[index].userList[item[index].totalUser-1]=value;
            }
        }

    }

    return 0;
}
void init(){
    //初始化max和len
    int i;
    for(i=0;i<Num_Thread;i++){
        max[i]=1000;
        len[i]=0;
    }
    //初始化Hash表
    item_hash = (int *)calloc(item_hash_size, sizeof(int));
    for ( i = 0; i < item_hash_size; ++i) item_hash[i] =  -1;

    //为item分配空间
    item = (struct itemInfo *)malloc(max_item*sizeof(struct itemInfo));

}


int binary_search(unsigned long long array[],int n,unsigned long long value)  {  
    int left=0;  
    int right=n-1;    
    while (left<=right)          //循环条件,适时而变  
    {  
        int middle=left + ((right-left)>>1);  //防止溢出,移位也更高效。同时,每次循环都需要更新。  

        if (array[middle]>value)  
        {  
            right =middle-1;   //right赋值,适时而变  
        }   
        else if(array[middle]<value)  
        {  
            left=middle+1;  
        }  
        else  
            return middle;    

    }  
    return -1;  
}  

void * CalItemSim(void * a){
    int id  = *(int *)a;
    record[id] = (struct recordInfo *)malloc(max[id]*sizeof(struct recordInfo));
    float bestSim[SIMILARITY_ITEM];
    char  bestUserID[SIMILARITY_ITEM][50];
    float p,similarity;
    int common,pos;
    int left = item_size/ Num_Thread * id;
    int right  = item_size / Num_Thread *(id+1)-1;
    if (id==Num_Thread-1)
        right=item_size-1;
    int i,w,j,t,k,q;
    for ( i = left; i <= right; ++i) //遍历此线程需要处理的item
    {
        for ( w = 0; w < SIMILARITY_ITEM; ++w) //初始化
        {
            bestSim[w]=-1;
            bestUserID[w][0]=0;
        }
        for ( j = 0; j < item_size; ++j)
        {
            if (i!=j)
            {
                common=0;
                for ( t = 0; t < item[i].totalUser; ++t)  //寻找购买的相同物品个数
                {   
                    pos=binary_search(item[j].userList,item[j].totalUser,item[i].userList[t]);
                    if (pos!=-1)
                        common++;
                }
                if(common>5){
                    p=sqrt(item[i].totalUser * item[j].totalUser);
                    similarity = common/p;
                    for (k = 0; k < SIMILARITY_ITEM; ++k)
                    {
                        if (similarity>bestSim[k])
                        {
                            for ( q = SIMILARITY_ITEM -1; q > k; q--)
                            {
                                bestSim[q] = bestSim[q-1];
                                strcpy(bestUserID[q],bestUserID[q-1]);
                            }
                            bestSim[k] =similarity;
                            strcpy(bestUserID[k],item[j].itemId);
                            break;
                        }
                    }
                }

            }   
        }
        int c;
        for ( c = 0; (c < SIMILARITY_ITEM)&&bestSim[c]!=-1; ++c){
            if (len[id]+5>=max[id])
            {
                max[id]+=1000;
                record[id] = (struct recordInfo *)realloc(record[id], max[id] * sizeof(struct recordInfo));  
            }
            strcpy(record[id][len[id]].itemId1,item[i].itemId);
            strcpy(record[id][len[id]].itemId2,bestUserID[c]);
            record[id][len[id]].similarity = bestSim[c];
            len[id]++;
        }


    }
    pthread_exit(NULL);
}


void CreatMulThread(){
    pthread_t *pt = (pthread_t *)malloc(Num_Thread* sizeof(pthread_t));
    long long a;
    int id[Num_Thread];
    for(a=0;a<Num_Thread;a++) id[a]=a;
    for ( a = 0; a < Num_Thread; a++) pthread_create(&pt[id[a]], NULL, CalItemSim, (void *)&id[a]);
    for ( a = 0; a < Num_Thread; a++) pthread_join(pt[a], NULL);    
}
//将结果输出到文件
void Output(){
    FILE * fout = fopen("output_item_knn.txt","w");
    int i,j;
    for (i = 0; i < Num_Thread; ++i)
    {
        for (j = 0; j < len[i]; ++j)
        {
            fprintf(fout, "%s %s %f\n",record[i][j].itemId1,record[i][j].itemId2,record[i][j].similarity);
        }

    }
    fclose(fout);
}

int main(){
    //freopen("output.txt","w",stdout);
    init();
    ReadItemInfo();
    printf("sad");
    CreatMulThread();
    int sum= 0,i;
    for (i = 0; i < Num_Thread; ++i)
    {
        printf("len%d:%d\n",i+1,len[i]);
        sum+=len[i];
    }
    printf("sum:%d\n",sum);
    Output();
    /*
    char testpid[50]="p535223";
    int test=SearchItem(testpid);
    printf("test:%d\n",test );
    printf("item_size:%llu\n",item_size );
    printf("itemId:%s\n",item[test].itemId);
        printf("totalUser:%d\n",item[test].totalUser);
    for (int i = 0; i < item[test].totalUser; ++i)
    {
        printf("%llu\n",item[test].userList[i]);
    }
    */
    return 0;
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值