/***************************************************************************
*
* Copyright (c) 2014 Baidu.com, Inc. All Rights Reserved
* $Id$
*
**************************************************************************/
/**
* @file largeSort.c
* @author liuyi(liuyi04@baidu.com)
* @date 2014/04/08 19:51:59
* @version $Revision$
* @brief
* sort largeScale text data
**/
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<time.h>
#define MAXLINE 110000
#define MAXLEN 1024
/**********************小文件的内排序**********************/
int compare_str(const void *p, const void *q)
{
return strcmp((char *)p, (char *)q);
}
int compare_str2(const void* p, const void* q)
{
return strcmp(*(char**)p, *(char**)q);
}
void sort_file(char* in_file,char* out_file)//排序单个文件
{
//char str[MAXLINE][MAXLEN];
char* str[MAXLINE],
*temp = (char *)malloc(sizeof(char)*MAXLEN);
FILE *fp1, *fp2;
int len = 0, i = 0;
if(NULL == (fp1 = fopen(in_file,"r")))
{
printf("cannot open %s file\n",in_file);
exit(0);
}
while(fgets(temp,MAXLEN,fp1) != NULL)
{
str[len] = temp;
temp=(char *)malloc(sizeof(char)*MAXLEN);
++len;
}
free(temp);
fclose(fp1);
/*printf("原数据:\n");
for (i = 0; i<len; ++i)
printf("%s",str[i]);
printf("\n");*/
//qsort(str,len,sizeof(str[0]),compare_str);
qsort(str,len,sizeof(char*),compare_str2);
/*printf("排序后:\n");
for (i = 0; i<len; ++i)
printf("%s",str[i]);
printf("\n");*/
if (NULL == (fp2 = fopen(out_file,"w")))
{
printf("cannot open %s file\n",out_file);
exit(0);
}
for(i=0; i<len; ++i)
{
fputs(str[i],fp2);
free(str[i]);
//fputs("\n",fp2);
}
fclose(fp2);
}
void sort_all(int file_num)
{
int i = 0;
char in_file[10]="part.", out_file[10]="in.",
digit[2];
for (; i<file_num; ++i)
{
sprintf(digit,"%d",i);
strcat(in_file,digit);
strcat(out_file,digit);
sort_file(in_file,out_file);
//printf("%s has been sorted to %s\n",in_file,out_file);
in_file[5] = '\0';
out_file[3] = '\0';
}
}
/**********************分割文件***********************/
int partion_file(int file_num, int file_size)//将file_num个文件分别分割成file_size行的小文件,返回小文件数目
{
char in_file[10]="text.", out_file[10]="part.",
digit[2];
FILE *fp1, *fp2;
int i=0,
file_line = 0, //所有文件总的行数
small_file_num = 0; //记录小文件的数目
char *temp = (char *)malloc(sizeof(char)*MAXLEN);
for (i = 0; i<file_num; ++i)
{
sprintf(digit,"%d",i);
strcat(in_file,digit);
if(NULL == (fp1 = fopen(in_file,"r")))
{
printf("cannot open %s file\n",in_file);
exit(0);
}
while(fgets(temp,MAXLEN,fp1) != NULL)
{
if (0 == file_line%file_size)//每达到一个file_size就新建一个文件
{
small_file_num = file_line/file_size;
char partion_file_suffix[3];
//sprintf(partion_file_suffix,"%d",small_file_num);
sprintf(partion_file_suffix,"%d",file_line/file_size);
strcat(out_file,partion_file_suffix);
if (NULL == (fp2 = fopen(out_file,"w")))
{
printf("cannot open %s file\n",out_file);
exit(0);
}
}
//printf("%s",temp);
fputs(temp,fp2);//将数据写入fp2指向的小文件中
if (0 == (file_line+1)%file_size)//小文件中最后一个数据,关闭该小文件
{
int j = 0;
for (; temp[j] != '\0';++j) {}
if (temp[j-1] != '\n')
fputs("\n",fp2);
fclose(fp2);
out_file[5] = '\0';
}
++file_line;
}
//printf("%s has been splited\n",in_file);
in_file[5] = '\0';
fclose(fp1);
}
if (0 != (file_line+1)%file_size) //关闭最后一个文件
{
int j = 0;
for (; temp[j] != '\0';++j) {}
if (temp[j-1] != '\n')
fputs("\n",fp2);//分割后的文件加入一个换行符
fclose(fp2);
}
free(temp);
return small_file_num;
}
/**************************外部排序(利用败者树)***********************/
void adjust(int* ls, char**b, int k, int s)
{//沿从叶子结点b[s]到根结点ls[0]的路径调整败者树。
int i, t;
t = (s+k)/2;
while(t>0)
{
if (strcmp(b[s],b[ls[t]]) > 0) //有问题
{
i = s;
s = ls[t];
ls[t] = i;
}
t /= 2;
}
ls[0] = s;
}
void create_loser_tree(int* ls, char** b, int k)
{//b[0]到b[k-1]为完全二叉树ls的叶子结点,存有k个结点
//沿从叶子到根的k条路径将ls调整成为败者树
int i;
strcpy(b[k],"\0\0");//将b[k]设置为关键字可能的最小值
for (i = 0; i<k; ++i)
ls[i] = k;
for (i = k-1; i>=0; --i)
adjust(ls,b,k,i);
}
void k_merge(int* ls, char** b, int k, FILE** fp)
{
int i, q;
FILE* fp_out;
char MAXKEY[10]="z\0";
MAXKEY[0] = (char)255;
if (NULL == (fp_out = fopen("sorted","w")))
{
printf("cannot open sorted file\n");
exit(0);
}
for (i=0; i<k; ++i)//分别从K个外部文件读入首关键字
{
fgets(b[i],MAXLEN,fp[i]);
//printf("%s",b[i]);//
}
//printf("\n");//
create_loser_tree(ls,b,k);//
while(strcmp(b[ls[0]],MAXKEY) != 0)
{
q = ls[0];
fputs(b[q],fp_out);//将当前最小值输出到结果文件中
if(NULL == fgets(b[q],MAXLEN,fp[q]))//该段已为空,将该数设置为最大
{
strcpy(b[q],MAXKEY);
//printf("in.%d has completed\n",q);
}
adjust(ls,b,k,q);
}
fclose(fp_out);
}
void external_sort(int file_num)
{
int i = 0, k = file_num;//将文件数k直接作为段数进行归并
int* ls = (int*) malloc(sizeof(int)*file_num);
char** b= (char**)malloc(sizeof(char*)*(file_num+1));//b[k]设置为MINKEY
for (i = 0; i<k+1; ++i)
b[i] = (char*)malloc(sizeof(char)*MAXLEN);
FILE** fp = (FILE**)malloc(sizeof(FILE*)*file_num);
for (i=0; i<k; ++i)//分别打开k个外部文件准备读入首关键字
{
char in_file[10] = "in.", file_No[3];
sprintf(file_No,"%d",i);
strcat(in_file,file_No);
if (NULL == (fp[i] = fopen(in_file,"r")))
{
printf("cannot open %s\n",in_file);
exit(0);
}
}
k_merge(ls,b,k,fp);
for (i = 0; i<k+1; ++i)
{
if (i<k)
fclose(fp[i]);
free(b[i]);
}
free(ls);
}
int main()
{
clock_t start, end;
int file_num = 12, file_size = 100000, small_file_num;
start = clock();
small_file_num = partion_file(file_num, file_size);
printf("No. of small files:%d\n",small_file_num+1);
sort_all(small_file_num+1);
external_sort(small_file_num+1);
end = clock();
printf("the running time is: %fs\n", (double)(end-start)/CLOCKS_PER_SEC);
return 0;
}
/* vim: set ts=4 sw=4 sts=4 tw=100 noet: */
*
* Copyright (c) 2014 Baidu.com, Inc. All Rights Reserved
* $Id$
*
**************************************************************************/
/**
* @file largeSort.c
* @author liuyi(liuyi04@baidu.com)
* @date 2014/04/08 19:51:59
* @version $Revision$
* @brief
* sort largeScale text data
**/
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<time.h>
#define MAXLINE 110000
#define MAXLEN 1024
/**********************小文件的内排序**********************/
int compare_str(const void *p, const void *q)
{
return strcmp((char *)p, (char *)q);
}
int compare_str2(const void* p, const void* q)
{
return strcmp(*(char**)p, *(char**)q);
}
void sort_file(char* in_file,char* out_file)//排序单个文件
{
//char str[MAXLINE][MAXLEN];
char* str[MAXLINE],
*temp = (char *)malloc(sizeof(char)*MAXLEN);
FILE *fp1, *fp2;
int len = 0, i = 0;
if(NULL == (fp1 = fopen(in_file,"r")))
{
printf("cannot open %s file\n",in_file);
exit(0);
}
while(fgets(temp,MAXLEN,fp1) != NULL)
{
str[len] = temp;
temp=(char *)malloc(sizeof(char)*MAXLEN);
++len;
}
free(temp);
fclose(fp1);
/*printf("原数据:\n");
for (i = 0; i<len; ++i)
printf("%s",str[i]);
printf("\n");*/
//qsort(str,len,sizeof(str[0]),compare_str);
qsort(str,len,sizeof(char*),compare_str2);
/*printf("排序后:\n");
for (i = 0; i<len; ++i)
printf("%s",str[i]);
printf("\n");*/
if (NULL == (fp2 = fopen(out_file,"w")))
{
printf("cannot open %s file\n",out_file);
exit(0);
}
for(i=0; i<len; ++i)
{
fputs(str[i],fp2);
free(str[i]);
//fputs("\n",fp2);
}
fclose(fp2);
}
void sort_all(int file_num)
{
int i = 0;
char in_file[10]="part.", out_file[10]="in.",
digit[2];
for (; i<file_num; ++i)
{
sprintf(digit,"%d",i);
strcat(in_file,digit);
strcat(out_file,digit);
sort_file(in_file,out_file);
//printf("%s has been sorted to %s\n",in_file,out_file);
in_file[5] = '\0';
out_file[3] = '\0';
}
}
/**********************分割文件***********************/
int partion_file(int file_num, int file_size)//将file_num个文件分别分割成file_size行的小文件,返回小文件数目
{
char in_file[10]="text.", out_file[10]="part.",
digit[2];
FILE *fp1, *fp2;
int i=0,
file_line = 0, //所有文件总的行数
small_file_num = 0; //记录小文件的数目
char *temp = (char *)malloc(sizeof(char)*MAXLEN);
for (i = 0; i<file_num; ++i)
{
sprintf(digit,"%d",i);
strcat(in_file,digit);
if(NULL == (fp1 = fopen(in_file,"r")))
{
printf("cannot open %s file\n",in_file);
exit(0);
}
while(fgets(temp,MAXLEN,fp1) != NULL)
{
if (0 == file_line%file_size)//每达到一个file_size就新建一个文件
{
small_file_num = file_line/file_size;
char partion_file_suffix[3];
//sprintf(partion_file_suffix,"%d",small_file_num);
sprintf(partion_file_suffix,"%d",file_line/file_size);
strcat(out_file,partion_file_suffix);
if (NULL == (fp2 = fopen(out_file,"w")))
{
printf("cannot open %s file\n",out_file);
exit(0);
}
}
//printf("%s",temp);
fputs(temp,fp2);//将数据写入fp2指向的小文件中
if (0 == (file_line+1)%file_size)//小文件中最后一个数据,关闭该小文件
{
int j = 0;
for (; temp[j] != '\0';++j) {}
if (temp[j-1] != '\n')
fputs("\n",fp2);
fclose(fp2);
out_file[5] = '\0';
}
++file_line;
}
//printf("%s has been splited\n",in_file);
in_file[5] = '\0';
fclose(fp1);
}
if (0 != (file_line+1)%file_size) //关闭最后一个文件
{
int j = 0;
for (; temp[j] != '\0';++j) {}
if (temp[j-1] != '\n')
fputs("\n",fp2);//分割后的文件加入一个换行符
fclose(fp2);
}
free(temp);
return small_file_num;
}
/**************************外部排序(利用败者树)***********************/
void adjust(int* ls, char**b, int k, int s)
{//沿从叶子结点b[s]到根结点ls[0]的路径调整败者树。
int i, t;
t = (s+k)/2;
while(t>0)
{
if (strcmp(b[s],b[ls[t]]) > 0) //有问题
{
i = s;
s = ls[t];
ls[t] = i;
}
t /= 2;
}
ls[0] = s;
}
void create_loser_tree(int* ls, char** b, int k)
{//b[0]到b[k-1]为完全二叉树ls的叶子结点,存有k个结点
//沿从叶子到根的k条路径将ls调整成为败者树
int i;
strcpy(b[k],"\0\0");//将b[k]设置为关键字可能的最小值
for (i = 0; i<k; ++i)
ls[i] = k;
for (i = k-1; i>=0; --i)
adjust(ls,b,k,i);
}
void k_merge(int* ls, char** b, int k, FILE** fp)
{
int i, q;
FILE* fp_out;
char MAXKEY[10]="z\0";
MAXKEY[0] = (char)255;
if (NULL == (fp_out = fopen("sorted","w")))
{
printf("cannot open sorted file\n");
exit(0);
}
for (i=0; i<k; ++i)//分别从K个外部文件读入首关键字
{
fgets(b[i],MAXLEN,fp[i]);
//printf("%s",b[i]);//
}
//printf("\n");//
create_loser_tree(ls,b,k);//
while(strcmp(b[ls[0]],MAXKEY) != 0)
{
q = ls[0];
fputs(b[q],fp_out);//将当前最小值输出到结果文件中
if(NULL == fgets(b[q],MAXLEN,fp[q]))//该段已为空,将该数设置为最大
{
strcpy(b[q],MAXKEY);
//printf("in.%d has completed\n",q);
}
adjust(ls,b,k,q);
}
fclose(fp_out);
}
void external_sort(int file_num)
{
int i = 0, k = file_num;//将文件数k直接作为段数进行归并
int* ls = (int*) malloc(sizeof(int)*file_num);
char** b= (char**)malloc(sizeof(char*)*(file_num+1));//b[k]设置为MINKEY
for (i = 0; i<k+1; ++i)
b[i] = (char*)malloc(sizeof(char)*MAXLEN);
FILE** fp = (FILE**)malloc(sizeof(FILE*)*file_num);
for (i=0; i<k; ++i)//分别打开k个外部文件准备读入首关键字
{
char in_file[10] = "in.", file_No[3];
sprintf(file_No,"%d",i);
strcat(in_file,file_No);
if (NULL == (fp[i] = fopen(in_file,"r")))
{
printf("cannot open %s\n",in_file);
exit(0);
}
}
k_merge(ls,b,k,fp);
for (i = 0; i<k+1; ++i)
{
if (i<k)
fclose(fp[i]);
free(b[i]);
}
free(ls);
}
int main()
{
clock_t start, end;
int file_num = 12, file_size = 100000, small_file_num;
start = clock();
small_file_num = partion_file(file_num, file_size);
printf("No. of small files:%d\n",small_file_num+1);
sort_all(small_file_num+1);
external_sort(small_file_num+1);
end = clock();
printf("the running time is: %fs\n", (double)(end-start)/CLOCKS_PER_SEC);
return 0;
}
/* vim: set ts=4 sw=4 sts=4 tw=100 noet: */