纯C实现大数据量浮点数排序
同学遇到的一道笔试题
仅仅是练习一下,做法可能不是最优。
纯C实现写的比较难受。。
- 分析:数据量比较大, 栈空间有限,无法一次性内排序,需要使用外排序。
- 以下我的做法:将大数据切分,切分出的数据快排后写到临时文件里,然后再多路选最小进行归并。
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#define CACHE_SIZE 1000010
#define MAX_HANNDLE 300
double cache[CACHE_SIZE];
int cacheUse = 0;
const double EPS = 1e-8;
const int BUNCH_SIZE = 1e5;
FILE* filesHandle[MAX_HANNDLE];
char dst[100];
int bunchCount = 0;
char* bunches[500];
int handleCount = 0;
void mySwap(double* v1, double* v2)
{
double tmp = *v1;
*v1 = *v2;
*v2 = tmp;
}
int isEqual(double a, double b, double eps)
{
return fabs(a - b) <= eps;
}
int isLess(double a, double b, double eps)
{
return a - b < eps;
}
int isGreater(double a, double b, double eps)
{
return isLess(b, a, eps);
}
int isLessEqual(double a, double b, double eps)
{
return isLess(a, b, eps) || isEqual(a, b, eps);
}
int partialSort(double* data, int left, int right)
{
int k = left;
int i = left;
int j = right;
while(i < j)
{
while(j > i && isGreater(data[j], data[k], EPS))
{
--j;
}
while(i < j && isLessEqual(data[i], data[k], EPS))
{
++i;
}
if(i < j)
{
mySwap(&data[i], &data[j]);
}
}
mySwap(&data[k], &data[i]);
return i;
}
void quickSort(double* data, int left, int right)
{
if(left >= right) return;
int mid = partialSort(data, left, right);
quickSort(data, left, mid);
quickSort(data, mid+1, right);
}
void print(double* data, int n)
{
int i;
for(i = 0; i < n; ++i)
{
printf("%lf\n", data[i]);
}
}
int writeTo(const char* path, double* data, int n)
{
// open file
FILE* fp = fopen(path, "w");
if(fp == NULL)
{
printf("open %s failed", path);
return 0;
}
// write data
if(n > 0) fprintf(fp, "%lf",data[0]);
int i;
for(i = 1; i < n; ++i)
{
fprintf(fp, " %lf",data[i]);
}
fprintf(fp, "\n");
// close file
fclose(fp);
return 1;
}
int readFrom(const char* path)
{
FILE* fp = fopen(path, "r");
if(fp == NULL)
{
printf("open %s failed", path);
return 0;
}
// read data
double num = 0;
int total = 0;
while(fscanf(fp, "%lf", &num) != EOF)
{
printf("%f\n", num);
++total;
}
printf("total = %d\n", total);
// close file
fclose(fp);
return 1;
}
void getDstPath(const char* path, int id)
{
memset(dst, '\0', sizeof(dst));
char suffix[100];
memset(suffix, '\0', sizeof(suffix));
sprintf(suffix, "_%d_sorted.txt", id);
char* pos = strchr(path, '.');
int cpyCnt = pos - path;
strncpy(dst, path, cpyCnt);
strcat(dst, suffix);
}
void recordBunch(const char* dst)
{
int dstLen = strlen(dst);
char* newBunch = (char*)malloc((dstLen+1) * sizeof(char));
strncpy(newBunch, dst, dstLen);
newBunch[dstLen] = '\0';
bunches[bunchCount++] = newBunch;
}
void processBunch(const char* src)
{
getDstPath(src, bunchCount);
printf("%s: nums total = %d\n", dst, cacheUse);
quickSort(cache, 0, cacheUse-1);
recordBunch(dst);
writeTo(dst, cache, cacheUse);
cacheUse = 0;
}
void splitBunch(const char* path)
{
puts(path);
FILE* fp = fopen(path, "r");
if(fp == NULL)
{
printf("open %s failed", path);
return;
}
// read data
double num = 0;
cacheUse = 0;
while(fscanf(fp, "%lf", &num) != EOF)
{
//printf("%lf\n", num);
cache[cacheUse] = num;
++cacheUse;
if(cacheUse == BUNCH_SIZE)
{
processBunch(path);
}
}
if(cacheUse > 0)
{
processBunch(path);
}
// close file
fclose(fp);
return;
}
void generateData(const char* path, int maxData, int total)
{
srand(time(0));
int factorMax = 20;
int factorMin = 10;
puts(path);
FILE* fp = fopen(path, "w");
if(fp == NULL)
{
printf("open %s failed", path);
return;
}
int i;
for(i = 0; i < total; ++i)
{
int iNum = rand() % maxData + 1; // [1, maxData]
int iFactor = rand() % (factorMax - factorMin) + factorMin; // [10,20)
double fFactor = iFactor / (double) 10; // [1.0, 2.0]
double fNum = iNum * fFactor;
fprintf(fp, "%lf ", fNum);
}
printf("generate toatal = %d\n", total);
fclose(fp);
}
void testStack(int cnt)
{
char data[1024 * 1024] = {0};
++cnt;
printf("%s %d stackSize = %d MB\n", __FUNCTION__, __LINE__, cnt);
testStack(cnt);
}
void clearBunches()
{
int i;
for(i = 0; i < bunchCount; ++i)
{
printf("all number sorted finished, remove temp file:%s\n", bunches[i]);
remove(bunches[i]);
free(bunches[i]);
bunches[i] = NULL;
}
}
int getMin(double* nums)
{
int minx = -1;
int i;
for(i = 0; i < handleCount; ++i)
{
if(filesHandle[i] == NULL) continue;
if(minx == -1 || isLess(nums[i], nums[minx], EPS))
{
minx = i;
}
}
return minx;
}
void mergeBunches()
{
printf("bunch count = %d\n", bunchCount);
handleCount = 0;
int i;
for(i = 0; i < bunchCount; ++i)
{
const char* path = bunches[i];
printf("open bunch %s\n", path);
FILE* fp = fopen(path, "r");
if(fp == NULL)
{
printf("open %s failed", path);
continue;
}
filesHandle[handleCount++] = fp;
}
printf("handleCount count = %d\n", handleCount);
double nums[handleCount];
for(i = 0; i < handleCount; ++i)
{
fscanf(filesHandle[i], "%lf", &nums[i]);
}
FILE* resHandele = fopen("result.txt", "w");
int isFirstNum = 1;
int total = 0;
while(1)
{
int minx = getMin(nums);
if(minx == - 1)
{
break;
}
++total;
if(isFirstNum)
{
fprintf(resHandele,"%lf", nums[minx]);
isFirstNum = 0;
}
else
{
fprintf(resHandele," %lf", nums[minx]);
}
if(filesHandle[minx] != NULL)
{
fscanf(filesHandle[minx], "%lf", &nums[minx]);
if(feof(filesHandle[minx]))
{
fclose(filesHandle[minx]);
filesHandle[minx] = NULL;
}
}
}
printf("sorted nums total = %d\n", total);
fclose(resHandele);
clearBunches();
}
int main()
{
puts("Hello world!");
//testStack(0);
// test data generate
int n = 1;
int maxData = 10000;
int total = 201100;
char src[100];
int i;
for(i = 0; i < n; ++i)
{
memset(src, '\0', sizeof(src));
sprintf(src, "test_data_%d.txt", i);
generateData(src, maxData, total);
}
// split
for(i = 0; i < n; ++i)
{
memset(src, '\0', sizeof(dst));
sprintf(src, "test_data_%d.txt", i);
splitBunch(src);
}
// merge
mergeBunches();
puts("Hello world!");
return 0;
}