大数据查询的步骤:
//读入到内存、排序、写入到文件、创建索引、二分查找
下面代码听课时写的有点乱,可以按上面的 步骤来实现。
#define _CRT_SECURE_NO_WARNINGS
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<Windows.h>
#define LINE 10000000
//读入到内存、排序、写入到文件、创建索引、二分查找
char **g_pp; //保存指针数组
char *sorpath = "E:\\qqok.txt";
char *despath = "E:\\qsort.txt";
char *indexpath = "E:\\qindex.txt";
struct index //索引的数据结构
{
int *pindex; //保存每行的首地址
int length; //行数
}allindex;
int getLine(char *path)
{
int line = 0;
FILE *pfr = fopen(path, "rb");
if (pfr == NULL)
{
printf("文件打开失败!\n");
return 0;
}
else
{
while (!feof(pfr))
{
char str[256] = { 0 };
fgets(str, 256, pfr);
line++;
}
}
fclose(pfr);
return line;
}
void eatq(char *str)
{
for (char *ptemp = str; *ptemp != '\0'; ptemp++)
{
if (*ptemp == '-')
{
*ptemp = '\0';
}
}
}
//去掉 \n
void eatN(char *str)
{
while (*str != '\0')
{
if (*str == '\r' || *str == '\n')
{
*str = '\0';
}
str++;
}
}
void initmem(char *path)
{
printf("\n开始读取...\n");
g_pp = calloc(LINE, sizeof(char *));
FILE *pfr = fopen(path, "rb");
if (pfr == NULL)
{
printf("文件打开失败!\n");
return ;
}
else
{
for (int i = 0; i < LINE;i++)
{
char str[256] = { 0 };
fgets(str, 256, pfr);
g_pp[i] = calloc(strlen(str) + 1, sizeof(char));
//printf("%s", str);
if (g_pp[i])
{
//strcpy(g_pp[i], str); //拷贝数据
sprintf(g_pp[i], str); //在大数据使用时不稳定
strcpy(g_pp[i], str);
//printf("%d ", strlen(str));
eatN(g_pp[i]); //去掉 \n 因为这儿可能有次使用 \n
//printf("%d \n ", strlen(g_pp[i]));
}
}
}
fclose(pfr);
printf("\n结束读取...\n");
}
int com(void *p1, void *p2)
{
char **pstr1 = p1;
char **pstr2 = p2;
int res = strcmp(*pstr1, *pstr2);
return res;
}
void sort()
{
qsort(g_pp, LINE, sizeof(char*), com);
}
void writetofile(char *path)
{
FILE *pfw = fopen(path, "wb");
if (pfw == NULL)
{
printf("文件打开失败!\n");
return ;
}
else
{
for (int i = 0; i < LINE; i++)
{
char tempstr[100] = { 0 };
sprintf(tempstr, "%s\n", g_pp[i]);
//strcat(g_pp[i], '\n');
//写进去字符串前先要配上个 \n,因为上面我们把 \n 去掉了
fputs(tempstr, pfw); //得到一个有序的文件
//printf("%s", tempstr);
}
}
fclose(pfw);
}
void show()
{
for (int i = 0; i < LINE; i++)
{
printf("%s\n", g_pp[i]);
}
}
//初始化数据
void initindex(char *path)
{
printf("索引数组开始分配...\n");
allindex.length = LINE+1;
allindex.pindex = calloc(allindex.length, sizeof(int)); //开辟内存空间
printf("索引数组完成分配。\n");
printf("开始读取...\n");
FILE *pfr = fopen(path, "rb");
FILE *pfw = fopen(indexpath, "wb"); //索引文件
if (pfr == NULL || pfw == NULL)
{
printf("文件打开失败!\n");
return;
}
else
{
int alllength = 0;
int i = 0;
while (!feof(pfr))
{
char str[256] = { 0 };
fgets(str, 256, pfr);
//记录每行数据所占用的长度,方便后面指针查询的跳转
allindex.pindex[i] = alllength;
//printf("%d ", allindex.pindex[i]);
int length = strlen(str);
alllength += length;
i++;
}
fclose(pfr);
}
printf("结束读取...\n");
//把索引写入到文件中
printf("索引写入...\n");
fwrite(allindex.pindex, sizeof(int), allindex.length, pfw);
fclose(pfw);
printf("索引写入结束。\n");
//释放内存
free(allindex.pindex);
}
//快速读取,就是建立好索引文件后,直接读取索引文件
void qucik(char *path)
{
printf("索引数组开始分配...\n");
allindex.length = LINE;
allindex.pindex = calloc(allindex.length, sizeof(int)); //开辟内存空间
printf("索引数组完成分配。\n");
printf("开始读取...\n");
FILE *pfr = fopen(indexpath, "rb");
if (pfr == NULL)
{
printf("文件打开失败!\n");
return;
}
fread(allindex.pindex, sizeof(int), allindex.length, pfr);
fclose(pfr);
printf("结束读取...\n");
}
//把索引读入到内存。
void main内存索引查询()
{
//printf("%d\n", getLine(sorpath));
//initmem(sorpath);
show();
//printf("\n排序后:\n");
//sort();
show();
//writetofile(despath);
//printf("%d\n", getLine(despath));
//创建索引
//initindex(despath);
/*qucik(indexpath);
FILE *pfr = fopen(despath, "rb");
while (1)
{
printf("请输入要查询的行号:");
int num = 0;
scanf("%d", &num);
fseek(pfr, allindex.pindex[num], SEEK_SET);
char str[256] = { 0 };
fgets(str, 256, pfr);
printf("%s", str);
}
fclose(pfr);*/
system("pause");
}
//文件索引,即打开二个文件,先查询索引文件的内容,
//然后根据索引的内容来查询相应的行
void main索引文件查询()
{
FILE *pfindex = fopen(indexpath, "rb");
FILE *pfsortq = fopen(despath, "rb");
while (1)
{
printf("请输入要查询的行数:");
int num = 0;
scanf("%d", &num);
fseek(pfindex, num*sizeof(int), SEEK_SET);
int data;
fread(&data, sizeof(int), 1, pfindex);
//printf("%d", data);
char str[256] = { 0 };
fseek(pfsortq, sizeof(char)*data, SEEK_SET);
fgets(str, 256, pfsortq);
printf("%s", str);
}
fclose(pfindex);
fclose(pfsortq);
system("pause");
}
void searchstr(char *str)
{
FILE *pfindex = fopen(indexpath, "rb");
FILE *pfdes = fopen(despath, "rb");
if (pfindex == NULL || pfdes == NULL)
{
printf("cannot file !\n");
return;
}
int flag = 0;
int start = 0;
int end = LINE - 1;
while (start <= end)
{
int mid = (start + end) / 2;
int data = 0;
fseek(pfindex, mid*sizeof(int), SEEK_SET);
fread(&data, sizeof(1), 1, pfindex);
char pstr[256] = { 0 };
fseek(pfdes, data, SEEK_SET);
fgets(pstr, 256, pfdes);
eatN(pstr); //去掉 \n
//如果按 QQ 号,这时就需要把前的QQ号取出来
//eatq(pstr); 这样会把原字符串直接截断,无法打印出全部数据。
char ptemp[256] = { 0 };
strcpy(ptemp, pstr);
eatq(ptemp);
int res = strcmp(ptemp, str); //这个比较是完全比较,连 \n 也比较
if (res == 0)
{
flag = 1;
printf("%s", pstr);
break;
}
else if (res == 1)
{
end = mid - 1;
}
else if (res == -1)
{
start = mid + 1;
}
}
if (flag)
{
printf("\nfind\n");
}
else
{
printf("\nnot find");
}
fclose(pfindex);
fclose(pfdes);
}
//二分查找法
void main()
{
while (1)
{
char str[256];
printf("请输入要查询的QQ号:");
scanf("%s", str);
searchstr(str);
}
system("pause");
}