c语言查找字符串的中位数,从雅量数据中找中位数（c语言实现）-CSDN博客

从海量数据中找中位数(c语言实现)

题目：5亿个int，从中找出第k大的数

算法：之后补上。。。

实现：

#include

typedef struct bucket_t {

int *buf;/* 输出缓冲区 */

int count;/* 当前有多少个数 */

int idx;/* 缓冲区的指针 */

} bucket_t;

static unsigned int BUF_PAGES;/* 缓冲区有多少个page */

static unsigned int PAGE_SIZE;/* page的大小 */

static unsigned int BUF_SIZE;/* 缓冲区的大小, BUF_SIZE = BUF_PAGES*PAGE_SIZE */

static unsigned int nbuckets;/* 分成多少个桶 */

static unsigned int BUCKET_BUF_SIZE;

static int *buffer;/* 输入缓冲区 */

long get_time_usecs();

void write_to_file(bucket_t *bucket, int pos);

int partition(int *a, int s, int t);

int quick_select(int *a, int s, int t, int i);

void swap(int *p, int *q);

int main(int argc, char **argv)

{

char filename[20];

unsigned intbp, length, bucket_size, k;

intfd, i, bytes;

bucket_t*bucket;

long start_usecs = get_time_usecs();

strcpy(filename, argv[1]);

fd = open(filename, O_RDONLY);

if (fd < 0) {

printf("can't open file %s\n", filename);

exit(0);

}

nbuckets = 1024;

k = atoi(argv[2]);

PAGE_SIZE = 4096;/* page = 4KB */

BUF_PAGES = 1024;

BUF_SIZE = PAGE_SIZE*BUF_PAGES;/* 4KB * 1024 = 4M */

BUCKET_BUF_SIZE = PAGE_SIZE*128;/* 4KB * 128 = 512KB */

buffer = (int *)malloc(BUF_SIZE);

//把1-2^32个数分成nbucket个组, nbuckets必须等于2的n次幂

bucket = malloc(sizeof(bucket_t)*nbuckets);

if (bucket == NULL) exit(0);

for (i = 0; i < nbuckets; i++) {

bucket[i].buf = malloc(BUCKET_BUF_SIZE);

if (bucket[i].buf == NULL) {

exit(0);

}

bucket[i].idx = 0;

bucket[i].count = 0;

}

bucket_size = (1<<22);/* 分成1024个桶，每个桶容纳2^22个数 */

// 读入第一批数据到输入缓冲区

bytes = read(fd, buffer, BUF_SIZE);

length = bytes/4;

bp = 0;

int element, pos;

unsigned intbase;

bucket_t*p;

base = 2147483648;

while (1) {

//从输入缓冲区取出一个数，加到对应的桶

element = buffer[bp++];

pos = (((long)element)+base)>>22;

p = &bucket[pos];

p->buf[p->idx++] = element;

p->count++;

//桶内的缓冲区已满，写入文件

if (p->idx*4 == BUCKET_BUF_SIZE) {

write_to_file(p, pos);

p->idx = 0;

}

//输入缓冲区的数已用完

if (bp == length) {

bytes = read(fd, buffer, BUF_SIZE);

if (bytes == 0) {

break;

}

length = bytes/4;

bp = 0;

}

//把每个桶剩下的数写入文件

for (i = 0; i < nbuckets; i++) {

write_to_file(bucket+i, i);

}

free(buffer);

close(fd);

buffer = malloc(bucket_size*4);

if (buffer == NULL) exit(0);

//找出第k大的数位于哪个文件

unsigned sum = 0;

for (i = 0; i < nbuckets && sum < k; i++) {

sum += bucket[i].count;

}

i--;

//把该文件读入内存

sprintf(filename, "foo_%d.dat", i);

printf("第%d大的数位于文件%s的第%d大的数\n", k, filename, k+bucket[i].count-sum);

fd = open(filename, O_RDONLY);

if (fd < 0) {

printf("can't open file %s\n", filename);

free(buffer);

exit(0);

}

bytes = read(fd, buffer, bucket_size*4);

length = bytes/4;

//选择文件内第(k+bucket[i].count-sum)大的数

int answer;

answer = quick_select(buffer, 1, length-1, k+bucket[i].count-sum);

printf("第%d大的数 = %d\n", k, answer);

close(fd);

free(buffer);

//free buckets

for (i = 0; i < nbuckets; i++) {

free(bucket[i].buf);

}

free(bucket);

long end_usecs = get_time_usecs();

double secs = (double)(end_usecs - start_usecs) / (double)1000000;

printf("it took %.02f seconds.\n", secs);

return 0;

}

void write_to_file(bucket_t *bucket, int pos)

{

charfilename[20];

intfd, bytes;

sprintf(filename, "foo_%d.dat", pos);

fd = open(filename, O_WRONLY | O_CREAT | O_APPEND, 0666);

if (fd < 0) {

printf("can't open file %s\n", filename);

exit(0);

}

bytes = write(fd, bucket->buf, bucket->idx*4);

if (bucket->idx*4 != bytes) {

printf("idx = %d, bytes = %d, write error\n", bucket->idx, bytes);

close(fd);

exit(0);

}

close(fd);

}

long get_time_usecs()

{

struct timeval time;

struct timezone tz;

memset(&tz, '\0', sizeof(struct timezone));

gettimeofday(&time, &tz);

long usecs = time.tv_sec*1000000 + time.tv_usec;

return usecs;

}

void swap(int *p, int *q)

{

inttmp;

tmp = *p;

*p = *q;

*q = tmp;

}

/* 把a[t]作为参考，将数组分成三部分: 小于等于a[t]，

* a[t]以及大于a[t]，分割完毕后，a[t]所在的下标即是a[t]的顺序

int partition(int *a, int s, int t)

{

inti, j;/* i用来遍历a[s]...a[t-1], j指向大于x部分的第一个元素 */

for (i = j = s; i < t; i++) {

if (a[i] < a[t]) {

swap(a+i, a+j);

j++;

}

swap(a+j, a+t);

return j;

}

/* 选择数组中第i大的元素并返回 */

int quick_select(int *a, int s, int t, int i)

{

intp, m;

if (s == t) return a[t];

p = partition(a, s, t);

m = p - s + 1;

if (m == i) return a[p];

if (m > i) {

return quick_select(a, s, p-1, i);

}

return quick_select(a, p+1, t, i-m);

}

运行和测试：

寻找第1111大的整数

dd if=/dev/urandom of=random.dat bs=1M count=1024

gcc main.c

./a.out random.dat 1111