<<C语言接口与实现>> 第三章原子

最新推荐文章于 2024-07-10 22:49:54 发布

weixin_34198762

最新推荐文章于 2024-07-10 22:49:54 发布

阅读量120

点赞数 1

文章标签： c/c++ python 数据结构与算法

原文链接：https://my.oschina.net/u/578519/blog/217823

版权

2019独角兽企业重金招聘Python工程师标准>>>

十舍七匹狼于~ http://www.oschina.net/

3.1

使用2048作为hash桶的数目: 链表最长的有5个元素，最少的1个元素，
绝大部分是在1-3个元素之间, 耗费的时间是[totaltime = 0.006123s]；
使用素数2039作为hash桶的数目，几乎没有出现在某个链表有长达5个元
素的情况，绝大部分是2-3个元素，可见分布更平均，并且耗时更少了
[totaltime = 0.000001s].不过，我后来再次测试了，发现即使使用
2048作为hash桶的数目，耗时也是[totaltime = 0.000001s]。不过
hash表分布的更平均却是不争的事实。最后，只有x86的机器，所以无法
判断机器的关联性。即使有不同机器我也尚不知道从何处判断关联性的大小。

测算时间使用了gettimeofday函数, 测试hash表的分布情况, 使用自写

的Atom_read函数遍历hash表, 然后写入文件, 没具体统计数字，肉眼观察了下。

主程序3_1.c

#include <sys/time.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "assert.h"

#include "atom.h"
#include "getOneword.h"

int main(int argc, char const *argv[])
{
    char buf[1024];
    int size;
    FILE *fp;
    int num;

    struct timeval tv_start, tv_end;
    double timeuse, totaltime = 0;

    size = sizeof(buf);
    fp = fopen("test_3_1.txt", "rb");
    assert(fp);

    for (num = 0; num < 10000; ++num) {
        if (getOneword(fp, buf, size, first, rest) == 0)
            break;
        gettimeofday(&tv_start, NULL);
        Atom_new(buf, strlen(buf));
        gettimeofday(&tv_end, NULL);
        timeuse = (tv_end.tv_sec - tv_start.tv_sec) * 1000000 + 
                (tv_end.tv_usec - tv_start.tv_usec);
        timeuse /= 1000000;
        totaltime += timeuse;
    }

    printf("[totaltime = %fs]\n", totaltime);

    fclose(fp);

    Atom_read();

    return 0;
}

其中getOneword函数使用的是书中提供的函数，在getOneword.c中

#include <stdio.h>
#include <ctype.h>

int getOneword(FILE *fp, char *buf, int size, int first(int c), int rest(int c))
{
    int i = 0;
    int c;

    c = getc(fp);

    for ( ; c != EOF; c = getc(fp))
        if (first(c)) {
            if (i < size - 1)
                buf[i++] = c;
            c = getc(fp);
            break;
        }

    for ( ; c != EOF && rest(c); c = getc(fp))
        if (i < size - 1)
            buf[i++] = c;
    
    if (i < size)
        buf[i] = '\0';
    else
        buf[size - 1] = '\0';

    if (c != EOF)
        ungetc(c, fp);

    return i > 0;
}

int first(int c) {
    return isalpha(c);
}
int rest(int c) {
    return isalpha(c) || c == '_';
}

其中Atom_read函数如下：

int Atom_read()
{
	int i;
	struct atom *p;
	FILE *fp;

	fp = fopen("result.txt", "wb");
	assert(fp);

	for (i = 0; i < NELEMS(buckets); i++)
		if (buckets[i] != NULL) {
			for (p = buckets[i]; p; p = p->link)
				fprintf(fp, "buckets[%d]: [%s]\n", i, p->str);
			fprintf(fp, "\n\n\n");
		}

	fclose(fp);

	return i > 0;
}

3.2
对hash的理论了解很少，只关注了实际使用。如果想寻找多种多样的hash函数，
业界著名的可参考下面的URL:
http://www.partow.net/programming/hashfunctions/index.html
在Libevent中，它的hash函数很简单，只是将一个struct event 的地址单纯的
右移6位得到hash码。可见，还是应该根据自身的需要编写，不过作为我来说，
一般会选择高德纳的。

3.3
使用strncmp有个显著缺陷，那就是遇到'\0'就会终止比较。比如有2个字符序列:
str1="abcd\0g" 和 str2="abcd\0m", 那么使用strncmp函数就会就会认为str1和
str2相等，这不可接受。书中使用逐个字符比较，当然也可以使用memcmp进行比较。
但这里对于书中的写法，仍然有个现实的考量，就是它会在拷贝完毕字符串之后显
式的缀上'\0', 作者明显是为了取字符串方便, 不过这由应用程序调用时，是会产
生二义性的。

3.4
本题不明白。据说这种 char str[1] 的写法很hack.

3.5
写上hash码对于特定操作是有明显的好处: 当要扩展这个hash表时，不需要
再做一次hash了，直接将每个节点的hash码模上新的hash桶的数目就得到了
在新hash表中所在的链表。这样做不仅节省了时间，还带来了一个额外的好
处，就是在旧表中同一个表项的节点，在新表中仍然在一起，因为它们再旧
表中的hash码是相同的，模上新的hash桶当然也相同了。这样在搜索表项时，
旧有代码照用不误。

3.6
Atom_length函数之所以会慢，根本原因在于，最坏情况下它要遍历所有存在
于hash表中的节点。一个简单的解决方法是，稍微复杂化原子的数据结构，
即添加使用上述的hash码。有了hash码就可以直接定位到特定的某个链表。
那么对atom的改造变为

static struct atom {
    struct atom *link;
    unsigned long atom_hash;
    int len;
    char *str;
} *buckets[2048];

并且也要改造Atom_new函数，它不能再返回p->str, 因为这不够用了，
它要返回p这个atom结构体指针。

3.7
extern void Atom_init(int hint); 实现这个函数的现实意义暂时还没发现。

3.8
代码见下面，其中四个字符串是在做 3.1 题时找到的具有相同hash码的两个字符串:
3_8.c

#include <sys/time.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "assert.h"

#include "atom.h"

int main(int argc, char const *argv[])
{
    char *s1 = "rebuttal";
    char *s2 = "handler";
    char *s3 = "hpl";
    char *s4 = "Zero";

    const char *p1 = Atom_new(s1, strlen(s1));
    const char *p2 = Atom_new(s2, strlen(s2));
    const char *p3 = Atom_new(s3, strlen(s3));
    const char *p4 = Atom_new(s4, strlen(s4));

    Atom_list();

    Atom_free(p1);
    Atom_free(p3);

    // Atom_reset();
    
    Atom_list();

    return 0;
}

其中 Atom_list Atom_free Atom_reset 函数见下面：

void Atom_free(const char *str)
{
	struct atom *p, *prev;
	int i;
	assert(str);

	for (i = 0; i < NELEMS(buckets); i++)
		for (p = buckets[i]; p; p = p->link) {	
			if (p->str == str) {
				printf("to free: [%s]\n", str);
				if (p == buckets[i])
					buckets[i] = p->link;
				else
					prev->link = p->link;
				free(p);
				// p = NULL;
			}
			prev = p;
		}
}

void Atom_list()
{
	struct atom *p;
	int i;
	for (i = 0; i < NELEMS(buckets); i++)
		for (p = buckets[i]; p; p = p->link)
			printf("%d: [%s]\n", i, p->str);
}

void Atom_reset(void)
{
	struct atom *p;
	int i;

	for (i = 0; i < NELEMS(buckets); i++)
		for (p = buckets[i]; p;) {
			buckets[i] = p->link;
			free(p);
			p = buckets[i];
		}
}

3.9
见下述代码，主要使用了可变参数列表。
3_9.c

#include <sys/time.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "assert.h"

#include "atom.h"

int main(int argc, char const *argv[])
{
    char *s1 = "rebuttal";
    char *s2 = "handler";
    char *s3 = "hpl";
    char *s4 = "Zero";

    printf("Atom_vload:\n");
    Atom_vload(s1, s2, s3, s4, NULL);

    Atom_list();
    Atom_reset();

    printf("\n\nAtom_aload:\n");

    const char *strs[] = {"rebuttal", "handler", "hpl", "Zero", NULL};
    Atom_aload(strs);

    Atom_list();
    Atom_reset();

    return 0;
}

Atom_vload Atom_aload 函数见下面：

void Atom_vload(const char *str, ...)
{
	va_list ap;
	va_start(ap, str);
	for (; str; str = va_arg(ap, const char *))
		Atom_new(str, strlen(str));
	va_end(ap);
}

void Atom_aload(const char *strs[])
{
	int i;
	const char *p;

	for (i = 0, p = strs[i]; p; p = strs[++i])
		Atom_new(p, strlen(p));
}

3.10
检查 const char *Atom_add(const char *str, int len) 参数中的str不为 NULL.

最后列出添加了新函数的atom.c文件，其中没有使用后面内存管理章节的FREE和ALLOC函数，使用的是标准库函数，目的是编译方便，现在编译时只需要：