哈希表练习，c语言词法分析，word count工具实现23/3/13

敲出美好未来

已于 2023-03-13 21:16:45 修改

阅读量191

点赞数

分类专栏： C++学习文章标签：散列表 c语言 word

于 2023-03-13 20:03:14 首次发布

本文链接：https://blog.csdn.net/Oliverzoo/article/details/129503584

版权

C++学习专栏收录该内容

12 篇文章 0 订阅

订阅专栏

#define _CRT_SECURE_NO_WARNINGS
#include<stdlib.h>
#include<stdio.h>
#include<ctype.h>
//哈希表练习---统计文本字母的出现次数
#define N 26

typedef char K;
typedef int V;

typedef struct node {
	K key;
	V val;
	struct node* next;
}Node;

typedef struct {
	Node* table[N];
}HashMap;

HashMap* hashmap_create()
{
	return calloc(1, sizeof(HashMap));
}

void hashmap_destroy(HashMap* map)
{
	for (int i = 0; i < N; i++)
	{
		Node* curr = map->table[i];
		while (curr)
		{//删除每个节点
			Node* next = curr->next;
			free(curr);
			curr = next;
		}
	}
	free(map);
}

unsigned int hash(const char c)
{
	return (unsigned)c;
}

V hashmap_put(HashMap* map, K key, V val)
{
	//对key进行哈希
	int idx = hash(key) % N;
	//遍历链表
	Node* curr = map->table[idx];
	while (curr)
	{
		if (curr->key == key)
		{
			//更新val并把旧的val返回
			V oldVal = curr->val;
			curr->val = val;
			return oldVal;
		}
		curr = curr->next;
	}
	//添加（key，val）
	Node* newNode = malloc(sizeof(Node));
	if (!newNode)
	{
		printf("ERROR");
		exit(1);
	}
	//初始化节点
	newNode->key = key;
	newNode->val = val;
	//头插法
	newNode->next = map->table[idx];
	map->table[idx] = newNode;

	return 0;
}

V hashmap_get(HashMap* map, K key)
{
	//对ket进行hash
	int idx = hash(key) % N;
	//遍历链表
	Node* curr = map->table[idx];
	while (curr)
	{
		if (curr->key == key)
		{
			return curr->val;
		}
		curr = curr->next;
	}
	return 0;
}

void hashmap_delete(HashMap* map, K key)
{
	//对key进行hash
	int idx = hash(key) % N;
	//遍历链表
	Node* prev = NULL;
	Node* curr = map->table[idx];
	while (curr)
	{
		if (curr->key == key)
		{
			if (prev == NULL)
			{
				map->table[idx] = curr->next;
				free(curr);
			}
			else
			{
				prev->next = curr->next;
				free(curr);
			}
			return;
		}
		prev = curr;
		curr = curr->next;
	}
}

int main(int argc, char* argv[])
{
	FILE* fp = fopen(argv[1], "r");
	if (!fp)
	{
		printf(stderr, "open %s failed\n", argv[1]);
		exit(1);
	}
	//统计
	HashMap* map = hashmap_create();
	int c;
	while ((c=fgetc(fp))!=EOF)
	{
		if (isalpha(c))
		{
			c = toupper(c);
			int count = hashmap_get(map, c);
			hashmap_put(map, c, count + 1);
		}
	}
	for (int i = 0; i < N; i++)
	{
		Node* curr = map->table[i];
		while (curr)
		{
			printf("%c: %d\n", curr->key, curr->val);
			curr = curr->next;
		}
	}
	hashmap_destroy(map);
}

c语言词法分析器—哈希表的使用

#pragma once
//scanner.h
//定义所有的token名字
typedef enum {
    // single-character tokens
    TOKEN_LEFT_PAREN, TOKEN_RIGHT_PAREN,		// '(', ')'
    TOKEN_LEFT_BRACKET, TOKEN_RIGHT_BRACKET,	// '[', ']'
    TOKEN_LEFT_BRACE, TOKEN_RIGHT_BRACE,  		// '{', '}'
    TOKEN_COMMA, TOKEN_DOT, TOKEN_SEMICOLON,	// ',', '.', ';'
    TOKEN_TILDE, TOKEN_JINGHAO, // '~''#'#不要忘记
    // one or two character tokens
    TOKEN_PLUS, TOKEN_PLUS_PLUS, TOKEN_PLUS_EQUAL, // '+', '++', '+='
    // '-', '--', '-=', '->'
    TOKEN_MINUS, TOKEN_MINUS_MINUS, TOKEN_MINUS_EQUAL, TOKEN_MINUS_GREATER,
    TOKEN_STAR, TOKEN_STAR_EQUAL,			// '*', '*='
    TOKEN_SLASH, TOKEN_SLASH_EQUAL, 		// '/', '/=', 
    TOKEN_PERCENT, TOKEN_PERCENT_EQUAL, 	// '%', '%='
    TOKEN_AMPER, TOKEN_AMPER_EQUAL, TOKEN_AMPER_AMPER, 	// '&', '&=', '&&'
    TOKEN_PIPE, TOKEN_PIPE_EQUAL, TOKEN_PIPE_PIPE,		// '|', '|=', '||'
    TOKEN_HAT, TOKEN_HAT_EQUAL, 		// '^', '^='
    TOKEN_EQUAL, TOKEN_EQUAL_EQUAL, 	// '=', '=='
    TOKEN_BANG, TOKEN_BANG_EQUAL,	  	// '!', '!='
    TOKEN_LESS, TOKEN_LESS_EQUAL, TOKEN_LESS_LESS, 				// '<', '<=', '<<'
    TOKEN_GREATER, TOKEN_GREATER_EQUAL, TOKEN_GREAER_GREATER, 	// '>', '>=', '>>'
    // 字面值: 标识符, 字符, 字符串, 数字
    TOKEN_IDENTIFIER, TOKEN_CHARACTER, TOKEN_STRING, TOKEN_NUMBER,
    // 关键字27
    TOKEN_SIGNED, TOKEN_UNSIGNED,
    TOKEN_CHAR, TOKEN_SHORT, TOKEN_INT, TOKEN_LONG,
    TOKEN_FLOAT, TOKEN_DOUBLE,
    TOKEN_STRUCT, TOKEN_UNION, TOKEN_ENUM, TOKEN_VOID,
    TOKEN_IF, TOKEN_ELSE, TOKEN_SWITCH, TOKEN_CASE, TOKEN_DEFAULT,
    TOKEN_WHILE, TOKEN_DO, TOKEN_FOR,
    TOKEN_BREAK, TOKEN_CONTINUE, TOKEN_RETURN, TOKEN_GOTO,
    TOKEN_CONST, TOKEN_SIZEOF, TOKEN_TYPEDEF,
    // 辅助Token
    TOKEN_ERROR, TOKEN_EOF

} TokenType;

typedef struct {
    TokenType type;
    const char* start;	// start指向source中的字符，source为读入的源代码。
    int length;		    // length表示这个Token的长度
    int line;		    // line表示这个Token在源代码的哪一行, 方便后面的报错
} Token;

// 对 Scanner 进行初始化 
void initScanner(const char* source);

// 调用scanToken(), 返回下一个Token.
Token scanToken();

//scanner.c
#include <stdbool.h>
#include<stdio.h>
#include "scanner.h"
typedef struct {
    const char* start;
    const char* current;
    int line;
} Scanner;

//哈希表来处理token
//哈希表
#define TABLE_SIZE 31

typedef struct {
    const char* lexeme;
    TokenType type;
} Entry;

typedef struct {
    int count;
    Entry entries[TABLE_SIZE];
} Table;
static Table keywords;

void initKeywords() {
    keywords.count = 26;
    // 初始化关键字表
    keywords.entries[0] = (Entry){ "signed", TOKEN_SIGNED };
    keywords.entries[1] = (Entry){ "unsigned", TOKEN_UNSIGNED };
    keywords.entries[2] = (Entry){ "char", TOKEN_CHAR };
    keywords.entries[3] = (Entry){ "short", TOKEN_SHORT };
    keywords.entries[4] = (Entry){ "int", TOKEN_INT };
    keywords.entries[5] = (Entry){ "long", TOKEN_LONG };
    keywords.entries[6] = (Entry){ "float", TOKEN_FLOAT };
    keywords.entries[7] = (Entry){ "double", TOKEN_DOUBLE };
    keywords.entries[8] = (Entry){ "struct", TOKEN_STRUCT };
    keywords.entries[9] = (Entry){ "union", TOKEN_UNION };
    keywords.entries[10] = (Entry){ "enum", TOKEN_ENUM };
    keywords.entries[11] = (Entry){ "void", TOKEN_VOID };
    keywords.entries[12] = (Entry){ "if", TOKEN_IF };
    keywords.entries[13] = (Entry){ "else", TOKEN_ELSE };
    keywords.entries[14] = (Entry){ "switch", TOKEN_SWITCH };
    keywords.entries[15] = (Entry){ "case", TOKEN_CASE };
    keywords.entries[16] = (Entry){ "default", TOKEN_DEFAULT };
    keywords.entries[17] = (Entry){ "while", TOKEN_WHILE };
    keywords.entries[18] = (Entry){ "do", TOKEN_DO };
    keywords.entries[19] = (Entry){ "for", TOKEN_FOR };
    keywords.entries[20] = (Entry){ "break", TOKEN_BREAK };
    keywords.entries[21] = (Entry){ "continue", TOKEN_CONTINUE };
    keywords.entries[22] = (Entry){ "return", TOKEN_RETURN };
    keywords.entries[23] = (Entry){ "goto", TOKEN_GOTO };
    keywords.entries[24] = (Entry){ "const", TOKEN_CONST };
    keywords.entries[25] = (Entry){ "sizeof", TOKEN_SIZEOF };
    keywords.entries[26] = (Entry){ "typedef", TOKEN_TYPEDEF };
}
#define HASH_MULTIPLIER 37

static int hashString(const char* key, int length) {
    int hash = 0;

    for (int i = 0; i < length; i++) {
        hash = hash * HASH_MULTIPLIER + key[i];
    }

    return hash;
}
// 全局变量
Scanner scanner;

void initScanner(const char* source) {
    scanner.start = &source[0];
    scanner.current = &source[0];
    scanner.line = 0;
}

/***************************************************************************************
 *                                   辅助方法											*
 ***************************************************************************************/
static bool isAlpha(char c) {
    return (c >= 'a' && c <= 'z') ||
        (c >= 'A' && c <= 'Z') ||
        c == '_';
}

static bool isDigit(char c) {
    return c >= '0' && c <= '9';
}

static bool isAtEnd() {
    return *scanner.current == '\0';
}

static char advance() {
    return *scanner.current++;
}

static char peek() {
    return *scanner.current;
}

static char peekNext() {
    if (isAtEnd()) return '\0';
    return *(scanner.current + 1);
}

static bool match(char expected) {
    if (isAtEnd()) return false;
    if (peek() != expected) return false;
    scanner.current++;
    return true;
}

// 传入TokenType, 创建对应类型的Token，并返回。
static Token makeToken(TokenType type) {
    Token token;
    token.type = type;
    token.start = scanner.start;
    token.length = (int)(scanner.current - scanner.start);
    token.line = scanner.line;
    return token;
}

// 遇到不能解析的情况时，我们创建一个ERROR Token. 比如：遇到@，$等符号时，比如字符串，字符没有对应的右引号时。
static Token errorToken(const char* message) {
    Token token;
    token.type = TOKEN_ERROR;
    token.start = message;
    token.length = (int)strlen(message);
    token.line = scanner.line;
    return token;
}
// 跳过空白字符: ' ', '\r', '\t', '\n'和注释   
   // 注释以'//'开头, 一直到行尾
   // 注意更新scanner.line！    
static void skipWhitespace() {
    while (1) {
        char c = peek();
        switch (c) {
        case ' ':
        case '\r':
        case '\t':
            advance();
            break;
        case '\n':
            scanner.line++;
            advance();
            break;
        case '/':
            if (peekNext() == '/') {
                // 跳过整行注释
                while (peek() != '\n' && !isAtEnd()) 
                    advance();
            }
            else {
                return;
            }
            break;
        default:
            return;
        }
    }
}


// 确定identifier类型主要有两种方式：
    // 1. 将所有的关键字放入哈希表中，然后查表确认
    // 2. 将所有的关键字放入Trie树中，然后查表确认
    // Trie树的方式不管是空间上还是时间上都优于哈希表的方式
static TokenType identifierType() 
{
    int length = (int)(scanner.current - scanner.start);
    int hash = hashString(scanner.start, length);

    for (int i = 0; i < keywords.count; i++) {
        if (keywords.entries[i].lexeme == NULL) {
            continue;
        }

        if (strlen(keywords.entries[i].lexeme) != length) {
            continue;
        }

        if (hashString(keywords.entries[i].lexeme, length) == hash &&
            memcmp(keywords.entries[i].lexeme, scanner.start, length) == 0) {
            return keywords.entries[i].type;
        }
    }

    return TOKEN_IDENTIFIER;

}

static Token identifier() {
    // IDENTIFIER包含: 字母，数字和下划线
    while (isAlpha(peek()) || isDigit(peek())) {
        advance();
    }
    // 这样的Token可能是标识符, 也可能是关键字, identifierType()是用来确定Token类型的
    return makeToken(identifierType());
}
// 简单起见，我们将NUMBER的规则定义如下:
    // 1. NUMBER可以包含数字和最多一个'.'号
    // 2. '.'号前面要有数字
    // 3. '.'号后面也要有数字
    // 这些都是合法的NUMBER: 123, 3.14
    // 这些都是不合法的NUMBER: 123., .14
static Token number() {

    while (isDigit(peek()))
    {
        advance();
        printf("kk\n");
    }
    // Look for a fractional part.
    if (peek() == '.' && isDigit(peekNext())) {
        // Consume the ".".
        advance();

        while (isDigit(peek())) advance();
    }

    return makeToken(TOKEN_NUMBER);

}

static Token string() {
    // 字符串以"开头，以"结尾，而且不能跨行
    while (peek() != '"' && !isAtEnd()) {
        if (peek() == '\n') scanner.line++;
        advance();
    }


    if (isAtEnd()) {
        printf("Unterminated string.");
        return makeToken(TOKEN_ERROR);
    }

    // The closing ".
    advance();
    return makeToken(TOKEN_STRING);
}

static Token character() {
    // 字符'开头，以'结尾，而且不能跨行
    if (peek() == '\'') {
        printf("Empty character literal.");
        return makeToken(TOKEN_ERROR);
    }

    if (peekNext() == '\'') {
        // Consume the closing '.
        advance();
        advance();
        printf("Empty character literal.");
        return makeToken(TOKEN_ERROR);
    }

    if (peek() == '\n') {
        printf("Character literal cannot contain newline.");
        return makeToken(TOKEN_ERROR);
    }

    advance();

    if (peek() != '\''&&peek=='\0') {
        printf("Unterminated character literal.");
        return makeToken(TOKEN_ERROR);
    }

    // Consume the closing '.
    advance();
    return makeToken(TOKEN_CHARACTER);
}

/***************************************************************************************
 *                                   	分词											  *
 ***************************************************************************************/
Token scanToken() {
    // 跳过前置空白字符和注释
    skipWhitespace();

    // 记录下一个Token的起始位置
    scanner.start = scanner.current;

    if (isAtEnd()) return makeToken(TOKEN_EOF);

    char c = advance();
    if (isAlpha(c)) return identifier();
    if (isDigit(c)) return number();

    switch (c) {
        // single-character tokens
    case '(': return makeToken(TOKEN_LEFT_PAREN);
    case ')': return makeToken(TOKEN_RIGHT_PAREN);
    case '{': return makeToken(TOKEN_LEFT_BRACE);
    case '}': return makeToken(TOKEN_RIGHT_BRACE);
        //这里丢了
    case '[': return makeToken(TOKEN_LEFT_BRACKET);
    case ']': return makeToken(TOKEN_RIGHT_BRACKET);
    case ';': return makeToken(TOKEN_SEMICOLON);
    case ',': return makeToken(TOKEN_COMMA);
    case '.': return makeToken(TOKEN_DOT);
    case '-': return makeToken(TOKEN_MINUS);

    case '#': return makeToken(TOKEN_JINGHAO);

        // one or two characters tokens
    case '+':
        if (match('+')) return makeToken(TOKEN_PLUS_PLUS);
        else if (match('=')) return makeToken(TOKEN_PLUS_EQUAL);
        else return makeToken(TOKEN_PLUS);
    case '/':
        if (match('/')) {
            // 注释以'//'开头, 一直到行尾
            while (peek() != '\n' && !isAtEnd()) advance();
            return scanToken();
        }
        else if (match('*')) {
            // 处理多行注释
            while (!isAtEnd()) {
                if (peek() == '*' && peekNext() == '/') {
                    advance(); advance();
                    return scanToken();
                }
                else advance();
            }
            return errorToken("Unterminated multi-line comment.");
        }
        else return makeToken(TOKEN_SLASH);
    case '*': return makeToken(TOKEN_STAR);
    case '%': return makeToken(TOKEN_PERCENT);
    case '<':
        if (match('=')) return makeToken(TOKEN_LESS_EQUAL);
        else return makeToken(TOKEN_LESS);
    case '>':
        if (match('=')) return makeToken(TOKEN_GREATER_EQUAL);
        else return makeToken(TOKEN_GREATER);
    case '!':
        if (match('=')) return makeToken(TOKEN_BANG_EQUAL);
        else return makeToken(TOKEN_BANG);
    case '=':
        if (match('=')) return makeToken(TOKEN_EQUAL_EQUAL);
        else return makeToken(TOKEN_EQUAL);

        // various-character tokens
    case '"': return string();
    case '\'': return character();
    }

    return errorToken("Unexpected character.");
}

//main.c
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "scanner.h"

static void run(const char* source) {
	initScanner(source);
	int line = -1;
	// 打印Token, 遇到TOKEN_EOF为止
	for (;;) {
		Token token = scanToken();
		if (token.line != line) {
			printf("%4d ", token.line);
			line = token.line;
		}
		else {
			printf("   | ");
		}
		printf("%2d '%.*s'\n", token.type, token.length, token.start);
		if (token.type == TOKEN_EOF) break;
	}
}
static void repl() {
	// 与用户交互，用户每输入一行代码，分析一行代码，并将结果输出
	// repl是"read evaluate print loop"的缩写
	char str[200] = { '\0' };
	while (1) {
		gets(str);
		initScanner(str);
		int line = 0;
		for (;;) {
			Token token = scanToken();
			if (token.line != line) {
				printf("%4d ", token.line);
				line = token.line;
			}
			else {
				printf("   | ");
			}
			printf("%2d '%.*s'\n", token.type, token.length, token.start);
			if (token.type == TOKEN_EOF) break;
		}
	}
}
static char* readFile(const char* path) {
	// 用户输入文件名，将整个文件的内容读入内存，并在末尾添加'\0'
	// 注意: 这里应该使用动态内存分配，因此应该事先确定文件的大小。
	FILE* fp = fopen(path, "rb");
	if (!fp) {
		fprintf(stderr, "open %s failed\n", path);
		exit(1);
	}
	// 确定文件大小
	fseek(fp, 0L, SEEK_END);
	long fileSize = ftell(fp);
	// 回到文件的开头
	rewind(fp);

	char* result = malloc(fileSize + 1);
	if (!result) {
		fprintf(stderr, "Error: malloc failed in readFile\n");
		exit(1);
	}

	fread(result, 1, fileSize, fp);
	fclose(fp);
	result[fileSize] = '\0';

	return result;
}
static void runFile(const char* path) {
	// 处理'.c'文件:用户输入文件名，分析整个文件，并将结果输出
	char* todostr = readFile(path);
	run(todostr);
}
int main(int argc, const char* argv[]) {

	//printf("%d",argc);
	if (argc == 1) {
		// ./scanner 没有参数,则进入交互式界面
		repl();
	}
	else if (argc == 2) {
		// ./scanner file 后面跟一个参数,则分析整个文件
		runFile(argv[1]);
	}
	else {
		fprintf(stderr, "Usage: scanner [path]\n");
		exit(1);
	}
	return 0;
}

//'wc' (word count)是Unix下的一个工具, 它可以统计一个文本文件中字符的个
//数，单词的个数以及行数 (其中我们也统计不可打印的字符和空白字符，单词是以
//空白字符分割的)。请实现一个 'wc' 程序，当传入的参数个数不对时，请给出提
//示信息。
#define _CRT_SECURE_NO_WARNINGS
#include<stdlib.h>
#include<stdio.h>
#include<ctype.h>

typedef enum {
	INSIDE,
	OUTSIDE
}State;

int main(int argc, char* argv[])
{
	if (argc != 2)
	{
		fprintf(stderr, "Usage: wc file\n");
		exit(1);
	}
	FILE* fp = fopen(argv[1], "r");
	if (!fp)
	{
		fprintf(stderr, "Open %s failed\n", argv[1]);
		exit(1);
	}
	int c;
	int characters = 0, words = 0, lines = 0;
	State state = OUTSIDE;

	while ((c=fgetc(fp))!=EOF)
	{
		characters++;
		if (c == '\n')
		{
			lines++;
			printf("检测到换行\n");
		}
		if (isspace(c))
		{
			if (state == INSIDE)
			{
				words++;
				printf("检测到单词\n");
			}
			state = OUTSIDE;
		}
		else
		{
			state = INSIDE;
		}
	}
	printf("characters:%d,words:%d,lines:%d\n", characters, words, lines);
	return 0;
}