#define _CRT_SECURE_NO_WARNINGS
#include<stdlib.h>
#include<stdio.h>
#include<ctype.h>
//哈希表练习---统计文本字母的出现次数
#define N 26
typedef char K;
typedef int V;
typedef struct node {
K key;
V val;
struct node* next;
}Node;
typedef struct {
Node* table[N];
}HashMap;
HashMap* hashmap_create()
{
return calloc(1, sizeof(HashMap));
}
void hashmap_destroy(HashMap* map)
{
for (int i = 0; i < N; i++)
{
Node* curr = map->table[i];
while (curr)
{//删除每个节点
Node* next = curr->next;
free(curr);
curr = next;
}
}
free(map);
}
unsigned int hash(const char c)
{
return (unsigned)c;
}
V hashmap_put(HashMap* map, K key, V val)
{
//对key进行哈希
int idx = hash(key) % N;
//遍历链表
Node* curr = map->table[idx];
while (curr)
{
if (curr->key == key)
{
//更新val并把旧的val返回
V oldVal = curr->val;
curr->val = val;
return oldVal;
}
curr = curr->next;
}
//添加(key,val)
Node* newNode = malloc(sizeof(Node));
if (!newNode)
{
printf("ERROR");
exit(1);
}
//初始化节点
newNode->key = key;
newNode->val = val;
//头插法
newNode->next = map->table[idx];
map->table[idx] = newNode;
return 0;
}
V hashmap_get(HashMap* map, K key)
{
//对ket进行hash
int idx = hash(key) % N;
//遍历链表
Node* curr = map->table[idx];
while (curr)
{
if (curr->key == key)
{
return curr->val;
}
curr = curr->next;
}
return 0;
}
void hashmap_delete(HashMap* map, K key)
{
//对key进行hash
int idx = hash(key) % N;
//遍历链表
Node* prev = NULL;
Node* curr = map->table[idx];
while (curr)
{
if (curr->key == key)
{
if (prev == NULL)
{
map->table[idx] = curr->next;
free(curr);
}
else
{
prev->next = curr->next;
free(curr);
}
return;
}
prev = curr;
curr = curr->next;
}
}
int main(int argc, char* argv[])
{
FILE* fp = fopen(argv[1], "r");
if (!fp)
{
printf(stderr, "open %s failed\n", argv[1]);
exit(1);
}
//统计
HashMap* map = hashmap_create();
int c;
while ((c=fgetc(fp))!=EOF)
{
if (isalpha(c))
{
c = toupper(c);
int count = hashmap_get(map, c);
hashmap_put(map, c, count + 1);
}
}
for (int i = 0; i < N; i++)
{
Node* curr = map->table[i];
while (curr)
{
printf("%c: %d\n", curr->key, curr->val);
curr = curr->next;
}
}
hashmap_destroy(map);
}
c语言词法分析器—哈希表的使用
#pragma once
//scanner.h
//定义所有的token名字
typedef enum {
// single-character tokens
TOKEN_LEFT_PAREN, TOKEN_RIGHT_PAREN, // '(', ')'
TOKEN_LEFT_BRACKET, TOKEN_RIGHT_BRACKET, // '[', ']'
TOKEN_LEFT_BRACE, TOKEN_RIGHT_BRACE, // '{', '}'
TOKEN_COMMA, TOKEN_DOT, TOKEN_SEMICOLON, // ',', '.', ';'
TOKEN_TILDE, TOKEN_JINGHAO, // '~''#'#不要忘记
// one or two character tokens
TOKEN_PLUS, TOKEN_PLUS_PLUS, TOKEN_PLUS_EQUAL, // '+', '++', '+='
// '-', '--', '-=', '->'
TOKEN_MINUS, TOKEN_MINUS_MINUS, TOKEN_MINUS_EQUAL, TOKEN_MINUS_GREATER,
TOKEN_STAR, TOKEN_STAR_EQUAL, // '*', '*='
TOKEN_SLASH, TOKEN_SLASH_EQUAL, // '/', '/=',
TOKEN_PERCENT, TOKEN_PERCENT_EQUAL, // '%', '%='
TOKEN_AMPER, TOKEN_AMPER_EQUAL, TOKEN_AMPER_AMPER, // '&', '&=', '&&'
TOKEN_PIPE, TOKEN_PIPE_EQUAL, TOKEN_PIPE_PIPE, // '|', '|=', '||'
TOKEN_HAT, TOKEN_HAT_EQUAL, // '^', '^='
TOKEN_EQUAL, TOKEN_EQUAL_EQUAL, // '=', '=='
TOKEN_BANG, TOKEN_BANG_EQUAL, // '!', '!='
TOKEN_LESS, TOKEN_LESS_EQUAL, TOKEN_LESS_LESS, // '<', '<=', '<<'
TOKEN_GREATER, TOKEN_GREATER_EQUAL, TOKEN_GREAER_GREATER, // '>', '>=', '>>'
// 字面值: 标识符, 字符, 字符串, 数字
TOKEN_IDENTIFIER, TOKEN_CHARACTER, TOKEN_STRING, TOKEN_NUMBER,
// 关键字27
TOKEN_SIGNED, TOKEN_UNSIGNED,
TOKEN_CHAR, TOKEN_SHORT, TOKEN_INT, TOKEN_LONG,
TOKEN_FLOAT, TOKEN_DOUBLE,
TOKEN_STRUCT, TOKEN_UNION, TOKEN_ENUM, TOKEN_VOID,
TOKEN_IF, TOKEN_ELSE, TOKEN_SWITCH, TOKEN_CASE, TOKEN_DEFAULT,
TOKEN_WHILE, TOKEN_DO, TOKEN_FOR,
TOKEN_BREAK, TOKEN_CONTINUE, TOKEN_RETURN, TOKEN_GOTO,
TOKEN_CONST, TOKEN_SIZEOF, TOKEN_TYPEDEF,
// 辅助Token
TOKEN_ERROR, TOKEN_EOF
} TokenType;
typedef struct {
TokenType type;
const char* start; // start指向source中的字符,source为读入的源代码。
int length; // length表示这个Token的长度
int line; // line表示这个Token在源代码的哪一行, 方便后面的报错
} Token;
// 对 Scanner 进行初始化
void initScanner(const char* source);
// 调用scanToken(), 返回下一个Token.
Token scanToken();
//scanner.c
#include <stdbool.h>
#include<stdio.h>
#include "scanner.h"
typedef struct {
const char* start;
const char* current;
int line;
} Scanner;
//哈希表来处理token
//哈希表
#define TABLE_SIZE 31
typedef struct {
const char* lexeme;
TokenType type;
} Entry;
typedef struct {
int count;
Entry entries[TABLE_SIZE];
} Table;
static Table keywords;
void initKeywords() {
keywords.count = 26;
// 初始化关键字表
keywords.entries[0] = (Entry){ "signed", TOKEN_SIGNED };
keywords.entries[1] = (Entry){ "unsigned", TOKEN_UNSIGNED };
keywords.entries[2] = (Entry){ "char", TOKEN_CHAR };
keywords.entries[3] = (Entry){ "short", TOKEN_SHORT };
keywords.entries[4] = (Entry){ "int", TOKEN_INT };
keywords.entries[5] = (Entry){ "long", TOKEN_LONG };
keywords.entries[6] = (Entry){ "float", TOKEN_FLOAT };
keywords.entries[7] = (Entry){ "double", TOKEN_DOUBLE };
keywords.entries[8] = (Entry){ "struct", TOKEN_STRUCT };
keywords.entries[9] = (Entry){ "union", TOKEN_UNION };
keywords.entries[10] = (Entry){ "enum", TOKEN_ENUM };
keywords.entries[11] = (Entry){ "void", TOKEN_VOID };
keywords.entries[12] = (Entry){ "if", TOKEN_IF };
keywords.entries[13] = (Entry){ "else", TOKEN_ELSE };
keywords.entries[14] = (Entry){ "switch", TOKEN_SWITCH };
keywords.entries[15] = (Entry){ "case", TOKEN_CASE };
keywords.entries[16] = (Entry){ "default", TOKEN_DEFAULT };
keywords.entries[17] = (Entry){ "while", TOKEN_WHILE };
keywords.entries[18] = (Entry){ "do", TOKEN_DO };
keywords.entries[19] = (Entry){ "for", TOKEN_FOR };
keywords.entries[20] = (Entry){ "break", TOKEN_BREAK };
keywords.entries[21] = (Entry){ "continue", TOKEN_CONTINUE };
keywords.entries[22] = (Entry){ "return", TOKEN_RETURN };
keywords.entries[23] = (Entry){ "goto", TOKEN_GOTO };
keywords.entries[24] = (Entry){ "const", TOKEN_CONST };
keywords.entries[25] = (Entry){ "sizeof", TOKEN_SIZEOF };
keywords.entries[26] = (Entry){ "typedef", TOKEN_TYPEDEF };
}
#define HASH_MULTIPLIER 37
static int hashString(const char* key, int length) {
int hash = 0;
for (int i = 0; i < length; i++) {
hash = hash * HASH_MULTIPLIER + key[i];
}
return hash;
}
// 全局变量
Scanner scanner;
void initScanner(const char* source) {
scanner.start = &source[0];
scanner.current = &source[0];
scanner.line = 0;
}
/***************************************************************************************
* 辅助方法 *
***************************************************************************************/
static bool isAlpha(char c) {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == '_';
}
static bool isDigit(char c) {
return c >= '0' && c <= '9';
}
static bool isAtEnd() {
return *scanner.current == '\0';
}
static char advance() {
return *scanner.current++;
}
static char peek() {
return *scanner.current;
}
static char peekNext() {
if (isAtEnd()) return '\0';
return *(scanner.current + 1);
}
static bool match(char expected) {
if (isAtEnd()) return false;
if (peek() != expected) return false;
scanner.current++;
return true;
}
// 传入TokenType, 创建对应类型的Token,并返回。
static Token makeToken(TokenType type) {
Token token;
token.type = type;
token.start = scanner.start;
token.length = (int)(scanner.current - scanner.start);
token.line = scanner.line;
return token;
}
// 遇到不能解析的情况时,我们创建一个ERROR Token. 比如:遇到@,$等符号时,比如字符串,字符没有对应的右引号时。
static Token errorToken(const char* message) {
Token token;
token.type = TOKEN_ERROR;
token.start = message;
token.length = (int)strlen(message);
token.line = scanner.line;
return token;
}
// 跳过空白字符: ' ', '\r', '\t', '\n'和注释
// 注释以'//'开头, 一直到行尾
// 注意更新scanner.line!
static void skipWhitespace() {
while (1) {
char c = peek();
switch (c) {
case ' ':
case '\r':
case '\t':
advance();
break;
case '\n':
scanner.line++;
advance();
break;
case '/':
if (peekNext() == '/') {
// 跳过整行注释
while (peek() != '\n' && !isAtEnd())
advance();
}
else {
return;
}
break;
default:
return;
}
}
}
// 确定identifier类型主要有两种方式:
// 1. 将所有的关键字放入哈希表中,然后查表确认
// 2. 将所有的关键字放入Trie树中,然后查表确认
// Trie树的方式不管是空间上还是时间上都优于哈希表的方式
static TokenType identifierType()
{
int length = (int)(scanner.current - scanner.start);
int hash = hashString(scanner.start, length);
for (int i = 0; i < keywords.count; i++) {
if (keywords.entries[i].lexeme == NULL) {
continue;
}
if (strlen(keywords.entries[i].lexeme) != length) {
continue;
}
if (hashString(keywords.entries[i].lexeme, length) == hash &&
memcmp(keywords.entries[i].lexeme, scanner.start, length) == 0) {
return keywords.entries[i].type;
}
}
return TOKEN_IDENTIFIER;
}
static Token identifier() {
// IDENTIFIER包含: 字母,数字和下划线
while (isAlpha(peek()) || isDigit(peek())) {
advance();
}
// 这样的Token可能是标识符, 也可能是关键字, identifierType()是用来确定Token类型的
return makeToken(identifierType());
}
// 简单起见,我们将NUMBER的规则定义如下:
// 1. NUMBER可以包含数字和最多一个'.'号
// 2. '.'号前面要有数字
// 3. '.'号后面也要有数字
// 这些都是合法的NUMBER: 123, 3.14
// 这些都是不合法的NUMBER: 123., .14
static Token number() {
while (isDigit(peek()))
{
advance();
printf("kk\n");
}
// Look for a fractional part.
if (peek() == '.' && isDigit(peekNext())) {
// Consume the ".".
advance();
while (isDigit(peek())) advance();
}
return makeToken(TOKEN_NUMBER);
}
static Token string() {
// 字符串以"开头,以"结尾,而且不能跨行
while (peek() != '"' && !isAtEnd()) {
if (peek() == '\n') scanner.line++;
advance();
}
if (isAtEnd()) {
printf("Unterminated string.");
return makeToken(TOKEN_ERROR);
}
// The closing ".
advance();
return makeToken(TOKEN_STRING);
}
static Token character() {
// 字符'开头,以'结尾,而且不能跨行
if (peek() == '\'') {
printf("Empty character literal.");
return makeToken(TOKEN_ERROR);
}
if (peekNext() == '\'') {
// Consume the closing '.
advance();
advance();
printf("Empty character literal.");
return makeToken(TOKEN_ERROR);
}
if (peek() == '\n') {
printf("Character literal cannot contain newline.");
return makeToken(TOKEN_ERROR);
}
advance();
if (peek() != '\''&&peek=='\0') {
printf("Unterminated character literal.");
return makeToken(TOKEN_ERROR);
}
// Consume the closing '.
advance();
return makeToken(TOKEN_CHARACTER);
}
/***************************************************************************************
* 分词 *
***************************************************************************************/
Token scanToken() {
// 跳过前置空白字符和注释
skipWhitespace();
// 记录下一个Token的起始位置
scanner.start = scanner.current;
if (isAtEnd()) return makeToken(TOKEN_EOF);
char c = advance();
if (isAlpha(c)) return identifier();
if (isDigit(c)) return number();
switch (c) {
// single-character tokens
case '(': return makeToken(TOKEN_LEFT_PAREN);
case ')': return makeToken(TOKEN_RIGHT_PAREN);
case '{': return makeToken(TOKEN_LEFT_BRACE);
case '}': return makeToken(TOKEN_RIGHT_BRACE);
//这里丢了
case '[': return makeToken(TOKEN_LEFT_BRACKET);
case ']': return makeToken(TOKEN_RIGHT_BRACKET);
case ';': return makeToken(TOKEN_SEMICOLON);
case ',': return makeToken(TOKEN_COMMA);
case '.': return makeToken(TOKEN_DOT);
case '-': return makeToken(TOKEN_MINUS);
case '#': return makeToken(TOKEN_JINGHAO);
// one or two characters tokens
case '+':
if (match('+')) return makeToken(TOKEN_PLUS_PLUS);
else if (match('=')) return makeToken(TOKEN_PLUS_EQUAL);
else return makeToken(TOKEN_PLUS);
case '/':
if (match('/')) {
// 注释以'//'开头, 一直到行尾
while (peek() != '\n' && !isAtEnd()) advance();
return scanToken();
}
else if (match('*')) {
// 处理多行注释
while (!isAtEnd()) {
if (peek() == '*' && peekNext() == '/') {
advance(); advance();
return scanToken();
}
else advance();
}
return errorToken("Unterminated multi-line comment.");
}
else return makeToken(TOKEN_SLASH);
case '*': return makeToken(TOKEN_STAR);
case '%': return makeToken(TOKEN_PERCENT);
case '<':
if (match('=')) return makeToken(TOKEN_LESS_EQUAL);
else return makeToken(TOKEN_LESS);
case '>':
if (match('=')) return makeToken(TOKEN_GREATER_EQUAL);
else return makeToken(TOKEN_GREATER);
case '!':
if (match('=')) return makeToken(TOKEN_BANG_EQUAL);
else return makeToken(TOKEN_BANG);
case '=':
if (match('=')) return makeToken(TOKEN_EQUAL_EQUAL);
else return makeToken(TOKEN_EQUAL);
// various-character tokens
case '"': return string();
case '\'': return character();
}
return errorToken("Unexpected character.");
}
//main.c
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "scanner.h"
static void run(const char* source) {
initScanner(source);
int line = -1;
// 打印Token, 遇到TOKEN_EOF为止
for (;;) {
Token token = scanToken();
if (token.line != line) {
printf("%4d ", token.line);
line = token.line;
}
else {
printf(" | ");
}
printf("%2d '%.*s'\n", token.type, token.length, token.start);
if (token.type == TOKEN_EOF) break;
}
}
static void repl() {
// 与用户交互,用户每输入一行代码,分析一行代码,并将结果输出
// repl是"read evaluate print loop"的缩写
char str[200] = { '\0' };
while (1) {
gets(str);
initScanner(str);
int line = 0;
for (;;) {
Token token = scanToken();
if (token.line != line) {
printf("%4d ", token.line);
line = token.line;
}
else {
printf(" | ");
}
printf("%2d '%.*s'\n", token.type, token.length, token.start);
if (token.type == TOKEN_EOF) break;
}
}
}
static char* readFile(const char* path) {
// 用户输入文件名,将整个文件的内容读入内存,并在末尾添加'\0'
// 注意: 这里应该使用动态内存分配,因此应该事先确定文件的大小。
FILE* fp = fopen(path, "rb");
if (!fp) {
fprintf(stderr, "open %s failed\n", path);
exit(1);
}
// 确定文件大小
fseek(fp, 0L, SEEK_END);
long fileSize = ftell(fp);
// 回到文件的开头
rewind(fp);
char* result = malloc(fileSize + 1);
if (!result) {
fprintf(stderr, "Error: malloc failed in readFile\n");
exit(1);
}
fread(result, 1, fileSize, fp);
fclose(fp);
result[fileSize] = '\0';
return result;
}
static void runFile(const char* path) {
// 处理'.c'文件:用户输入文件名,分析整个文件,并将结果输出
char* todostr = readFile(path);
run(todostr);
}
int main(int argc, const char* argv[]) {
//printf("%d",argc);
if (argc == 1) {
// ./scanner 没有参数,则进入交互式界面
repl();
}
else if (argc == 2) {
// ./scanner file 后面跟一个参数,则分析整个文件
runFile(argv[1]);
}
else {
fprintf(stderr, "Usage: scanner [path]\n");
exit(1);
}
return 0;
}
//'wc' (word count)是Unix下的一个工具, 它可以统计一个文本文件中字符的个
//数,单词的个数以及行数 (其中我们也统计不可打印的字符和空白字符,单词是以
//空白字符分割的)。请实现一个 'wc' 程序,当传入的参数个数不对时,请给出提
//示信息。
#define _CRT_SECURE_NO_WARNINGS
#include<stdlib.h>
#include<stdio.h>
#include<ctype.h>
typedef enum {
INSIDE,
OUTSIDE
}State;
int main(int argc, char* argv[])
{
if (argc != 2)
{
fprintf(stderr, "Usage: wc file\n");
exit(1);
}
FILE* fp = fopen(argv[1], "r");
if (!fp)
{
fprintf(stderr, "Open %s failed\n", argv[1]);
exit(1);
}
int c;
int characters = 0, words = 0, lines = 0;
State state = OUTSIDE;
while ((c=fgetc(fp))!=EOF)
{
characters++;
if (c == '\n')
{
lines++;
printf("检测到换行\n");
}
if (isspace(c))
{
if (state == INSIDE)
{
words++;
printf("检测到单词\n");
}
state = OUTSIDE;
}
else
{
state = INSIDE;
}
}
printf("characters:%d,words:%d,lines:%d\n", characters, words, lines);
return 0;
}