代码: main.c instructions.c error_report.c analyze.c instructions.h error_report.h analyze.h main.c #include <stdio.h> #include <string.h> #include <stdlib.h> #include "analyze.h" #include "instructions.h" int main(int argc, char **argv) { if (argc != 2) { fprintf(stderr, "options not suitable, please use -h for help/n"); exit(EXIT_FAILURE); } else if (strcmp(argv[1], "-h") == 0) { instructions(); } else { lex(argv[1]); } return 0; } instructions.h /*instructions about this program*/ #ifndef INSTRUCTIONS_H #define INSTRUCTIONS_H void instructions(void); instructions.c /*instructions about this program*/ #include <stdio.h> #include "instructions.h" void instructions(void) { printf("This is the help file. -h for help/n" "if you want to analysis a c file, please enter like" "this: lex_analyze file.c, less or more parameters will receive warning/n" "version 1.0, lexical analyze, created by guozan-bupt/n"); } #endif error_report.h #ifndef ERROR_REPORT_H #define ERROR_REPORT_H void error_report(void); #endif error_report.c #include <stdio.h> #include "error_report.h" #include "analyze.h" void error_report(void) { printf("Error occur in analyze, line %ld/n", linetotal_g); } analyze.h /*lexical analysis*/ #ifndef ANALYZE_H #define ANALYZE_H #include <limits.h> //fill which part of the buffer #define LEFT 1 #define RIGHT 0 #define BUF_SIZE 1024 char buffer_g[BUF_SIZE * 2]; //used to place characters read from source file //char *start_ptr_g; //use to identify the letter, same with forward_ptr char *forward_ptr_g; FILE *srcfile_ptr_g; #define TOKEN_SIZE 64 //char token_g[TOKEN_SIZE]; long long linetotal_g; //total number of lines long long wordtotal_g; //total number of words long long chartotal_g; //total number of characters int reloaded_g; //回退的时候,如果在边界位置,reloaded可以标记此区已被填充,下次next不用再填充 int side_g; //当前forward_ptr所在的位置,左边或者右边 void lex(const char *src_file); void reload(void); void succeed_exit(void); int iskey(char *maykey); void count(char ch); char getnext(void); void fallback(void); void table_insert(const char *mark, const char *property); void stringappend(char * const str, const char ch); #endif analyze.c /*lexical analysis*/ #include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <string.h> #include "analyze.h" #include "error_report.h" void lex(const char *src_file) { char ch; char token[TOKEN_SIZE]; memset(buffer_g, 0, BUF_SIZE); //memset(token, 0, TOKEN_SIZE); forward_ptr_g = buffer_g; reloaded_g = 0; linetotal_g = 1; printf("Analyze started/n"); if ((srcfile_ptr_g = fopen(src_file, "r")) == NULL) { fprintf(stderr, "file %s: line %d/nfile %s ", __FILE__, __LINE__, src_file); perror("open failed"); exit(EXIT_FAILURE); } if (!feof(srcfile_ptr_g)) { buffer_g[fread(buffer_g, sizeof(char), BUF_SIZE-1, srcfile_ptr_g)] = EOF; side_g = LEFT; } while (1) { //在循环中会不断的从文件中读数据,文件中没有数据后会调用succeed_exit退出 memset(token, 0, TOKEN_SIZE); count(ch = getnext()); switch (ch) { default: //space character, like /t /n ' ' if (isspace(ch)) { while (isspace(ch)) { count(ch = getnext()); //printf("space %d/n", ch); } fallback(); } //id or key else if (isalpha(ch) || ch == '_') { while (isalpha(ch) || ch == '_' || isdigit(ch)) { stringappend(token, ch); count(ch = getnext()); } if (ch == '!' || ch == '</p> || ch == '%' || ch == '#' || ch == '@') { error_report(); } if (iskey(token)) { table_insert(token, "-"); } else { table_insert("id", token); } count('/0'); fallback(); } //num else if (isdigit(ch)) { //读入小数点前的数字 while (isdigit(ch)) { stringappend(token, ch); count(ch = getnext()); } //读入小数点后的数字 if (ch == '.') { do { stringappend(token, ch); count(ch = getnext()); } while (isdigit(ch)); //科学计数法 if (ch == 'e' || ch == 'E') { stringappend(token, ch); count(ch = getnext()); if (ch == '+' || ch == '-') { do { stringappend(token, ch); count(ch = getnext()); } while (isdigit(ch)); } else { error_report(); } } //识别科学计数法中的错误 else if (!(isspace(ch) || ch == '|' || ch == '&')) { error_report(); } count('/0'); } if (isalpha(ch)) { error_report(); } table_insert("num", token); fallback(); } else { printf("unincluded character %c/n", ch); } break; case '>': count(ch = getnext()); if (ch == '=') { // >= table_insert("relop", "GE"); } else if (ch == '>') { // >> table_insert(">>", "-"); } else { // > table_insert("relop", "GT"); if (!isspace(ch) && !isalpha(ch)) { error_report(); } fallback(); } count('/0'); break; case '<': count(ch = getnext()); if (ch == '=') { // <= table_insert("relop", "LE"); } else if (ch == '<') { // << table_insert("<<", "-"); } else { // < table_insert("relop", "LT"); if (!isspace(ch) && !isalpha(ch)) { error_report(); } fallback(); } count('/0'); break; case '=': count(ch = getnext()); if (ch == '=') { // == table_insert("relop", "EQ"); } else { // = table_insert("assign-op", "-"); fallback(); } count('/0'); break; case '+': count(ch = getnext()); if (ch == '+') { // ++ table_insert("++", "-"); } else if (ch == '=') { // += table_insert("+=", "-"); } else { // + table_insert("+", "-"); fallback(); } count('/0'); break; case '-': count(ch = getnext()); if (ch == '-') { // -- table_insert("--", "-"); } else if (ch == '=') { // -= table_insert("-=", "-"); } else { // - table_insert("-", "-"); fallback(); } count('/0'); break; case '*': count(ch = getnext()); if (ch == '=') { // *= table_insert("*=", "-"); } else { // * table_insert("*", "-"); fallback(); } count('/0'); break; case '|': count(ch = getnext()); if (ch == '|') { // || table_insert("||", "-"); } else if (ch == '=') { // |= table_insert("|=", "-"); } else { // | table_insert("|", "-"); fallback(); } count('/0'); break; case '&': count(ch = getnext()); if (ch == '&') { // && table_insert("&&", "-"); } else if (ch == '=') { // &= table_insert("&=", "-"); } else { // & table_insert("&", "-"); fallback(); } count('/0'); break; case '^': count(ch = getnext()); if (ch == '=') { // ^= table_insert("^=", "-"); } else { // ^ table_insert("^", "-"); fallback(); } count('/0'); break; case '!': count(ch = getnext()); if (ch == '=') { // != table_insert("!=", "-"); } else { // ! table_insert("!", "-"); fallback(); } count('/0'); break; case ',': case ';': case '(': case ')': case '[': case ']': case '{': case '}': case ':': case '?': case '"': case '/'': case '//': case '.': case '#': stringappend(token, ch); table_insert(token, "-"); //printf("%c------/n", ch); count('/0'); break; //case '#': //break; case '/': stringappend(token, ch); count(ch = getnext()); if (ch == '/') { // 行注释 printf("/n**********************************/n" "/nremark found:/n"); count(ch = getnext()); while (ch != '/n') { putc(ch, stdout); count(ch = getnext()); } printf("/n**********************************/n"); } else if (ch == '*') { // /* */型注释 printf("/n**********************************/n" "/nremark found:/n"); while (1) { count(ch = getnext()); if (ch == '*') { count(ch = getnext()); if (ch == '/') { break; } else { putc('*', stdout); putc(ch, stdout); } } else { putc(ch, stdout); } } printf("/n**********************************/n"); } else { //符号 '/' table_insert(token, "-"); fallback(); } break; } } } //在程序把文件读完的时候调用 //在reload函数中,只有reload会每次读文件 void succeed_exit(void) { printf("Analyze complete./n"); fclose(srcfile_ptr_g); printf("characters: %ld/n", chartotal_g); printf("words: %ld/n", wordtotal_g); printf("lines: %ld/n", linetotal_g - 1); //printf("characters: %ld/nwords: %ld/nlines: %ld/n", //chartotal_g, wordtotal_g, linetotal_g); //搞笑啊,合在一起输出中间的那个数据就会是0,这样还不错 exit(EXIT_SUCCESS); } //每当 *forwardptr==eof 时,就要重新装填缓冲区 //然后将forwardptr定位在逻辑上的下一字符处 //文件读完了就成功退出程序 void reload(void) { printf("reload/n"); if (!feof(srcfile_ptr_g)) { switch (side_g) { case LEFT: //将读到的数据最后面加上eof,如果fread读到了足够的字符,则就是在左或右边界的地方是eof //如果没有读到足够的字符,肯定是到了文件的结束,此时也在最后加上eof,方便程序的退出 buffer_g[BUF_SIZE + fread(buffer_g + BUF_SIZE, sizeof(char), BUF_SIZE-1, srcfile_ptr_g)] = EOF; ++forward_ptr_g; side_g = RIGHT; break; case RIGHT: buffer_g[fread(buffer_g, sizeof(char), BUF_SIZE-1, srcfile_ptr_g)] = EOF; forward_ptr_g = buffer_g; side_g = LEFT; break; default: fprintf(stderr, "side neither left nor right," "something wrong in function reload./n"); break; } } else { printf("file read to end./n"); succeed_exit(); } } //识别一个单词后,传的参数是'/0' void count(char ch) { if (ch == '/0') { ++wordtotal_g; } else { ++chartotal_g; //printf("into n loop, char %d/n", ch); if (ch == '/n') { ++linetotal_g; printf("/n"); //以源码格式输出情况使用,可以换行 //printf("/nline %ld, next line----------------/n", linetotal_g); //统计模式 //printf("into n loop, linetotal %d/n", linetotal_g); } } } //采取的是先读当前数据,然后移动指针的办法 //get的实际上是当前指针指向的位置的数据,get后指针指向下一个逻辑位置 //例外的是碰到eof,要将Eof清除掉,也就是滤掉 //操作后side就是forward_ptr所在的区间 //对前进和回退的操作,在逻辑上忽略eof的存在,也就是指针从来不指向eof位置 char getnext(void) { if (*forward_ptr_g == EOF) { //到了边界,将forwardptr重新定位到下一个逻辑位置 if (reloaded_g == 0) { //上次没有遭遇回退的情况 reload(); } else { //遭遇回退后再次读下个eof数据的情况,由于上次已经将区间填充,所以不必reload switch (side_g) { case LEFT: ++forward_ptr_g; side_g = RIGHT; break; case RIGHT: forward_ptr_g = buffer_g; side_g = LEFT; break; default: fprintf(stderr, "side neither left nor right," "happend in getnext./n"); break; } reloaded_g = 0; } } return *(forward_ptr_g++); } //指针forward逻辑上后退一个字符 //边界情况:同样忽略eof的存在, //例如b****eof*****a'eof',那么读了a,发现不行,这时forward_ptr已经指向了b //回退后forward_ptr重新指向a void fallback(void) { --chartotal_g; if (forward_ptr_g == buffer_g) { forward_ptr_g = buffer_g + BUF_SIZE*2 - 2; //定位到eof前的那个字符 reloaded_g = 1; side_g = RIGHT; } else { --forward_ptr_g; if (*forward_ptr_g == EOF) { --forward_ptr_g; reloaded_g = 1; side_g = LEFT; } } if (*forward_ptr_g == '/n') { --linetotal_g; //printf("linetotal --"); } } void table_insert(const char *mark, const char *property) { //printf("<%s, %s> ", mark, property); //统计模式 printf("%s ", mark); //源码格式输出 } void stringappend(char * const str, const char ch) { int i = 0; for (i; *(str+i) && i < TOKEN_SIZE; ++i) { } if (i == TOKEN_SIZE - 1) { printf("buffer to short./n"); return ; } *(str + i) = ch; *(str + i + 1) = '/0'; } int iskey(char *maykey) { char *keywords[32] = { "auto", "static", "extern", "register", "int","float", "double", "struct", "char", "break", "continue", "long", "if", "switch", "case", "enum", "typedef", "return", "unio", "const", "unsigned", "for", "signed", "void", "default", "goto", "sizeof", "volatile", "do", "while", "else", "short" }; int i = 0; for (i = 0; i < 32; ++i) { if (strcmp(keywords[i], maykey) == 0) { //keywords return 1; } } return 0; }