LCC编译器的源程序分析(4)(5)

最新推荐文章于 2015-05-25 17:05:32 发布

zheng80037

最新推荐文章于 2015-05-25 17:05:32 发布

阅读量925

点赞数

文章标签：编译器 null buffer preprocessor input file

/***********************************
*作者：蔡军生
*出处： http://blog.csdn.net/caimouse/
************************************/
LCC编译器的源程序分析(4)处理文件参数
上面已经介绍选择不同的目标输出的参数处理，那么接着下来，自然的事情就是处理剩下的两个参数的问题，当然LCC是可以处理更多其它参数的，但这里只介绍两个文件参数的处理。命令行如下：
rcc.exe -target=x86/nasm hello.i hello.asm
其中hello.i是输入文件，hello.asm是输出文件。那么LCC是怎么样打开输入文件和输出文件呢？输入文件又有什么技巧呢？要仔细地理解源程序，就知道它的输入处理是非常高效的。
当选择合适的目标输出后，就调用下面的函数来处理：
//
init(argc, argv);
这个函数就是用来处理其它参数的。它的源程序如下：
#001 void init(int argc, char *argv[])
#002 {
#003 {
#004         extern void input_init(int, char *[]);
#005         input_init(argc, argv);
#006 }
#007
#008 {
#009         extern void main_init(int, char *[]);
#010         main_init(argc, argv);
#011 }
#012
#013 {
#014         extern void type_init(int, char *[]);
#015         type_init(argc, argv);
#016 }
#017 }
第1行代码里传入了命令行的参数。
第5行代码是处理参数的处理。如果在第5行里调用没有处理main_init，那么在第10行里会再次调用它进行参数处理。
第15行调函数type_init进行类型初始化，比如C缺省的数据类型初始化，比如int类型，就初始化为4字节的有符号类型，还有很多其C默认的类型定义。

先来分析函数input_init的源程序是做什么工作的，下面就是它的程序：
#001 void input_init(int argc, char *argv[])
#002 {
#003 static int inited;
#004
#005 if (inited)
#006         return;
#007
#008 inited = 1;
#009 main_init(argc, argv);
#010
#011 limit = cp = &buffer[MAXLINE+1];
#012 bsize = -1;
#013 lineno = 0;
#014 file = NULL;
#015
#016 fillbuf();
#017 if (cp >= limit)
#018         cp = limit;
#019
#020 nextline();
#021 }

第5行处理是否初始化，因为只允许初始化一次。第8行设置初始化变量为1，让这段代码不要运行两次。
第9行调用主要参数处理函数。后面再接着介绍。
第11行让当前行指针和缓冲区指针指向输入缓冲区的尾部。
第12行初始化读取文件块大小为-1，也就是读取文件失败的状态。
第13行设置分析的C程序行号为0。
第14行设置当前输入文件名称为空。
第16行是从输入文件里读取数据到输入缓冲区，同时设置当前处理的指针。
第17行判断当前指针是否大于数据缓冲区的指针。
第20行读取下一行源程序到缓冲区里。

调用函数main_init主要处理参数，并且打开输入的文件和输出的文件。它的程序如下：
#001 void main_init(int argc, char *argv[])
#002 {
#003 char *infile = NULL, *outfile = NULL;
#004 int i;
#005 static int inited;
#006
#007 if (inited)
#008         return;
#009
#010 inited = 1;
#011 for (i = 1; i < argc; i++)
#012         if (strcmp(argv[i], "-g") == 0 || strcmp(argv[i], "-g2") == 0)
#013               glevel = 2;
#014         else if (strncmp(argv[i], "-g", 2) == 0)
#015         {    /* -gn[,x] */
#016               char *p = strchr(argv[i], ',');
#017               glevel = atoi(argv[i]+2);
#018               if (p)
#019               {
#020                    comment = p + 1;
#021                    if (glevel == 0)
#022                          glevel = 1;
#023                    if (stabIR.stabline == NULL)
#024                    {
#025                          stabIR.stabline = IR->stabline;
#026                          stabIR.stabend = IR->stabend;
#027                          IR->stabline = stabline;
#028                          IR->stabend = stabend;
#029                    }
#030               }
#031         }
#032         else if (strcmp(argv[i], "-x") == 0)
#033               xref++;
#034         else if (strcmp(argv[i], "-A") == 0)
#035         {
#036               ++Aflag;
#037         }
#038         else if (strcmp(argv[i], "-P") == 0)
#039               Pflag++;
#040         else if (strcmp(argv[i], "-w") == 0)
#041               wflag++;
#042         else if (strcmp(argv[i], "-v") == 0)
#043               fprint(stderr, "%s %s/n", argv[0], rcsid);
#044         else if (strncmp(argv[i], "-s", 2) == 0)
#045               density = strtod(&argv[i][2], NULL);
#046         else if (strncmp(argv[i], "-errout=", 8) == 0)
#047         {
#048               FILE *f = fopen(argv[i]+8, "w");
#049               if (f == NULL)
#050               {
#051                    fprint(stderr, "%s: can't write errors to `%s'/n", argv[0], argv[i]+8);
#052                    exit(EXIT_FAILURE);
#053               }
#054
#055               fclose(f);
#056                f = freopen(argv[i]+8, "w", stderr);
#057               assert(f);
#058         }
#059         else if (strncmp(argv[i], "-e", 2) == 0)
#060         {
#061               int x;
#062               if ((x = strtol(&argv[i][2], NULL, 0)) > 0)
#063                    errlimit = x;
#064         }
#065         else if (strncmp(argv[i], "-little_endian=", 15) == 0)
#066               IR->little_endian = argv[i][15] - '0';
#067        else if (strncmp(argv[i], "-mulops_calls=", 18) == 0)
#068               IR->mulops_calls = argv[i][18] - '0';
#069         else if (strncmp(argv[i], "-wants_callb=", 13) == 0)
#070               IR->wants_callb = argv[i][13] - '0';
#071         else if (strncmp(argv[i], "-wants_argb=", 12) == 0)
#072               IR->wants_argb = argv[i][12] - '0';
#073         else if (strncmp(argv[i], "-left_to_right=", 15) == 0)
#074               IR->left_to_right = argv[i][15] - '0';
#075         else if (strncmp(argv[i], "-wants_dag=", 11) == 0)
#076               IR->wants_dag = argv[i][11] - '0';
#077         else if (*argv[i] != '-' || strcmp(argv[i], "-") == 0)
#078         {
#079               if (infile == NULL)
#080                    infile = argv[i];
#081               else if (outfile == NULL)
#082                    outfile = argv[i];
#083         }
#084
#085         if (infile != NULL && strcmp(infile, "-") != 0
#086               && freopen(infile, "r", stdin) == NULL)
#087         {
#088               fprint(stderr, "%s: can't read `%s'/n", argv[0], infile);
#089               exit(EXIT_FAILURE);
#090         }
#091
#092         if (outfile != NULL && strcmp(outfile, "-") != 0
#093               && freopen(outfile, "w", stdout) == NULL)
#094         {
#095               fprint(stderr, "%s: can't write `%s'/n", argv[0], outfile);
#096               exit(EXIT_FAILURE);
#097         }
#098 }

第7行到第10行，同样是让这个函数只运行一次的代码。
第79行到第82行是读取输入文件和输出文件的名称。
第85行到第90行是打开输入的文件，并处理出错的情况。
第92行到第97行是打开输出的文件，并处理出错的情况。
其它代码就是处理其它参数的功能，这里就不详略地介绍了。
OK，到这里就已经把输入的文件和输入的文件打开，准备好处理源程序的基础了。由于在函数input_init里已经调用main_init，后面再调用它已经是不再处理了。

下面再来看看函数input_init里调用的两个函数fillbuf和nextline。先来看函数fillbuf：
#001 void fillbuf(void)
#002 {
#003 if (bsize == 0)
#004         return;
#005
#006 if (cp >= limit)
#007         cp = &buffer[MAXLINE+1];
#008 else
#009 {
#010         int n = limit - cp;
#011         unsigned char *s = &buffer[MAXLINE+1] - n;
#012         assert(s >= buffer);
#013         line = (char *)s - ((char *)cp - line);
#014         while (cp < limit)
#015               *s++ = *cp++;
#016
#017         cp = &buffer[MAXLINE+1] - n;
#018 }
#019
#020 if (feof(stdin))
#021         bsize = 0;
#022 else
#023         bsize = fread(&buffer[MAXLINE+1], 1, BUFSIZE, stdin);
#024
#025 if (bsize < 0)
#026 {
#027         error("read error/n");
#028         exit(EXIT_FAILURE);
#029 }
#030 limit = &buffer[MAXLINE+1+bsize];
#031 *limit = '/n';
#032 }

第3行处理读取数据为0的情况，这时就返回去，因为没有数据处理。
第6行处理在缓冲区里已经可以识别所有单词的情况，如果在行缓冲以后都不能识别出来的单词，这时又需从文件里读取数据出来，那么就需要把缓冲区后面的数据移到行缓冲最前面去，这样就可以把这些字符串可以拼接在一起进行处理了，第10行到17行就是做这样的事情。
第20行是判断是否读完文件，不是的话，在第23行里就读取缓冲区的大小字符串。
第30行调整缓冲区最后的指针，它是指向缓冲区的尾部的。
上面就实现了缓冲文件的输入，并且处理文件的顺序识别，当然也限制了一行代码是512个字节的大小，这也是C标准里定义一行代码最大的大小，所以写C程序时，一行代码是不能超过512个字节的。

已经分析了这么多内容，下一节再分析nextline吧。

LCC编译器的源程序分析(5)行号同步与类型初始化
上面已经介绍打开文件输入，并且分析了读取到缓冲区里的代码，接着下来就是分析行号同步的处理，还有类型初始化。

先来看看生成中间文件hello.i中的源程序，在它的第1行和第2行如下：
#001 #line 1 "hello.c"
#002 #line 1 "include/stdio.h"
#003
#004
#005
#006
#007 typedef unsigned int size_t;

这样的源程序是怎么样被处理的呢？像＃line参数就是用来识别文件的行号同步和文件名称的。现在就来分析函数nextline，它就会处理这样的源程序，让行号同步和源程序的文件名称也同步更新，这样就可以定位源程序出错时所在的位置，比如在编译C程序时就可以看到在某行某列出错，然后双击鼠标，就可以跑到相应的源程序位置进行查看和修改了。如下所示：
Warning 1    warning C4101: 'dst' : unreferenced local variable   g:/cnasm/cncc/src/alpha.c 4798

nextline函数的源程序如下：
#001 void nextline(void)
#002 {
#003 do
#004 {
#005         if (cp >= limit)
#006         {
#007               fillbuf();
#008               if (cp >= limit)
#009                    cp = limit;
#010               if (cp == limit)
#011                    return;
#012         }
#013         else
#014         {
#015               lineno++;
#016               for (line = (char *)cp; *cp==' ' || *cp=='/t'; cp++)
#017                    ;
#018
#019               if (*cp == '#')
#020               {
#021                    resynch();
#022                    nextline();
#023               }
#024         }
#025
#026 } while (*cp == '/n' && cp == limit);
#027 }

第5行到第12行是分析缓冲区内容完成后，重新读取文件数据到缓冲区里。
第15行是增加源程序的行号，它就是用来表示记号所在的行号。
第16行是跳过连续的空格和制表符。
第19行到23行是处理行号同步和文件同步，后面接着分析它。
第26行是处理一行代码完成，再处理下一行代码。

下面接着看函数resynch，它是进行＃开始的参数处理：
#001 static void resynch(void)
#002 {
#003 for (cp++; *cp == ' ' || *cp == '/t'; )
#004         cp++;
#005
#006 if (limit - cp < MAXLINE)
#007         fillbuf();
#008
#009 if (strncmp((char *)cp, "pragma", 6) == 0)
#010 {
#011         cp += 6;
#012         pragma();
#013 }
#014 else if (strncmp((char *)cp, "ident", 5) == 0)
#015 {
#016         cp += 5;
#017         ident();
#018 }
#019 else if (*cp >= '0' && *cp <= '9')
#020 {
#021 line:     for (lineno = 0; *cp >= '0' && *cp <= '9'; )
#022               lineno = 10*lineno + *cp++ - '0';
#023         lineno--;
#024         while (*cp == ' ' || *cp == '/t')
#025               cp++;
#026
#027         if (*cp == '"')
#028         {
#029               file = (char *)++cp;
#030               while (*cp && *cp != '"' && *cp != '/n')
#031                    cp++;
#032               file = stringn(file, (char *)cp - file);
#033               if (*cp == '/n')
#034                    warning("missing /" in preprocessor line/n");
#035               if (firstfile == 0)
#036                    firstfile = file;
#037         }
#038 }
#039 else if (strncmp((char *)cp, "line", 4) == 0)
#040 {
#041         for (cp += 4; *cp == ' ' || *cp == '/t'; )
#042               cp++;
#043         if (*cp >= '0' && *cp <= '9')
#044               goto line;
#045         if (Aflag >= 2)
#046               warning("unrecognized control line/n");
#047 }
#048 else if (Aflag >= 2 && *cp != '/n')
#049         warning("unrecognized control line/n");
#050
#051 while (*cp)
#052         if (*cp++ == '/n')
#053               if (cp == limit + 1)
#054               {
#055                    nextline();
#056                    if (cp == limit)
#057                          break;
#058               }
#059               else
#060                    break;
#061 }
#062
第3行、第4行删除空格和制表符。
第6、7行是把行缓冲区填满。
第9行到第13行是处理参数pragma。
第14到第18行是处理参数ident。
第39行到第47行是处理line参数，然后跳到第21行的标号line里处理行号识别。比如下面的代码：
#line 1 "hello.c"
就是识出#后，运行上面的函数，然后就识别出来line字符串，接着就到标识处理，把后面的字符串1识别出来，把它转换为10进制值赋值给行号变量lineno。
第27行到第37行是识别后面的文件字符串hello.c，赋值给file。
这样就可以把上面的行号源程序处理完成，得到当前文件名称和当前行号，定位到源程序出错的位置了。

      处理完上面的行号源程序后，就会调用类型初始化，如下：
      type_init(argc, argv);

类型初始化，其实就是设置C编译器内部预先定义的基本类型。下面就来看看具体是怎么样的。
#001 void type_init(int argc, char *argv[])
#002 {
#003 static int inited;
#004 int i;
#005
#006 if (inited)
#007         return;
#008
#009 inited = 1;
#010 if (!IR)
#011         return;
#012
#013 for (i = 1; i < argc; i++)
#014 {
#015         int size, align, outofline;
#016         if (strncmp(argv[i], "-unsigned_char=", 15) == 0)
#017               IR->unsigned_char = argv[i][15] - '0';
#018
#019 #define xx(name) /
#020         else if (sscanf(argv[i], "-" #name "=%d,%d,%d", &size, &align, &outofline) == 3) { /
#021               IR->name.size = size; IR->name.align = align; /
#022               IR->name.outofline = outofline; }
#023 xx(charmetric)
#024 xx(shortmetric)
#025 xx(intmetric)
#026 xx(longmetric)
#027 xx(longlongmetric)
#028 xx(floatmetric)
#029 xx(doublemetric)
#030 xx(longdoublemetric)
#031 xx(ptrmetric)
#032 xx(structmetric)
#033 #undef xx
#034 }
#035
#036 #define xx(v,name,op,metrics) v=xxinit(op,name,IR->metrics)
#037 xx(chartype,        "char",              IR->unsigned_char ? UNSIGNED : INT,charmetric);
#038 xx(doubletype,      "double",            FLOAT,   doublemetric);
#039 xx(floattype,       "float",             FLOAT,   floatmetric);
#040 xx(inttype,         "int",               INT,     intmetric);
#041 xx(longdouble,      "long double",       FLOAT,   longdoublemetric);
#042 xx(longtype,        "long int",          INT,     longmetric);
#043 xx(longlong,        "long long int",     INT,     longlongmetric);
#044 xx(shorttype,       "short",             INT,     shortmetric);
#045 xx(signedchar,      "signed char",       INT,     charmetric);
#046 xx(unsignedchar,    "unsigned char",     UNSIGNED,charmetric);
#047 xx(unsignedlong,    "unsigned long",     UNSIGNED,longmetric);
#048 xx(unsignedshort,   "unsigned short",    UNSIGNED,shortmetric);
#049 xx(unsignedtype,    "unsigned int",      UNSIGNED,intmetric);
#050 xx(unsignedlonglong,"unsigned long long",UNSIGNED,longlongmetric);
#051 #undef xx
#052
#053 {
#054         Symbol p;
#055         p = install(string("void"), &types, GLOBAL, PERM);
#056         voidtype = type(VOID, NULL, 0, 0, p);
#057         p->type = voidtype;
#058 }
#059
#060 pointersym = install(string("T*"), &types, GLOBAL, PERM);
#061 pointersym->addressed = IR->ptrmetric.outofline;
#062 pointersym->u.limits.max.p = (void*)ones(8*IR->ptrmetric.size);
#063 pointersym->u.limits.min.p = 0;
#064 voidptype = ptr(voidtype);
#065 funcptype = ptr(func(voidtype, NULL, 1));
#066 charptype = ptr(chartype);
#067 #define xx(v,t) if (v==NULL && t->size==voidptype->size && t->align==voidptype->align) v=t
#068 xx(unsignedptr,unsignedshort);
#069 xx(unsignedptr,unsignedtype);
#070 xx(unsignedptr,unsignedlong);
#071 xx(unsignedptr,unsignedlonglong);
#072 if (unsignedptr == NULL)
#073         unsignedptr = type(UNSIGNED, NULL, voidptype->size, voidptype->align, voidptype->u.sym);
#074 xx(signedptr,shorttype);
#075 xx(signedptr,inttype);
#076 xx(signedptr,longtype);
#077 xx(signedptr,longlong);
#078 if (signedptr == NULL)
#079         signedptr = type(INT, NULL, voidptype->size, voidptype->align, voidptype->u.sym);
#080 #undef xx
#081 widechar = unsignedshort;
#082
#083 for (i = 0; i < argc; i++)
#084 {
#085 #define xx(name,type) /
#086         if (strcmp(argv[i], "-wchar_t=" #name) == 0) /
#087               widechar = type;
#088         xx(unsigned_char,unsignedchar)
#089         xx(unsigned_int,unsignedtype)
#090         xx(unsigned_short,unsignedshort)
#091 }
#092 #undef xx
#093 }

上面的代码主要把所有缺省的类型创建到一个表数据types里，把char、double、float、int类型初始化到那个表里。每个类型定义如下：
#001 struct type
#002 {
#003 int op;
#004 Type type;
#005 int align;
#006 int size;
#007 union
#008 {
#009         Symbol sym;
#010         struct
#011         {
#012               unsigned oldstyle:1;
#013               Type *proto;
#014         } f;
#015 } u;
#016 Xtype x;
#017 };
主要有类型对齐方式，类型占用大小，还有扩展类型。在C里，类型是很重要的，因为所有变量都是基类数据类型声明的，不同类型的属性不一致，这些都需要进行比较的。定义了这样的类型表，就比较好查找变量的类型，同时也可以节省编译时的存储空间。
到这里，就把类型初始化理解完成，C语言是强类型的语言，所有变量和函数都需要先声明后使用，并且不同的类型不等价的，相互之间需要进行转换。

现在已经把C编译器的初始化工作准备完成了，后面就开始读取源程序里的记号进行处理，也就是进入词法分析阶段，越来越来精彩了。

zheng80037

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
LCC编译器的源程序分析(4)(5)

/*********************************** *作者：蔡军生 *出处：http://blog.csdn.net/caimouse/ ************************************/ LCC编译器的源程序分析(4)处理文件参数上面已经介绍选择不同的目标输出的参数处理，那么接着下来，自然的事情就是处理剩下的两个参数的问题，当然LCC是可以处
复制链接

扫一扫