input module of lexical analyzer in lcc

最新推荐文章于 2022-09-12 23:03:13 发布

ToolBar

最新推荐文章于 2022-09-12 23:03:13 发布

阅读量429

点赞数

文章标签： input module buffer token character literals

本文链接：https://blog.csdn.net/ToolBar/article/details/6229247

版权

[input sub-module of lexical analyzer]
. requirement
   > speed, as fast as possible
   > there is rarely a limit on line length, i.e, line length can be aribitrary
   > extract tokens of C language definition

. techniqual implementation tips
> read input characters in large chunk into a buffer, to reduce IO accesses to save time
> there are '/n'(newline) in buffer, it's better to read tokens upon a line in buffer if necessary
     i.e, a line of file may span different buffer refill
> tokens cannot span line boundaries most oftern(except identifiers and string literals) [1]
     i.e, how to deal with tokens that span line boundaries
> refill the buffer when remaining characters that compose a token may span line boundaries.
     i.e, refill the buffer when remaining characters are insufficient to the max-length of a token,
     in lcc, the max-length is 32(MAXTOKEN)
> input module will be used by lexical analyzer

// according to the implementation tips, write a gettok() function
// input: source file
// output: return the token type
//             {to simplify the problem, tokens are simple punctuation(PUNC), BLANK
//              ID, string literal, EOF, ERR}
// guarantee: buffer has been filled with BUFFER_SIZE characters
// auxiliary variables:
//             buffer[BUFFER_SIZE] : buffer for storing read characters from file
//             cp                                 : current input char, usually points to start of a
//                                                    token or peseudo-token
//             rcp                                : help pointer in scanning token or peseudo-token
//
//             limit                               : sentinel of buffer, i.e, buffer end
//             map[256]                      : map char to its category, PUNC | LETTER | BLANK |
//                                                    DIGIT
ALGORITHM gettok()
        WHILE true
        DO
              rcp <- cp

              // skip over blanks
              WHILE map[*rcp] & BLANK
              DO
                   rcp <- rcp + 1
              DONE
              cp <- rcp   // points to a non-BLANK character

              CASE *rcp++)
                   '/n':
                       refill(buffer)
                       IF cp = limit   // reach end of file
                           return EOF
                       CONTINUE
                   ',' : ';' : '&' : '|' :
                       cp <- rcp;
                       return PUNC;
                   '/':
                       IF *rcp = '*'   // enter comment pseudo token
                            prev <- 0 //?
                            FOR( rcp++; *rcp != '/' && prev = '*'; ) {
                                  IF map[*rcp] & NEWLINE
                                      // !!! there is a determination whether cp >= limit in refill()
                                      //     "+1" because character before buffer sentinel maybe '/0'
                                      cp <- rcp + 1
                                      IF rcp < limit   // there is no need to refill()
                                          prev <- *rcp
                                      nextline()
                                      rcp <- cp

                                      IF cp == limit // read characters at EOF
                                          BREAK
                                  ELSE
                                      prev <- rcp++
                            }

                            IF cp >= limit   // error, unexpected eof when tries "*/"
                                  return ERR
                            cp <- rcp + 1   // ! cp points to next token
                            BREAK           // skip over comments and move to next token
                       ELSE
                            cp++
                            return PUNC
                   'a'-'z': 'A'-'Z' : '_' :
                       // note: !! it's necessary to check refill() action first before consuming consequent
                       //          characters of ID, and does not consume current *rcp
                       IF limit - rcp < MAXTOKEN
                           cp <- rcp - 1   // cp is used when call refill()
                           refill()
                           rcp <- cp + 1 // !! first ID character has been scanned
                       token <- rcp - 1 // !!! mark the beginning of ID token
                       WHILE map[*rcp++] & (DIGIT|LETTER);
                       token <- strncpy(token, rcp - token)

                       cp <- rcp            // reset cp to the beginning of next token
                       return ID
                   default:
                       IF *rcp = '/n'
                           nextline()
                           IF cp >= limit
                                return EOF
                       return ERR
        DONE

. lcc implementation of input module
(1) interface of input module
       extern unsigned char *cp; // current input char, usually points to start of a {peseudo}token
       extern unsigned char *limit; // sentinel of input buffer, with the value '/n'

       #define MAXLINE    512
       #define BUFSIZE    4096
       static unsigned char buffer[MAXLINE+1 + BUFSIZE+1];

(2) get next line when *cp++ == '/n'
       // input: a source file, cp, buffer
      // If the read line falls in buffer, increase line number
      // else, the read line span different buffers when read
       ALGORITHM nextline()
             IF cp >= limit
                 // read characters are stored in consecutive units from address &buffer[MAXLINE+1],
                 // if no characters read when apply refill(), it already reach EOF in the file.
                 refill(buffer)
                 IF cp = limit // EOF
                     RETURN
             ELSE // the next line falls in the consecutive units of buffer[]
                 increase line number

[lexical analyzer module]

. token types in C after preprocessing
1) identifiers including keywords
2) numbers
3) chars
      e.g, 'a', '/t'
4) string literals
     > "hello world"
     > L"hello world" (wide character for representing chinese, japanese)
     > "hello" "world" // ok, string concetation
     > ""            // ok, empty string
     > "hello
       world"       // error
     > "hello /      // there is '/n' after character '//'
           tworld" // invalid definition in c, but valid in c++, no error reported because after
                   // c-preprocessor, it's become "hello /tworld"

5) punctuations or compound punctuations
       e.g, "*", "+", "&&", "|"
6) comments -- PESEUDO token
7) blanks -- PESUDO token
8) line directive or "# progma ..." -- PESEUDO token, but change coordinate info of symbols

. interface
extern char* file;            // in which file that current token falls in
extern char* firstfile;      // ?? (not sure about the use), records the first file that is line directive?
extern int    lineno;       // in which line that current token locates
extern char* token;       // !! store string literal or number characters
extern Symbol tsym;      // symbol points to string literal/identifier/number/build in type

int gettok();
int getchr()

. souce codes analysis:
lex analyzer
        |--- input.c
                |-- static void resynch(void);   // process line directive & "# progma"
                |-- void nextline();    // get next line, and refill buffer if necessary
                |-- refill()                  // refill the buffer, and merge with un-scanned part of previous fill
        |--- lex.c
                |-- int gettok()          // get the token that "cp" points, consuming characters
                |-- * int getchr()       // get a non-blank character, and without consuming this character

   note:
   1) void nextline();
        this function is called when *cp++ == '/n'

[1] refer to p103
     it make sense that string literals can span line boundaries, for example:
     const char* str = "i love this /
                                world!/";
     Q: list an instance that an identifier spans line boundaries?
     A: An identifier may be with length more 32 characters, so it may cause refilling the buffer when remaining character number is insufficient to MAXTOKEN. (see p105 for explainations)