GCC-3.4.6源代码学习笔记（80）

最新推荐文章于 2022-02-25 21:57:21 发布

wuhui_gdnt

最新推荐文章于 2022-02-25 21:57:21 发布

阅读量1.1k

点赞数

分类专栏： GCC-3.4.6源代码学习笔记文章标签： character string struct token c integer

本文链接：https://blog.csdn.net/wuhui_gdnt/article/details/5801050

版权

GCC-3.4.6源代码学习笔记专栏收录该内容

207 篇文章 14 订阅

订阅专栏

5.6.1.1.2.3. 字符、字符串常量

当预处理器碰到字符或字符串常量时，它忠实地记录其内容，但不去解释它，因为预处理器没有掌握其格式、编码的信息。这是这里的函数的任务。

c_lex_with_flags (continue)

357 case CPP_ATSIGN:

…

388 case CPP_OTHER:

389 {

390 cppchar_t c = tok->val.str.text[0];

391

392 if (c == '"' || c == '/'')

393 error ("missing terminating %c character", (int) c);

394 else if (ISGRAPH (c))

395 error ("stray '%c' in program", (int) c);

396 else

397 error ("stray '//%o' in program", (int) c);

398 }

399 goto retry;

400

401 case CPP_CHAR:

402 case CPP_WCHAR:

403 *value = lex_charconst (tok);

404 break;

405

406 case CPP_STRING:

407 case CPP_WSTRING:

408 return lex_string (tok, value, false);

409 break;

410

411 /* These tokens should not be visible outside cpplib. */

412 case CPP_HEADER_NAME:

413 case CPP_COMMENT:

414 case CPP_MACRO_ARG:

415 abort ();

416

417 default:

418 *value = NULL_TREE;

419 break;

420 }

对于字符常量，其树节点仍旧是INTEGER_CST，不过其内容必须是从原字符翻译过来的目标机器的字符。函数lex_charconst协助这样的转换并参加该节点。

727 static tree

728 lex_charconst (const cpp_token *token) in c-lex.c

729 {

730 cppchar_t result;

731 tree type, value;

732 unsigned int chars_seen;

733 int unsignedp;

734

735 result = cpp_interpret_charconst (parse_in, token,

736 &chars_seen, &unsignedp);

737

738 /* Cast to cppchar_signed_t to get correct sign-extension of RESULT

739 before possibly widening to HOST_WIDE_INT for build_int_2. */

740 if (unsignedp || (cppchar_signed_t) result >= 0)

741 value = build_int_2 (result, 0);

742 else

743 value = build_int_2 ((cppchar_signed_t) result, -1);

744

745 if (token->type == CPP_WCHAR)

746 type = wchar_type_node;

747 /* In C, a character constant has type 'int'.

748 In C++ 'char', but multi-char charconsts have type 'int'. */

749 else if (!c_dialect_cxx () || chars_seen > 1)

750 type = integer_type_node;

751 else

752 type = char_type_node;

753

754 TREE_TYPE (value) = type;

755 return value;

756 }

在前面我们看到，GCC可以在使用EBCDIC编码的宿主机器（host）上运行，同时GCC 可以为目标机器提供宽字符支持（可选的有UTF-8，UTF-16LE，UTF-16BE，UTF32_LE，UTF32_BE）。

1328 cppchar_t

1329 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, in cppcharset.c

1330 unsigned int *pchars_seen, int *unsignedp)

1331 {

1332 cpp_string str = { 0, 0 };

1333 bool wide = (token->type == CPP_WCHAR);

1334 cppchar_t result;

1335

1336 /* an empty constant will appear as L'' or '' */

1337 if (token->val.str.len == (size_t) (2 + wide))

1338 {

1339 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");

1340 return 0;

1341 }

1342 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))

1343 return 0;

1344

1345 if (wide)

1346 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);

1347 else

1348 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);

1349

1350 if (str.text != token->val.str.text)

1351 free ((void *)str.text);

1352

1353 return result;

1354 }

在上面1334行，cppchar_t 具有32比特大小，它足够保存unicode（UTF-32）的字符。为了把字符从预处理器传到词法分析器，使用了结构体cpp_string。注意到在这个结构体中预处理器返回的字符是一个字节数组。

158 struct cpp_string in cpplib.h

159 {

160 unsigned int len;

161 const unsigned char *text;

162 };

这个字符被词法分析器处理后，它可能占据不止一个字节。需要一个新的结构体用于转换后的字符。

97 struct _cpp_strbuf in cppcharset.c

98 {

99 uchar *text;

100 size_t asize;

101 size_t len;

102 };

GCC定义了UTF编码之间的转换函数，而系统调用iconv提供了UTF和EBCDIC编码之间的转换。在这些例程的协助下，cpp_interpret_string可以把from中的输入转换为具有期望格式的to。

1133 bool

1134 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, in cppcharset.c

1135 size_t count, cpp_string *to, bool wide)

1136 {

1137 struct _cpp_strbuf tbuf;

1138 const uchar *p, *base, *limit;

1139 size_t i;

1140 struct cset_converter cvt

1141 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;

1142

1143 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);

1144 tbuf.text = xmalloc (tbuf.asize);

1145 tbuf.len = 0;

1146

1147 for (i = 0; i < count; i++)

1148 {

1149 p = from[i].text;

1150 if (*p == 'L') p++;

1151 p++; /* Skip leading quote. */

1152 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */

1153

1154 for (;;)

1155 {

1156 base = p;

1157 while (p < limit && *p != '//')

1158 p++;

1159 if (p > base)

1160 {

1161 /* We have a run of normal characters; these can be fed

1162 directly to convert_cset. */

1163 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))

1164 goto fail;

1165 }

1166 if (p == limit)

1167 break;

1168

1169 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);

1170 }

1171 }

1172 /* NUL-terminate the 'to' buffer and translate it to a cpp_string

1173 structure. */

1174 emit_numeric_escape (pfile, 0, &tbuf, wide);

1175 tbuf.text = xrealloc (tbuf.text, tbuf.len);

1176 to->text = tbuf.text;

1177 to->len = tbuf.len;

1178 return true;

1179

1180 fail:

1181 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");

1182 free (tbuf.text);

1183 return false;

1184 }

不过转义序列是这些转换函数中的例外，因为预处理器把字符串中的转义序列也忠实地记录可，例如，/a记为“/a”。需要重新解释下面的宿主字符集，然后转换至目标字符集。

1043 static const uchar *

1044 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, in cppcharset.c

1045 struct _cpp_strbuf *tbuf, bool wide)

1046 {

1047 /* Values of /a /b /e /f /n /r /t /v respectively. */

1048 #if HOST_CHARSET == HOST_CHARSET_ASCII

1049 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };

1050 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC

1051 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };

1052 #else

1053 #error "unknown host character set"

1054 #endif

1055

1056 uchar c;

1057 struct cset_converter cvt

1058 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;

1059

1060 c = *from;

1061 switch (c)

1062 {

1063 /* UCNs, hex escapes, and octal escapes are processed separately. */

1064 case 'u': case 'U':

1065 return convert_ucn (pfile, from, limit, tbuf, wide);

1066

1067 case 'x':

1068 return convert_hex (pfile, from, limit, tbuf, wide);

1069 break;

1070

1071 case '0': case '1': case '2': case '3':

1072 case '4': case '5': case '6': case '7':

1073 return convert_oct (pfile, from, limit, tbuf, wide);

1074

1075 /* Various letter escapes. Get the appropriate host-charset

1076 value into C. */

1077 case '//': case '/'': case '"': case '?': break;

1078

1079 case '(': case '{': case '[': case '%':

1080 /* '/(', etc, can be used at the beginning of a line in a long

1081 string split onto multiple lines with /-newline, to prevent

1082 Emacs or other text editors from getting confused. '/%' can

1083 be used to prevent SCCS from mangling printf format strings. */

1084 if (CPP_PEDANTIC (pfile))

1085 goto unknown;

1086 break;

1087

1088 case 'b': c = charconsts[1]; break;

1089 case 'f': c = charconsts[3]; break;

1090 case 'n': c = charconsts[4]; break;

1091 case 'r': c = charconsts[5]; break;

1092 case 't': c = charconsts[6]; break;

1093 case 'v': c = charconsts[7]; break;

1094

1095 case 'a':

1096 if (CPP_WTRADITIONAL (pfile))

1097 cpp_error (pfile, CPP_DL_WARNING,

1098 "the meaning of '//a' is different in traditional C");

1099 c = charconsts[0];

1100 break;

1101

1102 case 'e': case 'E':

1103 if (CPP_PEDANTIC (pfile))

1104 cpp_error (pfile, CPP_DL_PEDWARN,

1105 "non-ISO-standard escape sequence, '//%c'", (int) c);

1106 c = charconsts[2];

1107 break;

1108

1109 default:

1110 unknown:

1111 if (ISGRAPH (c))

1112 cpp_error (pfile, CPP_DL_PEDWARN,

1113 "unknown escape sequence '//%c'", (int) c);

1114 else

1115 cpp_error (pfile, CPP_DL_PEDWARN,

1116 "unknown escape sequence: '//%03o'", (int) c);

1117 }

1118

1119 /* Now convert what we have to the execution character set. */

1120 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))

1121 cpp_errno (pfile, CPP_DL_ERROR,

1122 "converting escape sequence to execution character set");

1123

1124 return from + 1;

1125 }

除了类似‘/a’的转义序列外；具有八进制数值NNN（1到3个数字）的序列‘/NNN’由convert_oct处理；具有16进制值NN（1到2个数字）的序列‘/xNN’由convert_hex处理；具有16进制值NNNN（4个数字）的序列‘/uNNNN’，及具有16进制值NNNNNNNN（8个数字）的序列‘/UNNNNNNNN’由convert_ucn处理。

对于那些在cpp_reader中的转换句柄所不能处理的字符，emit_numeric_escape被调用将这些字符记录到缓存。

907 static void

908 emit_numeric_escape (cpp_reader *pfile, cppchar_t n, in cppcharset.c

909 struct _cpp_strbuf *tbuf, bool wide)

910 {

911 if (wide)

912 {

913 /* We have to render this into the target byte order, which may not

914 be our byte order. */

915 bool bigend = CPP_OPTION (pfile, bytes_big_endian);

916 size_t width = CPP_OPTION (pfile, wchar_precision);

917 size_t cwidth = CPP_OPTION (pfile, char_precision);

918 size_t cmask = width_to_mask (cwidth);

919 size_t nbwc = width / cwidth;

920 size_t i;

921 size_t off = tbuf->len;

922 cppchar_t c;

923

924 if (tbuf->len + nbwc > tbuf->asize)

925 {

926 tbuf->asize += OUTBUF_BLOCK_SIZE;

927 tbuf->text = xrealloc (tbuf->text, tbuf->asize);

928 }

929

930 for (i = 0; i < nbwc; i++)

931 {

932 c = n & cmask;

933 n >>= cwidth;

934 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;

935 }

936 tbuf->len += nbwc;

937 }

938 else

939 {

940 if (tbuf->len + 1 > tbuf->asize)

941 {

942 tbuf->asize += OUTBUF_BLOCK_SIZE;

943 tbuf->text = xrealloc (tbuf->text, tbuf->asize);

944 }

945 tbuf->text[tbuf->len++] = n;

946 }

947 }

注意到_cpp_strbuf的asize域告知缓存的大小，而len域则告知字符的数目。

接下来在c_lex_with_flags里，对于字符串常量的情形，lex_string的核心也是调用cpp_interpret_string。这里我们跳过它。

5.6.1.1.3. 完成预处理符号提取

如果我们没有在上面因错误退出，那么到达这里，不会再有PCH文件需要读入了。

c_lex_with_flags (continue)

422 if (! no_more_pch)

423 {

424 no_more_pch = true;

425 c_common_no_more_pch ();

426 }

427

428 if (cpp_flags)

429 *cpp_flags = tok->flags;

430 return tok->type;

431 }

这个函数重置了parse_in的句柄valid_pch，这样如果再读入PCH文件时，将导致错误。

425 void

426 c_common_no_more_pch (void) in c-pch.c

427 {

428 if (cpp_get_callbacks (parse_in)->valid_pch)

429 {

430 cpp_get_callbacks (parse_in)->valid_pch = NULL;

431 host_hooks.gt_pch_use_address (NULL, 0, -1, 0);

432 }

433 }

钩子host_hooks提供了特定于目标机器的方法来获取为PCH文件分配的内存空间。把0作为gt_pch_use_address的第二个参数size传入，将允许钩子释放在加载期间可能分配的静态内存。但在当前版本中，该功能尚未实现。

wuhui_gdnt

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
GCC-3.4.6源代码学习笔记（80）

5.6.1.1.2.3. 字符、字符串常量 当预处理器碰到字符或字符串常量时，它忠实地记录其内容，但不去解释它，因为预处理器没有掌握其格式、编码的信息。这是这里的函数的任务。 c_lex_with_flags (continue) 357 case CPP_ATSIGN: … 388 case CPP_OTHER: 389 { 390 cpp
复制链接

扫一扫