编码格式4

收藏|谁在用? 代码 评论  发芽网首页 | 代码 工具 题酷 游戏 | 注册 登录 首页 贴代码 排行榜 仓库 标签 搜索 用户 反馈    本地收藏夹 百度收藏 QQ收藏 Google书签 Del.icio.us 豆瓣 校内网 Digg 雅虎收藏 365KEY 乐收 Twitter 更多...


比这篇新的文章: Codee#2522
比这篇旧的文章: 一些工具模板函数


unicode各种编码格式之间的转换(utf8,utf16, utf32)
语言: C++, 标签: utf unicode 2009/06/28发布 5个月前更新
作者: doorfly, 点击562次, 评论(0), 收藏者(1), , 打分:


o开关行号, 全选(Ctrl+C复制) | 一键复制:HTML, BBCode, QQ空间 , 源代码 | 查看:裸代码, 全屏 背景 主题: aqua autumn borland bw colorful darkslategray default desert dw_blue dw_cyan dw_green dw_orange dw_purple dw_red dw_yellow emacs fog friendly fruity inkpot manni manxome midnight2 moria murphy native navajo oceanblack oceandeep pastie peaksea perldoc pyte railscasts trac vs wombat zenburn 字体: C++语言: unicode各种编码格式之间的转换(utf8,utf16, utf32)
001 //
002 //convertUTF.h
003 //
004
005
006 /*
007 * Copyright 2001-2004 Unicode, Inc.
008 *
009 * Disclaimer
010 *
011 */
012
013 /* ---------------------------------------------------------------------
014
015     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
016
017     Several funtions are included here, forming a complete set of
018     conversions between the three formats.  UTF-7 is not included
019     here, but is handled in a separate source file.
020
021     Each of these routines takes pointers to input buffers and output
022     buffers.  The input buffers are const.
023
024     Each routine converts the text between *sourceStart and sourceEnd,
025     putting the result into the buffer between *targetStart and
026     targetEnd. Note: the end pointers are *after* the last item: e.g.
027     *(sourceEnd - 1) is the last item.
028
029     The return result indicates whether the conversion was successful,
030     and if not, whether the problem was in the source or target buffers.
031     (Only the first encountered problem is indicated.)
032
033     After the conversion, *sourceStart and *targetStart are both
034     updated to point to the end of last text successfully converted in
035     the respective buffers.
036
037     Input parameters:
038         sourceStart - pointer to a pointer to the source buffer.
039                 The contents of this are modified on return so that
040                 it points at the next thing to be converted.
041         targetStart - similarly, pointer to pointer to the target buffer.
042         sourceEnd, targetEnd - respectively pointers to the ends of the
043                 two buffers, for overflow checking only.
044
045     These conversion functions take a ConversionFlags argument. When this
046     flag is set to strict, both irregular sequences and isolated surrogates
047     will cause an error.  When the flag is set to lenient, both irregular
048     sequences and isolated surrogates are converted.
049
050     Whether the flag is strict or lenient, all illegal sequences will cause
051     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
052     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
053     must check for illegal sequences.
054
055     When the flag is set to lenient, characters over 0x10FFFF are converted
056     to the replacement character; otherwise (when the flag is set to strict)
057     they constitute an error.
058
059     Output parameters:
060         The value "sourceIllegal" is returned from some routines if the input
061         sequence is malformed.  When "sourceIllegal" is returned, the source
062         value will point to the illegal value that caused the problem. E.g.,
063         in UTF-8 when a sequence is malformed, it points to the start of the
064         malformed sequence. 
065
066     Author: Mark E. Davis, 1994.
067     Rev History: Rick McGowan, fixes & updates May 2001.
068                  Fixes & updates, Sept 2001.
069
070 ------------------------------------------------------------------------ */
071
072 /* ---------------------------------------------------------------------
073     The following 4 definitions are compiler-specific.
074     The C standard does not guarantee that wchar_t has at least
075     16 bits, so wchar_t is no less portable than unsigned short!
076     All should be unsigned values to avoid sign extension during
077     bit mask & shift operations.
078 ------------------------------------------------------------------------ */
079
080 typedef unsigned long UTF32;  /* at least 32 bits */
081 typedef unsigned short UTF16;  /* at least 16 bits */
082 typedef unsigned char UTF8;   /* typically 8 bits */
083 typedef unsigned char Boolean; /* 0 or 1 */
084
085 /* Some fundamental constants */
086 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
087 #define UNI_MAX_BMP (UTF32)0x0000FFFF
088 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
089 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
090 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
091
092 typedef enum
093 {
094     conversionOK,
095     /* conversion successful */
096     sourceExhausted,
097     /* partial character in source, but hit end */
098     targetExhausted,
099     /* insuff. room in target for conversion */
100     sourceIllegal           /* source sequence is illegal/malformed */
101 } ConversionResult;
102
103 typedef enum
104 {
105     strictConversion    = 0,
106     lenientConversion
107 } ConversionFlags;
108
109 /* This is for C++ and does no harm in C */
110 #ifdef __cplusplus
111 extern "C"
112 {
113 #endif
114
115     ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart,
116         const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
117         ConversionFlags flags);
118
119     ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart,
120         const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
121         ConversionFlags flags);
122
123     ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart,
124         const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
125         ConversionFlags flags);
126
127     ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart,
128         const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
129         ConversionFlags flags);
130
131     ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
132         const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
133         ConversionFlags flags);
134
135     ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
136         const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
137         ConversionFlags flags);
138
139     Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd);
140
141 #ifdef __cplusplus
142 }
143 #endif
144
145
146
147
148
149
150 //
151 //convertUTF.c
152 //
153
154 #include "ConvertUTF.h"
155 #ifdef CVTUTF_DEBUG
156 #include <stdio.h>
157 #endif
158
159 static const int halfShift = 10; /* used for shifting by 10 bits */
160
161 static const UTF32 halfBase = 0x0010000UL;
162 static const UTF32 halfMask = 0x3FFUL;
163
164 #define UNI_SUR_HIGH_START  (UTF32)0xD800
165 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
166 #define UNI_SUR_LOW_START   (UTF32)0xDC00
167 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
168 #define false      0
169 #define true        1
170
171 /* --------------------------------------------------------------------- */
172
173 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
174     const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
175     ConversionFlags flags)
176 {
177     ConversionResult result = conversionOK;
178     const UTF32* source = *sourceStart;
179     UTF16* target = *targetStart;
180     while (source < sourceEnd)
181     {
182         UTF32 ch;
183         if (target >= targetEnd)
184         {
185             result = targetExhausted; break;
186         }
187         ch = *source++;
188         if (ch <= UNI_MAX_BMP)
189         {
190             /* Target is a character <= 0xFFFF */
191             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
192             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
193             {
194                 if (flags == strictConversion)
195                 {
196                     --source; /* return to the illegal value itself */
197                     result = sourceIllegal;
198                     break;
199                 }
200                 else
201                 {
202                     *target++ = UNI_REPLACEMENT_CHAR;
203                 }
204             }
205             else
206             {
207                 *target++ = (UTF16) ch; /* normal case */
208             }
209         }
210         else if (ch > UNI_MAX_LEGAL_UTF32)
211         {
212             if (flags == strictConversion)
213             {
214                 result = sourceIllegal;
215             }
216             else
217             {
218                 *target++ = UNI_REPLACEMENT_CHAR;
219             }
220         }
221         else
222         {
223             /* target is a character in range 0xFFFF - 0x10FFFF. */
224             if (target + 1 >= targetEnd)
225             {
226                 --source; /* Back up source pointer! */
227                 result = targetExhausted; break;
228             }
229             ch -= halfBase;
230             *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
231             *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
232         }
233     }
234     *sourceStart = source;
235     *targetStart = target;
236     return result;
237 }
238
239 /* --------------------------------------------------------------------- */
240
241 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
242     const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
243     ConversionFlags flags)
244 {
245     ConversionResult result = conversionOK;
246     const UTF16* source = *sourceStart;
247     UTF32* target = *targetStart;
248     UTF32 ch, ch2;
249     while (source < sourceEnd)
250     {
251         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
252         ch = *source++;
253         /* If we have a surrogate pair, convert to UTF32 first. */
254         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
255         {
256             /* If the 16 bits following the high surrogate are in the source buffer... */
257             if (source < sourceEnd)
258             {
259                 ch2 = *source;
260                 /* If it's a low surrogate, convert to UTF32. */
261                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
262                 {
263                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift) +
264                         (ch2 - UNI_SUR_LOW_START) +
265                         halfBase;
266                     ++source;
267                 }
268                 else if (flags == strictConversion)
269                 {
270                     /* it's an unpaired high surrogate */
271                     --source; /* return to the illegal value itself */
272                     result = sourceIllegal;
273                     break;
274                 }
275             }
276             else
277             {
278                 /* We don't have the 16 bits following the high surrogate. */
279                 --source; /* return to the high surrogate */
280                 result = sourceExhausted;
281                 break;
282             }
283         }
284         else if (flags == strictConversion)
285         {
286             /* UTF-16 surrogate values are illegal in UTF-32 */
287             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
288             {
289                 --source; /* return to the illegal value itself */
290                 result = sourceIllegal;
291                 break;
292             }
293         }
294         if (target >= targetEnd)
295         {
296             source = oldSource; /* Back up source pointer! */
297             result = targetExhausted; break;
298         }
299         *target++ = ch;
300     }
301     *sourceStart = source;
302     *targetStart = target;
303 #ifdef CVTUTF_DEBUG
304     if (result == sourceIllegal)
305     {
306         fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x/n", ch,
307             ch2);
308         fflush(stderr);
309     }
310 #endif
311     return result;
312 }
313
314 /* --------------------------------------------------------------------- */
315
316 /*
317 * Index into the table below with the first byte of a UTF-8 sequence to
318 * get the number of trailing bytes that are supposed to follow it.
319 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
320 * left as-is for anyone who may want to do such conversion, which was
321 * allowed in earlier algorithms.
322 */
323 static const char trailingBytesForUTF8[256] =
324 {
325     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
330     0, 0, 0, 0

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值