utf-7转码

最新推荐文章于 2019-02-22 17:43:18 发布
weixin_34194087
最新推荐文章于 2019-02-22 17:43:18 发布
阅读量928
点赞数 1
原文链接：http://www.cnblogs.com/fanchaostudy/p/7144031.html
版权
List:       imap
I tried the code you referenced (the exact program and compilation script
are in attachments), but it failed. The program takes input as modified
UTF-7, uses MailboxToURL routine to change it to UTF-8 and then uses the
URLtoMailbox routine to change it to UTF-7 again:

int main(int argc, char* argv[]){
  char out[OUTSIZE];
  char in[OUTSIZE]; 

  strcpy(in,argv[1]);
  printf("in:   %s\n",in);

  MailboxToURL(out,in);
  printf("out:  %s\n",out);

  URLtoMailbox(in,out);
  printf("in:   %s\n",in);  
}

As an input I gave it the following UTF-7 code:
a&AQUBBA-e&AFC-f
which is the code produced by Microsoft Outlook and contains bunch of Polish
letters.

The output of the program is the following:
[tomcat@fatcat]$ ./utf7test 'a&AQUBBA-e&AFC-f'
in:   a&AQUBBA-e&AFC-f
out:  a%C4%85%C4%84e%50f
in:   a&AQUBBA-ePf

So, as you can see, the conversion is not 1:1 ;-)))) Strange enough, if I
use the resulting output (a&AQUBBA-ePf) as an input to another iteration, it
starts behaving correctly ;-)))

Can you help me?
Marek.

ps. I tried the code on linux. There are couple of strange assignments in
the code (like unsigned long variable = char variable), so I mention it just
in case this might be of some importance.

> -----Original Message-----
> From: Chris Newman [mailto:chris+imap@innosoft.com]
> Sent: Tuesday, July 24, 2001 8:43 PM
> To: Marek Kowal; imap@u.washington.edu
> Subject: Re: modified UTF7 to UTF8 conversion
> 
> 
> Try the code in:
>   <http://www.innosoft.com/rfc/rfc2192.html>
> 
> It's missing a security check for invalid UTF-8 chararacters 
> on input, but 
> is otherwise correct to my knowledge.  If it's broken, please 
> email me the 
> example which breaks it so I can fix the code.
> 
> 		- Chris
> 
> --On Monday, July 23, 2001 19:52 +0200 Marek Kowal 
> <kowalm@onet.pl> wrote:
> 
> > Hi there,
> >
> > I am having HARD time trying to convert modified UTF7 
> mailbox names to
> > UTF8 (which I then convert to ISO-8859-2 using iconv 
> library, BTW). I
> > tried the UTF7 to URL UTF8 code (which I found in imap 
> discussion list,
> > 
> http://www.washington.edu/imap/listarch/1997/msg00800.html), 
> but it does
> > not seem to work correctly - if I run it one-way and then 
> back on some
> > string, sometimes I get different results - the resulting 
> UTF7 code is
> > not the same.
> >
> > Anyway, can anybody point me to the proper conversion 
> routines which can
> > transform between modified UTF7 and UTF8? It could be 
> separate code, but
> > if anybody did it already as iconv conversion table, that 
> would be great.
> 


["compile" (application/octet-stream)]
["utf7test.c" (application/octet-stream)]

#include <stdio.h>
#include <string.h>
#include <iconv.h>
#define OUTSIZE 1024


/* hexadecimal lookup table */
static char hex[] = "0123456789ABCDEF";

/* URL unsafe printable characters */
static char urlunsafe[] = " \"#%&+:;<=>?@[\\]^`{|}";

/* UTF7 modified base64 alphabet */

static char base64chars[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; 
#define UNDEFINED 64

/* UTF16 definitions */
#define UTF16MASK       0x03FFUL
#define UTF16SHIFT      10
#define UTF16BASE       0x10000UL

#define UTF16HIGHSTART  0xD800UL
#define UTF16HIGHEND    0xDBFFUL
#define UTF16LOSTART    0xDC00UL
#define UTF16LOEND      0xDFFFUL

/* Convert an IMAP mailbox to a URL path
 *  dst needs to have roughly 4 times the storage space of src
 *    Hex encoding can triple the size of the input
 *    UTF-7 can be slightly denser than UTF-8
 *     (worst case: 8 octets UTF-7 becomes 9 octets UTF-8)
 */

void MailboxToURL(char *dst, char *src)
{
  unsigned char c, i, bitcount;
  unsigned long ucs4, utf16, bitbuf;
  unsigned char base64[256], utf8[6];

  /* initialize modified base64 decoding table */
  memset(base64, UNDEFINED, sizeof (base64));
 for (i = 0; i < sizeof (base64chars); ++i) {
   base64[base64chars[i]] = i;
 }

 /* loop until end of string */
 while (*src != '\0') {
   c = *src++;
   /* deal with literal characters and &- */
   if (c != '&' || *src == '-') {
     if (c < ' ' || c > '~' || strchr(urlunsafe, c) != NULL) {
       /* hex encode if necessary */
       dst[0] = '%';
       dst[1] = hex[c >> 4];
       dst[2] = hex[c & 0x0f];
       dst += 3;
     } else {
       /* encode literally */
       *dst++ = c;
     }
     /* skip over the '-' if this is an &- sequence */
     if (c == '&') ++src;
   } else {
     /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
     bitbuf = 0;
     bitcount = 0;
     ucs4 = 0;
     while ((c = base64[(unsigned char) *src]) != UNDEFINED) {
       ++src;
       bitbuf = (bitbuf << 6) | c;
       bitcount += 6;
       /* enough bits for a UTF-16 character? */
       if (bitcount >= 16) {
	 bitcount -= 16;
	 utf16 = (bitcount ? bitbuf >> bitcount
		  : bitbuf) & 0xffff;
	 /* convert UTF16 to UCS4 */
                    if
		      (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) {
		      ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
		      continue;
                    } else if
		      (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) {
		      ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
                    } else {
		      ucs4 = utf16;
                    }

  /* convert UTF-16 range of UCS4 to UTF-8 */
  if (ucs4 <= 0x7fUL) {
    utf8[0] = ucs4;
    i = 1;
  } else if (ucs4 <= 0x7ffUL) {
    utf8[0] = 0xc0 | (ucs4 >> 6);
    utf8[1] = 0x80 | (ucs4 & 0x3f);
    i = 2;
  } else if (ucs4 <= 0xffffUL) {
    utf8[0] = 0xe0 | (ucs4 >> 12);
    utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
    utf8[2] = 0x80 | (ucs4 & 0x3f);
    i = 3;
  } else {
    utf8[0] = 0xf0 | (ucs4 >> 18);
    utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
    utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
    utf8[3] = 0x80 | (ucs4 & 0x3f);
    i = 4;
  }
/* convert utf8 to hex */
 for (c = 0; c < i; ++c) {
   dst[0] = '%';
   dst[1] = hex[utf8[c] >> 4];
   dst[2] = hex[utf8[c] & 0x0f];
   dst += 3;
 }
       }
     }
     /* skip over trailing '-' in modified UTF-7 encoding */
     if (*src == '-') ++src;
   }
 }
 /* terminate destination string */
 *dst = '\0';

} 

/* Convert hex coded UTF-8 URL path to modified UTF-7 IMAP mailbox
 *  dst should be about twice the length of src to deal with non-hex
 *  coded URLs
 */

void URLtoMailbox(char *dst, char *src)
{
  unsigned int utf8pos, utf8total, i, c, utf7mode, bitstogo, utf16flag;
  unsigned long ucs4, bitbuf;
  unsigned char hextab[256];

  /* initialize hex lookup table */

  memset(hextab, 0, sizeof (hextab)); 

 for (i = 0; i < sizeof (hex); ++i) {
   hextab[hex[i]] = i;
   if (isupper(hex[i])) hextab[tolower(hex[i])] = i;
 }

 utf7mode = 0;
 utf8total = 0;
 bitstogo = 0;
 while ((c = *src) != '\0') {
   ++src;
   /* undo hex-encoding */
   if (c == '%' && src[0] != '\0' && src[1] != '\0') {
     c = (hextab[src[0]] << 4) | hextab[src[1]];
     src += 2;
   }
   /* normal character? */
   if (c >= ' ' && c <= '~') {
     /* switch out of UTF-7 mode */
     if (utf7mode) {
       if (bitstogo) {
	 *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
       }
       *dst++ = '-';
       utf7mode = 0;
     }
     *dst++ = c;
     /* encode '&' as '&-' */
     if (c == '&') {
       *dst++ = '-';
     }
     continue;
   }
   /* switch to UTF-7 mode */
   if (!utf7mode) {
     *dst++ = '&';
     utf7mode = 1;
   }
   /* Encode US-ASCII characters as themselves */
   if (c < 0x80) {
     ucs4 = c;
     utf8total = 1;
   } else if (utf8total) {
     /* save UTF8 bits into UCS4 */
     ucs4 = (ucs4 << 6) | (c & 0x3FUL);
     if (++utf8pos < utf8total) {
       continue;
     }
  } else {
    utf8pos = 1;
    if (c < 0xE0) {
      utf8total = 2;
      ucs4 = c & 0x1F;
    } else if (c < 0xF0) {
      utf8total = 3;
      ucs4 = c & 0x0F;
    } else {
      /* NOTE: can't convert UTF8 sequences longer than 4 */
      utf8total = 4;
      ucs4 = c & 0x03;
    }
    continue;
  }
   /* loop to split ucs4 into two utf16 chars if necessary */
   utf8total = 0;
   do {
     if (ucs4 >= UTF16BASE) {
       ucs4 -= UTF16BASE;
       bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT)
				  + UTF16HIGHSTART);
       ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
       utf16flag = 1;
     } else {
       bitbuf = (bitbuf << 16) | ucs4;
       utf16flag = 0;
     }
     bitstogo += 16;
     /* spew out base64 */
     while (bitstogo >= 6) {
       bitstogo -= 6;
       *dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo)
			     : bitbuf)
			   & 0x3F];
     }
   } while (utf16flag);
 }
 /* if in UTF-7 mode, finish in ASCII */
 if (utf7mode) {
   if (bitstogo) {
     *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
   }
   *dst++ = '-';
 }
 /* tie off string */
 *dst = '\0';

} 

int main(int argc, char* argv[]){
  char out[OUTSIZE];
  char in[OUTSIZE]; 

  strcpy(in,argv[1]);
  printf("in:   %s\n",in);

  MailboxToURL(out,in);
  printf("out:  %s\n",out);

  URLtoMailbox(in,out);
  printf("in:   %s\n",in);  
}
转载于:https://www.cnblogs.com/fanchaostudy/p/7144031.html