List: imap
I tried the code you referenced (the exact program and compilation script
are in attachments), but it failed. The program takes input as modified
UTF-7, uses MailboxToURL routine to change it to UTF-8 and then uses the
URLtoMailbox routine to change it to UTF-7 again:
int main(int argc, char* argv[]){
char out[OUTSIZE];
char in[OUTSIZE];
strcpy(in,argv[1]);
printf("in: %s\n",in);
MailboxToURL(out,in);
printf("out: %s\n",out);
URLtoMailbox(in,out);
printf("in: %s\n",in);
}
As an input I gave it the following UTF-7 code:
a&AQUBBA-e&AFC-f
which is the code produced by Microsoft Outlook and contains bunch of Polish
letters.
The output of the program is the following:
[tomcat@fatcat]$ ./utf7test 'a&AQUBBA-e&AFC-f'
in: a&AQUBBA-e&AFC-f
out: a%C4%85%C4%84e%50f
in: a&AQUBBA-ePf
So, as you can see, the conversion is not 1:1 ;-)))) Strange enough, if I
use the resulting output (a&AQUBBA-ePf) as an input to another iteration, it
starts behaving correctly ;-)))
Can you help me?
Marek.
ps. I tried the code on linux. There are couple of strange assignments in
the code (like unsigned long variable = char variable), so I mention it just
in case this might be of some importance.
> -----Original Message-----
> From: Chris Newman [mailto:chris+imap@innosoft.com]
> Sent: Tuesday, July 24, 2001 8:43 PM
> To: Marek Kowal; imap@u.washington.edu
> Subject: Re: modified UTF7 to UTF8 conversion
>
>
> Try the code in:
> <http://www.innosoft.com/rfc/rfc2192.html>
>
> It's missing a security check for invalid UTF-8 chararacters
> on input, but
> is otherwise correct to my knowledge. If it's broken, please
> email me the
> example which breaks it so I can fix the code.
>
> - Chris
>
> --On Monday, July 23, 2001 19:52 +0200 Marek Kowal
> <kowalm@onet.pl> wrote:
>
> > Hi there,
> >
> > I am having HARD time trying to convert modified UTF7
> mailbox names to
> > UTF8 (which I then convert to ISO-8859-2 using iconv
> library, BTW). I
> > tried the UTF7 to URL UTF8 code (which I found in imap
> discussion list,
> >
> http://www.washington.edu/imap/listarch/1997/msg00800.html),
> but it does
> > not seem to work correctly - if I run it one-way and then
> back on some
> > string, sometimes I get different results - the resulting
> UTF7 code is
> > not the same.
> >
> > Anyway, can anybody point me to the proper conversion
> routines which can
> > transform between modified UTF7 and UTF8? It could be
> separate code, but
> > if anybody did it already as iconv conversion table, that
> would be great.
>
["compile" (application/octet-stream)]
["utf7test.c" (application/octet-stream)]
#include <stdio.h>
#include <string.h>
#include <iconv.h>
#define OUTSIZE 1024
/* hexadecimal lookup table */
static char hex[] = "0123456789ABCDEF";
/* URL unsafe printable characters */
static char urlunsafe[] = " \"#%&+:;<=>?@[\\]^`{|}";
/* UTF7 modified base64 alphabet */
static char base64chars[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
#define UNDEFINED 64
/* UTF16 definitions */
#define UTF16MASK 0x03FFUL
#define UTF16SHIFT 10
#define UTF16BASE 0x10000UL
#define UTF16HIGHSTART 0xD800UL
#define UTF16HIGHEND 0xDBFFUL
#define UTF16LOSTART 0xDC00UL
#define UTF16LOEND 0xDFFFUL
/* Convert an IMAP mailbox to a URL path
* dst needs to have roughly 4 times the storage space of src
* Hex encoding can triple the size of the input
* UTF-7 can be slightly denser than UTF-8
* (worst case: 8 octets UTF-7 becomes 9 octets UTF-8)
*/
void MailboxToURL(char *dst, char *src)
{
unsigned char c, i, bitcount;
unsigned long ucs4, utf16, bitbuf;
unsigned char base64[256], utf8[6];
/* initialize modified base64 decoding table */
memset(base64, UNDEFINED, sizeof (base64));
for (i = 0; i < sizeof (base64chars); ++i) {
base64[base64chars[i]] = i;
}
/* loop until end of string */
while (*src != '\0') {
c = *src++;
/* deal with literal characters and &- */
if (c != '&' || *src == '-') {
if (c < ' ' || c > '~' || strchr(urlunsafe, c) != NULL) {
/* hex encode if necessary */
dst[0] = '%';
dst[1] = hex[c >> 4];
dst[2] = hex[c & 0x0f];
dst += 3;
} else {
/* encode literally */
*dst++ = c;
}
/* skip over the '-' if this is an &- sequence */
if (c == '&') ++src;
} else {
/* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
bitbuf = 0;
bitcount = 0;
ucs4 = 0;
while ((c = base64[(unsigned char) *src]) != UNDEFINED) {
++src;
bitbuf = (bitbuf << 6) | c;
bitcount += 6;
/* enough bits for a UTF-16 character? */
if (bitcount >= 16) {
bitcount -= 16;
utf16 = (bitcount ? bitbuf >> bitcount
: bitbuf) & 0xffff;
/* convert UTF16 to UCS4 */
if
(utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) {
ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
continue;
} else if
(utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) {
ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
} else {
ucs4 = utf16;
}
/* convert UTF-16 range of UCS4 to UTF-8 */
if (ucs4 <= 0x7fUL) {
utf8[0] = ucs4;
i = 1;
} else if (ucs4 <= 0x7ffUL) {
utf8[0] = 0xc0 | (ucs4 >> 6);
utf8[1] = 0x80 | (ucs4 & 0x3f);
i = 2;
} else if (ucs4 <= 0xffffUL) {
utf8[0] = 0xe0 | (ucs4 >> 12);
utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
utf8[2] = 0x80 | (ucs4 & 0x3f);
i = 3;
} else {
utf8[0] = 0xf0 | (ucs4 >> 18);
utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
utf8[3] = 0x80 | (ucs4 & 0x3f);
i = 4;
}
/* convert utf8 to hex */
for (c = 0; c < i; ++c) {
dst[0] = '%';
dst[1] = hex[utf8[c] >> 4];
dst[2] = hex[utf8[c] & 0x0f];
dst += 3;
}
}
}
/* skip over trailing '-' in modified UTF-7 encoding */
if (*src == '-') ++src;
}
}
/* terminate destination string */
*dst = '\0';
}
/* Convert hex coded UTF-8 URL path to modified UTF-7 IMAP mailbox
* dst should be about twice the length of src to deal with non-hex
* coded URLs
*/
void URLtoMailbox(char *dst, char *src)
{
unsigned int utf8pos, utf8total, i, c, utf7mode, bitstogo, utf16flag;
unsigned long ucs4, bitbuf;
unsigned char hextab[256];
/* initialize hex lookup table */
memset(hextab, 0, sizeof (hextab));
for (i = 0; i < sizeof (hex); ++i) {
hextab[hex[i]] = i;
if (isupper(hex[i])) hextab[tolower(hex[i])] = i;
}
utf7mode = 0;
utf8total = 0;
bitstogo = 0;
while ((c = *src) != '\0') {
++src;
/* undo hex-encoding */
if (c == '%' && src[0] != '\0' && src[1] != '\0') {
c = (hextab[src[0]] << 4) | hextab[src[1]];
src += 2;
}
/* normal character? */
if (c >= ' ' && c <= '~') {
/* switch out of UTF-7 mode */
if (utf7mode) {
if (bitstogo) {
*dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
}
*dst++ = '-';
utf7mode = 0;
}
*dst++ = c;
/* encode '&' as '&-' */
if (c == '&') {
*dst++ = '-';
}
continue;
}
/* switch to UTF-7 mode */
if (!utf7mode) {
*dst++ = '&';
utf7mode = 1;
}
/* Encode US-ASCII characters as themselves */
if (c < 0x80) {
ucs4 = c;
utf8total = 1;
} else if (utf8total) {
/* save UTF8 bits into UCS4 */
ucs4 = (ucs4 << 6) | (c & 0x3FUL);
if (++utf8pos < utf8total) {
continue;
}
} else {
utf8pos = 1;
if (c < 0xE0) {
utf8total = 2;
ucs4 = c & 0x1F;
} else if (c < 0xF0) {
utf8total = 3;
ucs4 = c & 0x0F;
} else {
/* NOTE: can't convert UTF8 sequences longer than 4 */
utf8total = 4;
ucs4 = c & 0x03;
}
continue;
}
/* loop to split ucs4 into two utf16 chars if necessary */
utf8total = 0;
do {
if (ucs4 >= UTF16BASE) {
ucs4 -= UTF16BASE;
bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT)
+ UTF16HIGHSTART);
ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
utf16flag = 1;
} else {
bitbuf = (bitbuf << 16) | ucs4;
utf16flag = 0;
}
bitstogo += 16;
/* spew out base64 */
while (bitstogo >= 6) {
bitstogo -= 6;
*dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo)
: bitbuf)
& 0x3F];
}
} while (utf16flag);
}
/* if in UTF-7 mode, finish in ASCII */
if (utf7mode) {
if (bitstogo) {
*dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
}
*dst++ = '-';
}
/* tie off string */
*dst = '\0';
}
int main(int argc, char* argv[]){
char out[OUTSIZE];
char in[OUTSIZE];
strcpy(in,argv[1]);
printf("in: %s\n",in);
MailboxToURL(out,in);
printf("out: %s\n",out);
URLtoMailbox(in,out);
printf("in: %s\n",in);
}