git.hungrycats.org Git - xscreensaver/blob - utils/utf8wc.c

   1 /* xscreensaver, Copyright (c) 2014 Jamie Zawinski <jwz@jwz.org>
   2  *
   3  * Permission to use, copy, modify, distribute, and sell this software and its
   4  * documentation for any purpose is hereby granted without fee, provided that
   5  * the above copyright notice appear in all copies and that both that
   6  * copyright notice and this permission notice appear in supporting
   7  * documentation.  No representations are made about the suitability of this
   8  * software for any purpose.  It is provided "as is" without express or
   9  * implied warranty.
  10  */
  11
  12 #ifdef HAVE_CONFIG_H
  13 # include "config.h"
  14 #endif
  15
  16 #include <stdlib.h>
  17 #include <stdio.h>
  18 #include <string.h>
  19
  20 #ifdef HAVE_COCOA
  21 # include "jwxyz.h"
  22 # elif defined(HAVE_ANDROID)
  23 # include "jwxyz.h"
  24 #else /* !HAVE_COCOA */
  25 # include <X11/Xlib.h>
  26 #endif
  27
  28 #include "utf8wc.h"
  29
  30
  31 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
  32 # define INVALID 0xFFFD
  33
  34
  35 /* Mask the number to be within the valid range of unicode characters.
  36  */
  37 static unsigned long
  38 uc_truncate (unsigned long uc)
  39 {
  40   uc &= 0x7FFFFFFFL;                    /* Unicode is 31 bits */
  41   if (uc > 0x10FFFF) uc = INVALID;      /* But UTF-8 is 4 bytes */
  42   if (uc == 0) uc = INVALID;            /* no nulls */
  43
  44   if (uc >= 0xD800 && uc <= 0xDFFF)
  45     /* Reserved for use with UTF-16: not a real character. */
  46     uc = INVALID;
  47
  48   return uc;
  49 }
  50
  51
  52 /* Parse the first UTF8 character at the front of the string.
  53    Return the Unicode character, and the number of bytes read.
  54  */
  55 static long
  56 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
  57 {
  58   const unsigned char *start = in;
  59   const unsigned char *end = in + length;
  60   unsigned long uc = INVALID;
  61   unsigned long min = 0;
  62   unsigned char c;
  63
  64   if (length <= 0) goto DONE;
  65
  66   c = *in++;
  67
  68 # define PREMATURE_EOF { in = end; goto DONE; }
  69
  70   if ((c & 0xC0) == 0x80) {        /* 10xxxxxx - lonely continuation byte */
  71     uc = INVALID;
  72
  73   } else if ((c & 0x80) == 0) {    /* 0xxxxxxx - 7 bits in 1 byte */
  74     uc = (c & 0x7F);               /* 01111111 */
  75
  76   } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
  77     if (in+1 > end) PREMATURE_EOF;
  78     min = 1 << 7;
  79     uc = (((c    & 0x1F) << 6) |   /* 00011111------ */
  80           (in[0] & 0x3F));         /*       00111111 */
  81     in += 1;
  82
  83   } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
  84     if (in+2 > end) PREMATURE_EOF;
  85     min = 1 << 11;
  86     uc = (((c     & 0x0F) << 12) | /* 00001111----+------- */
  87           ((in[0] & 0x3F) <<  6) | /*       00111111------ */
  88           ((in[1] & 0x3F)));       /*             00111111 */
  89     in += 2;
  90
  91   } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
  92     if (in+3 > end) PREMATURE_EOF;
  93     min = 1 << 16;
  94     uc = (((c     & 0x07) << 18) | /* 00000111--+-------+------- */
  95           ((in[0] & 0x3F) << 12) | /*       01111111----+------- */
  96           ((in[1] & 0x3F) <<  6) | /*             00111111------ */
  97           ((in[2] & 0x3F)));       /*                   00111111 */
  98     in += 3;
  99
 100   } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
 101     if (in+4 > end) PREMATURE_EOF;
 102     min = 1 << 21;
 103     uc = (((c     & 0x03) << 24) | /* 00000011--------+-------+------- */
 104           ((in[0] & 0x3F) << 18) | /*       00111111--+-------+------- */
 105           ((in[1] & 0x3F) << 12) | /*             00111111----+------- */
 106           ((in[2] & 0x3F) << 6)  | /*                   00111111------ */
 107           ((in[3] & 0x3F)));       /*                         00111111 */
 108     in += 4;
 109
 110   } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
 111     if (in+5 > end) PREMATURE_EOF;
 112     min = 1 << 26;
 113     uc = (((c     & 0x01) << 30) | /* 00000001------+-------+-------+------- */
 114           ((in[0] & 0x3F) << 24) | /*       00111111+-------+-------+------- */
 115           ((in[1] & 0x3F) << 18) | /*             00111111--+-------+------- */
 116           ((in[2] & 0x3F) << 12) | /*                   00111111----+------- */
 117           ((in[3] & 0x3F) << 6)  | /*                         00111111------ */
 118           ((in[4] & 0x3F)));       /*                               00111111 */
 119     in += 5;
 120   } else {
 121     uc = INVALID;                  /* Unparsable sequence. */
 122   }
 123
 124  DONE:
 125
 126   length = in - start;
 127
 128   /* If any of the continuation bytes didn't begin with the continuation tag,
 129      the sequence is invalid; stop at the bad byte, not consuming later ones.
 130      (It's easier to check this after the fact than up above.) */
 131   {
 132     int i;
 133     for (i = 1; i < length; i++)
 134       if ((start[i] & 0xC0) != 0x80) {
 135         uc = INVALID;
 136         length = i+1;
 137         break;
 138       }
 139   }
 140
 141   if (uc < min)
 142     /* A multi-byte sequence encoded a character that could have been
 143        encoded with a shorter sequence, e.g., hiding ASCII inside a
 144        multi-byte sequence. Something hinky's going on. Reject it. */
 145     uc = INVALID;
 146
 147   uc = uc_truncate (uc);
 148
 149   if (unicode_ret)
 150     *unicode_ret = uc;
 151
 152   return length;
 153 }
 154
 155
 156 /* Converts a Unicode character to a multi-byte UTF8 sequence.
 157    Returns the number of bytes written.
 158  */
 159 static int
 160 utf8_encode (unsigned long uc, char *out, long length)
 161 {
 162   const char *old = out;
 163
 164   uc = uc_truncate (uc);
 165
 166   if (uc < 0x80 && length >= 1)                 /* 7 bits in 1 byte */
 167     {
 168       *out++ = uc;                              /* 0xxxxxxx */
 169     }
 170   else if (uc < 0x800 && length >= 2)           /* 11 bits in 2 bytes */
 171     {
 172       *out++ = (0xC0 | ((uc >> 6)  & 0x1F));    /* 110xxxxx */
 173       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 174     }
 175   else if (uc < 0x10000L && length >= 3)        /* 16 bits in 3 bytes */
 176     {
 177       *out++ = (0xE0 | ((uc >> 12) & 0x0F));    /* 1110xxxx */
 178       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 179       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 180     }
 181   else if (uc < 0x200000L && length >= 4)       /* 21 bits in 4 bytes */
 182     {
 183       *out++ = (0xF0 | ((uc >> 18) & 0x07));    /* 11110xxx */
 184       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 185       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 186       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 187     }
 188   else if (uc < 0x4000000L && length >= 5)      /* 26 bits in 5 bytes */
 189     {
 190       *out++ = (0xF8 | ((uc >> 24) & 0x03));    /* 111110xx */
 191       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 192       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 193       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 194       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 195     }
 196   else if (length >= 6)                         /* 31 bits in 6 bytes */
 197     {
 198       *out++ = (0xFC | ((uc >> 30) & 0x01));    /* 1111110x */
 199       *out++ = (0x80 | ((uc >> 24) & 0x3F));    /* 10xxxxxx */
 200       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 201       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 202       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 203       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 204     }
 205
 206   return (int) (out - old);
 207 }
 208
 209
 210 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
 211    This only handles characters that can be represented in 16 bits, the
 212    Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
 213  */
 214 XChar2b *
 215 utf8_to_XChar2b (const char *string, int *length_ret)
 216 {
 217   long in_len = strlen(string);
 218   const unsigned char *in = (const unsigned char *) string;
 219   const unsigned char *in_end = in + in_len;
 220   XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
 221   XChar2b *out = c2b;
 222   if (! out) return 0;
 223
 224   while (in < in_end)
 225     {
 226       unsigned long uc = 0;
 227       long L = utf8_decode (in, in_end - in, &uc);
 228       in += L;
 229
 230       /* If it can't be represented in a 16-bit XChar2b,
 231          use "Unicode Replacement Character". */
 232       if (uc > 0xFFFF) uc = INVALID;
 233
 234       out->byte1 = (uc >> 8) & 0xFF;
 235       out->byte2 = uc & 0xFF;
 236       out++;
 237     }
 238
 239   out->byte1 = 0;
 240   out->byte2 = 0;
 241
 242   /* shrink */
 243   c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
 244
 245   if (length_ret)
 246     *length_ret = (int) (out - c2b);
 247
 248   return c2b;
 249 }
 250
 251
 252 /* Split a UTF8 string into an array of strings, one per character.
 253    The sub-strings will be null terminated and may be multiple bytes.
 254  */
 255 char **
 256 utf8_split (const char *string, int *length_ret)
 257 {
 258   const unsigned char *in = (const unsigned char *) string;
 259   long len = strlen (string);
 260   const unsigned char *end = in + len;
 261   char **ret = (char **) malloc ((len+1) * sizeof(*ret));
 262   int i = 0;
 263   if (!ret) return 0;
 264
 265   while (in < end)
 266     {
 267       long len2 = utf8_decode (in, len, 0);
 268       char tmp[10];
 269       strncpy (tmp, (char *) in, len2);
 270       tmp[len2] = 0;
 271       ret[i++] = strdup (tmp);
 272       in += len2;
 273     }
 274   ret[i] = 0;
 275
 276   /* shrink */
 277   ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
 278
 279   if (length_ret)
 280     *length_ret = i;
 281
 282   return ret;
 283 }
 284
 285
 286 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
 287  */
 288 char *
 289 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
 290 {
 291   int in_len = 0;
 292   const XChar2b *in_end;
 293   int out_len;
 294   char *utf8, *out;
 295   const char *out_end;
 296
 297   /* Find the null termination on the XChar2b. */
 298   for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
 299     ;
 300
 301   out_len = (in_len + 1) * 3;              /* 16 bit chars = 3 bytes max */
 302   utf8 = out = (char *) malloc (out_len + 1);
 303   if (! out) return 0;
 304   out_end = out + out_len;
 305
 306   while (in < in_end)
 307     {
 308       unsigned long uc = (in->byte1 << 8) | in->byte2;
 309       int wrote = utf8_encode (uc, out, out_end - out);
 310       if (wrote > 3) abort();  /* Can't happen with 16 bit input */
 311       out += wrote;
 312       in++;
 313     }
 314   *out = 0;
 315
 316   /* shrink */
 317   utf8 = (char *) realloc (utf8, (out - utf8 + 1) * sizeof(*utf8));
 318
 319   if (length_ret)
 320     *length_ret = (int) (out - utf8);
 321
 322   return utf8;
 323 }
 324
 325
 326 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
 327  */
 328 char *
 329 utf8_to_latin1 (const char *string, Bool ascii_p)
 330 {
 331   long in_len = strlen(string);
 332   const unsigned char *in = (const unsigned char *) string;
 333   const unsigned char *in_end = in + in_len;
 334   unsigned char *ret = (unsigned char *) malloc (in_len + 1);
 335   unsigned char *out = ret;
 336
 337   if (! ret) return 0;
 338
 339   while (in < in_end)
 340     {
 341       unsigned long uc = 0;
 342       long len2 = utf8_decode (in, in_end - in, &uc);
 343       in += len2;
 344
 345       if (uc == '\240') /* &nbsp; */
 346         uc = ' ';
 347       else if (uc >= 0x2300 && uc <= 0x36F)
 348         uc = 0;         /* Discard "Unicode Combining Diacriticals Block" */
 349       else if (uc > 0xFF)
 350         switch (uc) {
 351
 352         /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
 353
 354         case 0x2000:    /* EN QUAD */
 355         case 0x2001:    /* EM QUAD */
 356         case 0x2002:    /* EN SPACE */
 357         case 0x2003:    /* EM SPACE */
 358         case 0x2004:    /* THREE-PER-EM SPACE */
 359         case 0x2005:    /* FOUR-PER-EM SPACE */
 360         case 0x2006:    /* SIX-PER-EM SPACE */
 361         case 0x2007:    /* FIGURE SPACE */
 362         case 0x2008:    /* PUNCTUATION SPACE */
 363         case 0x2009:    /* THIN SPACE */
 364         case 0x200A:    /* HAIR SPACE */
 365           uc = ' ';
 366           break;
 367
 368         case 0x2010:    /* HYPHEN */
 369         case 0x2011:    /* NON-BREAKING HYPHEN */
 370         case 0x2012:    /* FIGURE DASH */
 371         case 0x2013:    /* EN DASH */
 372         case 0x2014:    /* EM DASH */
 373         case 0x2015:    /* HORIZONTAL BAR */
 374           uc = '-';
 375           break;
 376
 377         case 0x2018:    /* LEFT SINGLE QUOTATION MARK */
 378         case 0x2019:    /* SINGLE LOW-9 QUOTATION MARK */
 379         case 0x201A:    /* SINGLE LOW-9 QUOTATION MARK */
 380         case 0x201B:    /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
 381           uc = '\'';
 382           break;
 383
 384         case 0x201C:    /* LEFT DOUBLE QUOTATION MARK */
 385         case 0x201D:    /* RIGHT DOUBLE QUOTATION MARK */
 386         case 0x201E:    /* DOUBLE LOW-9 QUOTATION MARK */
 387         case 0x201F:    /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
 388           uc = '"';
 389           break;
 390
 391         case 0x2022: uc = '\267'; break; /* BULLET */
 392         case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
 393         case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
 394         case 0x202F: uc = ' ';    break; /* NARROW NO-BREAK SPACE */
 395         case 0x2038: uc = '^';    break; /* CARET */
 396         case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
 397         case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
 398         case 0x2041: uc = '^';    break; /* CARET INSERTION POINT */
 399         case 0x2042: uc = '*';    break; /* ASTERISM */
 400         case 0x2043: uc = '=';    break; /* HYPHEN BULLET */
 401         case 0x2044: uc = '/';    break; /* FRACTION SLASH */
 402         case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
 403         case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
 404         case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
 405         case 0x204E: uc = '*';    break; /* LOW ASTERISK */
 406         case 0x204F: uc = ';';    break; /* REVERSED SEMICOLON */
 407         default:
 408           break;
 409         }
 410
 411       if (uc > 0xFF)
 412         /* "Inverted question mark" looks enough like 0xFFFD,
 413            the "Unicode Replacement Character". */
 414         uc = (ascii_p ? '#' : '\277');
 415
 416       if (ascii_p)      /* Map Latin1 to the closest ASCII versions. */
 417         {
 418           const unsigned char latin1_to_ascii[96] =
 419              " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
 420              "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
 421              "aaaaaaeceeeeiiiionooooo/ouuuuypy";
 422           if (uc >= 0xA0)
 423             uc = latin1_to_ascii[uc - 0xA0];
 424         }
 425
 426       if (uc > 0)
 427         *out++ = (unsigned char) uc;
 428     }
 429   *out = 0;
 430
 431   /* shrink */
 432   ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
 433
 434   return (char *) ret;
 435 }
 436
 437
 438 /*************************************************************************
 439
 440  cd ../hacks ; make test-utf8wc
 441
 442  *************************************************************************/
 443
 444 #ifdef SELFTEST
 445
 446 /* Convert a UTF8 string to Unicode and back again.
 447  */
 448 static char *
 449 split_and_join (const char *string)
 450 {
 451   const unsigned char *in = (const unsigned char *) string;
 452   int len = strlen (string);
 453   const unsigned char *end = in + len;
 454   unsigned long *unicode = (unsigned long *)
 455     malloc((len + 1) * sizeof(*unicode));
 456   int i = 0;
 457   char *ret, *out, *out_end;
 458
 459   while (in < end)
 460     {
 461       long len2 = utf8_decode (in, len, &unicode[i]);
 462       i++;
 463       in += len2;
 464     }
 465   unicode[i] = 0;
 466
 467   i = i*6 + 1;
 468   out = ret = (char *) malloc(i);
 469   out_end = out + i;
 470   i = 0;
 471   while (unicode[i])
 472     {
 473       int len2 = utf8_encode (unicode[i], out, out_end - out);
 474       out += len2;
 475       i++;
 476     }
 477   *out = 0;
 478   free (unicode);
 479
 480   return ret;
 481 }
 482
 483
 484 static void
 485 LOG (FILE *out, const char *prefix, const char *s)
 486 {
 487   fprintf (out, "%6s: \"", prefix);
 488   while (*s)
 489     {
 490       unsigned char c = *s;
 491       if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
 492       else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
 493       else fprintf (out, "%c", c);
 494       s++;
 495     }
 496   fprintf (out, "\"\n");
 497 }
 498
 499
 500 int
 501 main (int argc, char **argv)
 502 {
 503   /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 504    */
 505
 506 #  define URC "\357\277\275"   /* 0xFFFD, "Unicode Replacement Character" */
 507
 508   static const struct { const char *name, *in, *target, *target2; } tests[] = {
 509     /* 1  Some correct UTF-8 text */
 510
 511     /* The Greek word 'kosme': */
 512     { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
 513
 514
 515     /* 2  Boundary condition test cases */
 516
 517     /* 2.1  First possible sequence of a certain length */
 518
 519     { "2.1.1", /*  1 byte  (U-00000000): */  "\000" },
 520     { "2.1.2", /*  2 bytes (U-00000080): */  "\302\200" },
 521     { "2.1.3", /*  3 bytes (U-00000800): */  "\340\240\200" },
 522     { "2.1.4", /*  4 bytes (U-00010000): */  "\360\220\200\200", 0, URC },
 523     { "2.1.5", /*  5 bytes (U-00200000): */  "\370\210\200\200\200", URC },
 524     { "2.1.6", /*  6 bytes (U-04000000): */  "\374\204\200\200\200\200", URC },
 525
 526     /* 2.2  Last possible sequence of a certain length */
 527
 528     { "2.2.1", /*  1 byte  (U-0000007F): */  "\177" },
 529     { "2.2.2", /*  2 bytes (U-000007FF): */  "\337\277" },
 530     { "2.2.3", /*  3 bytes (U-0000FFFF): */  "\357\277\277" },
 531     { "2.2.4", /*  4 bytes (U-001FFFFF): */  "\367\277\277\277", URC },
 532     { "2.2.5", /*  5 bytes (U-03FFFFFF): */  "\373\277\277\277\277", URC },
 533     { "2.2.6", /*  6 bytes (U-7FFFFFFF): */  "\375\277\277\277\277\277", URC },
 534
 535     /* 2.3  Other boundary conditions */
 536
 537     { "2.3.1", /*  U-0000D7FF = ed 9f bf = */    "\355\237\277" },
 538     { "2.3.2", /*  U-0000E000 = ee 80 80 = */    "\356\200\200" },
 539     { "2.3.3", /*  U-0000FFFD = ef bf bd = */    URC },
 540     { "2.3.4", /*  U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
 541     { "2.3.5", /*  U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
 542
 543
 544     /* 3  Malformed sequences */
 545
 546     /* 3.1  Unexpected continuation bytes */
 547
 548     /* Each unexpected continuation byte should be separately signalled as a
 549        malformed sequence of its own. */
 550
 551     { "3.1.1", /*  First continuation byte 0x80: */ "\200", URC },
 552     { "3.1.2", /*  Last  continuation byte 0xbf: */ "\277", URC },
 553     { "3.1.3", /*  2 continuation bytes: */ "\200\277",     URC URC },
 554     { "3.1.4", /*  3 continuation bytes: */ "\200\277\200", URC URC URC },
 555     { "3.1.5", /*  4 continuation bytes: */ "\200\277\200\277",
 556       URC URC URC URC },
 557     { "3.1.6", /*  5 continuation bytes: */ "\200\277\200\277\200",
 558       URC URC URC URC URC },
 559     { "3.1.7", /*  6 continuation bytes: */ "\200\277\200\277\200\277",
 560       URC URC URC URC URC URC },
 561     { "3.1.8", /*  7 continuation bytes: */ "\200\277\200\277\200\277\200",
 562       URC URC URC URC URC URC URC },
 563
 564     { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
 565
 566       "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
 567       "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
 568       "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
 569       "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
 570       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 571       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 572       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 573       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 574
 575     /* 3.2  Lonely start characters */
 576
 577     { "3.2.1", /*  All 32 first bytes of 2-byte sequences (0xc0-0xdf),
 578                    each followed by a space character: */
 579
 580       "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
 581       "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
 582       "\332 \333 \334 \335 \336 \337 ",
 583       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 584       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 585
 586     { "3.2.2", /*  All 16 first bytes of 3-byte sequences (0xe0-0xef),
 587                    each followed by a space character: */
 588       "\340 \341 \342 \343 \344 \345 \346 \347 "
 589       "\350 \351 \352 \353 \354 \355 \356 \357 ",
 590       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 591
 592     { "3.2.3", /*  All 8 first bytes of 4-byte sequences (0xf0-0xf7),
 593                    each followed by a space character: */
 594       URC URC URC URC URC URC URC URC },
 595
 596     { "3.2.4", /*  All 4 first bytes of 5-byte sequences (0xf8-0xfb),
 597                    each followed by a space character: */
 598       "\370 \371 \372 \373 ",
 599       URC URC URC URC },
 600
 601     { "3.2.5", /*  All 2 first bytes of 6-byte sequences (0xfc-0xfd),
 602                    each followed by a space character: */
 603       "\374 \375 ", URC URC },
 604
 605     /* 3.3  Sequences with last continuation byte missing */
 606
 607     /* All bytes of an incomplete sequence should be signalled as a single
 608        malformed sequence, i.e., you should see only a single replacement
 609        character in each of the next 10 tests. (Characters as in section 2) */
 610
 611     { "3.3.1", /*  2-byte sequence with last byte missing (U+0000): */
 612       "\300", URC },
 613     { "3.3.2", /*  3-byte sequence with last byte missing (U+0000): */
 614       "\340\200", URC },
 615     { "3.3.3", /*  4-byte sequence with last byte missing (U+0000): */
 616       "\360\200\200", URC },
 617     { "3.3.4", /*  5-byte sequence with last byte missing (U+0000): */
 618       "\370\200\200\200", URC },
 619     { "3.3.5", /*  6-byte sequence with last byte missing (U+0000): */
 620       "\374\200\200\200\200", URC },
 621     { "3.3.6", /*  2-byte sequence with last byte missing (U-000007FF): */
 622       "\337", URC },
 623     { "3.3.7", /*  3-byte sequence with last byte missing (U-0000FFFF): */
 624       "\357\277", URC },
 625     { "3.3.8", /*  4-byte sequence with last byte missing (U-001FFFFF): */
 626       "\367\277\277", URC },
 627     { "3.3.9", /*  5-byte sequence with last byte missing (U-03FFFFFF): */
 628       "\373\277\277\277", URC },
 629     { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
 630       "\375\277\277\277\277", URC },
 631
 632     /* 3.4  Concatenation of incomplete sequences */
 633
 634     /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
 635        sequences being signalled: */
 636
 637     { "3.4",   "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
 638       "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
 639       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 640
 641     /* 3.5  Impossible bytes */
 642
 643     /* The following two bytes cannot appear in a correct UTF-8 string */
 644
 645     { "3.5.1", /*  fe = */      "\376", URC },
 646     { "3.5.2", /*  ff = */      "\377", URC },
 647     { "3.5.3", /*  fe fe ff ff = */     "\376\376\377\377", URC URC URC URC },
 648
 649
 650     /* 4  Overlong sequences */
 651
 652     /* 4.1  Examples of an overlong ASCII character */
 653
 654     { "4.1.1", /* U+002F = c0 af             = */ "\300\257", URC },
 655     { "4.1.2", /* U+002F = e0 80 af          = */ "\340\200\257", URC },
 656     { "4.1.3", /* U+002F = f0 80 80 af       = */ "\360\200\200\257", URC },
 657     { "4.1.4", /* U+002F = f8 80 80 80 af    = */ "\370\200\200\200\257",
 658       URC },
 659     { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
 660       URC },
 661
 662     /* 4.2  Maximum overlong sequences */
 663
 664     { "4.2.1", /*  U-0000007F = c1 bf             = */ "\301\277", URC },
 665     { "4.2.2", /*  U-000007FF = e0 9f bf          = */ "\340\237\277", URC },
 666     { "4.2.3", /*  U-0000FFFF = f0 8f bf bf       = */ "\360\217\277\277",
 667       URC },
 668     { "4.2.4", /*  U-001FFFFF = f8 87 bf bf bf    = */ "\370\207\277\277\277",
 669       URC },
 670     { "4.2.5", /*  U-03FFFFFF = fc 83 bf bf bf bf = */  URC },
 671
 672     /* 4.3  Overlong representation of the NUL character */
 673
 674     { "4.3.1", /*  U+0000 = c0 80             = */  "\300\200", URC },
 675     { "4.3.2", /*  U+0000 = e0 80 80          = */  "\340\200\200", URC },
 676     { "4.3.3", /*  U+0000 = f0 80 80 80       = */  "\360\200\200\200", URC },
 677     { "4.3.4", /*  U+0000 = f8 80 80 80 80    = */  "\370\200\200\200\200",
 678       URC },
 679     { "4.3.5", /*  U+0000 = fc 80 80 80 80 80 = */  "\374\200\200\200\200\200",
 680       URC },
 681
 682
 683     /* 5  Illegal code positions */
 684
 685     /* 5.1 Single UTF-16 surrogates */
 686
 687     { "5.1.1", /*  U+D800 = ed a0 80 = */       "\355\240\200", URC },
 688     { "5.1.2", /*  U+DB7F = ed ad bf = */       "\355\255\277", URC },
 689     { "5.1.3", /*  U+DB80 = ed ae 80 = */       "\355\256\200", URC },
 690     { "5.1.4", /*  U+DBFF = ed af bf = */       "\355\257\277", URC },
 691     { "5.1.5", /*  U+DC00 = ed b0 80 = */       "\355\260\200", URC },
 692     { "5.1.6", /*  U+DF80 = ed be 80 = */       "\355\276\200", URC },
 693     { "5.1.7", /*  U+DFFF = ed bf bf = */       "\355\277\277", URC },
 694
 695     /* 5.2 Paired UTF-16 surrogates */
 696
 697     { "5.2.1", /*  U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
 698     { "5.2.2", /*  U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
 699     { "5.2.3", /*  U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
 700     { "5.2.4", /*  U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
 701     { "5.2.5", /*  U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
 702     { "5.2.6", /*  U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
 703     { "5.2.7", /*  U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
 704     { "5.2.8", /*  U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
 705
 706     /* 5.3 Other illegal code positions */
 707
 708     { "5.3.1", /*  U+FFFE = ef bf be = */       "\357\277\276" },
 709     { "5.3.2", /*  U+FFFF = ef bf bf = */       "\357\277\277" },
 710
 711
 712     /* 6 Some other junk */
 713
 714     { "6.0", "" },
 715     { "6.1", "\001\002\003\004\005 ABC" },
 716     { "6.2", /* every non-ASCII Latin1 character */
 717       "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
 718       "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
 719       "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
 720       "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
 721       "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
 722       "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
 723       "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
 724       "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
 725       "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
 726       "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
 727       "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
 728       "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
 729
 730     { "6.3", /* Christmas tree */
 731       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 732       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
 733       "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
 734       "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
 735       "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
 736       "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
 737       "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
 738       "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
 739       "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
 740       "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
 741       "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
 742       "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
 743       "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
 744       "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
 745       "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
 746       "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
 747
 748       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 749       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
 750       " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 751       "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
 752       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 753       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 754       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 755       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 756       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 757       URC URC URC URC URC URC URC URC URC URC URC URC },
 758   };
 759
 760   int i;
 761   int ok = 1;
 762   for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
 763     {
 764       const char *name = tests[i].name;
 765       const char *in   = tests[i].in;
 766       const char *target = (tests[i].target ? tests[i].target : in);
 767       const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
 768       char *out = split_and_join (in);
 769       XChar2b *out16 = utf8_to_XChar2b (in, 0);
 770       char *out2 = XChar2b_to_utf8 (out16, 0);
 771       if (strcmp (out, target))
 772         {
 773           LOG (stderr, name, target);
 774           LOG (stderr, "FAIL", out);
 775           fprintf (stderr, "\n");
 776           ok = 0;
 777         }
 778       if (strcmp (out2, target2))
 779         {
 780           LOG (stderr, name, target2);
 781           LOG (stderr, "FAIL2", out2);
 782           fprintf (stderr, "\n");
 783           ok = 0;
 784         }
 785       free (out);
 786       free (out2);
 787       free (out16);
 788     }
 789
 790   {
 791     const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
 792                         "c\303\264t\303\251 de l'alc\303\264ve "
 793                         "ovo\303\257de, o\303\271 les b\303\273ches "
 794                         "se consument dans l'\303\242tre");
 795     const char *latin1 = ("son \356le int\351rieure, \340 "
 796                           "c\364t\351 de l'alc\364ve ovo\357de, "
 797                           "o\371 les b\373ches se consument dans "
 798                           "l'\342tre");
 799     const char *ascii = ("son ile interieure, a cote de l'alcove "
 800                          "ovoide, ou les buches se consument dans "
 801                          "l'atre");
 802     char *latin1b = utf8_to_latin1 (utf8, False);
 803     char *ascii2  = utf8_to_latin1 (utf8, True);
 804     if (strcmp (latin1, latin1b))
 805       {
 806         LOG (stderr, "LATIN1", utf8);
 807         LOG (stderr, "FAIL3", latin1b);
 808         fprintf (stderr, "\n");
 809         ok = 0;
 810       }
 811     if (strcmp (ascii, ascii2))
 812       {
 813         LOG (stderr, "ASCII", utf8);
 814         LOG (stderr, "FAIL4", ascii2);
 815         fprintf (stderr, "\n");
 816         ok = 0;
 817       }
 818     free (latin1b);
 819     free (ascii2);
 820   }
 821
 822
 823   if (ok) fprintf (stderr, "OK\n");
 824   return (ok == 0);
 825 }
 826
 827 #endif /* SELFTEST */