git.hungrycats.org Git - xscreensaver/blob - utils/utf8wc.c

   1 /* xscreensaver, Copyright (c) 2014-2015 Jamie Zawinski <jwz@jwz.org>
   2  *
   3  * Permission to use, copy, modify, distribute, and sell this software and its
   4  * documentation for any purpose is hereby granted without fee, provided that
   5  * the above copyright notice appear in all copies and that both that
   6  * copyright notice and this permission notice appear in supporting
   7  * documentation.  No representations are made about the suitability of this
   8  * software for any purpose.  It is provided "as is" without express or
   9  * implied warranty.
  10  */
  11
  12 #ifdef HAVE_CONFIG_H
  13 # include "config.h"
  14 #endif
  15
  16 #include <stdlib.h>
  17 #include <stdio.h>
  18 #include <string.h>
  19
  20 #ifdef HAVE_COCOA
  21 # include "jwxyz.h"
  22 # elif defined(HAVE_ANDROID)
  23 # include "jwxyz.h"
  24 #else /* !HAVE_COCOA */
  25 # include <X11/Xlib.h>
  26 #endif
  27
  28 #include "utf8wc.h"
  29
  30
  31 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
  32 # define INVALID 0xFFFD
  33
  34
  35 /* Mask the number to be within the valid range of unicode characters.
  36  */
  37 static unsigned long
  38 uc_truncate (unsigned long uc)
  39 {
  40   uc &= 0x7FFFFFFFL;                    /* Unicode is 31 bits */
  41   if (uc > 0x10FFFF) uc = INVALID;      /* But UTF-8 is 4 bytes */
  42   if (uc == 0) uc = INVALID;            /* no nulls */
  43
  44   if (uc >= 0xD800 && uc <= 0xDFFF)
  45     /* Reserved for use with UTF-16: not a real character. */
  46     uc = INVALID;
  47
  48   return uc;
  49 }
  50
  51
  52 /* Parse the first UTF8 character at the front of the string.
  53    Return the Unicode character, and the number of bytes read.
  54  */
  55 long
  56 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
  57 {
  58   const unsigned char *start = in;
  59   const unsigned char *end = in + length;
  60   unsigned long uc = INVALID;
  61   unsigned long min = 0;
  62   unsigned char c;
  63
  64   if (length <= 0) goto DONE;
  65
  66   c = *in++;
  67
  68 # define PREMATURE_EOF { in = end; goto DONE; }
  69
  70   if ((c & 0xC0) == 0x80) {        /* 10xxxxxx - lonely continuation byte */
  71     uc = INVALID;
  72
  73   } else if ((c & 0x80) == 0) {    /* 0xxxxxxx - 7 bits in 1 byte */
  74     uc = (c & 0x7F);               /* 01111111 */
  75
  76   } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
  77     if (in+1 > end) PREMATURE_EOF;
  78     min = 1 << 7;
  79     uc = (((c    & 0x1F) << 6) |   /* 00011111------ */
  80           (in[0] & 0x3F));         /*       00111111 */
  81     in += 1;
  82
  83   } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
  84     if (in+2 > end) PREMATURE_EOF;
  85     min = 1 << 11;
  86     uc = (((c     & 0x0F) << 12) | /* 00001111----+------- */
  87           ((in[0] & 0x3F) <<  6) | /*       00111111------ */
  88           ((in[1] & 0x3F)));       /*             00111111 */
  89     in += 2;
  90
  91   } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
  92     if (in+3 > end) PREMATURE_EOF;
  93     min = 1 << 16;
  94     uc = (((c     & 0x07) << 18) | /* 00000111--+-------+------- */
  95           ((in[0] & 0x3F) << 12) | /*       01111111----+------- */
  96           ((in[1] & 0x3F) <<  6) | /*             00111111------ */
  97           ((in[2] & 0x3F)));       /*                   00111111 */
  98     in += 3;
  99
 100   } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
 101     if (in+4 > end) PREMATURE_EOF;
 102     min = 1 << 21;
 103     uc = (((c     & 0x03) << 24) | /* 00000011--------+-------+------- */
 104           ((in[0] & 0x3F) << 18) | /*       00111111--+-------+------- */
 105           ((in[1] & 0x3F) << 12) | /*             00111111----+------- */
 106           ((in[2] & 0x3F) << 6)  | /*                   00111111------ */
 107           ((in[3] & 0x3F)));       /*                         00111111 */
 108     in += 4;
 109
 110   } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
 111     if (in+5 > end) PREMATURE_EOF;
 112     min = 1 << 26;
 113     uc = (((c     & 0x01) << 30) | /* 00000001------+-------+-------+------- */
 114           ((in[0] & 0x3F) << 24) | /*       00111111+-------+-------+------- */
 115           ((in[1] & 0x3F) << 18) | /*             00111111--+-------+------- */
 116           ((in[2] & 0x3F) << 12) | /*                   00111111----+------- */
 117           ((in[3] & 0x3F) << 6)  | /*                         00111111------ */
 118           ((in[4] & 0x3F)));       /*                               00111111 */
 119     in += 5;
 120   } else {
 121     uc = INVALID;                  /* Unparsable sequence. */
 122   }
 123
 124  DONE:
 125
 126   length = in - start;
 127
 128   /* If any of the continuation bytes didn't begin with the continuation tag,
 129      the sequence is invalid; stop at the bad byte, not consuming later ones.
 130      (It's easier to check this after the fact than up above.) */
 131   {
 132     int i;
 133     for (i = 1; i < length; i++)
 134       if ((start[i] & 0xC0) != 0x80) {
 135         uc = INVALID;
 136         length = i+1;
 137         break;
 138       }
 139   }
 140
 141   if (uc < min)
 142     /* A multi-byte sequence encoded a character that could have been
 143        encoded with a shorter sequence, e.g., hiding ASCII inside a
 144        multi-byte sequence. Something hinky's going on. Reject it. */
 145     uc = INVALID;
 146
 147   uc = uc_truncate (uc);
 148
 149   if (unicode_ret)
 150     *unicode_ret = uc;
 151
 152   return length;
 153 }
 154
 155
 156 /* Converts a Unicode character to a multi-byte UTF8 sequence.
 157    Returns the number of bytes written.
 158  */
 159 int
 160 utf8_encode (unsigned long uc, char *out, long length)
 161 {
 162   const char *old = out;
 163
 164   uc = uc_truncate (uc);
 165
 166   if (uc < 0x80 && length >= 1)                 /* 7 bits in 1 byte */
 167     {
 168       *out++ = uc;                              /* 0xxxxxxx */
 169     }
 170   else if (uc < 0x800 && length >= 2)           /* 11 bits in 2 bytes */
 171     {
 172       *out++ = (0xC0 | ((uc >> 6)  & 0x1F));    /* 110xxxxx */
 173       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 174     }
 175   else if (uc < 0x10000L && length >= 3)        /* 16 bits in 3 bytes */
 176     {
 177       *out++ = (0xE0 | ((uc >> 12) & 0x0F));    /* 1110xxxx */
 178       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 179       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 180     }
 181   else if (uc < 0x200000L && length >= 4)       /* 21 bits in 4 bytes */
 182     {
 183       *out++ = (0xF0 | ((uc >> 18) & 0x07));    /* 11110xxx */
 184       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 185       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 186       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 187     }
 188   else if (uc < 0x4000000L && length >= 5)      /* 26 bits in 5 bytes */
 189     {
 190       *out++ = (0xF8 | ((uc >> 24) & 0x03));    /* 111110xx */
 191       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 192       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 193       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 194       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 195     }
 196   else if (length >= 6)                         /* 31 bits in 6 bytes */
 197     {
 198       *out++ = (0xFC | ((uc >> 30) & 0x01));    /* 1111110x */
 199       *out++ = (0x80 | ((uc >> 24) & 0x3F));    /* 10xxxxxx */
 200       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 201       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 202       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 203       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 204     }
 205
 206   return (int) (out - old);
 207 }
 208
 209
 210 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
 211    This only handles characters that can be represented in 16 bits, the
 212    Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
 213  */
 214 XChar2b *
 215 utf8_to_XChar2b (const char *string, int *length_ret)
 216 {
 217   long in_len = strlen(string);
 218   const unsigned char *in = (const unsigned char *) string;
 219   const unsigned char *in_end = in + in_len;
 220   XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
 221   XChar2b *out = c2b;
 222   if (! out) return 0;
 223
 224   while (in < in_end)
 225     {
 226       unsigned long uc = 0;
 227       long L = utf8_decode (in, in_end - in, &uc);
 228       in += L;
 229
 230       /* If it can't be represented in a 16-bit XChar2b,
 231          use "Unicode Replacement Character". */
 232       if (uc > 0xFFFF) uc = INVALID;
 233
 234       out->byte1 = (uc >> 8) & 0xFF;
 235       out->byte2 = uc & 0xFF;
 236       out++;
 237     }
 238
 239   out->byte1 = 0;
 240   out->byte2 = 0;
 241
 242   /* shrink */
 243   c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
 244
 245   if (length_ret)
 246     *length_ret = (int) (out - c2b);
 247
 248   return c2b;
 249 }
 250
 251
 252 /* Split a UTF8 string into an array of strings, one per character.
 253    The sub-strings will be null terminated and may be multiple bytes.
 254  */
 255 char **
 256 utf8_split (const char *string, int *length_ret)
 257 {
 258   const unsigned char *in = (const unsigned char *) string;
 259   long len = strlen (string);
 260   const unsigned char *end = in + len;
 261   char **ret = (char **) malloc ((len+1) * sizeof(*ret));
 262   int i = 0;
 263   if (!ret) return 0;
 264
 265   while (in < end)
 266     {
 267       unsigned long uc;
 268       long len2 = utf8_decode (in, len, &uc);
 269       char tmp[10];
 270       strncpy (tmp, (char *) in, len2);
 271       tmp[len2] = 0;
 272       ret[i++] = strdup (tmp);
 273       in += len2;
 274
 275       /* If this is a Combining Diacritical, append it to the previous
 276          character. E.g., "y\314\206\314\206" is one string, not three.
 277        */
 278       if (i > 1 && uc >= 0x300 && uc <= 0x36F)
 279         {
 280           long L1 = strlen(ret[i-2]);
 281           long L2 = strlen(ret[i-1]);
 282           char *s2 = (char *) malloc (L1 + L2 + 1);
 283           strncpy (s2,      ret[i-2], L1);
 284           strncpy (s2 + L1, ret[i-1], L2);
 285           s2[L1 + L2] = 0;
 286           free (ret[i-2]);
 287           ret[i-2] = s2;
 288           i--;
 289         }
 290     }
 291   ret[i] = 0;
 292
 293   /* shrink */
 294   ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
 295
 296   if (length_ret)
 297     *length_ret = i;
 298
 299   return ret;
 300 }
 301
 302
 303 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
 304  */
 305 char *
 306 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
 307 {
 308   int in_len = 0;
 309   const XChar2b *in_end;
 310   int out_len;
 311   char *utf8, *out;
 312   const char *out_end;
 313
 314   /* Find the null termination on the XChar2b. */
 315   for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
 316     ;
 317
 318   out_len = (in_len + 1) * 3;              /* 16 bit chars = 3 bytes max */
 319   utf8 = out = (char *) malloc (out_len + 1);
 320   if (! out) return 0;
 321   out_end = out + out_len;
 322
 323   while (in < in_end)
 324     {
 325       unsigned long uc = (in->byte1 << 8) | in->byte2;
 326       int wrote = utf8_encode (uc, out, out_end - out);
 327       if (wrote > 3) abort();  /* Can't happen with 16 bit input */
 328       out += wrote;
 329       in++;
 330     }
 331   *out = 0;
 332
 333   /* shrink */
 334   out_len = (int) (out - utf8 + 1);
 335   utf8 = (char *) realloc (utf8, out_len);
 336
 337   if (length_ret)
 338     *length_ret = out_len;
 339
 340   return utf8;
 341 }
 342
 343
 344 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
 345  */
 346 char *
 347 utf8_to_latin1 (const char *string, Bool ascii_p)
 348 {
 349   long in_len = strlen(string);
 350   const unsigned char *in = (const unsigned char *) string;
 351   const unsigned char *in_end = in + in_len;
 352   unsigned char *ret = (unsigned char *) malloc (in_len + 1);
 353   unsigned char *out = ret;
 354
 355   if (! ret) return 0;
 356
 357   while (in < in_end)
 358     {
 359       unsigned long uc = 0;
 360       long len2 = utf8_decode (in, in_end - in, &uc);
 361       in += len2;
 362
 363       if (uc == '\240') /* &nbsp; */
 364         uc = ' ';
 365       else if (uc >= 0x300 && uc <= 0x36F)
 366         uc = 0;         /* Discard "Unicode Combining Diacriticals Block" */
 367       else if (uc > 0xFF)
 368         switch (uc) {
 369
 370         /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
 371
 372         case 0x2000:    /* EN QUAD */
 373         case 0x2001:    /* EM QUAD */
 374         case 0x2002:    /* EN SPACE */
 375         case 0x2003:    /* EM SPACE */
 376         case 0x2004:    /* THREE-PER-EM SPACE */
 377         case 0x2005:    /* FOUR-PER-EM SPACE */
 378         case 0x2006:    /* SIX-PER-EM SPACE */
 379         case 0x2007:    /* FIGURE SPACE */
 380         case 0x2008:    /* PUNCTUATION SPACE */
 381         case 0x2009:    /* THIN SPACE */
 382         case 0x200A:    /* HAIR SPACE */
 383           uc = ' ';
 384           break;
 385
 386         case 0x2010:    /* HYPHEN */
 387         case 0x2011:    /* NON-BREAKING HYPHEN */
 388         case 0x2012:    /* FIGURE DASH */
 389         case 0x2013:    /* EN DASH */
 390         case 0x2014:    /* EM DASH */
 391         case 0x2015:    /* HORIZONTAL BAR */
 392           uc = '-';
 393           break;
 394
 395         case 0x2018:    /* LEFT SINGLE QUOTATION MARK */
 396         case 0x2019:    /* SINGLE LOW-9 QUOTATION MARK */
 397         case 0x201A:    /* SINGLE LOW-9 QUOTATION MARK */
 398         case 0x201B:    /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
 399           uc = '\'';
 400           break;
 401
 402         case 0x201C:    /* LEFT DOUBLE QUOTATION MARK */
 403         case 0x201D:    /* RIGHT DOUBLE QUOTATION MARK */
 404         case 0x201E:    /* DOUBLE LOW-9 QUOTATION MARK */
 405         case 0x201F:    /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
 406           uc = '"';
 407           break;
 408
 409         case 0x2022: uc = '\267'; break; /* BULLET */
 410         case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
 411         case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
 412         case 0x202F: uc = ' ';    break; /* NARROW NO-BREAK SPACE */
 413         case 0x2038: uc = '^';    break; /* CARET */
 414         case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
 415         case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
 416         case 0x2041: uc = '^';    break; /* CARET INSERTION POINT */
 417         case 0x2042: uc = '*';    break; /* ASTERISM */
 418         case 0x2043: uc = '=';    break; /* HYPHEN BULLET */
 419         case 0x2044: uc = '/';    break; /* FRACTION SLASH */
 420         case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
 421         case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
 422         case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
 423         case 0x204E: uc = '*';    break; /* LOW ASTERISK */
 424         case 0x204F: uc = ';';    break; /* REVERSED SEMICOLON */
 425         default:
 426           break;
 427         }
 428
 429       if (uc > 0xFF)
 430         /* "Inverted question mark" looks enough like 0xFFFD,
 431            the "Unicode Replacement Character". */
 432         uc = (ascii_p ? '#' : '\277');
 433
 434       if (ascii_p)      /* Map Latin1 to the closest ASCII versions. */
 435         {
 436           const unsigned char latin1_to_ascii[96] =
 437              " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
 438              "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
 439              "aaaaaaeceeeeiiiionooooo/ouuuuypy";
 440           if (uc >= 0xA0)
 441             uc = latin1_to_ascii[uc - 0xA0];
 442         }
 443
 444       if (uc > 0)
 445         *out++ = (unsigned char) uc;
 446     }
 447   *out = 0;
 448
 449   /* shrink */
 450   ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
 451
 452   return (char *) ret;
 453 }
 454
 455
 456 /*************************************************************************
 457
 458  cd ../hacks ; make test-utf8wc
 459
 460  *************************************************************************/
 461
 462 #ifdef SELFTEST
 463
 464 /* Convert a UTF8 string to Unicode and back again.
 465  */
 466 static char *
 467 split_and_join (const char *string)
 468 {
 469   const unsigned char *in = (const unsigned char *) string;
 470   int len = strlen (string);
 471   const unsigned char *end = in + len;
 472   unsigned long *unicode = (unsigned long *)
 473     malloc((len + 1) * sizeof(*unicode));
 474   int i = 0;
 475   char *ret, *out, *out_end;
 476
 477   while (in < end)
 478     {
 479       long len2 = utf8_decode (in, len, &unicode[i]);
 480       i++;
 481       in += len2;
 482     }
 483   unicode[i] = 0;
 484
 485   i = i*6 + 1;
 486   out = ret = (char *) malloc(i);
 487   out_end = out + i;
 488   i = 0;
 489   while (unicode[i])
 490     {
 491       int len2 = utf8_encode (unicode[i], out, out_end - out);
 492       out += len2;
 493       i++;
 494     }
 495   *out = 0;
 496   free (unicode);
 497
 498   return ret;
 499 }
 500
 501
 502 static void
 503 LOG (FILE *out, const char *prefix, const char *s)
 504 {
 505   fprintf (out, "%6s: \"", prefix);
 506   while (*s)
 507     {
 508       unsigned char c = *s;
 509       if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
 510       else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
 511       else fprintf (out, "%c", c);
 512       s++;
 513     }
 514   fprintf (out, "\"\n");
 515 }
 516
 517
 518 int
 519 main (int argc, char **argv)
 520 {
 521   /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 522    */
 523
 524 #  define URC "\357\277\275"   /* 0xFFFD, "Unicode Replacement Character" */
 525
 526   static const struct { const char *name, *in, *target, *target2; } tests[] = {
 527     /* 1  Some correct UTF-8 text */
 528
 529     /* The Greek word 'kosme': */
 530     { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
 531
 532
 533     /* 2  Boundary condition test cases */
 534
 535     /* 2.1  First possible sequence of a certain length */
 536
 537     { "2.1.1", /*  1 byte  (U-00000000): */  "\000" },
 538     { "2.1.2", /*  2 bytes (U-00000080): */  "\302\200" },
 539     { "2.1.3", /*  3 bytes (U-00000800): */  "\340\240\200" },
 540     { "2.1.4", /*  4 bytes (U-00010000): */  "\360\220\200\200", 0, URC },
 541     { "2.1.5", /*  5 bytes (U-00200000): */  "\370\210\200\200\200", URC },
 542     { "2.1.6", /*  6 bytes (U-04000000): */  "\374\204\200\200\200\200", URC },
 543
 544     /* 2.2  Last possible sequence of a certain length */
 545
 546     { "2.2.1", /*  1 byte  (U-0000007F): */  "\177" },
 547     { "2.2.2", /*  2 bytes (U-000007FF): */  "\337\277" },
 548     { "2.2.3", /*  3 bytes (U-0000FFFF): */  "\357\277\277" },
 549     { "2.2.4", /*  4 bytes (U-001FFFFF): */  "\367\277\277\277", URC },
 550     { "2.2.5", /*  5 bytes (U-03FFFFFF): */  "\373\277\277\277\277", URC },
 551     { "2.2.6", /*  6 bytes (U-7FFFFFFF): */  "\375\277\277\277\277\277", URC },
 552
 553     /* 2.3  Other boundary conditions */
 554
 555     { "2.3.1", /*  U-0000D7FF = ed 9f bf = */    "\355\237\277" },
 556     { "2.3.2", /*  U-0000E000 = ee 80 80 = */    "\356\200\200" },
 557     { "2.3.3", /*  U-0000FFFD = ef bf bd = */    URC },
 558     { "2.3.4", /*  U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
 559     { "2.3.5", /*  U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
 560
 561
 562     /* 3  Malformed sequences */
 563
 564     /* 3.1  Unexpected continuation bytes */
 565
 566     /* Each unexpected continuation byte should be separately signalled as a
 567        malformed sequence of its own. */
 568
 569     { "3.1.1", /*  First continuation byte 0x80: */ "\200", URC },
 570     { "3.1.2", /*  Last  continuation byte 0xbf: */ "\277", URC },
 571     { "3.1.3", /*  2 continuation bytes: */ "\200\277",     URC URC },
 572     { "3.1.4", /*  3 continuation bytes: */ "\200\277\200", URC URC URC },
 573     { "3.1.5", /*  4 continuation bytes: */ "\200\277\200\277",
 574       URC URC URC URC },
 575     { "3.1.6", /*  5 continuation bytes: */ "\200\277\200\277\200",
 576       URC URC URC URC URC },
 577     { "3.1.7", /*  6 continuation bytes: */ "\200\277\200\277\200\277",
 578       URC URC URC URC URC URC },
 579     { "3.1.8", /*  7 continuation bytes: */ "\200\277\200\277\200\277\200",
 580       URC URC URC URC URC URC URC },
 581
 582     { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
 583
 584       "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
 585       "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
 586       "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
 587       "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
 588       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 589       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 590       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 591       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 592
 593     /* 3.2  Lonely start characters */
 594
 595     { "3.2.1", /*  All 32 first bytes of 2-byte sequences (0xc0-0xdf),
 596                    each followed by a space character: */
 597
 598       "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
 599       "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
 600       "\332 \333 \334 \335 \336 \337 ",
 601       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 602       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 603
 604     { "3.2.2", /*  All 16 first bytes of 3-byte sequences (0xe0-0xef),
 605                    each followed by a space character: */
 606       "\340 \341 \342 \343 \344 \345 \346 \347 "
 607       "\350 \351 \352 \353 \354 \355 \356 \357 ",
 608       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 609
 610     { "3.2.3", /*  All 8 first bytes of 4-byte sequences (0xf0-0xf7),
 611                    each followed by a space character: */
 612       URC URC URC URC URC URC URC URC },
 613
 614     { "3.2.4", /*  All 4 first bytes of 5-byte sequences (0xf8-0xfb),
 615                    each followed by a space character: */
 616       "\370 \371 \372 \373 ",
 617       URC URC URC URC },
 618
 619     { "3.2.5", /*  All 2 first bytes of 6-byte sequences (0xfc-0xfd),
 620                    each followed by a space character: */
 621       "\374 \375 ", URC URC },
 622
 623     /* 3.3  Sequences with last continuation byte missing */
 624
 625     /* All bytes of an incomplete sequence should be signalled as a single
 626        malformed sequence, i.e., you should see only a single replacement
 627        character in each of the next 10 tests. (Characters as in section 2) */
 628
 629     { "3.3.1", /*  2-byte sequence with last byte missing (U+0000): */
 630       "\300", URC },
 631     { "3.3.2", /*  3-byte sequence with last byte missing (U+0000): */
 632       "\340\200", URC },
 633     { "3.3.3", /*  4-byte sequence with last byte missing (U+0000): */
 634       "\360\200\200", URC },
 635     { "3.3.4", /*  5-byte sequence with last byte missing (U+0000): */
 636       "\370\200\200\200", URC },
 637     { "3.3.5", /*  6-byte sequence with last byte missing (U+0000): */
 638       "\374\200\200\200\200", URC },
 639     { "3.3.6", /*  2-byte sequence with last byte missing (U-000007FF): */
 640       "\337", URC },
 641     { "3.3.7", /*  3-byte sequence with last byte missing (U-0000FFFF): */
 642       "\357\277", URC },
 643     { "3.3.8", /*  4-byte sequence with last byte missing (U-001FFFFF): */
 644       "\367\277\277", URC },
 645     { "3.3.9", /*  5-byte sequence with last byte missing (U-03FFFFFF): */
 646       "\373\277\277\277", URC },
 647     { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
 648       "\375\277\277\277\277", URC },
 649
 650     /* 3.4  Concatenation of incomplete sequences */
 651
 652     /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
 653        sequences being signalled: */
 654
 655     { "3.4",   "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
 656       "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
 657       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 658
 659     /* 3.5  Impossible bytes */
 660
 661     /* The following two bytes cannot appear in a correct UTF-8 string */
 662
 663     { "3.5.1", /*  fe = */      "\376", URC },
 664     { "3.5.2", /*  ff = */      "\377", URC },
 665     { "3.5.3", /*  fe fe ff ff = */     "\376\376\377\377", URC URC URC URC },
 666
 667
 668     /* 4  Overlong sequences */
 669
 670     /* 4.1  Examples of an overlong ASCII character */
 671
 672     { "4.1.1", /* U+002F = c0 af             = */ "\300\257", URC },
 673     { "4.1.2", /* U+002F = e0 80 af          = */ "\340\200\257", URC },
 674     { "4.1.3", /* U+002F = f0 80 80 af       = */ "\360\200\200\257", URC },
 675     { "4.1.4", /* U+002F = f8 80 80 80 af    = */ "\370\200\200\200\257",
 676       URC },
 677     { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
 678       URC },
 679
 680     /* 4.2  Maximum overlong sequences */
 681
 682     { "4.2.1", /*  U-0000007F = c1 bf             = */ "\301\277", URC },
 683     { "4.2.2", /*  U-000007FF = e0 9f bf          = */ "\340\237\277", URC },
 684     { "4.2.3", /*  U-0000FFFF = f0 8f bf bf       = */ "\360\217\277\277",
 685       URC },
 686     { "4.2.4", /*  U-001FFFFF = f8 87 bf bf bf    = */ "\370\207\277\277\277",
 687       URC },
 688     { "4.2.5", /*  U-03FFFFFF = fc 83 bf bf bf bf = */  URC },
 689
 690     /* 4.3  Overlong representation of the NUL character */
 691
 692     { "4.3.1", /*  U+0000 = c0 80             = */  "\300\200", URC },
 693     { "4.3.2", /*  U+0000 = e0 80 80          = */  "\340\200\200", URC },
 694     { "4.3.3", /*  U+0000 = f0 80 80 80       = */  "\360\200\200\200", URC },
 695     { "4.3.4", /*  U+0000 = f8 80 80 80 80    = */  "\370\200\200\200\200",
 696       URC },
 697     { "4.3.5", /*  U+0000 = fc 80 80 80 80 80 = */  "\374\200\200\200\200\200",
 698       URC },
 699
 700
 701     /* 5  Illegal code positions */
 702
 703     /* 5.1 Single UTF-16 surrogates */
 704
 705     { "5.1.1", /*  U+D800 = ed a0 80 = */       "\355\240\200", URC },
 706     { "5.1.2", /*  U+DB7F = ed ad bf = */       "\355\255\277", URC },
 707     { "5.1.3", /*  U+DB80 = ed ae 80 = */       "\355\256\200", URC },
 708     { "5.1.4", /*  U+DBFF = ed af bf = */       "\355\257\277", URC },
 709     { "5.1.5", /*  U+DC00 = ed b0 80 = */       "\355\260\200", URC },
 710     { "5.1.6", /*  U+DF80 = ed be 80 = */       "\355\276\200", URC },
 711     { "5.1.7", /*  U+DFFF = ed bf bf = */       "\355\277\277", URC },
 712
 713     /* 5.2 Paired UTF-16 surrogates */
 714
 715     { "5.2.1", /*  U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
 716     { "5.2.2", /*  U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
 717     { "5.2.3", /*  U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
 718     { "5.2.4", /*  U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
 719     { "5.2.5", /*  U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
 720     { "5.2.6", /*  U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
 721     { "5.2.7", /*  U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
 722     { "5.2.8", /*  U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
 723
 724     /* 5.3 Other illegal code positions */
 725
 726     { "5.3.1", /*  U+FFFE = ef bf be = */       "\357\277\276" },
 727     { "5.3.2", /*  U+FFFF = ef bf bf = */       "\357\277\277" },
 728
 729
 730     /* 6 Some other junk */
 731
 732     { "6.0", "" },
 733     { "6.1", "\001\002\003\004\005 ABC" },
 734     { "6.2", /* every non-ASCII Latin1 character */
 735       "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
 736       "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
 737       "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
 738       "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
 739       "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
 740       "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
 741       "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
 742       "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
 743       "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
 744       "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
 745       "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
 746       "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
 747
 748     { "6.3", /* Christmas tree */
 749       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 750       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
 751       "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
 752       "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
 753       "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
 754       "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
 755       "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
 756       "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
 757       "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
 758       "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
 759       "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
 760       "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
 761       "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
 762       "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
 763       "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
 764       "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
 765
 766       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 767       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
 768       " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 769       "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
 770       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 771       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 772       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 773       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 774       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 775       URC URC URC URC URC URC URC URC URC URC URC URC },
 776   };
 777
 778   int i;
 779   int ok = 1;
 780   for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
 781     {
 782       const char *name = tests[i].name;
 783       const char *in   = tests[i].in;
 784       const char *target = (tests[i].target ? tests[i].target : in);
 785       const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
 786       char *out = split_and_join (in);
 787       XChar2b *out16 = utf8_to_XChar2b (in, 0);
 788       char *out2 = XChar2b_to_utf8 (out16, 0);
 789       if (strcmp (out, target))
 790         {
 791           LOG (stderr, name, target);
 792           LOG (stderr, "FAIL", out);
 793           fprintf (stderr, "\n");
 794           ok = 0;
 795         }
 796       if (strcmp (out2, target2))
 797         {
 798           LOG (stderr, name, target2);
 799           LOG (stderr, "FAIL2", out2);
 800           fprintf (stderr, "\n");
 801           ok = 0;
 802         }
 803       free (out);
 804       free (out2);
 805       free (out16);
 806     }
 807
 808   {
 809     const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
 810                         "c\303\264t\303\251 de l'alc\303\264ve "
 811                         "ovo\303\257de, o\303\271 les b\303\273ches "
 812                         "se consument dans l'\303\242tre");
 813     const char *latin1 = ("son \356le int\351rieure, \340 "
 814                           "c\364t\351 de l'alc\364ve ovo\357de, "
 815                           "o\371 les b\373ches se consument dans "
 816                           "l'\342tre");
 817     const char *ascii = ("son ile interieure, a cote de l'alcove "
 818                          "ovoide, ou les buches se consument dans "
 819                          "l'atre");
 820     char *latin1b = utf8_to_latin1 (utf8, False);
 821     char *ascii2  = utf8_to_latin1 (utf8, True);
 822     if (strcmp (latin1, latin1b))
 823       {
 824         LOG (stderr, "LATIN1", utf8);
 825         LOG (stderr, "FAIL3", latin1b);
 826         fprintf (stderr, "\n");
 827         ok = 0;
 828       }
 829     if (strcmp (ascii, ascii2))
 830       {
 831         LOG (stderr, "ASCII", utf8);
 832         LOG (stderr, "FAIL4", ascii2);
 833         fprintf (stderr, "\n");
 834         ok = 0;
 835       }
 836     free (latin1b);
 837     free (ascii2);
 838   }
 839
 840
 841   if (ok) fprintf (stderr, "OK\n");
 842   return (ok == 0);
 843 }
 844
 845 #endif /* SELFTEST */