git.hungrycats.org Git - xscreensaver/blob - utils/utf8wc.c

   1 /* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski <jwz@jwz.org>
   2  *
   3  * Permission to use, copy, modify, distribute, and sell this software and its
   4  * documentation for any purpose is hereby granted without fee, provided that
   5  * the above copyright notice appear in all copies and that both that
   6  * copyright notice and this permission notice appear in supporting
   7  * documentation.  No representations are made about the suitability of this
   8  * software for any purpose.  It is provided "as is" without express or
   9  * implied warranty.
  10  */
  11
  12 #ifdef HAVE_CONFIG_H
  13 # include "config.h"
  14 #endif
  15
  16 #include <stdlib.h>
  17 #include <stdio.h>
  18 #include <string.h>
  19
  20 #ifdef HAVE_JWXYZ
  21 # include "jwxyz.h"
  22 #else /* !HAVE_JWXYZ */
  23 # include <X11/Xlib.h>
  24 #endif
  25
  26 #include "utf8wc.h"
  27
  28
  29 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
  30 # define INVALID 0xFFFD
  31
  32
  33 /* Mask the number to be within the valid range of unicode characters.
  34  */
  35 static unsigned long
  36 uc_truncate (unsigned long uc)
  37 {
  38   uc &= 0x7FFFFFFFL;                    /* Unicode is 31 bits */
  39   if (uc > 0x10FFFF) uc = INVALID;      /* But UTF-8 is 4 bytes */
  40   if (uc == 0) uc = INVALID;            /* no nulls */
  41
  42   if (uc >= 0xD800 && uc <= 0xDFFF)
  43     /* Reserved for use with UTF-16: not a real character. */
  44     uc = INVALID;
  45
  46   return uc;
  47 }
  48
  49
  50 /* Parse the first UTF8 character at the front of the string.
  51    Return the Unicode character, and the number of bytes read.
  52  */
  53 long
  54 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
  55 {
  56   const unsigned char *start = in;
  57   const unsigned char *end = in + length;
  58   unsigned long uc = INVALID;
  59   unsigned long min = 0;
  60   unsigned char c;
  61
  62   if (length <= 0) goto DONE;
  63
  64   c = *in++;
  65
  66 # define PREMATURE_EOF { in = end; goto DONE; }
  67
  68   if ((c & 0xC0) == 0x80) {        /* 10xxxxxx - lonely continuation byte */
  69     uc = INVALID;
  70
  71   } else if ((c & 0x80) == 0) {    /* 0xxxxxxx - 7 bits in 1 byte */
  72     uc = (c & 0x7F);               /* 01111111 */
  73
  74   } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
  75     if (in+1 > end) PREMATURE_EOF;
  76     min = 1 << 7;
  77     uc = (((c    & 0x1F) << 6) |   /* 00011111------ */
  78           (in[0] & 0x3F));         /*       00111111 */
  79     in += 1;
  80
  81   } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
  82     if (in+2 > end) PREMATURE_EOF;
  83     min = 1 << 11;
  84     uc = (((c     & 0x0F) << 12) | /* 00001111----+------- */
  85           ((in[0] & 0x3F) <<  6) | /*       00111111------ */
  86           ((in[1] & 0x3F)));       /*             00111111 */
  87     in += 2;
  88
  89   } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
  90     if (in+3 > end) PREMATURE_EOF;
  91     min = 1 << 16;
  92     uc = (((c     & 0x07) << 18) | /* 00000111--+-------+------- */
  93           ((in[0] & 0x3F) << 12) | /*       01111111----+------- */
  94           ((in[1] & 0x3F) <<  6) | /*             00111111------ */
  95           ((in[2] & 0x3F)));       /*                   00111111 */
  96     in += 3;
  97
  98   } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
  99     if (in+4 > end) PREMATURE_EOF;
 100     min = 1 << 21;
 101     uc = (((c     & 0x03) << 24) | /* 00000011--------+-------+------- */
 102           ((in[0] & 0x3F) << 18) | /*       00111111--+-------+------- */
 103           ((in[1] & 0x3F) << 12) | /*             00111111----+------- */
 104           ((in[2] & 0x3F) << 6)  | /*                   00111111------ */
 105           ((in[3] & 0x3F)));       /*                         00111111 */
 106     in += 4;
 107
 108   } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
 109     if (in+5 > end) PREMATURE_EOF;
 110     min = 1 << 26;
 111     uc = (((c     & 0x01) << 30) | /* 00000001------+-------+-------+------- */
 112           ((in[0] & 0x3F) << 24) | /*       00111111+-------+-------+------- */
 113           ((in[1] & 0x3F) << 18) | /*             00111111--+-------+------- */
 114           ((in[2] & 0x3F) << 12) | /*                   00111111----+------- */
 115           ((in[3] & 0x3F) << 6)  | /*                         00111111------ */
 116           ((in[4] & 0x3F)));       /*                               00111111 */
 117     in += 5;
 118   } else {
 119     uc = INVALID;                  /* Unparsable sequence. */
 120   }
 121
 122  DONE:
 123
 124   length = in - start;
 125
 126   /* If any of the continuation bytes didn't begin with the continuation tag,
 127      the sequence is invalid; stop at the bad byte, not consuming later ones.
 128      (It's easier to check this after the fact than up above.) */
 129   {
 130     int i;
 131     for (i = 1; i < length; i++)
 132       if ((start[i] & 0xC0) != 0x80) {
 133         uc = INVALID;
 134         length = i+1;
 135         break;
 136       }
 137   }
 138
 139   if (uc < min)
 140     /* A multi-byte sequence encoded a character that could have been
 141        encoded with a shorter sequence, e.g., hiding ASCII inside a
 142        multi-byte sequence. Something hinky's going on. Reject it. */
 143     uc = INVALID;
 144
 145   uc = uc_truncate (uc);
 146
 147   if (unicode_ret)
 148     *unicode_ret = uc;
 149
 150   return length;
 151 }
 152
 153
 154 /* Converts a Unicode character to a multi-byte UTF8 sequence.
 155    Returns the number of bytes written.
 156  */
 157 int
 158 utf8_encode (unsigned long uc, char *out, long length)
 159 {
 160   const char *old = out;
 161
 162   uc = uc_truncate (uc);
 163
 164   if (uc < 0x80 && length >= 1)                 /* 7 bits in 1 byte */
 165     {
 166       *out++ = uc;                              /* 0xxxxxxx */
 167     }
 168   else if (uc < 0x800 && length >= 2)           /* 11 bits in 2 bytes */
 169     {
 170       *out++ = (0xC0 | ((uc >> 6)  & 0x1F));    /* 110xxxxx */
 171       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 172     }
 173   else if (uc < 0x10000L && length >= 3)        /* 16 bits in 3 bytes */
 174     {
 175       *out++ = (0xE0 | ((uc >> 12) & 0x0F));    /* 1110xxxx */
 176       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 177       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 178     }
 179   else if (uc < 0x200000L && length >= 4)       /* 21 bits in 4 bytes */
 180     {
 181       *out++ = (0xF0 | ((uc >> 18) & 0x07));    /* 11110xxx */
 182       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 183       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 184       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 185     }
 186   else if (uc < 0x4000000L && length >= 5)      /* 26 bits in 5 bytes */
 187     {
 188       *out++ = (0xF8 | ((uc >> 24) & 0x03));    /* 111110xx */
 189       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 190       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 191       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 192       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 193     }
 194   else if (length >= 6)                         /* 31 bits in 6 bytes */
 195     {
 196       *out++ = (0xFC | ((uc >> 30) & 0x01));    /* 1111110x */
 197       *out++ = (0x80 | ((uc >> 24) & 0x3F));    /* 10xxxxxx */
 198       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 199       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 200       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 201       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 202     }
 203
 204   return (int) (out - old);
 205 }
 206
 207
 208 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
 209    This only handles characters that can be represented in 16 bits, the
 210    Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
 211  */
 212 XChar2b *
 213 utf8_to_XChar2b (const char *string, int *length_ret)
 214 {
 215   long in_len = strlen(string);
 216   const unsigned char *in = (const unsigned char *) string;
 217   const unsigned char *in_end = in + in_len;
 218   XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
 219   XChar2b *out = c2b;
 220   if (! out) return 0;
 221
 222   while (in < in_end)
 223     {
 224       unsigned long uc = 0;
 225       long L = utf8_decode (in, in_end - in, &uc);
 226       in += L;
 227
 228       /* If it can't be represented in a 16-bit XChar2b,
 229          use "Unicode Replacement Character". */
 230       if (uc > 0xFFFF) uc = INVALID;
 231
 232       out->byte1 = (uc >> 8) & 0xFF;
 233       out->byte2 = uc & 0xFF;
 234       out++;
 235     }
 236
 237   out->byte1 = 0;
 238   out->byte2 = 0;
 239
 240   if (length_ret)
 241     *length_ret = (int) (out - c2b);
 242
 243   /* shrink */
 244   c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
 245
 246   return c2b;
 247 }
 248
 249
 250 /* Split a UTF8 string into an array of strings, one per character.
 251    The sub-strings will be null terminated and may be multiple bytes.
 252  */
 253 char **
 254 utf8_split (const char *string, int *length_ret)
 255 {
 256   const unsigned char *in = (const unsigned char *) string;
 257   long len = strlen (string);
 258   const unsigned char *end = in + len;
 259   char **ret = (char **) malloc ((len+1) * sizeof(*ret));
 260   int i = 0;
 261   if (!ret) return 0;
 262
 263   while (in < end)
 264     {
 265       unsigned long uc;
 266       long len2 = utf8_decode (in, len, &uc);
 267       char tmp[10];
 268       strncpy (tmp, (char *) in, len2);
 269       tmp[len2] = 0;
 270       ret[i++] = strdup (tmp);
 271       in += len2;
 272
 273       /* If this is a Combining Diacritical, append it to the previous
 274          character. E.g., "y\314\206\314\206" is one string, not three.
 275        */
 276       if (i > 1 &&
 277           ((uc >=  0x300 && uc <=  0x36F) || /* Combining Diacritical */
 278            (uc >= 0x1AB0 && uc <= 0x1AFF) || /* Combining Diacritical Ext. */
 279            (uc >= 0x1DC0 && uc <= 0x1DFF) || /* Combining Diacritical Supp. */
 280            (uc >= 0x20D0 && uc <= 0x20FF) || /* Combining Diacritical Sym. */
 281            (uc >= 0xFE20 && uc <= 0xFE2F)))  /* Combining Half Marks */
 282         {
 283           long L1 = strlen(ret[i-2]);
 284           long L2 = strlen(ret[i-1]);
 285           char *s2 = (char *) malloc (L1 + L2 + 1);
 286           strncpy (s2,      ret[i-2], L1);
 287           strncpy (s2 + L1, ret[i-1], L2);
 288           s2[L1 + L2] = 0;
 289           free (ret[i-2]);
 290           ret[i-2] = s2;
 291           i--;
 292         }
 293     }
 294   ret[i] = 0;
 295
 296   if (length_ret)
 297     *length_ret = i;
 298
 299   /* shrink */
 300   ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
 301
 302   return ret;
 303 }
 304
 305
 306 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
 307  */
 308 char *
 309 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
 310 {
 311   int in_len = 0;
 312   const XChar2b *in_end;
 313   int out_len;
 314   char *utf8, *out;
 315   const char *out_end;
 316
 317   /* Find the null termination on the XChar2b. */
 318   for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
 319     ;
 320
 321   out_len = (in_len + 1) * 3;              /* 16 bit chars = 3 bytes max */
 322   utf8 = out = (char *) malloc (out_len + 1);
 323   if (! out) return 0;
 324   out_end = out + out_len;
 325
 326   while (in < in_end)
 327     {
 328       unsigned long uc = (in->byte1 << 8) | in->byte2;
 329       int wrote = utf8_encode (uc, out, out_end - out);
 330       if (wrote > 3) abort();  /* Can't happen with 16 bit input */
 331       out += wrote;
 332       in++;
 333     }
 334   *out = 0;
 335
 336   out_len = (int) (out - utf8 + 1);
 337
 338   if (length_ret)
 339     *length_ret = out_len;
 340
 341   /* shrink */
 342   utf8 = (char *) realloc (utf8, out_len);
 343
 344   return utf8;
 345 }
 346
 347
 348 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
 349  */
 350 char *
 351 utf8_to_latin1 (const char *string, Bool ascii_p)
 352 {
 353   long in_len = strlen(string);
 354   const unsigned char *in = (const unsigned char *) string;
 355   const unsigned char *in_end = in + in_len;
 356   unsigned char *ret = (unsigned char *) malloc (in_len + 1);
 357   unsigned char *out = ret;
 358
 359   if (! ret) return 0;
 360
 361   while (in < in_end)
 362     {
 363       unsigned long uc = 0;
 364       long len2 = utf8_decode (in, in_end - in, &uc);
 365       in += len2;
 366
 367       if (uc == '\240') /* &nbsp; */
 368         uc = ' ';
 369       else if (uc >= 0x300 && uc <= 0x36F)
 370         uc = 0;         /* Discard "Combining Diacritical Marks" */
 371       else if (uc >= 0x1AB0 && uc <= 0x1AFF)
 372         uc = 0;         /* Discard "Combining Diacritical Marks Extended" */
 373       else if (uc >= 0x1DC0 && uc <= 0x1DFF)
 374         uc = 0;         /* Discard "Combining Diacritical Marks Supplement" */
 375       else if (uc >= 0x20D0 && uc <= 0x20FF)
 376         uc = 0;         /* Discard "Combining Diacritical Marks for Symbols" */
 377       else if (uc >= 0xFE20 && uc <= 0xFE2F)
 378         uc = 0;         /* Discard "Combining Half Marks" */
 379
 380       else if (uc > 0xFF)
 381         switch (uc) {
 382
 383         /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
 384
 385         case 0x2000:    /* EN QUAD */
 386         case 0x2001:    /* EM QUAD */
 387         case 0x2002:    /* EN SPACE */
 388         case 0x2003:    /* EM SPACE */
 389         case 0x2004:    /* THREE-PER-EM SPACE */
 390         case 0x2005:    /* FOUR-PER-EM SPACE */
 391         case 0x2006:    /* SIX-PER-EM SPACE */
 392         case 0x2007:    /* FIGURE SPACE */
 393         case 0x2008:    /* PUNCTUATION SPACE */
 394         case 0x2009:    /* THIN SPACE */
 395         case 0x200A:    /* HAIR SPACE */
 396           uc = ' ';
 397           break;
 398
 399         case 0x2010:    /* HYPHEN */
 400         case 0x2011:    /* NON-BREAKING HYPHEN */
 401         case 0x2012:    /* FIGURE DASH */
 402         case 0x2013:    /* EN DASH */
 403         case 0x2014:    /* EM DASH */
 404         case 0x2015:    /* HORIZONTAL BAR */
 405           uc = '-';
 406           break;
 407
 408         case 0x2018:    /* LEFT SINGLE QUOTATION MARK */
 409         case 0x2019:    /* SINGLE LOW-9 QUOTATION MARK */
 410         case 0x201A:    /* SINGLE LOW-9 QUOTATION MARK */
 411         case 0x201B:    /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
 412           uc = '\'';
 413           break;
 414
 415         case 0x201C:    /* LEFT DOUBLE QUOTATION MARK */
 416         case 0x201D:    /* RIGHT DOUBLE QUOTATION MARK */
 417         case 0x201E:    /* DOUBLE LOW-9 QUOTATION MARK */
 418         case 0x201F:    /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
 419           uc = '"';
 420           break;
 421
 422         case 0x2022: uc = '\267'; break; /* BULLET */
 423         case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
 424         case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
 425         case 0x202F: uc = ' ';    break; /* NARROW NO-BREAK SPACE */
 426         case 0x2038: uc = '^';    break; /* CARET */
 427         case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
 428         case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
 429         case 0x2041: uc = '^';    break; /* CARET INSERTION POINT */
 430         case 0x2042: uc = '*';    break; /* ASTERISM */
 431         case 0x2043: uc = '=';    break; /* HYPHEN BULLET */
 432         case 0x2044: uc = '/';    break; /* FRACTION SLASH */
 433         case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
 434         case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
 435         case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
 436         case 0x204E: uc = '*';    break; /* LOW ASTERISK */
 437         case 0x204F: uc = ';';    break; /* REVERSED SEMICOLON */
 438         default:
 439           break;
 440         }
 441
 442       if (uc > 0xFF)
 443         /* "Inverted question mark" looks enough like 0xFFFD,
 444            the "Unicode Replacement Character". */
 445         uc = (ascii_p ? '#' : '\277');
 446
 447       if (ascii_p)      /* Map Latin1 to the closest ASCII versions. */
 448         {
 449           const unsigned char latin1_to_ascii[96] =
 450              " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
 451              "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
 452              "aaaaaaeceeeeiiiionooooo/ouuuuypy";
 453           if (uc >= 0xA0)
 454             uc = latin1_to_ascii[uc - 0xA0];
 455         }
 456
 457       if (uc > 0)
 458         *out++ = (unsigned char) uc;
 459     }
 460   *out = 0;
 461
 462   /* shrink */
 463   ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
 464
 465   return (char *) ret;
 466 }
 467
 468
 469 /*************************************************************************
 470
 471  cd ../hacks ; make test-utf8wc
 472
 473  *************************************************************************/
 474
 475 #ifdef SELFTEST
 476
 477 /* Convert a UTF8 string to Unicode and back again.
 478  */
 479 static char *
 480 split_and_join (const char *string)
 481 {
 482   const unsigned char *in = (const unsigned char *) string;
 483   int len = strlen (string);
 484   const unsigned char *end = in + len;
 485   unsigned long *unicode = (unsigned long *)
 486     malloc((len + 1) * sizeof(*unicode));
 487   int i = 0;
 488   char *ret, *out, *out_end;
 489
 490   while (in < end)
 491     {
 492       long len2 = utf8_decode (in, len, &unicode[i]);
 493       i++;
 494       in += len2;
 495     }
 496   unicode[i] = 0;
 497
 498   i = i*6 + 1;
 499   out = ret = (char *) malloc(i);
 500   out_end = out + i;
 501   i = 0;
 502   while (unicode[i])
 503     {
 504       int len2 = utf8_encode (unicode[i], out, out_end - out);
 505       out += len2;
 506       i++;
 507     }
 508   *out = 0;
 509   free (unicode);
 510
 511   return ret;
 512 }
 513
 514
 515 static void
 516 LOG (FILE *out, const char *prefix, const char *s)
 517 {
 518   fprintf (out, "%6s: \"", prefix);
 519   while (*s)
 520     {
 521       unsigned char c = *s;
 522       if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
 523       else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
 524       else fprintf (out, "%c", c);
 525       s++;
 526     }
 527   fprintf (out, "\"\n");
 528 }
 529
 530
 531 int
 532 main (int argc, char **argv)
 533 {
 534   /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 535    */
 536
 537 #  define URC "\357\277\275"   /* 0xFFFD, "Unicode Replacement Character" */
 538
 539   static const struct { const char *name, *in, *target, *target2; } tests[] = {
 540     /* 1  Some correct UTF-8 text */
 541
 542     /* The Greek word 'kosme': */
 543     { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
 544
 545
 546     /* 2  Boundary condition test cases */
 547
 548     /* 2.1  First possible sequence of a certain length */
 549
 550     { "2.1.1", /*  1 byte  (U-00000000): */  "\000" },
 551     { "2.1.2", /*  2 bytes (U-00000080): */  "\302\200" },
 552     { "2.1.3", /*  3 bytes (U-00000800): */  "\340\240\200" },
 553     { "2.1.4", /*  4 bytes (U-00010000): */  "\360\220\200\200", 0, URC },
 554     { "2.1.5", /*  5 bytes (U-00200000): */  "\370\210\200\200\200", URC },
 555     { "2.1.6", /*  6 bytes (U-04000000): */  "\374\204\200\200\200\200", URC },
 556
 557     /* 2.2  Last possible sequence of a certain length */
 558
 559     { "2.2.1", /*  1 byte  (U-0000007F): */  "\177" },
 560     { "2.2.2", /*  2 bytes (U-000007FF): */  "\337\277" },
 561     { "2.2.3", /*  3 bytes (U-0000FFFF): */  "\357\277\277" },
 562     { "2.2.4", /*  4 bytes (U-001FFFFF): */  "\367\277\277\277", URC },
 563     { "2.2.5", /*  5 bytes (U-03FFFFFF): */  "\373\277\277\277\277", URC },
 564     { "2.2.6", /*  6 bytes (U-7FFFFFFF): */  "\375\277\277\277\277\277", URC },
 565
 566     /* 2.3  Other boundary conditions */
 567
 568     { "2.3.1", /*  U-0000D7FF = ed 9f bf = */    "\355\237\277" },
 569     { "2.3.2", /*  U-0000E000 = ee 80 80 = */    "\356\200\200" },
 570     { "2.3.3", /*  U-0000FFFD = ef bf bd = */    URC },
 571     { "2.3.4", /*  U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
 572     { "2.3.5", /*  U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
 573
 574
 575     /* 3  Malformed sequences */
 576
 577     /* 3.1  Unexpected continuation bytes */
 578
 579     /* Each unexpected continuation byte should be separately signalled as a
 580        malformed sequence of its own. */
 581
 582     { "3.1.1", /*  First continuation byte 0x80: */ "\200", URC },
 583     { "3.1.2", /*  Last  continuation byte 0xbf: */ "\277", URC },
 584     { "3.1.3", /*  2 continuation bytes: */ "\200\277",     URC URC },
 585     { "3.1.4", /*  3 continuation bytes: */ "\200\277\200", URC URC URC },
 586     { "3.1.5", /*  4 continuation bytes: */ "\200\277\200\277",
 587       URC URC URC URC },
 588     { "3.1.6", /*  5 continuation bytes: */ "\200\277\200\277\200",
 589       URC URC URC URC URC },
 590     { "3.1.7", /*  6 continuation bytes: */ "\200\277\200\277\200\277",
 591       URC URC URC URC URC URC },
 592     { "3.1.8", /*  7 continuation bytes: */ "\200\277\200\277\200\277\200",
 593       URC URC URC URC URC URC URC },
 594
 595     { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
 596
 597       "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
 598       "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
 599       "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
 600       "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
 601       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 602       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 603       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 604       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 605
 606     /* 3.2  Lonely start characters */
 607
 608     { "3.2.1", /*  All 32 first bytes of 2-byte sequences (0xc0-0xdf),
 609                    each followed by a space character: */
 610
 611       "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
 612       "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
 613       "\332 \333 \334 \335 \336 \337 ",
 614       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 615       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 616
 617     { "3.2.2", /*  All 16 first bytes of 3-byte sequences (0xe0-0xef),
 618                    each followed by a space character: */
 619       "\340 \341 \342 \343 \344 \345 \346 \347 "
 620       "\350 \351 \352 \353 \354 \355 \356 \357 ",
 621       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 622
 623     { "3.2.3", /*  All 8 first bytes of 4-byte sequences (0xf0-0xf7),
 624                    each followed by a space character: */
 625       URC URC URC URC URC URC URC URC },
 626
 627     { "3.2.4", /*  All 4 first bytes of 5-byte sequences (0xf8-0xfb),
 628                    each followed by a space character: */
 629       "\370 \371 \372 \373 ",
 630       URC URC URC URC },
 631
 632     { "3.2.5", /*  All 2 first bytes of 6-byte sequences (0xfc-0xfd),
 633                    each followed by a space character: */
 634       "\374 \375 ", URC URC },
 635
 636     /* 3.3  Sequences with last continuation byte missing */
 637
 638     /* All bytes of an incomplete sequence should be signalled as a single
 639        malformed sequence, i.e., you should see only a single replacement
 640        character in each of the next 10 tests. (Characters as in section 2) */
 641
 642     { "3.3.1", /*  2-byte sequence with last byte missing (U+0000): */
 643       "\300", URC },
 644     { "3.3.2", /*  3-byte sequence with last byte missing (U+0000): */
 645       "\340\200", URC },
 646     { "3.3.3", /*  4-byte sequence with last byte missing (U+0000): */
 647       "\360\200\200", URC },
 648     { "3.3.4", /*  5-byte sequence with last byte missing (U+0000): */
 649       "\370\200\200\200", URC },
 650     { "3.3.5", /*  6-byte sequence with last byte missing (U+0000): */
 651       "\374\200\200\200\200", URC },
 652     { "3.3.6", /*  2-byte sequence with last byte missing (U-000007FF): */
 653       "\337", URC },
 654     { "3.3.7", /*  3-byte sequence with last byte missing (U-0000FFFF): */
 655       "\357\277", URC },
 656     { "3.3.8", /*  4-byte sequence with last byte missing (U-001FFFFF): */
 657       "\367\277\277", URC },
 658     { "3.3.9", /*  5-byte sequence with last byte missing (U-03FFFFFF): */
 659       "\373\277\277\277", URC },
 660     { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
 661       "\375\277\277\277\277", URC },
 662
 663     /* 3.4  Concatenation of incomplete sequences */
 664
 665     /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
 666        sequences being signalled: */
 667
 668     { "3.4",   "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
 669       "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
 670       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 671
 672     /* 3.5  Impossible bytes */
 673
 674     /* The following two bytes cannot appear in a correct UTF-8 string */
 675
 676     { "3.5.1", /*  fe = */      "\376", URC },
 677     { "3.5.2", /*  ff = */      "\377", URC },
 678     { "3.5.3", /*  fe fe ff ff = */     "\376\376\377\377", URC URC URC URC },
 679
 680
 681     /* 4  Overlong sequences */
 682
 683     /* 4.1  Examples of an overlong ASCII character */
 684
 685     { "4.1.1", /* U+002F = c0 af             = */ "\300\257", URC },
 686     { "4.1.2", /* U+002F = e0 80 af          = */ "\340\200\257", URC },
 687     { "4.1.3", /* U+002F = f0 80 80 af       = */ "\360\200\200\257", URC },
 688     { "4.1.4", /* U+002F = f8 80 80 80 af    = */ "\370\200\200\200\257",
 689       URC },
 690     { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
 691       URC },
 692
 693     /* 4.2  Maximum overlong sequences */
 694
 695     { "4.2.1", /*  U-0000007F = c1 bf             = */ "\301\277", URC },
 696     { "4.2.2", /*  U-000007FF = e0 9f bf          = */ "\340\237\277", URC },
 697     { "4.2.3", /*  U-0000FFFF = f0 8f bf bf       = */ "\360\217\277\277",
 698       URC },
 699     { "4.2.4", /*  U-001FFFFF = f8 87 bf bf bf    = */ "\370\207\277\277\277",
 700       URC },
 701     { "4.2.5", /*  U-03FFFFFF = fc 83 bf bf bf bf = */  URC },
 702
 703     /* 4.3  Overlong representation of the NUL character */
 704
 705     { "4.3.1", /*  U+0000 = c0 80             = */  "\300\200", URC },
 706     { "4.3.2", /*  U+0000 = e0 80 80          = */  "\340\200\200", URC },
 707     { "4.3.3", /*  U+0000 = f0 80 80 80       = */  "\360\200\200\200", URC },
 708     { "4.3.4", /*  U+0000 = f8 80 80 80 80    = */  "\370\200\200\200\200",
 709       URC },
 710     { "4.3.5", /*  U+0000 = fc 80 80 80 80 80 = */  "\374\200\200\200\200\200",
 711       URC },
 712
 713
 714     /* 5  Illegal code positions */
 715
 716     /* 5.1 Single UTF-16 surrogates */
 717
 718     { "5.1.1", /*  U+D800 = ed a0 80 = */       "\355\240\200", URC },
 719     { "5.1.2", /*  U+DB7F = ed ad bf = */       "\355\255\277", URC },
 720     { "5.1.3", /*  U+DB80 = ed ae 80 = */       "\355\256\200", URC },
 721     { "5.1.4", /*  U+DBFF = ed af bf = */       "\355\257\277", URC },
 722     { "5.1.5", /*  U+DC00 = ed b0 80 = */       "\355\260\200", URC },
 723     { "5.1.6", /*  U+DF80 = ed be 80 = */       "\355\276\200", URC },
 724     { "5.1.7", /*  U+DFFF = ed bf bf = */       "\355\277\277", URC },
 725
 726     /* 5.2 Paired UTF-16 surrogates */
 727
 728     { "5.2.1", /*  U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
 729     { "5.2.2", /*  U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
 730     { "5.2.3", /*  U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
 731     { "5.2.4", /*  U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
 732     { "5.2.5", /*  U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
 733     { "5.2.6", /*  U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
 734     { "5.2.7", /*  U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
 735     { "5.2.8", /*  U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
 736
 737     /* 5.3 Other illegal code positions */
 738
 739     { "5.3.1", /*  U+FFFE = ef bf be = */       "\357\277\276" },
 740     { "5.3.2", /*  U+FFFF = ef bf bf = */       "\357\277\277" },
 741
 742
 743     /* 6 Some other junk */
 744
 745     { "6.0", "" },
 746     { "6.1", "\001\002\003\004\005 ABC" },
 747     { "6.2", /* every non-ASCII Latin1 character */
 748       "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
 749       "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
 750       "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
 751       "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
 752       "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
 753       "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
 754       "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
 755       "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
 756       "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
 757       "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
 758       "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
 759       "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
 760
 761     { "6.3", /* Christmas tree */
 762       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 763       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
 764       "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
 765       "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
 766       "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
 767       "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
 768       "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
 769       "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
 770       "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
 771       "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
 772       "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
 773       "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
 774       "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
 775       "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
 776       "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
 777       "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
 778
 779       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 780       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
 781       " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 782       "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
 783       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 784       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 785       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 786       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 787       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 788       URC URC URC URC URC URC URC URC URC URC URC URC },
 789   };
 790
 791   int i;
 792   int ok = 1;
 793   for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
 794     {
 795       const char *name = tests[i].name;
 796       const char *in   = tests[i].in;
 797       const char *target = (tests[i].target ? tests[i].target : in);
 798       const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
 799       char *out = split_and_join (in);
 800       XChar2b *out16 = utf8_to_XChar2b (in, 0);
 801       char *out2 = XChar2b_to_utf8 (out16, 0);
 802       if (strcmp (out, target))
 803         {
 804           LOG (stderr, name, target);
 805           LOG (stderr, "FAIL", out);
 806           fprintf (stderr, "\n");
 807           ok = 0;
 808         }
 809       if (strcmp (out2, target2))
 810         {
 811           LOG (stderr, name, target2);
 812           LOG (stderr, "FAIL2", out2);
 813           fprintf (stderr, "\n");
 814           ok = 0;
 815         }
 816       free (out);
 817       free (out2);
 818       free (out16);
 819     }
 820
 821   {
 822     const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
 823                         "c\303\264t\303\251 de l'alc\303\264ve "
 824                         "ovo\303\257de, o\303\271 les b\303\273ches "
 825                         "se consument dans l'\303\242tre");
 826     const char *latin1 = ("son \356le int\351rieure, \340 "
 827                           "c\364t\351 de l'alc\364ve ovo\357de, "
 828                           "o\371 les b\373ches se consument dans "
 829                           "l'\342tre");
 830     const char *ascii = ("son ile interieure, a cote de l'alcove "
 831                          "ovoide, ou les buches se consument dans "
 832                          "l'atre");
 833     char *latin1b = utf8_to_latin1 (utf8, False);
 834     char *ascii2  = utf8_to_latin1 (utf8, True);
 835     if (strcmp (latin1, latin1b))
 836       {
 837         LOG (stderr, "LATIN1", utf8);
 838         LOG (stderr, "FAIL3", latin1b);
 839         fprintf (stderr, "\n");
 840         ok = 0;
 841       }
 842     if (strcmp (ascii, ascii2))
 843       {
 844         LOG (stderr, "ASCII", utf8);
 845         LOG (stderr, "FAIL4", ascii2);
 846         fprintf (stderr, "\n");
 847         ok = 0;
 848       }
 849     free (latin1b);
 850     free (ascii2);
 851   }
 852
 853
 854   if (ok) fprintf (stderr, "OK\n");
 855   return (ok == 0);
 856 }
 857
 858 #endif /* SELFTEST */