git.hungrycats.org Git - xscreensaver/blob - utils/utf8wc.c

   1 /* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski <jwz@jwz.org>
   2  *
   3  * Permission to use, copy, modify, distribute, and sell this software and its
   4  * documentation for any purpose is hereby granted without fee, provided that
   5  * the above copyright notice appear in all copies and that both that
   6  * copyright notice and this permission notice appear in supporting
   7  * documentation.  No representations are made about the suitability of this
   8  * software for any purpose.  It is provided "as is" without express or
   9  * implied warranty.
  10  */
  11
  12 #ifdef HAVE_CONFIG_H
  13 # include "config.h"
  14 #endif
  15
  16 #include <stdlib.h>
  17 #include <stdio.h>
  18 #include <string.h>
  19
  20 #ifdef HAVE_JWXYZ
  21 # include "jwxyz.h"
  22 #else /* !HAVE_JWXYZ */
  23 # include <X11/Xlib.h>
  24 #endif
  25
  26 #include "utf8wc.h"
  27
  28
  29 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
  30 # define INVALID 0xFFFD
  31
  32
  33 /* Mask the number to be within the valid range of unicode characters.
  34  */
  35 static unsigned long
  36 uc_truncate (unsigned long uc)
  37 {
  38   uc &= 0x7FFFFFFFL;                    /* Unicode is 31 bits */
  39   if (uc > 0x10FFFF) uc = INVALID;      /* But UTF-8 is 4 bytes */
  40   if (uc == 0) uc = INVALID;            /* no nulls */
  41
  42   if (uc >= 0xD800 && uc <= 0xDFFF)
  43     /* Reserved for use with UTF-16: not a real character. */
  44     uc = INVALID;
  45
  46   return uc;
  47 }
  48
  49
  50 /* Parse the first UTF8 character at the front of the string.
  51    Return the Unicode character, and the number of bytes read.
  52  */
  53 long
  54 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
  55 {
  56   const unsigned char *start = in;
  57   const unsigned char *end = in + length;
  58   unsigned long uc = INVALID;
  59   unsigned long min = 0;
  60   unsigned char c;
  61
  62   if (length <= 0) goto DONE;
  63
  64   c = *in++;
  65
  66 # define PREMATURE_EOF { in = end; goto DONE; }
  67
  68   if ((c & 0xC0) == 0x80) {        /* 10xxxxxx - lonely continuation byte */
  69     uc = INVALID;
  70
  71   } else if ((c & 0x80) == 0) {    /* 0xxxxxxx - 7 bits in 1 byte */
  72     uc = (c & 0x7F);               /* 01111111 */
  73
  74   } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
  75     if (in+1 > end) PREMATURE_EOF;
  76     min = 1 << 7;
  77     uc = (((c    & 0x1F) << 6) |   /* 00011111------ */
  78           (in[0] & 0x3F));         /*       00111111 */
  79     in += 1;
  80
  81   } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
  82     if (in+2 > end) PREMATURE_EOF;
  83     min = 1 << 11;
  84     uc = (((c     & 0x0F) << 12) | /* 00001111----+------- */
  85           ((in[0] & 0x3F) <<  6) | /*       00111111------ */
  86           ((in[1] & 0x3F)));       /*             00111111 */
  87     in += 2;
  88
  89   } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
  90     if (in+3 > end) PREMATURE_EOF;
  91     min = 1 << 16;
  92     uc = (((c     & 0x07) << 18) | /* 00000111--+-------+------- */
  93           ((in[0] & 0x3F) << 12) | /*       01111111----+------- */
  94           ((in[1] & 0x3F) <<  6) | /*             00111111------ */
  95           ((in[2] & 0x3F)));       /*                   00111111 */
  96     in += 3;
  97
  98   } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
  99     if (in+4 > end) PREMATURE_EOF;
 100     min = 1 << 21;
 101     uc = (((c     & 0x03) << 24) | /* 00000011--------+-------+------- */
 102           ((in[0] & 0x3F) << 18) | /*       00111111--+-------+------- */
 103           ((in[1] & 0x3F) << 12) | /*             00111111----+------- */
 104           ((in[2] & 0x3F) << 6)  | /*                   00111111------ */
 105           ((in[3] & 0x3F)));       /*                         00111111 */
 106     in += 4;
 107
 108   } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
 109     if (in+5 > end) PREMATURE_EOF;
 110     min = 1 << 26;
 111     uc = (((c     & 0x01) << 30) | /* 00000001------+-------+-------+------- */
 112           ((in[0] & 0x3F) << 24) | /*       00111111+-------+-------+------- */
 113           ((in[1] & 0x3F) << 18) | /*             00111111--+-------+------- */
 114           ((in[2] & 0x3F) << 12) | /*                   00111111----+------- */
 115           ((in[3] & 0x3F) << 6)  | /*                         00111111------ */
 116           ((in[4] & 0x3F)));       /*                               00111111 */
 117     in += 5;
 118   } else {
 119     uc = INVALID;                  /* Unparsable sequence. */
 120   }
 121
 122  DONE:
 123
 124   length = in - start;
 125
 126   /* If any of the continuation bytes didn't begin with the continuation tag,
 127      the sequence is invalid; stop at the bad byte, not consuming later ones.
 128      (It's easier to check this after the fact than up above.) */
 129   {
 130     int i;
 131     for (i = 1; i < length; i++)
 132       if ((start[i] & 0xC0) != 0x80) {
 133         uc = INVALID;
 134         length = i+1;
 135         break;
 136       }
 137   }
 138
 139   if (uc < min)
 140     /* A multi-byte sequence encoded a character that could have been
 141        encoded with a shorter sequence, e.g., hiding ASCII inside a
 142        multi-byte sequence. Something hinky's going on. Reject it. */
 143     uc = INVALID;
 144
 145   uc = uc_truncate (uc);
 146
 147   if (unicode_ret)
 148     *unicode_ret = uc;
 149
 150   return length;
 151 }
 152
 153
 154 /* Converts a Unicode character to a multi-byte UTF8 sequence.
 155    Returns the number of bytes written.
 156  */
 157 int
 158 utf8_encode (unsigned long uc, char *out, long length)
 159 {
 160   const char *old = out;
 161
 162   uc = uc_truncate (uc);
 163
 164   if (uc < 0x80 && length >= 1)                 /* 7 bits in 1 byte */
 165     {
 166       *out++ = uc;                              /* 0xxxxxxx */
 167     }
 168   else if (uc < 0x800 && length >= 2)           /* 11 bits in 2 bytes */
 169     {
 170       *out++ = (0xC0 | ((uc >> 6)  & 0x1F));    /* 110xxxxx */
 171       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 172     }
 173   else if (uc < 0x10000L && length >= 3)        /* 16 bits in 3 bytes */
 174     {
 175       *out++ = (0xE0 | ((uc >> 12) & 0x0F));    /* 1110xxxx */
 176       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 177       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 178     }
 179   else if (uc < 0x200000L && length >= 4)       /* 21 bits in 4 bytes */
 180     {
 181       *out++ = (0xF0 | ((uc >> 18) & 0x07));    /* 11110xxx */
 182       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 183       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 184       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 185     }
 186   else if (uc < 0x4000000L && length >= 5)      /* 26 bits in 5 bytes */
 187     {
 188       *out++ = (0xF8 | ((uc >> 24) & 0x03));    /* 111110xx */
 189       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 190       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 191       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 192       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 193     }
 194   else if (length >= 6)                         /* 31 bits in 6 bytes */
 195     {
 196       *out++ = (0xFC | ((uc >> 30) & 0x01));    /* 1111110x */
 197       *out++ = (0x80 | ((uc >> 24) & 0x3F));    /* 10xxxxxx */
 198       *out++ = (0x80 | ((uc >> 18) & 0x3F));    /* 10xxxxxx */
 199       *out++ = (0x80 | ((uc >> 12) & 0x3F));    /* 10xxxxxx */
 200       *out++ = (0x80 | ((uc >>  6) & 0x3F));    /* 10xxxxxx */
 201       *out++ = (0x80 |  (uc        & 0x3F));    /* 10xxxxxx */
 202     }
 203
 204   return (int) (out - old);
 205 }
 206
 207
 208 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
 209    This only handles characters that can be represented in 16 bits, the
 210    Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
 211  */
 212 XChar2b *
 213 utf8_to_XChar2b (const char *string, int *length_ret)
 214 {
 215   long in_len = strlen(string);
 216   const unsigned char *in = (const unsigned char *) string;
 217   const unsigned char *in_end = in + in_len;
 218   XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
 219   XChar2b *out = c2b;
 220   if (! out) return 0;
 221
 222   while (in < in_end)
 223     {
 224       unsigned long uc = 0;
 225       long L = utf8_decode (in, in_end - in, &uc);
 226       in += L;
 227
 228       /* If it can't be represented in a 16-bit XChar2b,
 229          use "Unicode Replacement Character". */
 230       if (uc > 0xFFFF) uc = INVALID;
 231
 232       out->byte1 = (uc >> 8) & 0xFF;
 233       out->byte2 = uc & 0xFF;
 234       out++;
 235     }
 236
 237   out->byte1 = 0;
 238   out->byte2 = 0;
 239
 240   if (length_ret)
 241     *length_ret = (int) (out - c2b);
 242
 243   /* shrink */
 244   c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
 245
 246   return c2b;
 247 }
 248
 249
 250 /* Split a UTF8 string into an array of strings, one per character.
 251    The sub-strings will be null terminated and may be multiple bytes.
 252  */
 253 char **
 254 utf8_split (const char *string, int *length_ret)
 255 {
 256   const unsigned char *in = (const unsigned char *) string;
 257   long len = strlen (string);
 258   const unsigned char *end = in + len;
 259   char **ret = (char **) malloc ((len+1) * sizeof(*ret));
 260   int i = 0;
 261   int zwjp = 0;
 262   if (!ret) return 0;
 263
 264   while (in < end)
 265     {
 266       unsigned long uc;
 267       long len2 = utf8_decode (in, len, &uc);
 268       char tmp[10];
 269       strncpy (tmp, (char *) in, len2);
 270       tmp[len2] = 0;
 271       ret[i++] = strdup (tmp);
 272       in += len2;
 273
 274       /* If this is a Combining Diacritical, append it to the previous
 275          character. E.g., "y\314\206\314\206" is one string, not three.
 276
 277          If this is ZWJ, Zero Width Joiner, then we append both this character
 278          and the following character, e.g. "X ZWJ Y" is one string not three.
 279
 280          #### Hmmm, should this also include every character in the
 281          "Symbol, Modifier" category, or does ZWJ get used for those?
 282          https://www.fileformat.info/info/unicode/category/Sk/list.htm
 283
 284          Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8"
 285          should be a single glyph? Or is that what "Combining Cedilla, 0327"
 286          is for?  I'm confused by the fact that the skin tones (1F3FB-1F3FF)
 287          do not seem to be in a readily-identifiable block the way the various
 288          combining diacriticals are.
 289        */
 290       if (i > 1 &&
 291           ((uc >=   0x300 && uc <=   0x36F) || /* Combining Diacritical */
 292            (uc >=  0x1AB0 && uc <=  0x1AFF) || /* Combining Diacritical Ext. */
 293            (uc >=  0x1DC0 && uc <=  0x1DFF) || /* Combining Diacritical Supp. */
 294            (uc >=  0x20D0 && uc <=  0x20FF) || /* Combining Diacritical Sym. */
 295            (uc >=  0xFE20 && uc <=  0xFE2F) || /* Combining Half Marks */
 296            (uc >= 0x1F3FB && uc <= 0x1F3FF) || /* Emoji skin tone modifiers */
 297            zwjp || uc == 0x200D))              /* Zero Width Joiner */
 298         {
 299           long L1 = strlen(ret[i-2]);
 300           long L2 = strlen(ret[i-1]);
 301           char *s2 = (char *) malloc (L1 + L2 + 1);
 302           strncpy (s2,      ret[i-2], L1);
 303           strncpy (s2 + L1, ret[i-1], L2);
 304           s2[L1 + L2] = 0;
 305           free (ret[i-2]);
 306           ret[i-2] = s2;
 307           i--;
 308           zwjp = (uc == 0x200D);  /* Swallow the next character as well */
 309         }
 310     }
 311   ret[i] = 0;
 312
 313   if (length_ret)
 314     *length_ret = i;
 315
 316   /* shrink */
 317   ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
 318
 319   return ret;
 320 }
 321
 322
 323 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
 324  */
 325 char *
 326 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
 327 {
 328   int in_len = 0;
 329   const XChar2b *in_end;
 330   int out_len;
 331   char *utf8, *out;
 332   const char *out_end;
 333
 334   /* Find the null termination on the XChar2b. */
 335   for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
 336     ;
 337
 338   out_len = (in_len + 1) * 3;              /* 16 bit chars = 3 bytes max */
 339   utf8 = out = (char *) malloc (out_len + 1);
 340   if (! out) return 0;
 341   out_end = out + out_len;
 342
 343   while (in < in_end)
 344     {
 345       unsigned long uc = (in->byte1 << 8) | in->byte2;
 346       int wrote = utf8_encode (uc, out, out_end - out);
 347       if (wrote > 3) abort();  /* Can't happen with 16 bit input */
 348       out += wrote;
 349       in++;
 350     }
 351   *out = 0;
 352
 353   out_len = (int) (out - utf8 + 1);
 354
 355   if (length_ret)
 356     *length_ret = out_len;
 357
 358   /* shrink */
 359   utf8 = (char *) realloc (utf8, out_len);
 360
 361   return utf8;
 362 }
 363
 364
 365 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
 366  */
 367 char *
 368 utf8_to_latin1 (const char *string, Bool ascii_p)
 369 {
 370   long in_len = strlen(string);
 371   const unsigned char *in = (const unsigned char *) string;
 372   const unsigned char *in_end = in + in_len;
 373   unsigned char *ret = (unsigned char *) malloc (in_len + 1);
 374   unsigned char *out = ret;
 375
 376   if (! ret) return 0;
 377
 378   while (in < in_end)
 379     {
 380       unsigned long uc = 0;
 381       long len2 = utf8_decode (in, in_end - in, &uc);
 382       in += len2;
 383
 384       if (uc == '\240') /* &nbsp; */
 385         uc = ' ';
 386       else if (uc >= 0x300 && uc <= 0x36F)
 387         uc = 0;         /* Discard "Combining Diacritical Marks" */
 388       else if (uc >= 0x1AB0 && uc <= 0x1AFF)
 389         uc = 0;         /* Discard "Combining Diacritical Marks Extended" */
 390       else if (uc >= 0x1DC0 && uc <= 0x1DFF)
 391         uc = 0;         /* Discard "Combining Diacritical Marks Supplement" */
 392       else if (uc >= 0x20D0 && uc <= 0x20FF)
 393         uc = 0;         /* Discard "Combining Diacritical Marks for Symbols" */
 394       else if (uc >= 0xFE20 && uc <= 0xFE2F)
 395         uc = 0;         /* Discard "Combining Half Marks" */
 396
 397       else if (uc > 0xFF)
 398         switch (uc) {
 399
 400         /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
 401
 402         case 0x2000:    /* EN QUAD */
 403         case 0x2001:    /* EM QUAD */
 404         case 0x2002:    /* EN SPACE */
 405         case 0x2003:    /* EM SPACE */
 406         case 0x2004:    /* THREE-PER-EM SPACE */
 407         case 0x2005:    /* FOUR-PER-EM SPACE */
 408         case 0x2006:    /* SIX-PER-EM SPACE */
 409         case 0x2007:    /* FIGURE SPACE */
 410         case 0x2008:    /* PUNCTUATION SPACE */
 411         case 0x2009:    /* THIN SPACE */
 412         case 0x200A:    /* HAIR SPACE */
 413           uc = ' ';
 414           break;
 415
 416         case 0x2010:    /* HYPHEN */
 417         case 0x2011:    /* NON-BREAKING HYPHEN */
 418         case 0x2012:    /* FIGURE DASH */
 419         case 0x2013:    /* EN DASH */
 420         case 0x2014:    /* EM DASH */
 421         case 0x2015:    /* HORIZONTAL BAR */
 422           uc = '-';
 423           break;
 424
 425         case 0x2018:    /* LEFT SINGLE QUOTATION MARK */
 426         case 0x2019:    /* SINGLE LOW-9 QUOTATION MARK */
 427         case 0x201A:    /* SINGLE LOW-9 QUOTATION MARK */
 428         case 0x201B:    /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
 429           uc = '\'';
 430           break;
 431
 432         case 0x201C:    /* LEFT DOUBLE QUOTATION MARK */
 433         case 0x201D:    /* RIGHT DOUBLE QUOTATION MARK */
 434         case 0x201E:    /* DOUBLE LOW-9 QUOTATION MARK */
 435         case 0x201F:    /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
 436           uc = '"';
 437           break;
 438
 439         case 0x2022: uc = '\267'; break; /* BULLET */
 440         case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
 441         case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
 442         case 0x202F: uc = ' ';    break; /* NARROW NO-BREAK SPACE */
 443         case 0x2038: uc = '^';    break; /* CARET */
 444         case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
 445         case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
 446         case 0x2041: uc = '^';    break; /* CARET INSERTION POINT */
 447         case 0x2042: uc = '*';    break; /* ASTERISM */
 448         case 0x2043: uc = '=';    break; /* HYPHEN BULLET */
 449         case 0x2044: uc = '/';    break; /* FRACTION SLASH */
 450         case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
 451         case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
 452         case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
 453         case 0x204E: uc = '*';    break; /* LOW ASTERISK */
 454         case 0x204F: uc = ';';    break; /* REVERSED SEMICOLON */
 455         default:
 456           break;
 457         }
 458
 459       if (uc > 0xFF)
 460         /* "Inverted question mark" looks enough like 0xFFFD,
 461            the "Unicode Replacement Character". */
 462         uc = (ascii_p ? '#' : '\277');
 463
 464       if (ascii_p)      /* Map Latin1 to the closest ASCII versions. */
 465         {
 466           const unsigned char latin1_to_ascii[96] =
 467              " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
 468              "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
 469              "aaaaaaeceeeeiiiionooooo/ouuuuypy";
 470           if (uc >= 0xA0)
 471             uc = latin1_to_ascii[uc - 0xA0];
 472         }
 473
 474       if (uc > 0)
 475         *out++ = (unsigned char) uc;
 476     }
 477   *out = 0;
 478
 479   /* shrink */
 480   ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
 481
 482   return (char *) ret;
 483 }
 484
 485
 486 /*************************************************************************
 487
 488  cd ../hacks ; make test-utf8wc
 489
 490  *************************************************************************/
 491
 492 #ifdef SELFTEST
 493
 494 /* Convert a UTF8 string to Unicode and back again.
 495  */
 496 static char *
 497 split_and_join (const char *string)
 498 {
 499   const unsigned char *in = (const unsigned char *) string;
 500   int len = strlen (string);
 501   const unsigned char *end = in + len;
 502   unsigned long *unicode = (unsigned long *)
 503     malloc((len + 1) * sizeof(*unicode));
 504   int i = 0;
 505   char *ret, *out, *out_end;
 506
 507   while (in < end)
 508     {
 509       long len2 = utf8_decode (in, len, &unicode[i]);
 510       i++;
 511       in += len2;
 512     }
 513   unicode[i] = 0;
 514
 515   i = i*6 + 1;
 516   out = ret = (char *) malloc(i);
 517   out_end = out + i;
 518   i = 0;
 519   while (unicode[i])
 520     {
 521       int len2 = utf8_encode (unicode[i], out, out_end - out);
 522       out += len2;
 523       i++;
 524     }
 525   *out = 0;
 526   free (unicode);
 527
 528   return ret;
 529 }
 530
 531
 532 static void
 533 LOG (FILE *out, const char *prefix, const char *s)
 534 {
 535   fprintf (out, "%6s: \"", prefix);
 536   while (*s)
 537     {
 538       unsigned char c = *s;
 539       if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
 540       else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
 541       else fprintf (out, "%c", c);
 542       s++;
 543     }
 544   fprintf (out, "\"\n");
 545 }
 546
 547
 548 int
 549 main (int argc, char **argv)
 550 {
 551   /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 552    */
 553
 554 #  define URC "\357\277\275"   /* 0xFFFD, "Unicode Replacement Character" */
 555
 556   static const struct { const char *name, *in, *target, *target2; } tests[] = {
 557     /* 1  Some correct UTF-8 text */
 558
 559     /* The Greek word 'kosme': */
 560     { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
 561
 562
 563     /* 2  Boundary condition test cases */
 564
 565     /* 2.1  First possible sequence of a certain length */
 566
 567     { "2.1.1", /*  1 byte  (U-00000000): */  "\000" },
 568     { "2.1.2", /*  2 bytes (U-00000080): */  "\302\200" },
 569     { "2.1.3", /*  3 bytes (U-00000800): */  "\340\240\200" },
 570     { "2.1.4", /*  4 bytes (U-00010000): */  "\360\220\200\200", 0, URC },
 571     { "2.1.5", /*  5 bytes (U-00200000): */  "\370\210\200\200\200", URC },
 572     { "2.1.6", /*  6 bytes (U-04000000): */  "\374\204\200\200\200\200", URC },
 573
 574     /* 2.2  Last possible sequence of a certain length */
 575
 576     { "2.2.1", /*  1 byte  (U-0000007F): */  "\177" },
 577     { "2.2.2", /*  2 bytes (U-000007FF): */  "\337\277" },
 578     { "2.2.3", /*  3 bytes (U-0000FFFF): */  "\357\277\277" },
 579     { "2.2.4", /*  4 bytes (U-001FFFFF): */  "\367\277\277\277", URC },
 580     { "2.2.5", /*  5 bytes (U-03FFFFFF): */  "\373\277\277\277\277", URC },
 581     { "2.2.6", /*  6 bytes (U-7FFFFFFF): */  "\375\277\277\277\277\277", URC },
 582
 583     /* 2.3  Other boundary conditions */
 584
 585     { "2.3.1", /*  U-0000D7FF = ed 9f bf = */    "\355\237\277" },
 586     { "2.3.2", /*  U-0000E000 = ee 80 80 = */    "\356\200\200" },
 587     { "2.3.3", /*  U-0000FFFD = ef bf bd = */    URC },
 588     { "2.3.4", /*  U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
 589     { "2.3.5", /*  U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
 590
 591
 592     /* 3  Malformed sequences */
 593
 594     /* 3.1  Unexpected continuation bytes */
 595
 596     /* Each unexpected continuation byte should be separately signalled as a
 597        malformed sequence of its own. */
 598
 599     { "3.1.1", /*  First continuation byte 0x80: */ "\200", URC },
 600     { "3.1.2", /*  Last  continuation byte 0xbf: */ "\277", URC },
 601     { "3.1.3", /*  2 continuation bytes: */ "\200\277",     URC URC },
 602     { "3.1.4", /*  3 continuation bytes: */ "\200\277\200", URC URC URC },
 603     { "3.1.5", /*  4 continuation bytes: */ "\200\277\200\277",
 604       URC URC URC URC },
 605     { "3.1.6", /*  5 continuation bytes: */ "\200\277\200\277\200",
 606       URC URC URC URC URC },
 607     { "3.1.7", /*  6 continuation bytes: */ "\200\277\200\277\200\277",
 608       URC URC URC URC URC URC },
 609     { "3.1.8", /*  7 continuation bytes: */ "\200\277\200\277\200\277\200",
 610       URC URC URC URC URC URC URC },
 611
 612     { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
 613
 614       "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
 615       "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
 616       "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
 617       "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
 618       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 619       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 620       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 621       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 622
 623     /* 3.2  Lonely start characters */
 624
 625     { "3.2.1", /*  All 32 first bytes of 2-byte sequences (0xc0-0xdf),
 626                    each followed by a space character: */
 627
 628       "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
 629       "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
 630       "\332 \333 \334 \335 \336 \337 ",
 631       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 632       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 633
 634     { "3.2.2", /*  All 16 first bytes of 3-byte sequences (0xe0-0xef),
 635                    each followed by a space character: */
 636       "\340 \341 \342 \343 \344 \345 \346 \347 "
 637       "\350 \351 \352 \353 \354 \355 \356 \357 ",
 638       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 639
 640     { "3.2.3", /*  All 8 first bytes of 4-byte sequences (0xf0-0xf7),
 641                    each followed by a space character: */
 642       URC URC URC URC URC URC URC URC },
 643
 644     { "3.2.4", /*  All 4 first bytes of 5-byte sequences (0xf8-0xfb),
 645                    each followed by a space character: */
 646       "\370 \371 \372 \373 ",
 647       URC URC URC URC },
 648
 649     { "3.2.5", /*  All 2 first bytes of 6-byte sequences (0xfc-0xfd),
 650                    each followed by a space character: */
 651       "\374 \375 ", URC URC },
 652
 653     /* 3.3  Sequences with last continuation byte missing */
 654
 655     /* All bytes of an incomplete sequence should be signalled as a single
 656        malformed sequence, i.e., you should see only a single replacement
 657        character in each of the next 10 tests. (Characters as in section 2) */
 658
 659     { "3.3.1", /*  2-byte sequence with last byte missing (U+0000): */
 660       "\300", URC },
 661     { "3.3.2", /*  3-byte sequence with last byte missing (U+0000): */
 662       "\340\200", URC },
 663     { "3.3.3", /*  4-byte sequence with last byte missing (U+0000): */
 664       "\360\200\200", URC },
 665     { "3.3.4", /*  5-byte sequence with last byte missing (U+0000): */
 666       "\370\200\200\200", URC },
 667     { "3.3.5", /*  6-byte sequence with last byte missing (U+0000): */
 668       "\374\200\200\200\200", URC },
 669     { "3.3.6", /*  2-byte sequence with last byte missing (U-000007FF): */
 670       "\337", URC },
 671     { "3.3.7", /*  3-byte sequence with last byte missing (U-0000FFFF): */
 672       "\357\277", URC },
 673     { "3.3.8", /*  4-byte sequence with last byte missing (U-001FFFFF): */
 674       "\367\277\277", URC },
 675     { "3.3.9", /*  5-byte sequence with last byte missing (U-03FFFFFF): */
 676       "\373\277\277\277", URC },
 677     { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
 678       "\375\277\277\277\277", URC },
 679
 680     /* 3.4  Concatenation of incomplete sequences */
 681
 682     /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
 683        sequences being signalled: */
 684
 685     { "3.4",   "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
 686       "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
 687       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
 688
 689     /* 3.5  Impossible bytes */
 690
 691     /* The following two bytes cannot appear in a correct UTF-8 string */
 692
 693     { "3.5.1", /*  fe = */      "\376", URC },
 694     { "3.5.2", /*  ff = */      "\377", URC },
 695     { "3.5.3", /*  fe fe ff ff = */     "\376\376\377\377", URC URC URC URC },
 696
 697
 698     /* 4  Overlong sequences */
 699
 700     /* 4.1  Examples of an overlong ASCII character */
 701
 702     { "4.1.1", /* U+002F = c0 af             = */ "\300\257", URC },
 703     { "4.1.2", /* U+002F = e0 80 af          = */ "\340\200\257", URC },
 704     { "4.1.3", /* U+002F = f0 80 80 af       = */ "\360\200\200\257", URC },
 705     { "4.1.4", /* U+002F = f8 80 80 80 af    = */ "\370\200\200\200\257",
 706       URC },
 707     { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
 708       URC },
 709
 710     /* 4.2  Maximum overlong sequences */
 711
 712     { "4.2.1", /*  U-0000007F = c1 bf             = */ "\301\277", URC },
 713     { "4.2.2", /*  U-000007FF = e0 9f bf          = */ "\340\237\277", URC },
 714     { "4.2.3", /*  U-0000FFFF = f0 8f bf bf       = */ "\360\217\277\277",
 715       URC },
 716     { "4.2.4", /*  U-001FFFFF = f8 87 bf bf bf    = */ "\370\207\277\277\277",
 717       URC },
 718     { "4.2.5", /*  U-03FFFFFF = fc 83 bf bf bf bf = */  URC },
 719
 720     /* 4.3  Overlong representation of the NUL character */
 721
 722     { "4.3.1", /*  U+0000 = c0 80             = */  "\300\200", URC },
 723     { "4.3.2", /*  U+0000 = e0 80 80          = */  "\340\200\200", URC },
 724     { "4.3.3", /*  U+0000 = f0 80 80 80       = */  "\360\200\200\200", URC },
 725     { "4.3.4", /*  U+0000 = f8 80 80 80 80    = */  "\370\200\200\200\200",
 726       URC },
 727     { "4.3.5", /*  U+0000 = fc 80 80 80 80 80 = */  "\374\200\200\200\200\200",
 728       URC },
 729
 730
 731     /* 5  Illegal code positions */
 732
 733     /* 5.1 Single UTF-16 surrogates */
 734
 735     { "5.1.1", /*  U+D800 = ed a0 80 = */       "\355\240\200", URC },
 736     { "5.1.2", /*  U+DB7F = ed ad bf = */       "\355\255\277", URC },
 737     { "5.1.3", /*  U+DB80 = ed ae 80 = */       "\355\256\200", URC },
 738     { "5.1.4", /*  U+DBFF = ed af bf = */       "\355\257\277", URC },
 739     { "5.1.5", /*  U+DC00 = ed b0 80 = */       "\355\260\200", URC },
 740     { "5.1.6", /*  U+DF80 = ed be 80 = */       "\355\276\200", URC },
 741     { "5.1.7", /*  U+DFFF = ed bf bf = */       "\355\277\277", URC },
 742
 743     /* 5.2 Paired UTF-16 surrogates */
 744
 745     { "5.2.1", /*  U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
 746     { "5.2.2", /*  U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
 747     { "5.2.3", /*  U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
 748     { "5.2.4", /*  U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
 749     { "5.2.5", /*  U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
 750     { "5.2.6", /*  U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
 751     { "5.2.7", /*  U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
 752     { "5.2.8", /*  U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
 753
 754     /* 5.3 Other illegal code positions */
 755
 756     { "5.3.1", /*  U+FFFE = ef bf be = */       "\357\277\276" },
 757     { "5.3.2", /*  U+FFFF = ef bf bf = */       "\357\277\277" },
 758
 759
 760     /* 6 Some other junk */
 761
 762     { "6.0", "" },
 763     { "6.1", "\001\002\003\004\005 ABC" },
 764     { "6.2", /* every non-ASCII Latin1 character */
 765       "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
 766       "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
 767       "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
 768       "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
 769       "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
 770       "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
 771       "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
 772       "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
 773       "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
 774       "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
 775       "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
 776       "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
 777
 778     { "6.3", /* Christmas tree */
 779       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 780       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
 781       "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
 782       "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
 783       "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
 784       "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
 785       "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
 786       "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
 787       "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
 788       "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
 789       "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
 790       "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
 791       "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
 792       "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
 793       "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
 794       "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
 795
 796       "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
 797       "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
 798       " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 799       "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
 800       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 801       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 802       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 803       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 804       URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
 805       URC URC URC URC URC URC URC URC URC URC URC URC },
 806   };
 807
 808   int i;
 809   int ok = 1;
 810   for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
 811     {
 812       const char *name = tests[i].name;
 813       const char *in   = tests[i].in;
 814       const char *target = (tests[i].target ? tests[i].target : in);
 815       const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
 816       char *out = split_and_join (in);
 817       XChar2b *out16 = utf8_to_XChar2b (in, 0);
 818       char *out2 = XChar2b_to_utf8 (out16, 0);
 819       if (strcmp (out, target))
 820         {
 821           LOG (stderr, name, target);
 822           LOG (stderr, "FAIL", out);
 823           fprintf (stderr, "\n");
 824           ok = 0;
 825         }
 826       if (strcmp (out2, target2))
 827         {
 828           LOG (stderr, name, target2);
 829           LOG (stderr, "FAIL2", out2);
 830           fprintf (stderr, "\n");
 831           ok = 0;
 832         }
 833       free (out);
 834       free (out2);
 835       free (out16);
 836     }
 837
 838   /* Check conversion from UTF8 to Latin1 and ASCII. */
 839   {
 840     const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
 841                         "c\303\264t\303\251 de l'alc\303\264ve "
 842                         "ovo\303\257de, o\303\271 les b\303\273ches "
 843                         "se consument dans l'\303\242tre");
 844     const char *latin1 = ("son \356le int\351rieure, \340 "
 845                           "c\364t\351 de l'alc\364ve ovo\357de, "
 846                           "o\371 les b\373ches se consument dans "
 847                           "l'\342tre");
 848     const char *ascii = ("son ile interieure, a cote de l'alcove "
 849                          "ovoide, ou les buches se consument dans "
 850                          "l'atre");
 851     char *latin1b = utf8_to_latin1 (utf8, False);
 852     char *ascii2  = utf8_to_latin1 (utf8, True);
 853     if (strcmp (latin1, latin1b))
 854       {
 855         LOG (stderr, "LATIN1", utf8);
 856         LOG (stderr, "FAIL3", latin1b);
 857         fprintf (stderr, "\n");
 858         ok = 0;
 859       }
 860     if (strcmp (ascii, ascii2))
 861       {
 862         LOG (stderr, "ASCII", utf8);
 863         LOG (stderr, "FAIL4", ascii2);
 864         fprintf (stderr, "\n");
 865         ok = 0;
 866       }
 867     free (latin1b);
 868     free (ascii2);
 869   }
 870
 871   /* Check de-composition of emoji that should all be treated as a unit
 872      for measurement and display purposes. */
 873   {
 874     static const char * const tests[] = {
 875
 876       /* 0: "Man" */
 877       " \360\237\221\250 ",
 878
 879       /* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */
 880       " \360\237\247\233\360\237\217\277 ",
 881
 882       /* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" =
 883             1F468 1F3FF 200D 1F3EB
 884        */
 885       " \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ",
 886
 887       /* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */
 888       " \360\237\217\203\342\200\215\342\231\200 ",
 889
 890       /* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */
 891       " \360\237\217\203\342\200\215\360\237\232\200 ",
 892
 893       /* 5:
 894          Group of people displayed as a single glyph:
 895            Woman, dark skin tone, ZWJ,   1F469 1F3FF 200D
 896            Man, light skin tone, ZWJ,    1F468 1F3FB 200D
 897            Boy, medium skin tone, ZWJ,   1F466 1F3FD 200D
 898            Girl, dark skin tone.         1F467 1F3FF
 899        */
 900       " \360\237\221\251\360\237\217\277\342\200\215"
 901        "\360\237\221\250\360\237\217\273\342\200\215"
 902        "\360\237\221\246\360\237\217\275\342\200\215"
 903        "\360\237\221\247\360\237\217\277 ",
 904     };
 905     int i;
 906     for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
 907       {
 908         int L = 0;
 909         char **out = utf8_split (tests[i], &L);
 910         char name[100];
 911         int j;
 912         sprintf (name, "SPLIT %d: %d glyphs", i, L-2);
 913         if (L != 3)
 914           {
 915             LOG (stderr, name, tests[i]);
 916             ok = 0;
 917           }
 918         for (j = 0; j < L; j++)
 919           free (out[j]);
 920         free (out);
 921       }
 922   }
 923
 924   if (ok) fprintf (stderr, "OK\n");
 925   return (ok == 0);
 926 }
 927
 928 #endif /* SELFTEST */