1 /* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski <jwz@jwz.org>
3 * Permission to use, copy, modify, distribute, and sell this software and its
4 * documentation for any purpose is hereby granted without fee, provided that
5 * the above copyright notice appear in all copies and that both that
6 * copyright notice and this permission notice appear in supporting
7 * documentation. No representations are made about the suitability of this
8 * software for any purpose. It is provided "as is" without express or
22 #else /* !HAVE_JWXYZ */
23 # include <X11/Xlib.h>
29 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
30 # define INVALID 0xFFFD
33 /* Mask the number to be within the valid range of unicode characters.
36 uc_truncate (unsigned long uc)
38 uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */
39 if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */
40 if (uc == 0) uc = INVALID; /* no nulls */
42 if (uc >= 0xD800 && uc <= 0xDFFF)
43 /* Reserved for use with UTF-16: not a real character. */
50 /* Parse the first UTF8 character at the front of the string.
51 Return the Unicode character, and the number of bytes read.
54 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
56 const unsigned char *start = in;
57 const unsigned char *end = in + length;
58 unsigned long uc = INVALID;
59 unsigned long min = 0;
62 if (length <= 0) goto DONE;
66 # define PREMATURE_EOF { in = end; goto DONE; }
68 if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */
71 } else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */
72 uc = (c & 0x7F); /* 01111111 */
74 } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
75 if (in+1 > end) PREMATURE_EOF;
77 uc = (((c & 0x1F) << 6) | /* 00011111------ */
78 (in[0] & 0x3F)); /* 00111111 */
81 } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
82 if (in+2 > end) PREMATURE_EOF;
84 uc = (((c & 0x0F) << 12) | /* 00001111----+------- */
85 ((in[0] & 0x3F) << 6) | /* 00111111------ */
86 ((in[1] & 0x3F))); /* 00111111 */
89 } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
90 if (in+3 > end) PREMATURE_EOF;
92 uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */
93 ((in[0] & 0x3F) << 12) | /* 01111111----+------- */
94 ((in[1] & 0x3F) << 6) | /* 00111111------ */
95 ((in[2] & 0x3F))); /* 00111111 */
98 } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
99 if (in+4 > end) PREMATURE_EOF;
101 uc = (((c & 0x03) << 24) | /* 00000011--------+-------+------- */
102 ((in[0] & 0x3F) << 18) | /* 00111111--+-------+------- */
103 ((in[1] & 0x3F) << 12) | /* 00111111----+------- */
104 ((in[2] & 0x3F) << 6) | /* 00111111------ */
105 ((in[3] & 0x3F))); /* 00111111 */
108 } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
109 if (in+5 > end) PREMATURE_EOF;
111 uc = (((c & 0x01) << 30) | /* 00000001------+-------+-------+------- */
112 ((in[0] & 0x3F) << 24) | /* 00111111+-------+-------+------- */
113 ((in[1] & 0x3F) << 18) | /* 00111111--+-------+------- */
114 ((in[2] & 0x3F) << 12) | /* 00111111----+------- */
115 ((in[3] & 0x3F) << 6) | /* 00111111------ */
116 ((in[4] & 0x3F))); /* 00111111 */
119 uc = INVALID; /* Unparsable sequence. */
126 /* If any of the continuation bytes didn't begin with the continuation tag,
127 the sequence is invalid; stop at the bad byte, not consuming later ones.
128 (It's easier to check this after the fact than up above.) */
131 for (i = 1; i < length; i++)
132 if ((start[i] & 0xC0) != 0x80) {
140 /* A multi-byte sequence encoded a character that could have been
141 encoded with a shorter sequence, e.g., hiding ASCII inside a
142 multi-byte sequence. Something hinky's going on. Reject it. */
145 uc = uc_truncate (uc);
154 /* Converts a Unicode character to a multi-byte UTF8 sequence.
155 Returns the number of bytes written.
158 utf8_encode (unsigned long uc, char *out, long length)
160 const char *old = out;
162 uc = uc_truncate (uc);
164 if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */
166 *out++ = uc; /* 0xxxxxxx */
168 else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */
170 *out++ = (0xC0 | ((uc >> 6) & 0x1F)); /* 110xxxxx */
171 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
173 else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */
175 *out++ = (0xE0 | ((uc >> 12) & 0x0F)); /* 1110xxxx */
176 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
177 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
179 else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */
181 *out++ = (0xF0 | ((uc >> 18) & 0x07)); /* 11110xxx */
182 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
183 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
184 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
186 else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */
188 *out++ = (0xF8 | ((uc >> 24) & 0x03)); /* 111110xx */
189 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
190 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
191 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
192 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
194 else if (length >= 6) /* 31 bits in 6 bytes */
196 *out++ = (0xFC | ((uc >> 30) & 0x01)); /* 1111110x */
197 *out++ = (0x80 | ((uc >> 24) & 0x3F)); /* 10xxxxxx */
198 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
199 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
200 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
201 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
204 return (int) (out - old);
208 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
209 This only handles characters that can be represented in 16 bits, the
210 Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
213 utf8_to_XChar2b (const char *string, int *length_ret)
215 long in_len = strlen(string);
216 const unsigned char *in = (const unsigned char *) string;
217 const unsigned char *in_end = in + in_len;
218 XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
224 unsigned long uc = 0;
225 long L = utf8_decode (in, in_end - in, &uc);
228 /* If it can't be represented in a 16-bit XChar2b,
229 use "Unicode Replacement Character". */
230 if (uc > 0xFFFF) uc = INVALID;
232 out->byte1 = (uc >> 8) & 0xFF;
233 out->byte2 = uc & 0xFF;
241 *length_ret = (int) (out - c2b);
244 c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
250 /* Split a UTF8 string into an array of strings, one per character.
251 The sub-strings will be null terminated and may be multiple bytes.
254 utf8_split (const char *string, int *length_ret)
256 const unsigned char *in = (const unsigned char *) string;
257 long len = strlen (string);
258 const unsigned char *end = in + len;
259 char **ret = (char **) malloc ((len+1) * sizeof(*ret));
267 long len2 = utf8_decode (in, len, &uc);
269 strncpy (tmp, (char *) in, len2);
271 ret[i++] = strdup (tmp);
274 /* If this is a Combining Diacritical, append it to the previous
275 character. E.g., "y\314\206\314\206" is one string, not three.
277 If this is ZWJ, Zero Width Joiner, then we append both this character
278 and the following character, e.g. "X ZWJ Y" is one string not three.
280 #### Hmmm, should this also include every character in the
281 "Symbol, Modifier" category, or does ZWJ get used for those?
282 https://www.fileformat.info/info/unicode/category/Sk/list.htm
284 Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8"
285 should be a single glyph? Or is that what "Combining Cedilla, 0327"
286 is for? I'm confused by the fact that the skin tones (1F3FB-1F3FF)
287 do not seem to be in a readily-identifiable block the way the various
288 combining diacriticals are.
291 ((uc >= 0x300 && uc <= 0x36F) || /* Combining Diacritical */
292 (uc >= 0x1AB0 && uc <= 0x1AFF) || /* Combining Diacritical Ext. */
293 (uc >= 0x1DC0 && uc <= 0x1DFF) || /* Combining Diacritical Supp. */
294 (uc >= 0x20D0 && uc <= 0x20FF) || /* Combining Diacritical Sym. */
295 (uc >= 0xFE20 && uc <= 0xFE2F) || /* Combining Half Marks */
296 (uc >= 0x1F3FB && uc <= 0x1F3FF) || /* Emoji skin tone modifiers */
297 zwjp || uc == 0x200D)) /* Zero Width Joiner */
299 long L1 = strlen(ret[i-2]);
300 long L2 = strlen(ret[i-1]);
301 char *s2 = (char *) malloc (L1 + L2 + 1);
302 strncpy (s2, ret[i-2], L1);
303 strncpy (s2 + L1, ret[i-1], L2);
308 zwjp = (uc == 0x200D); /* Swallow the next character as well */
317 ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
323 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
326 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
329 const XChar2b *in_end;
334 /* Find the null termination on the XChar2b. */
335 for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
338 out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */
339 utf8 = out = (char *) malloc (out_len + 1);
341 out_end = out + out_len;
345 unsigned long uc = (in->byte1 << 8) | in->byte2;
346 int wrote = utf8_encode (uc, out, out_end - out);
347 if (wrote > 3) abort(); /* Can't happen with 16 bit input */
353 out_len = (int) (out - utf8 + 1);
356 *length_ret = out_len;
359 utf8 = (char *) realloc (utf8, out_len);
365 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
368 utf8_to_latin1 (const char *string, Bool ascii_p)
370 long in_len = strlen(string);
371 const unsigned char *in = (const unsigned char *) string;
372 const unsigned char *in_end = in + in_len;
373 unsigned char *ret = (unsigned char *) malloc (in_len + 1);
374 unsigned char *out = ret;
380 unsigned long uc = 0;
381 long len2 = utf8_decode (in, in_end - in, &uc);
384 if (uc == '\240') /* */
386 else if (uc >= 0x300 && uc <= 0x36F)
387 uc = 0; /* Discard "Combining Diacritical Marks" */
388 else if (uc >= 0x1AB0 && uc <= 0x1AFF)
389 uc = 0; /* Discard "Combining Diacritical Marks Extended" */
390 else if (uc >= 0x1DC0 && uc <= 0x1DFF)
391 uc = 0; /* Discard "Combining Diacritical Marks Supplement" */
392 else if (uc >= 0x20D0 && uc <= 0x20FF)
393 uc = 0; /* Discard "Combining Diacritical Marks for Symbols" */
394 else if (uc >= 0xFE20 && uc <= 0xFE2F)
395 uc = 0; /* Discard "Combining Half Marks" */
400 /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
402 case 0x2000: /* EN QUAD */
403 case 0x2001: /* EM QUAD */
404 case 0x2002: /* EN SPACE */
405 case 0x2003: /* EM SPACE */
406 case 0x2004: /* THREE-PER-EM SPACE */
407 case 0x2005: /* FOUR-PER-EM SPACE */
408 case 0x2006: /* SIX-PER-EM SPACE */
409 case 0x2007: /* FIGURE SPACE */
410 case 0x2008: /* PUNCTUATION SPACE */
411 case 0x2009: /* THIN SPACE */
412 case 0x200A: /* HAIR SPACE */
416 case 0x2010: /* HYPHEN */
417 case 0x2011: /* NON-BREAKING HYPHEN */
418 case 0x2012: /* FIGURE DASH */
419 case 0x2013: /* EN DASH */
420 case 0x2014: /* EM DASH */
421 case 0x2015: /* HORIZONTAL BAR */
425 case 0x2018: /* LEFT SINGLE QUOTATION MARK */
426 case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */
427 case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */
428 case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
432 case 0x201C: /* LEFT DOUBLE QUOTATION MARK */
433 case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */
434 case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */
435 case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
439 case 0x2022: uc = '\267'; break; /* BULLET */
440 case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
441 case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
442 case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */
443 case 0x2038: uc = '^'; break; /* CARET */
444 case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
445 case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
446 case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */
447 case 0x2042: uc = '*'; break; /* ASTERISM */
448 case 0x2043: uc = '='; break; /* HYPHEN BULLET */
449 case 0x2044: uc = '/'; break; /* FRACTION SLASH */
450 case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
451 case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
452 case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
453 case 0x204E: uc = '*'; break; /* LOW ASTERISK */
454 case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */
460 /* "Inverted question mark" looks enough like 0xFFFD,
461 the "Unicode Replacement Character". */
462 uc = (ascii_p ? '#' : '\277');
464 if (ascii_p) /* Map Latin1 to the closest ASCII versions. */
466 const unsigned char latin1_to_ascii[96] =
467 " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
468 "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
469 "aaaaaaeceeeeiiiionooooo/ouuuuypy";
471 uc = latin1_to_ascii[uc - 0xA0];
475 *out++ = (unsigned char) uc;
480 ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
486 /*************************************************************************
488 cd ../hacks ; make test-utf8wc
490 *************************************************************************/
494 /* Convert a UTF8 string to Unicode and back again.
497 split_and_join (const char *string)
499 const unsigned char *in = (const unsigned char *) string;
500 int len = strlen (string);
501 const unsigned char *end = in + len;
502 unsigned long *unicode = (unsigned long *)
503 malloc((len + 1) * sizeof(*unicode));
505 char *ret, *out, *out_end;
509 long len2 = utf8_decode (in, len, &unicode[i]);
516 out = ret = (char *) malloc(i);
521 int len2 = utf8_encode (unicode[i], out, out_end - out);
533 LOG (FILE *out, const char *prefix, const char *s)
535 fprintf (out, "%6s: \"", prefix);
538 unsigned char c = *s;
539 if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
540 else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
541 else fprintf (out, "%c", c);
544 fprintf (out, "\"\n");
549 main (int argc, char **argv)
551 /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
554 # define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */
556 static const struct { const char *name, *in, *target, *target2; } tests[] = {
557 /* 1 Some correct UTF-8 text */
559 /* The Greek word 'kosme': */
560 { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
563 /* 2 Boundary condition test cases */
565 /* 2.1 First possible sequence of a certain length */
567 { "2.1.1", /* 1 byte (U-00000000): */ "\000" },
568 { "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" },
569 { "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" },
570 { "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC },
571 { "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC },
572 { "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC },
574 /* 2.2 Last possible sequence of a certain length */
576 { "2.2.1", /* 1 byte (U-0000007F): */ "\177" },
577 { "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" },
578 { "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" },
579 { "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC },
580 { "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC },
581 { "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC },
583 /* 2.3 Other boundary conditions */
585 { "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" },
586 { "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" },
587 { "2.3.3", /* U-0000FFFD = ef bf bd = */ URC },
588 { "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
589 { "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
592 /* 3 Malformed sequences */
594 /* 3.1 Unexpected continuation bytes */
596 /* Each unexpected continuation byte should be separately signalled as a
597 malformed sequence of its own. */
599 { "3.1.1", /* First continuation byte 0x80: */ "\200", URC },
600 { "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC },
601 { "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC },
602 { "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC },
603 { "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277",
605 { "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200",
606 URC URC URC URC URC },
607 { "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277",
608 URC URC URC URC URC URC },
609 { "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200",
610 URC URC URC URC URC URC URC },
612 { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
614 "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
615 "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
616 "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
617 "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
618 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
619 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
620 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
621 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
623 /* 3.2 Lonely start characters */
625 { "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf),
626 each followed by a space character: */
628 "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
629 "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
630 "\332 \333 \334 \335 \336 \337 ",
631 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
632 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
634 { "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef),
635 each followed by a space character: */
636 "\340 \341 \342 \343 \344 \345 \346 \347 "
637 "\350 \351 \352 \353 \354 \355 \356 \357 ",
638 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
640 { "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7),
641 each followed by a space character: */
642 URC URC URC URC URC URC URC URC },
644 { "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb),
645 each followed by a space character: */
646 "\370 \371 \372 \373 ",
649 { "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd),
650 each followed by a space character: */
651 "\374 \375 ", URC URC },
653 /* 3.3 Sequences with last continuation byte missing */
655 /* All bytes of an incomplete sequence should be signalled as a single
656 malformed sequence, i.e., you should see only a single replacement
657 character in each of the next 10 tests. (Characters as in section 2) */
659 { "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */
661 { "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */
663 { "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */
664 "\360\200\200", URC },
665 { "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */
666 "\370\200\200\200", URC },
667 { "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */
668 "\374\200\200\200\200", URC },
669 { "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */
671 { "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */
673 { "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */
674 "\367\277\277", URC },
675 { "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */
676 "\373\277\277\277", URC },
677 { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
678 "\375\277\277\277\277", URC },
680 /* 3.4 Concatenation of incomplete sequences */
682 /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
683 sequences being signalled: */
685 { "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
686 "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
687 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
689 /* 3.5 Impossible bytes */
691 /* The following two bytes cannot appear in a correct UTF-8 string */
693 { "3.5.1", /* fe = */ "\376", URC },
694 { "3.5.2", /* ff = */ "\377", URC },
695 { "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC },
698 /* 4 Overlong sequences */
700 /* 4.1 Examples of an overlong ASCII character */
702 { "4.1.1", /* U+002F = c0 af = */ "\300\257", URC },
703 { "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC },
704 { "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC },
705 { "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257",
707 { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
710 /* 4.2 Maximum overlong sequences */
712 { "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC },
713 { "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC },
714 { "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277",
716 { "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277",
718 { "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC },
720 /* 4.3 Overlong representation of the NUL character */
722 { "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC },
723 { "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC },
724 { "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC },
725 { "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200",
727 { "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200",
731 /* 5 Illegal code positions */
733 /* 5.1 Single UTF-16 surrogates */
735 { "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC },
736 { "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC },
737 { "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC },
738 { "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC },
739 { "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC },
740 { "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC },
741 { "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC },
743 /* 5.2 Paired UTF-16 surrogates */
745 { "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
746 { "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
747 { "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
748 { "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
749 { "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
750 { "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
751 { "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
752 { "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
754 /* 5.3 Other illegal code positions */
756 { "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" },
757 { "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" },
760 /* 6 Some other junk */
763 { "6.1", "\001\002\003\004\005 ABC" },
764 { "6.2", /* every non-ASCII Latin1 character */
765 "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
766 "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
767 "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
768 "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
769 "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
770 "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
771 "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
772 "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
773 "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
774 "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
775 "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
776 "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
778 { "6.3", /* Christmas tree */
779 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
780 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
781 "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
782 "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
783 "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
784 "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
785 "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
786 "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
787 "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
788 "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
789 "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
790 "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
791 "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
792 "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
793 "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
794 "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
796 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
797 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
798 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
799 "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
800 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
801 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
802 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
803 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
804 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
805 URC URC URC URC URC URC URC URC URC URC URC URC },
810 for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
812 const char *name = tests[i].name;
813 const char *in = tests[i].in;
814 const char *target = (tests[i].target ? tests[i].target : in);
815 const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
816 char *out = split_and_join (in);
817 XChar2b *out16 = utf8_to_XChar2b (in, 0);
818 char *out2 = XChar2b_to_utf8 (out16, 0);
819 if (strcmp (out, target))
821 LOG (stderr, name, target);
822 LOG (stderr, "FAIL", out);
823 fprintf (stderr, "\n");
826 if (strcmp (out2, target2))
828 LOG (stderr, name, target2);
829 LOG (stderr, "FAIL2", out2);
830 fprintf (stderr, "\n");
838 /* Check conversion from UTF8 to Latin1 and ASCII. */
840 const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
841 "c\303\264t\303\251 de l'alc\303\264ve "
842 "ovo\303\257de, o\303\271 les b\303\273ches "
843 "se consument dans l'\303\242tre");
844 const char *latin1 = ("son \356le int\351rieure, \340 "
845 "c\364t\351 de l'alc\364ve ovo\357de, "
846 "o\371 les b\373ches se consument dans "
848 const char *ascii = ("son ile interieure, a cote de l'alcove "
849 "ovoide, ou les buches se consument dans "
851 char *latin1b = utf8_to_latin1 (utf8, False);
852 char *ascii2 = utf8_to_latin1 (utf8, True);
853 if (strcmp (latin1, latin1b))
855 LOG (stderr, "LATIN1", utf8);
856 LOG (stderr, "FAIL3", latin1b);
857 fprintf (stderr, "\n");
860 if (strcmp (ascii, ascii2))
862 LOG (stderr, "ASCII", utf8);
863 LOG (stderr, "FAIL4", ascii2);
864 fprintf (stderr, "\n");
871 /* Check de-composition of emoji that should all be treated as a unit
872 for measurement and display purposes. */
874 static const char * const tests[] = {
877 " \360\237\221\250 ",
879 /* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */
880 " \360\237\247\233\360\237\217\277 ",
882 /* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" =
883 1F468 1F3FF 200D 1F3EB
885 " \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ",
887 /* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */
888 " \360\237\217\203\342\200\215\342\231\200 ",
890 /* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */
891 " \360\237\217\203\342\200\215\360\237\232\200 ",
894 Group of people displayed as a single glyph:
895 Woman, dark skin tone, ZWJ, 1F469 1F3FF 200D
896 Man, light skin tone, ZWJ, 1F468 1F3FB 200D
897 Boy, medium skin tone, ZWJ, 1F466 1F3FD 200D
898 Girl, dark skin tone. 1F467 1F3FF
900 " \360\237\221\251\360\237\217\277\342\200\215"
901 "\360\237\221\250\360\237\217\273\342\200\215"
902 "\360\237\221\246\360\237\217\275\342\200\215"
903 "\360\237\221\247\360\237\217\277 ",
906 for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
909 char **out = utf8_split (tests[i], &L);
912 sprintf (name, "SPLIT %d: %d glyphs", i, L-2);
915 LOG (stderr, name, tests[i]);
918 for (j = 0; j < L; j++)
924 if (ok) fprintf (stderr, "OK\n");
928 #endif /* SELFTEST */