1 /* xscreensaver, Copyright (c) 2014 Jamie Zawinski <jwz@jwz.org>
3 * Permission to use, copy, modify, distribute, and sell this software and its
4 * documentation for any purpose is hereby granted without fee, provided that
5 * the above copyright notice appear in all copies and that both that
6 * copyright notice and this permission notice appear in supporting
7 * documentation. No representations are made about the suitability of this
8 * software for any purpose. It is provided "as is" without express or
22 # elif defined(HAVE_ANDROID)
24 #else /* !HAVE_COCOA */
25 # include <X11/Xlib.h>
31 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
32 # define INVALID 0xFFFD
35 /* Mask the number to be within the valid range of unicode characters.
38 uc_truncate (unsigned long uc)
40 uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */
41 if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */
42 if (uc == 0) uc = INVALID; /* no nulls */
44 if (uc >= 0xD800 && uc <= 0xDFFF)
45 /* Reserved for use with UTF-16: not a real character. */
52 /* Parse the first UTF8 character at the front of the string.
53 Return the Unicode character, and the number of bytes read.
56 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
58 const unsigned char *start = in;
59 const unsigned char *end = in + length;
60 unsigned long uc = INVALID;
61 unsigned long min = 0;
64 if (length <= 0) goto DONE;
68 # define PREMATURE_EOF { in = end; goto DONE; }
70 if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */
73 } else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */
74 uc = (c & 0x7F); /* 01111111 */
76 } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
77 if (in+1 > end) PREMATURE_EOF;
79 uc = (((c & 0x1F) << 6) | /* 00011111------ */
80 (in[0] & 0x3F)); /* 00111111 */
83 } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
84 if (in+2 > end) PREMATURE_EOF;
86 uc = (((c & 0x0F) << 12) | /* 00001111----+------- */
87 ((in[0] & 0x3F) << 6) | /* 00111111------ */
88 ((in[1] & 0x3F))); /* 00111111 */
91 } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
92 if (in+3 > end) PREMATURE_EOF;
94 uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */
95 ((in[0] & 0x3F) << 12) | /* 01111111----+------- */
96 ((in[1] & 0x3F) << 6) | /* 00111111------ */
97 ((in[2] & 0x3F))); /* 00111111 */
100 } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
101 if (in+4 > end) PREMATURE_EOF;
103 uc = (((c & 0x03) << 24) | /* 00000011--------+-------+------- */
104 ((in[0] & 0x3F) << 18) | /* 00111111--+-------+------- */
105 ((in[1] & 0x3F) << 12) | /* 00111111----+------- */
106 ((in[2] & 0x3F) << 6) | /* 00111111------ */
107 ((in[3] & 0x3F))); /* 00111111 */
110 } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
111 if (in+5 > end) PREMATURE_EOF;
113 uc = (((c & 0x01) << 30) | /* 00000001------+-------+-------+------- */
114 ((in[0] & 0x3F) << 24) | /* 00111111+-------+-------+------- */
115 ((in[1] & 0x3F) << 18) | /* 00111111--+-------+------- */
116 ((in[2] & 0x3F) << 12) | /* 00111111----+------- */
117 ((in[3] & 0x3F) << 6) | /* 00111111------ */
118 ((in[4] & 0x3F))); /* 00111111 */
121 uc = INVALID; /* Unparsable sequence. */
128 /* If any of the continuation bytes didn't begin with the continuation tag,
129 the sequence is invalid; stop at the bad byte, not consuming later ones.
130 (It's easier to check this after the fact than up above.) */
133 for (i = 1; i < length; i++)
134 if ((start[i] & 0xC0) != 0x80) {
142 /* A multi-byte sequence encoded a character that could have been
143 encoded with a shorter sequence, e.g., hiding ASCII inside a
144 multi-byte sequence. Something hinky's going on. Reject it. */
147 uc = uc_truncate (uc);
156 /* Converts a Unicode character to a multi-byte UTF8 sequence.
157 Returns the number of bytes written.
160 utf8_encode (unsigned long uc, char *out, long length)
162 const char *old = out;
164 uc = uc_truncate (uc);
166 if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */
168 *out++ = uc; /* 0xxxxxxx */
170 else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */
172 *out++ = (0xC0 | ((uc >> 6) & 0x1F)); /* 110xxxxx */
173 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
175 else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */
177 *out++ = (0xE0 | ((uc >> 12) & 0x0F)); /* 1110xxxx */
178 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
179 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
181 else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */
183 *out++ = (0xF0 | ((uc >> 18) & 0x07)); /* 11110xxx */
184 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
185 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
186 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
188 else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */
190 *out++ = (0xF8 | ((uc >> 24) & 0x03)); /* 111110xx */
191 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
192 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
193 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
194 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
196 else if (length >= 6) /* 31 bits in 6 bytes */
198 *out++ = (0xFC | ((uc >> 30) & 0x01)); /* 1111110x */
199 *out++ = (0x80 | ((uc >> 24) & 0x3F)); /* 10xxxxxx */
200 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
201 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
202 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
203 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
206 return (int) (out - old);
210 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
211 This only handles characters that can be represented in 16 bits, the
212 Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
215 utf8_to_XChar2b (const char *string, int *length_ret)
217 long in_len = strlen(string);
218 const unsigned char *in = (const unsigned char *) string;
219 const unsigned char *in_end = in + in_len;
220 XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
226 unsigned long uc = 0;
227 long L = utf8_decode (in, in_end - in, &uc);
230 /* If it can't be represented in a 16-bit XChar2b,
231 use "Unicode Replacement Character". */
232 if (uc > 0xFFFF) uc = INVALID;
234 out->byte1 = (uc >> 8) & 0xFF;
235 out->byte2 = uc & 0xFF;
243 c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
246 *length_ret = (int) (out - c2b);
252 /* Split a UTF8 string into an array of strings, one per character.
253 The sub-strings will be null terminated and may be multiple bytes.
256 utf8_split (const char *string, int *length_ret)
258 const unsigned char *in = (const unsigned char *) string;
259 long len = strlen (string);
260 const unsigned char *end = in + len;
261 char **ret = (char **) malloc ((len+1) * sizeof(*ret));
267 long len2 = utf8_decode (in, len, 0);
269 strncpy (tmp, (char *) in, len2);
271 ret[i++] = strdup (tmp);
277 ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
286 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
289 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
292 const XChar2b *in_end;
297 /* Find the null termination on the XChar2b. */
298 for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
301 out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */
302 utf8 = out = (char *) malloc (out_len + 1);
304 out_end = out + out_len;
308 unsigned long uc = (in->byte1 << 8) | in->byte2;
309 int wrote = utf8_encode (uc, out, out_end - out);
310 if (wrote > 3) abort(); /* Can't happen with 16 bit input */
317 utf8 = (char *) realloc (utf8, (out - utf8 + 1) * sizeof(*utf8));
320 *length_ret = (int) (out - utf8);
326 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
329 utf8_to_latin1 (const char *string, Bool ascii_p)
331 long in_len = strlen(string);
332 const unsigned char *in = (const unsigned char *) string;
333 const unsigned char *in_end = in + in_len;
334 unsigned char *ret = (unsigned char *) malloc (in_len + 1);
335 unsigned char *out = ret;
341 unsigned long uc = 0;
342 long len2 = utf8_decode (in, in_end - in, &uc);
345 if (uc == '\240') /* */
347 else if (uc >= 0x2300 && uc <= 0x36F)
348 uc = 0; /* Discard "Unicode Combining Diacriticals Block" */
352 /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
354 case 0x2000: /* EN QUAD */
355 case 0x2001: /* EM QUAD */
356 case 0x2002: /* EN SPACE */
357 case 0x2003: /* EM SPACE */
358 case 0x2004: /* THREE-PER-EM SPACE */
359 case 0x2005: /* FOUR-PER-EM SPACE */
360 case 0x2006: /* SIX-PER-EM SPACE */
361 case 0x2007: /* FIGURE SPACE */
362 case 0x2008: /* PUNCTUATION SPACE */
363 case 0x2009: /* THIN SPACE */
364 case 0x200A: /* HAIR SPACE */
368 case 0x2010: /* HYPHEN */
369 case 0x2011: /* NON-BREAKING HYPHEN */
370 case 0x2012: /* FIGURE DASH */
371 case 0x2013: /* EN DASH */
372 case 0x2014: /* EM DASH */
373 case 0x2015: /* HORIZONTAL BAR */
377 case 0x2018: /* LEFT SINGLE QUOTATION MARK */
378 case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */
379 case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */
380 case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
384 case 0x201C: /* LEFT DOUBLE QUOTATION MARK */
385 case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */
386 case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */
387 case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
391 case 0x2022: uc = '\267'; break; /* BULLET */
392 case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
393 case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
394 case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */
395 case 0x2038: uc = '^'; break; /* CARET */
396 case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
397 case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
398 case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */
399 case 0x2042: uc = '*'; break; /* ASTERISM */
400 case 0x2043: uc = '='; break; /* HYPHEN BULLET */
401 case 0x2044: uc = '/'; break; /* FRACTION SLASH */
402 case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
403 case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
404 case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
405 case 0x204E: uc = '*'; break; /* LOW ASTERISK */
406 case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */
412 /* "Inverted question mark" looks enough like 0xFFFD,
413 the "Unicode Replacement Character". */
414 uc = (ascii_p ? '#' : '\277');
416 if (ascii_p) /* Map Latin1 to the closest ASCII versions. */
418 const unsigned char latin1_to_ascii[96] =
419 " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
420 "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
421 "aaaaaaeceeeeiiiionooooo/ouuuuypy";
423 uc = latin1_to_ascii[uc - 0xA0];
427 *out++ = (unsigned char) uc;
432 ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
438 /*************************************************************************
440 cd ../hacks ; make test-utf8wc
442 *************************************************************************/
446 /* Convert a UTF8 string to Unicode and back again.
449 split_and_join (const char *string)
451 const unsigned char *in = (const unsigned char *) string;
452 int len = strlen (string);
453 const unsigned char *end = in + len;
454 unsigned long *unicode = (unsigned long *)
455 malloc((len + 1) * sizeof(*unicode));
457 char *ret, *out, *out_end;
461 long len2 = utf8_decode (in, len, &unicode[i]);
468 out = ret = (char *) malloc(i);
473 int len2 = utf8_encode (unicode[i], out, out_end - out);
485 LOG (FILE *out, const char *prefix, const char *s)
487 fprintf (out, "%6s: \"", prefix);
490 unsigned char c = *s;
491 if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
492 else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
493 else fprintf (out, "%c", c);
496 fprintf (out, "\"\n");
501 main (int argc, char **argv)
503 /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
506 # define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */
508 static const struct { const char *name, *in, *target, *target2; } tests[] = {
509 /* 1 Some correct UTF-8 text */
511 /* The Greek word 'kosme': */
512 { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
515 /* 2 Boundary condition test cases */
517 /* 2.1 First possible sequence of a certain length */
519 { "2.1.1", /* 1 byte (U-00000000): */ "\000" },
520 { "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" },
521 { "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" },
522 { "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC },
523 { "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC },
524 { "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC },
526 /* 2.2 Last possible sequence of a certain length */
528 { "2.2.1", /* 1 byte (U-0000007F): */ "\177" },
529 { "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" },
530 { "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" },
531 { "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC },
532 { "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC },
533 { "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC },
535 /* 2.3 Other boundary conditions */
537 { "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" },
538 { "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" },
539 { "2.3.3", /* U-0000FFFD = ef bf bd = */ URC },
540 { "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
541 { "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
544 /* 3 Malformed sequences */
546 /* 3.1 Unexpected continuation bytes */
548 /* Each unexpected continuation byte should be separately signalled as a
549 malformed sequence of its own. */
551 { "3.1.1", /* First continuation byte 0x80: */ "\200", URC },
552 { "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC },
553 { "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC },
554 { "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC },
555 { "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277",
557 { "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200",
558 URC URC URC URC URC },
559 { "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277",
560 URC URC URC URC URC URC },
561 { "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200",
562 URC URC URC URC URC URC URC },
564 { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
566 "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
567 "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
568 "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
569 "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
570 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
571 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
572 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
573 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
575 /* 3.2 Lonely start characters */
577 { "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf),
578 each followed by a space character: */
580 "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
581 "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
582 "\332 \333 \334 \335 \336 \337 ",
583 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
584 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
586 { "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef),
587 each followed by a space character: */
588 "\340 \341 \342 \343 \344 \345 \346 \347 "
589 "\350 \351 \352 \353 \354 \355 \356 \357 ",
590 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
592 { "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7),
593 each followed by a space character: */
594 URC URC URC URC URC URC URC URC },
596 { "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb),
597 each followed by a space character: */
598 "\370 \371 \372 \373 ",
601 { "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd),
602 each followed by a space character: */
603 "\374 \375 ", URC URC },
605 /* 3.3 Sequences with last continuation byte missing */
607 /* All bytes of an incomplete sequence should be signalled as a single
608 malformed sequence, i.e., you should see only a single replacement
609 character in each of the next 10 tests. (Characters as in section 2) */
611 { "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */
613 { "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */
615 { "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */
616 "\360\200\200", URC },
617 { "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */
618 "\370\200\200\200", URC },
619 { "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */
620 "\374\200\200\200\200", URC },
621 { "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */
623 { "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */
625 { "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */
626 "\367\277\277", URC },
627 { "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */
628 "\373\277\277\277", URC },
629 { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
630 "\375\277\277\277\277", URC },
632 /* 3.4 Concatenation of incomplete sequences */
634 /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
635 sequences being signalled: */
637 { "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
638 "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
639 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
641 /* 3.5 Impossible bytes */
643 /* The following two bytes cannot appear in a correct UTF-8 string */
645 { "3.5.1", /* fe = */ "\376", URC },
646 { "3.5.2", /* ff = */ "\377", URC },
647 { "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC },
650 /* 4 Overlong sequences */
652 /* 4.1 Examples of an overlong ASCII character */
654 { "4.1.1", /* U+002F = c0 af = */ "\300\257", URC },
655 { "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC },
656 { "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC },
657 { "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257",
659 { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
662 /* 4.2 Maximum overlong sequences */
664 { "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC },
665 { "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC },
666 { "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277",
668 { "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277",
670 { "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC },
672 /* 4.3 Overlong representation of the NUL character */
674 { "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC },
675 { "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC },
676 { "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC },
677 { "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200",
679 { "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200",
683 /* 5 Illegal code positions */
685 /* 5.1 Single UTF-16 surrogates */
687 { "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC },
688 { "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC },
689 { "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC },
690 { "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC },
691 { "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC },
692 { "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC },
693 { "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC },
695 /* 5.2 Paired UTF-16 surrogates */
697 { "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
698 { "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
699 { "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
700 { "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
701 { "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
702 { "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
703 { "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
704 { "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
706 /* 5.3 Other illegal code positions */
708 { "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" },
709 { "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" },
712 /* 6 Some other junk */
715 { "6.1", "\001\002\003\004\005 ABC" },
716 { "6.2", /* every non-ASCII Latin1 character */
717 "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
718 "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
719 "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
720 "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
721 "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
722 "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
723 "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
724 "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
725 "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
726 "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
727 "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
728 "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
730 { "6.3", /* Christmas tree */
731 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
732 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
733 "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
734 "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
735 "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
736 "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
737 "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
738 "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
739 "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
740 "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
741 "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
742 "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
743 "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
744 "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
745 "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
746 "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
748 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
749 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
750 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
751 "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
752 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
753 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
754 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
755 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
756 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
757 URC URC URC URC URC URC URC URC URC URC URC URC },
762 for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
764 const char *name = tests[i].name;
765 const char *in = tests[i].in;
766 const char *target = (tests[i].target ? tests[i].target : in);
767 const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
768 char *out = split_and_join (in);
769 XChar2b *out16 = utf8_to_XChar2b (in, 0);
770 char *out2 = XChar2b_to_utf8 (out16, 0);
771 if (strcmp (out, target))
773 LOG (stderr, name, target);
774 LOG (stderr, "FAIL", out);
775 fprintf (stderr, "\n");
778 if (strcmp (out2, target2))
780 LOG (stderr, name, target2);
781 LOG (stderr, "FAIL2", out2);
782 fprintf (stderr, "\n");
791 const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
792 "c\303\264t\303\251 de l'alc\303\264ve "
793 "ovo\303\257de, o\303\271 les b\303\273ches "
794 "se consument dans l'\303\242tre");
795 const char *latin1 = ("son \356le int\351rieure, \340 "
796 "c\364t\351 de l'alc\364ve ovo\357de, "
797 "o\371 les b\373ches se consument dans "
799 const char *ascii = ("son ile interieure, a cote de l'alcove "
800 "ovoide, ou les buches se consument dans "
802 char *latin1b = utf8_to_latin1 (utf8, False);
803 char *ascii2 = utf8_to_latin1 (utf8, True);
804 if (strcmp (latin1, latin1b))
806 LOG (stderr, "LATIN1", utf8);
807 LOG (stderr, "FAIL3", latin1b);
808 fprintf (stderr, "\n");
811 if (strcmp (ascii, ascii2))
813 LOG (stderr, "ASCII", utf8);
814 LOG (stderr, "FAIL4", ascii2);
815 fprintf (stderr, "\n");
823 if (ok) fprintf (stderr, "OK\n");
827 #endif /* SELFTEST */