1 /* xscreensaver, Copyright (c) 2014-2015 Jamie Zawinski <jwz@jwz.org>
3 * Permission to use, copy, modify, distribute, and sell this software and its
4 * documentation for any purpose is hereby granted without fee, provided that
5 * the above copyright notice appear in all copies and that both that
6 * copyright notice and this permission notice appear in supporting
7 * documentation. No representations are made about the suitability of this
8 * software for any purpose. It is provided "as is" without express or
22 # elif defined(HAVE_ANDROID)
24 #else /* !HAVE_COCOA */
25 # include <X11/Xlib.h>
31 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
32 # define INVALID 0xFFFD
35 /* Mask the number to be within the valid range of unicode characters.
38 uc_truncate (unsigned long uc)
40 uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */
41 if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */
42 if (uc == 0) uc = INVALID; /* no nulls */
44 if (uc >= 0xD800 && uc <= 0xDFFF)
45 /* Reserved for use with UTF-16: not a real character. */
52 /* Parse the first UTF8 character at the front of the string.
53 Return the Unicode character, and the number of bytes read.
56 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
58 const unsigned char *start = in;
59 const unsigned char *end = in + length;
60 unsigned long uc = INVALID;
61 unsigned long min = 0;
64 if (length <= 0) goto DONE;
68 # define PREMATURE_EOF { in = end; goto DONE; }
70 if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */
73 } else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */
74 uc = (c & 0x7F); /* 01111111 */
76 } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
77 if (in+1 > end) PREMATURE_EOF;
79 uc = (((c & 0x1F) << 6) | /* 00011111------ */
80 (in[0] & 0x3F)); /* 00111111 */
83 } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
84 if (in+2 > end) PREMATURE_EOF;
86 uc = (((c & 0x0F) << 12) | /* 00001111----+------- */
87 ((in[0] & 0x3F) << 6) | /* 00111111------ */
88 ((in[1] & 0x3F))); /* 00111111 */
91 } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
92 if (in+3 > end) PREMATURE_EOF;
94 uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */
95 ((in[0] & 0x3F) << 12) | /* 01111111----+------- */
96 ((in[1] & 0x3F) << 6) | /* 00111111------ */
97 ((in[2] & 0x3F))); /* 00111111 */
100 } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
101 if (in+4 > end) PREMATURE_EOF;
103 uc = (((c & 0x03) << 24) | /* 00000011--------+-------+------- */
104 ((in[0] & 0x3F) << 18) | /* 00111111--+-------+------- */
105 ((in[1] & 0x3F) << 12) | /* 00111111----+------- */
106 ((in[2] & 0x3F) << 6) | /* 00111111------ */
107 ((in[3] & 0x3F))); /* 00111111 */
110 } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
111 if (in+5 > end) PREMATURE_EOF;
113 uc = (((c & 0x01) << 30) | /* 00000001------+-------+-------+------- */
114 ((in[0] & 0x3F) << 24) | /* 00111111+-------+-------+------- */
115 ((in[1] & 0x3F) << 18) | /* 00111111--+-------+------- */
116 ((in[2] & 0x3F) << 12) | /* 00111111----+------- */
117 ((in[3] & 0x3F) << 6) | /* 00111111------ */
118 ((in[4] & 0x3F))); /* 00111111 */
121 uc = INVALID; /* Unparsable sequence. */
128 /* If any of the continuation bytes didn't begin with the continuation tag,
129 the sequence is invalid; stop at the bad byte, not consuming later ones.
130 (It's easier to check this after the fact than up above.) */
133 for (i = 1; i < length; i++)
134 if ((start[i] & 0xC0) != 0x80) {
142 /* A multi-byte sequence encoded a character that could have been
143 encoded with a shorter sequence, e.g., hiding ASCII inside a
144 multi-byte sequence. Something hinky's going on. Reject it. */
147 uc = uc_truncate (uc);
156 /* Converts a Unicode character to a multi-byte UTF8 sequence.
157 Returns the number of bytes written.
160 utf8_encode (unsigned long uc, char *out, long length)
162 const char *old = out;
164 uc = uc_truncate (uc);
166 if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */
168 *out++ = uc; /* 0xxxxxxx */
170 else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */
172 *out++ = (0xC0 | ((uc >> 6) & 0x1F)); /* 110xxxxx */
173 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
175 else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */
177 *out++ = (0xE0 | ((uc >> 12) & 0x0F)); /* 1110xxxx */
178 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
179 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
181 else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */
183 *out++ = (0xF0 | ((uc >> 18) & 0x07)); /* 11110xxx */
184 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
185 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
186 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
188 else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */
190 *out++ = (0xF8 | ((uc >> 24) & 0x03)); /* 111110xx */
191 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
192 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
193 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
194 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
196 else if (length >= 6) /* 31 bits in 6 bytes */
198 *out++ = (0xFC | ((uc >> 30) & 0x01)); /* 1111110x */
199 *out++ = (0x80 | ((uc >> 24) & 0x3F)); /* 10xxxxxx */
200 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
201 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
202 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
203 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
206 return (int) (out - old);
210 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
211 This only handles characters that can be represented in 16 bits, the
212 Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
215 utf8_to_XChar2b (const char *string, int *length_ret)
217 long in_len = strlen(string);
218 const unsigned char *in = (const unsigned char *) string;
219 const unsigned char *in_end = in + in_len;
220 XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
226 unsigned long uc = 0;
227 long L = utf8_decode (in, in_end - in, &uc);
230 /* If it can't be represented in a 16-bit XChar2b,
231 use "Unicode Replacement Character". */
232 if (uc > 0xFFFF) uc = INVALID;
234 out->byte1 = (uc >> 8) & 0xFF;
235 out->byte2 = uc & 0xFF;
243 c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
246 *length_ret = (int) (out - c2b);
252 /* Split a UTF8 string into an array of strings, one per character.
253 The sub-strings will be null terminated and may be multiple bytes.
256 utf8_split (const char *string, int *length_ret)
258 const unsigned char *in = (const unsigned char *) string;
259 long len = strlen (string);
260 const unsigned char *end = in + len;
261 char **ret = (char **) malloc ((len+1) * sizeof(*ret));
268 long len2 = utf8_decode (in, len, &uc);
270 strncpy (tmp, (char *) in, len2);
272 ret[i++] = strdup (tmp);
275 /* If this is a Combining Diacritical, append it to the previous
276 character. E.g., "y\314\206\314\206" is one string, not three.
278 if (i > 1 && uc >= 0x300 && uc <= 0x36F)
280 long L1 = strlen(ret[i-2]);
281 long L2 = strlen(ret[i-1]);
282 char *s2 = (char *) malloc (L1 + L2 + 1);
283 strncpy (s2, ret[i-2], L1);
284 strncpy (s2 + L1, ret[i-1], L2);
294 ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
303 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
306 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
309 const XChar2b *in_end;
314 /* Find the null termination on the XChar2b. */
315 for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
318 out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */
319 utf8 = out = (char *) malloc (out_len + 1);
321 out_end = out + out_len;
325 unsigned long uc = (in->byte1 << 8) | in->byte2;
326 int wrote = utf8_encode (uc, out, out_end - out);
327 if (wrote > 3) abort(); /* Can't happen with 16 bit input */
334 out_len = (int) (out - utf8 + 1);
335 utf8 = (char *) realloc (utf8, out_len);
338 *length_ret = out_len;
344 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
347 utf8_to_latin1 (const char *string, Bool ascii_p)
349 long in_len = strlen(string);
350 const unsigned char *in = (const unsigned char *) string;
351 const unsigned char *in_end = in + in_len;
352 unsigned char *ret = (unsigned char *) malloc (in_len + 1);
353 unsigned char *out = ret;
359 unsigned long uc = 0;
360 long len2 = utf8_decode (in, in_end - in, &uc);
363 if (uc == '\240') /* */
365 else if (uc >= 0x300 && uc <= 0x36F)
366 uc = 0; /* Discard "Unicode Combining Diacriticals Block" */
370 /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
372 case 0x2000: /* EN QUAD */
373 case 0x2001: /* EM QUAD */
374 case 0x2002: /* EN SPACE */
375 case 0x2003: /* EM SPACE */
376 case 0x2004: /* THREE-PER-EM SPACE */
377 case 0x2005: /* FOUR-PER-EM SPACE */
378 case 0x2006: /* SIX-PER-EM SPACE */
379 case 0x2007: /* FIGURE SPACE */
380 case 0x2008: /* PUNCTUATION SPACE */
381 case 0x2009: /* THIN SPACE */
382 case 0x200A: /* HAIR SPACE */
386 case 0x2010: /* HYPHEN */
387 case 0x2011: /* NON-BREAKING HYPHEN */
388 case 0x2012: /* FIGURE DASH */
389 case 0x2013: /* EN DASH */
390 case 0x2014: /* EM DASH */
391 case 0x2015: /* HORIZONTAL BAR */
395 case 0x2018: /* LEFT SINGLE QUOTATION MARK */
396 case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */
397 case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */
398 case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
402 case 0x201C: /* LEFT DOUBLE QUOTATION MARK */
403 case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */
404 case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */
405 case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
409 case 0x2022: uc = '\267'; break; /* BULLET */
410 case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
411 case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
412 case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */
413 case 0x2038: uc = '^'; break; /* CARET */
414 case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
415 case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
416 case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */
417 case 0x2042: uc = '*'; break; /* ASTERISM */
418 case 0x2043: uc = '='; break; /* HYPHEN BULLET */
419 case 0x2044: uc = '/'; break; /* FRACTION SLASH */
420 case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
421 case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
422 case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
423 case 0x204E: uc = '*'; break; /* LOW ASTERISK */
424 case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */
430 /* "Inverted question mark" looks enough like 0xFFFD,
431 the "Unicode Replacement Character". */
432 uc = (ascii_p ? '#' : '\277');
434 if (ascii_p) /* Map Latin1 to the closest ASCII versions. */
436 const unsigned char latin1_to_ascii[96] =
437 " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
438 "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
439 "aaaaaaeceeeeiiiionooooo/ouuuuypy";
441 uc = latin1_to_ascii[uc - 0xA0];
445 *out++ = (unsigned char) uc;
450 ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
456 /*************************************************************************
458 cd ../hacks ; make test-utf8wc
460 *************************************************************************/
464 /* Convert a UTF8 string to Unicode and back again.
467 split_and_join (const char *string)
469 const unsigned char *in = (const unsigned char *) string;
470 int len = strlen (string);
471 const unsigned char *end = in + len;
472 unsigned long *unicode = (unsigned long *)
473 malloc((len + 1) * sizeof(*unicode));
475 char *ret, *out, *out_end;
479 long len2 = utf8_decode (in, len, &unicode[i]);
486 out = ret = (char *) malloc(i);
491 int len2 = utf8_encode (unicode[i], out, out_end - out);
503 LOG (FILE *out, const char *prefix, const char *s)
505 fprintf (out, "%6s: \"", prefix);
508 unsigned char c = *s;
509 if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
510 else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
511 else fprintf (out, "%c", c);
514 fprintf (out, "\"\n");
519 main (int argc, char **argv)
521 /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
524 # define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */
526 static const struct { const char *name, *in, *target, *target2; } tests[] = {
527 /* 1 Some correct UTF-8 text */
529 /* The Greek word 'kosme': */
530 { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
533 /* 2 Boundary condition test cases */
535 /* 2.1 First possible sequence of a certain length */
537 { "2.1.1", /* 1 byte (U-00000000): */ "\000" },
538 { "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" },
539 { "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" },
540 { "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC },
541 { "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC },
542 { "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC },
544 /* 2.2 Last possible sequence of a certain length */
546 { "2.2.1", /* 1 byte (U-0000007F): */ "\177" },
547 { "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" },
548 { "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" },
549 { "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC },
550 { "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC },
551 { "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC },
553 /* 2.3 Other boundary conditions */
555 { "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" },
556 { "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" },
557 { "2.3.3", /* U-0000FFFD = ef bf bd = */ URC },
558 { "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
559 { "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
562 /* 3 Malformed sequences */
564 /* 3.1 Unexpected continuation bytes */
566 /* Each unexpected continuation byte should be separately signalled as a
567 malformed sequence of its own. */
569 { "3.1.1", /* First continuation byte 0x80: */ "\200", URC },
570 { "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC },
571 { "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC },
572 { "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC },
573 { "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277",
575 { "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200",
576 URC URC URC URC URC },
577 { "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277",
578 URC URC URC URC URC URC },
579 { "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200",
580 URC URC URC URC URC URC URC },
582 { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
584 "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
585 "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
586 "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
587 "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
588 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
589 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
590 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
591 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
593 /* 3.2 Lonely start characters */
595 { "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf),
596 each followed by a space character: */
598 "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
599 "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
600 "\332 \333 \334 \335 \336 \337 ",
601 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
602 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
604 { "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef),
605 each followed by a space character: */
606 "\340 \341 \342 \343 \344 \345 \346 \347 "
607 "\350 \351 \352 \353 \354 \355 \356 \357 ",
608 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
610 { "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7),
611 each followed by a space character: */
612 URC URC URC URC URC URC URC URC },
614 { "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb),
615 each followed by a space character: */
616 "\370 \371 \372 \373 ",
619 { "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd),
620 each followed by a space character: */
621 "\374 \375 ", URC URC },
623 /* 3.3 Sequences with last continuation byte missing */
625 /* All bytes of an incomplete sequence should be signalled as a single
626 malformed sequence, i.e., you should see only a single replacement
627 character in each of the next 10 tests. (Characters as in section 2) */
629 { "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */
631 { "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */
633 { "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */
634 "\360\200\200", URC },
635 { "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */
636 "\370\200\200\200", URC },
637 { "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */
638 "\374\200\200\200\200", URC },
639 { "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */
641 { "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */
643 { "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */
644 "\367\277\277", URC },
645 { "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */
646 "\373\277\277\277", URC },
647 { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
648 "\375\277\277\277\277", URC },
650 /* 3.4 Concatenation of incomplete sequences */
652 /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
653 sequences being signalled: */
655 { "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
656 "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
657 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
659 /* 3.5 Impossible bytes */
661 /* The following two bytes cannot appear in a correct UTF-8 string */
663 { "3.5.1", /* fe = */ "\376", URC },
664 { "3.5.2", /* ff = */ "\377", URC },
665 { "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC },
668 /* 4 Overlong sequences */
670 /* 4.1 Examples of an overlong ASCII character */
672 { "4.1.1", /* U+002F = c0 af = */ "\300\257", URC },
673 { "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC },
674 { "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC },
675 { "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257",
677 { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
680 /* 4.2 Maximum overlong sequences */
682 { "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC },
683 { "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC },
684 { "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277",
686 { "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277",
688 { "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC },
690 /* 4.3 Overlong representation of the NUL character */
692 { "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC },
693 { "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC },
694 { "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC },
695 { "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200",
697 { "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200",
701 /* 5 Illegal code positions */
703 /* 5.1 Single UTF-16 surrogates */
705 { "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC },
706 { "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC },
707 { "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC },
708 { "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC },
709 { "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC },
710 { "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC },
711 { "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC },
713 /* 5.2 Paired UTF-16 surrogates */
715 { "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
716 { "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
717 { "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
718 { "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
719 { "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
720 { "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
721 { "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
722 { "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
724 /* 5.3 Other illegal code positions */
726 { "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" },
727 { "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" },
730 /* 6 Some other junk */
733 { "6.1", "\001\002\003\004\005 ABC" },
734 { "6.2", /* every non-ASCII Latin1 character */
735 "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
736 "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
737 "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
738 "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
739 "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
740 "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
741 "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
742 "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
743 "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
744 "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
745 "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
746 "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
748 { "6.3", /* Christmas tree */
749 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
750 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
751 "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
752 "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
753 "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
754 "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
755 "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
756 "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
757 "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
758 "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
759 "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
760 "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
761 "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
762 "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
763 "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
764 "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
766 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
767 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
768 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
769 "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
770 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
771 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
772 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
773 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
774 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
775 URC URC URC URC URC URC URC URC URC URC URC URC },
780 for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
782 const char *name = tests[i].name;
783 const char *in = tests[i].in;
784 const char *target = (tests[i].target ? tests[i].target : in);
785 const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
786 char *out = split_and_join (in);
787 XChar2b *out16 = utf8_to_XChar2b (in, 0);
788 char *out2 = XChar2b_to_utf8 (out16, 0);
789 if (strcmp (out, target))
791 LOG (stderr, name, target);
792 LOG (stderr, "FAIL", out);
793 fprintf (stderr, "\n");
796 if (strcmp (out2, target2))
798 LOG (stderr, name, target2);
799 LOG (stderr, "FAIL2", out2);
800 fprintf (stderr, "\n");
809 const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
810 "c\303\264t\303\251 de l'alc\303\264ve "
811 "ovo\303\257de, o\303\271 les b\303\273ches "
812 "se consument dans l'\303\242tre");
813 const char *latin1 = ("son \356le int\351rieure, \340 "
814 "c\364t\351 de l'alc\364ve ovo\357de, "
815 "o\371 les b\373ches se consument dans "
817 const char *ascii = ("son ile interieure, a cote de l'alcove "
818 "ovoide, ou les buches se consument dans "
820 char *latin1b = utf8_to_latin1 (utf8, False);
821 char *ascii2 = utf8_to_latin1 (utf8, True);
822 if (strcmp (latin1, latin1b))
824 LOG (stderr, "LATIN1", utf8);
825 LOG (stderr, "FAIL3", latin1b);
826 fprintf (stderr, "\n");
829 if (strcmp (ascii, ascii2))
831 LOG (stderr, "ASCII", utf8);
832 LOG (stderr, "FAIL4", ascii2);
833 fprintf (stderr, "\n");
841 if (ok) fprintf (stderr, "OK\n");
845 #endif /* SELFTEST */