1 /* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski <jwz@jwz.org>
3 * Permission to use, copy, modify, distribute, and sell this software and its
4 * documentation for any purpose is hereby granted without fee, provided that
5 * the above copyright notice appear in all copies and that both that
6 * copyright notice and this permission notice appear in supporting
7 * documentation. No representations are made about the suitability of this
8 * software for any purpose. It is provided "as is" without express or
22 #else /* !HAVE_JWXYZ */
23 # include <X11/Xlib.h>
29 /* "Unicode Replacement Character", displayed in lieu of invalid characters. */
30 # define INVALID 0xFFFD
33 /* Mask the number to be within the valid range of unicode characters.
36 uc_truncate (unsigned long uc)
38 uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */
39 if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */
40 if (uc == 0) uc = INVALID; /* no nulls */
42 if (uc >= 0xD800 && uc <= 0xDFFF)
43 /* Reserved for use with UTF-16: not a real character. */
50 /* Parse the first UTF8 character at the front of the string.
51 Return the Unicode character, and the number of bytes read.
54 utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
56 const unsigned char *start = in;
57 const unsigned char *end = in + length;
58 unsigned long uc = INVALID;
59 unsigned long min = 0;
62 if (length <= 0) goto DONE;
66 # define PREMATURE_EOF { in = end; goto DONE; }
68 if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */
71 } else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */
72 uc = (c & 0x7F); /* 01111111 */
74 } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
75 if (in+1 > end) PREMATURE_EOF;
77 uc = (((c & 0x1F) << 6) | /* 00011111------ */
78 (in[0] & 0x3F)); /* 00111111 */
81 } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
82 if (in+2 > end) PREMATURE_EOF;
84 uc = (((c & 0x0F) << 12) | /* 00001111----+------- */
85 ((in[0] & 0x3F) << 6) | /* 00111111------ */
86 ((in[1] & 0x3F))); /* 00111111 */
89 } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
90 if (in+3 > end) PREMATURE_EOF;
92 uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */
93 ((in[0] & 0x3F) << 12) | /* 01111111----+------- */
94 ((in[1] & 0x3F) << 6) | /* 00111111------ */
95 ((in[2] & 0x3F))); /* 00111111 */
98 } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
99 if (in+4 > end) PREMATURE_EOF;
101 uc = (((c & 0x03) << 24) | /* 00000011--------+-------+------- */
102 ((in[0] & 0x3F) << 18) | /* 00111111--+-------+------- */
103 ((in[1] & 0x3F) << 12) | /* 00111111----+------- */
104 ((in[2] & 0x3F) << 6) | /* 00111111------ */
105 ((in[3] & 0x3F))); /* 00111111 */
108 } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
109 if (in+5 > end) PREMATURE_EOF;
111 uc = (((c & 0x01) << 30) | /* 00000001------+-------+-------+------- */
112 ((in[0] & 0x3F) << 24) | /* 00111111+-------+-------+------- */
113 ((in[1] & 0x3F) << 18) | /* 00111111--+-------+------- */
114 ((in[2] & 0x3F) << 12) | /* 00111111----+------- */
115 ((in[3] & 0x3F) << 6) | /* 00111111------ */
116 ((in[4] & 0x3F))); /* 00111111 */
119 uc = INVALID; /* Unparsable sequence. */
126 /* If any of the continuation bytes didn't begin with the continuation tag,
127 the sequence is invalid; stop at the bad byte, not consuming later ones.
128 (It's easier to check this after the fact than up above.) */
131 for (i = 1; i < length; i++)
132 if ((start[i] & 0xC0) != 0x80) {
140 /* A multi-byte sequence encoded a character that could have been
141 encoded with a shorter sequence, e.g., hiding ASCII inside a
142 multi-byte sequence. Something hinky's going on. Reject it. */
145 uc = uc_truncate (uc);
154 /* Converts a Unicode character to a multi-byte UTF8 sequence.
155 Returns the number of bytes written.
158 utf8_encode (unsigned long uc, char *out, long length)
160 const char *old = out;
162 uc = uc_truncate (uc);
164 if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */
166 *out++ = uc; /* 0xxxxxxx */
168 else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */
170 *out++ = (0xC0 | ((uc >> 6) & 0x1F)); /* 110xxxxx */
171 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
173 else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */
175 *out++ = (0xE0 | ((uc >> 12) & 0x0F)); /* 1110xxxx */
176 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
177 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
179 else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */
181 *out++ = (0xF0 | ((uc >> 18) & 0x07)); /* 11110xxx */
182 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
183 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
184 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
186 else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */
188 *out++ = (0xF8 | ((uc >> 24) & 0x03)); /* 111110xx */
189 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
190 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
191 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
192 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
194 else if (length >= 6) /* 31 bits in 6 bytes */
196 *out++ = (0xFC | ((uc >> 30) & 0x01)); /* 1111110x */
197 *out++ = (0x80 | ((uc >> 24) & 0x3F)); /* 10xxxxxx */
198 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
199 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
200 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
201 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
204 return (int) (out - old);
208 /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
209 This only handles characters that can be represented in 16 bits, the
210 Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
213 utf8_to_XChar2b (const char *string, int *length_ret)
215 long in_len = strlen(string);
216 const unsigned char *in = (const unsigned char *) string;
217 const unsigned char *in_end = in + in_len;
218 XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
224 unsigned long uc = 0;
225 long L = utf8_decode (in, in_end - in, &uc);
228 /* If it can't be represented in a 16-bit XChar2b,
229 use "Unicode Replacement Character". */
230 if (uc > 0xFFFF) uc = INVALID;
232 out->byte1 = (uc >> 8) & 0xFF;
233 out->byte2 = uc & 0xFF;
241 *length_ret = (int) (out - c2b);
244 c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
250 /* Split a UTF8 string into an array of strings, one per character.
251 The sub-strings will be null terminated and may be multiple bytes.
254 utf8_split (const char *string, int *length_ret)
256 const unsigned char *in = (const unsigned char *) string;
257 long len = strlen (string);
258 const unsigned char *end = in + len;
259 char **ret = (char **) malloc ((len+1) * sizeof(*ret));
266 long len2 = utf8_decode (in, len, &uc);
268 strncpy (tmp, (char *) in, len2);
270 ret[i++] = strdup (tmp);
273 /* If this is a Combining Diacritical, append it to the previous
274 character. E.g., "y\314\206\314\206" is one string, not three.
277 ((uc >= 0x300 && uc <= 0x36F) || /* Combining Diacritical */
278 (uc >= 0x1AB0 && uc <= 0x1AFF) || /* Combining Diacritical Ext. */
279 (uc >= 0x1DC0 && uc <= 0x1DFF) || /* Combining Diacritical Supp. */
280 (uc >= 0x20D0 && uc <= 0x20FF) || /* Combining Diacritical Sym. */
281 (uc >= 0xFE20 && uc <= 0xFE2F))) /* Combining Half Marks */
283 long L1 = strlen(ret[i-2]);
284 long L2 = strlen(ret[i-1]);
285 char *s2 = (char *) malloc (L1 + L2 + 1);
286 strncpy (s2, ret[i-2], L1);
287 strncpy (s2 + L1, ret[i-1], L2);
300 ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
306 /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
309 XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
312 const XChar2b *in_end;
317 /* Find the null termination on the XChar2b. */
318 for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
321 out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */
322 utf8 = out = (char *) malloc (out_len + 1);
324 out_end = out + out_len;
328 unsigned long uc = (in->byte1 << 8) | in->byte2;
329 int wrote = utf8_encode (uc, out, out_end - out);
330 if (wrote > 3) abort(); /* Can't happen with 16 bit input */
336 out_len = (int) (out - utf8 + 1);
339 *length_ret = out_len;
342 utf8 = (char *) realloc (utf8, out_len);
348 /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
351 utf8_to_latin1 (const char *string, Bool ascii_p)
353 long in_len = strlen(string);
354 const unsigned char *in = (const unsigned char *) string;
355 const unsigned char *in_end = in + in_len;
356 unsigned char *ret = (unsigned char *) malloc (in_len + 1);
357 unsigned char *out = ret;
363 unsigned long uc = 0;
364 long len2 = utf8_decode (in, in_end - in, &uc);
367 if (uc == '\240') /* */
369 else if (uc >= 0x300 && uc <= 0x36F)
370 uc = 0; /* Discard "Combining Diacritical Marks" */
371 else if (uc >= 0x1AB0 && uc <= 0x1AFF)
372 uc = 0; /* Discard "Combining Diacritical Marks Extended" */
373 else if (uc >= 0x1DC0 && uc <= 0x1DFF)
374 uc = 0; /* Discard "Combining Diacritical Marks Supplement" */
375 else if (uc >= 0x20D0 && uc <= 0x20FF)
376 uc = 0; /* Discard "Combining Diacritical Marks for Symbols" */
377 else if (uc >= 0xFE20 && uc <= 0xFE2F)
378 uc = 0; /* Discard "Combining Half Marks" */
383 /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
385 case 0x2000: /* EN QUAD */
386 case 0x2001: /* EM QUAD */
387 case 0x2002: /* EN SPACE */
388 case 0x2003: /* EM SPACE */
389 case 0x2004: /* THREE-PER-EM SPACE */
390 case 0x2005: /* FOUR-PER-EM SPACE */
391 case 0x2006: /* SIX-PER-EM SPACE */
392 case 0x2007: /* FIGURE SPACE */
393 case 0x2008: /* PUNCTUATION SPACE */
394 case 0x2009: /* THIN SPACE */
395 case 0x200A: /* HAIR SPACE */
399 case 0x2010: /* HYPHEN */
400 case 0x2011: /* NON-BREAKING HYPHEN */
401 case 0x2012: /* FIGURE DASH */
402 case 0x2013: /* EN DASH */
403 case 0x2014: /* EM DASH */
404 case 0x2015: /* HORIZONTAL BAR */
408 case 0x2018: /* LEFT SINGLE QUOTATION MARK */
409 case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */
410 case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */
411 case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
415 case 0x201C: /* LEFT DOUBLE QUOTATION MARK */
416 case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */
417 case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */
418 case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
422 case 0x2022: uc = '\267'; break; /* BULLET */
423 case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
424 case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
425 case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */
426 case 0x2038: uc = '^'; break; /* CARET */
427 case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
428 case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
429 case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */
430 case 0x2042: uc = '*'; break; /* ASTERISM */
431 case 0x2043: uc = '='; break; /* HYPHEN BULLET */
432 case 0x2044: uc = '/'; break; /* FRACTION SLASH */
433 case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
434 case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
435 case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
436 case 0x204E: uc = '*'; break; /* LOW ASTERISK */
437 case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */
443 /* "Inverted question mark" looks enough like 0xFFFD,
444 the "Unicode Replacement Character". */
445 uc = (ascii_p ? '#' : '\277');
447 if (ascii_p) /* Map Latin1 to the closest ASCII versions. */
449 const unsigned char latin1_to_ascii[96] =
450 " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
451 "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
452 "aaaaaaeceeeeiiiionooooo/ouuuuypy";
454 uc = latin1_to_ascii[uc - 0xA0];
458 *out++ = (unsigned char) uc;
463 ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
469 /*************************************************************************
471 cd ../hacks ; make test-utf8wc
473 *************************************************************************/
477 /* Convert a UTF8 string to Unicode and back again.
480 split_and_join (const char *string)
482 const unsigned char *in = (const unsigned char *) string;
483 int len = strlen (string);
484 const unsigned char *end = in + len;
485 unsigned long *unicode = (unsigned long *)
486 malloc((len + 1) * sizeof(*unicode));
488 char *ret, *out, *out_end;
492 long len2 = utf8_decode (in, len, &unicode[i]);
499 out = ret = (char *) malloc(i);
504 int len2 = utf8_encode (unicode[i], out, out_end - out);
516 LOG (FILE *out, const char *prefix, const char *s)
518 fprintf (out, "%6s: \"", prefix);
521 unsigned char c = *s;
522 if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
523 else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
524 else fprintf (out, "%c", c);
527 fprintf (out, "\"\n");
532 main (int argc, char **argv)
534 /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
537 # define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */
539 static const struct { const char *name, *in, *target, *target2; } tests[] = {
540 /* 1 Some correct UTF-8 text */
542 /* The Greek word 'kosme': */
543 { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
546 /* 2 Boundary condition test cases */
548 /* 2.1 First possible sequence of a certain length */
550 { "2.1.1", /* 1 byte (U-00000000): */ "\000" },
551 { "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" },
552 { "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" },
553 { "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC },
554 { "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC },
555 { "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC },
557 /* 2.2 Last possible sequence of a certain length */
559 { "2.2.1", /* 1 byte (U-0000007F): */ "\177" },
560 { "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" },
561 { "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" },
562 { "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC },
563 { "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC },
564 { "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC },
566 /* 2.3 Other boundary conditions */
568 { "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" },
569 { "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" },
570 { "2.3.3", /* U-0000FFFD = ef bf bd = */ URC },
571 { "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
572 { "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
575 /* 3 Malformed sequences */
577 /* 3.1 Unexpected continuation bytes */
579 /* Each unexpected continuation byte should be separately signalled as a
580 malformed sequence of its own. */
582 { "3.1.1", /* First continuation byte 0x80: */ "\200", URC },
583 { "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC },
584 { "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC },
585 { "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC },
586 { "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277",
588 { "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200",
589 URC URC URC URC URC },
590 { "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277",
591 URC URC URC URC URC URC },
592 { "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200",
593 URC URC URC URC URC URC URC },
595 { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
597 "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
598 "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
599 "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
600 "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
601 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
602 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
603 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
604 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
606 /* 3.2 Lonely start characters */
608 { "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf),
609 each followed by a space character: */
611 "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
612 "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
613 "\332 \333 \334 \335 \336 \337 ",
614 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
615 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
617 { "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef),
618 each followed by a space character: */
619 "\340 \341 \342 \343 \344 \345 \346 \347 "
620 "\350 \351 \352 \353 \354 \355 \356 \357 ",
621 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
623 { "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7),
624 each followed by a space character: */
625 URC URC URC URC URC URC URC URC },
627 { "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb),
628 each followed by a space character: */
629 "\370 \371 \372 \373 ",
632 { "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd),
633 each followed by a space character: */
634 "\374 \375 ", URC URC },
636 /* 3.3 Sequences with last continuation byte missing */
638 /* All bytes of an incomplete sequence should be signalled as a single
639 malformed sequence, i.e., you should see only a single replacement
640 character in each of the next 10 tests. (Characters as in section 2) */
642 { "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */
644 { "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */
646 { "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */
647 "\360\200\200", URC },
648 { "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */
649 "\370\200\200\200", URC },
650 { "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */
651 "\374\200\200\200\200", URC },
652 { "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */
654 { "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */
656 { "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */
657 "\367\277\277", URC },
658 { "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */
659 "\373\277\277\277", URC },
660 { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
661 "\375\277\277\277\277", URC },
663 /* 3.4 Concatenation of incomplete sequences */
665 /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
666 sequences being signalled: */
668 { "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
669 "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
670 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
672 /* 3.5 Impossible bytes */
674 /* The following two bytes cannot appear in a correct UTF-8 string */
676 { "3.5.1", /* fe = */ "\376", URC },
677 { "3.5.2", /* ff = */ "\377", URC },
678 { "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC },
681 /* 4 Overlong sequences */
683 /* 4.1 Examples of an overlong ASCII character */
685 { "4.1.1", /* U+002F = c0 af = */ "\300\257", URC },
686 { "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC },
687 { "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC },
688 { "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257",
690 { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
693 /* 4.2 Maximum overlong sequences */
695 { "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC },
696 { "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC },
697 { "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277",
699 { "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277",
701 { "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC },
703 /* 4.3 Overlong representation of the NUL character */
705 { "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC },
706 { "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC },
707 { "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC },
708 { "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200",
710 { "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200",
714 /* 5 Illegal code positions */
716 /* 5.1 Single UTF-16 surrogates */
718 { "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC },
719 { "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC },
720 { "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC },
721 { "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC },
722 { "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC },
723 { "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC },
724 { "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC },
726 /* 5.2 Paired UTF-16 surrogates */
728 { "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
729 { "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
730 { "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
731 { "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
732 { "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
733 { "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
734 { "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
735 { "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
737 /* 5.3 Other illegal code positions */
739 { "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" },
740 { "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" },
743 /* 6 Some other junk */
746 { "6.1", "\001\002\003\004\005 ABC" },
747 { "6.2", /* every non-ASCII Latin1 character */
748 "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
749 "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
750 "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
751 "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
752 "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
753 "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
754 "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
755 "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
756 "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
757 "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
758 "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
759 "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
761 { "6.3", /* Christmas tree */
762 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
763 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
764 "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
765 "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
766 "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
767 "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
768 "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
769 "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
770 "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
771 "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
772 "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
773 "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
774 "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
775 "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
776 "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
777 "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
779 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
780 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
781 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
782 "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
783 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
784 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
785 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
786 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
787 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
788 URC URC URC URC URC URC URC URC URC URC URC URC },
793 for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
795 const char *name = tests[i].name;
796 const char *in = tests[i].in;
797 const char *target = (tests[i].target ? tests[i].target : in);
798 const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
799 char *out = split_and_join (in);
800 XChar2b *out16 = utf8_to_XChar2b (in, 0);
801 char *out2 = XChar2b_to_utf8 (out16, 0);
802 if (strcmp (out, target))
804 LOG (stderr, name, target);
805 LOG (stderr, "FAIL", out);
806 fprintf (stderr, "\n");
809 if (strcmp (out2, target2))
811 LOG (stderr, name, target2);
812 LOG (stderr, "FAIL2", out2);
813 fprintf (stderr, "\n");
822 const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
823 "c\303\264t\303\251 de l'alc\303\264ve "
824 "ovo\303\257de, o\303\271 les b\303\273ches "
825 "se consument dans l'\303\242tre");
826 const char *latin1 = ("son \356le int\351rieure, \340 "
827 "c\364t\351 de l'alc\364ve ovo\357de, "
828 "o\371 les b\373ches se consument dans "
830 const char *ascii = ("son ile interieure, a cote de l'alcove "
831 "ovoide, ou les buches se consument dans "
833 char *latin1b = utf8_to_latin1 (utf8, False);
834 char *ascii2 = utf8_to_latin1 (utf8, True);
835 if (strcmp (latin1, latin1b))
837 LOG (stderr, "LATIN1", utf8);
838 LOG (stderr, "FAIL3", latin1b);
839 fprintf (stderr, "\n");
842 if (strcmp (ascii, ascii2))
844 LOG (stderr, "ASCII", utf8);
845 LOG (stderr, "FAIL4", ascii2);
846 fprintf (stderr, "\n");
854 if (ok) fprintf (stderr, "OK\n");
858 #endif /* SELFTEST */