X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?p=xscreensaver;a=blobdiff_plain;f=utils%2Futf8wc.c;fp=utils%2Futf8wc.c;h=e2db24170aadecef6caa7992e3fe402a5f90c08d;hp=0000000000000000000000000000000000000000;hb=d5186197bc394e10a4402f7f6d23fbb14103bc50;hpb=6afd6db0ae9396cd7ff897ade597cd5483f49b0e diff --git a/utils/utf8wc.c b/utils/utf8wc.c new file mode 100644 index 00000000..e2db2417 --- /dev/null +++ b/utils/utf8wc.c @@ -0,0 +1,827 @@ +/* xscreensaver, Copyright (c) 2014 Jamie Zawinski + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation. No representations are made about the suitability of this + * software for any purpose. It is provided "as is" without express or + * implied warranty. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include + +#ifdef HAVE_COCOA +# include "jwxyz.h" +# elif defined(HAVE_ANDROID) +# include "jwxyz.h" +#else /* !HAVE_COCOA */ +# include +#endif + +#include "utf8wc.h" + + +/* "Unicode Replacement Character", displayed in lieu of invalid characters. */ +# define INVALID 0xFFFD + + +/* Mask the number to be within the valid range of unicode characters. + */ +static unsigned long +uc_truncate (unsigned long uc) +{ + uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */ + if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */ + if (uc == 0) uc = INVALID; /* no nulls */ + + if (uc >= 0xD800 && uc <= 0xDFFF) + /* Reserved for use with UTF-16: not a real character. */ + uc = INVALID; + + return uc; +} + + +/* Parse the first UTF8 character at the front of the string. + Return the Unicode character, and the number of bytes read. + */ +static long +utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret) +{ + const unsigned char *start = in; + const unsigned char *end = in + length; + unsigned long uc = INVALID; + unsigned long min = 0; + unsigned char c; + + if (length <= 0) goto DONE; + + c = *in++; + +# define PREMATURE_EOF { in = end; goto DONE; } + + if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */ + uc = INVALID; + + } else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */ + uc = (c & 0x7F); /* 01111111 */ + + } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */ + if (in+1 > end) PREMATURE_EOF; + min = 1 << 7; + uc = (((c & 0x1F) << 6) | /* 00011111------ */ + (in[0] & 0x3F)); /* 00111111 */ + in += 1; + + } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */ + if (in+2 > end) PREMATURE_EOF; + min = 1 << 11; + uc = (((c & 0x0F) << 12) | /* 00001111----+------- */ + ((in[0] & 0x3F) << 6) | /* 00111111------ */ + ((in[1] & 0x3F))); /* 00111111 */ + in += 2; + + } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */ + if (in+3 > end) PREMATURE_EOF; + min = 1 << 16; + uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */ + ((in[0] & 0x3F) << 12) | /* 01111111----+------- */ + ((in[1] & 0x3F) << 6) | /* 00111111------ */ + ((in[2] & 0x3F))); /* 00111111 */ + in += 3; + + } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */ + if (in+4 > end) PREMATURE_EOF; + min = 1 << 21; + uc = (((c & 0x03) << 24) | /* 00000011--------+-------+------- */ + ((in[0] & 0x3F) << 18) | /* 00111111--+-------+------- */ + ((in[1] & 0x3F) << 12) | /* 00111111----+------- */ + ((in[2] & 0x3F) << 6) | /* 00111111------ */ + ((in[3] & 0x3F))); /* 00111111 */ + in += 4; + + } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */ + if (in+5 > end) PREMATURE_EOF; + min = 1 << 26; + uc = (((c & 0x01) << 30) | /* 00000001------+-------+-------+------- */ + ((in[0] & 0x3F) << 24) | /* 00111111+-------+-------+------- */ + ((in[1] & 0x3F) << 18) | /* 00111111--+-------+------- */ + ((in[2] & 0x3F) << 12) | /* 00111111----+------- */ + ((in[3] & 0x3F) << 6) | /* 00111111------ */ + ((in[4] & 0x3F))); /* 00111111 */ + in += 5; + } else { + uc = INVALID; /* Unparsable sequence. */ + } + + DONE: + + length = in - start; + + /* If any of the continuation bytes didn't begin with the continuation tag, + the sequence is invalid; stop at the bad byte, not consuming later ones. + (It's easier to check this after the fact than up above.) */ + { + int i; + for (i = 1; i < length; i++) + if ((start[i] & 0xC0) != 0x80) { + uc = INVALID; + length = i+1; + break; + } + } + + if (uc < min) + /* A multi-byte sequence encoded a character that could have been + encoded with a shorter sequence, e.g., hiding ASCII inside a + multi-byte sequence. Something hinky's going on. Reject it. */ + uc = INVALID; + + uc = uc_truncate (uc); + + if (unicode_ret) + *unicode_ret = uc; + + return length; +} + + +/* Converts a Unicode character to a multi-byte UTF8 sequence. + Returns the number of bytes written. + */ +static int +utf8_encode (unsigned long uc, char *out, long length) +{ + const char *old = out; + + uc = uc_truncate (uc); + + if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */ + { + *out++ = uc; /* 0xxxxxxx */ + } + else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */ + { + *out++ = (0xC0 | ((uc >> 6) & 0x1F)); /* 110xxxxx */ + *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ + } + else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */ + { + *out++ = (0xE0 | ((uc >> 12) & 0x0F)); /* 1110xxxx */ + *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ + } + else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */ + { + *out++ = (0xF0 | ((uc >> 18) & 0x07)); /* 11110xxx */ + *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ + } + else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */ + { + *out++ = (0xF8 | ((uc >> 24) & 0x03)); /* 111110xx */ + *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ + } + else if (length >= 6) /* 31 bits in 6 bytes */ + { + *out++ = (0xFC | ((uc >> 30) & 0x01)); /* 1111110x */ + *out++ = (0x80 | ((uc >> 24) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ + *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ + } + + return (int) (out - old); +} + + +/* Converts a null-terminated UTF8 string to a null-terminated XChar2b array. + This only handles characters that can be represented in 16 bits, the + Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.) + */ +XChar2b * +utf8_to_XChar2b (const char *string, int *length_ret) +{ + long in_len = strlen(string); + const unsigned char *in = (const unsigned char *) string; + const unsigned char *in_end = in + in_len; + XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b)); + XChar2b *out = c2b; + if (! out) return 0; + + while (in < in_end) + { + unsigned long uc = 0; + long L = utf8_decode (in, in_end - in, &uc); + in += L; + + /* If it can't be represented in a 16-bit XChar2b, + use "Unicode Replacement Character". */ + if (uc > 0xFFFF) uc = INVALID; + + out->byte1 = (uc >> 8) & 0xFF; + out->byte2 = uc & 0xFF; + out++; + } + + out->byte1 = 0; + out->byte2 = 0; + + /* shrink */ + c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b)); + + if (length_ret) + *length_ret = (int) (out - c2b); + + return c2b; +} + + +/* Split a UTF8 string into an array of strings, one per character. + The sub-strings will be null terminated and may be multiple bytes. + */ +char ** +utf8_split (const char *string, int *length_ret) +{ + const unsigned char *in = (const unsigned char *) string; + long len = strlen (string); + const unsigned char *end = in + len; + char **ret = (char **) malloc ((len+1) * sizeof(*ret)); + int i = 0; + if (!ret) return 0; + + while (in < end) + { + long len2 = utf8_decode (in, len, 0); + char tmp[10]; + strncpy (tmp, (char *) in, len2); + tmp[len2] = 0; + ret[i++] = strdup (tmp); + in += len2; + } + ret[i] = 0; + + /* shrink */ + ret = (char **) realloc (ret, (i+1) * sizeof(*ret)); + + if (length_ret) + *length_ret = i; + + return ret; +} + + +/* Converts a null-terminated XChar2b array to a null-terminated UTF8 string. + */ +char * +XChar2b_to_utf8 (const XChar2b *in, int *length_ret) +{ + int in_len = 0; + const XChar2b *in_end; + int out_len; + char *utf8, *out; + const char *out_end; + + /* Find the null termination on the XChar2b. */ + for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++) + ; + + out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */ + utf8 = out = (char *) malloc (out_len + 1); + if (! out) return 0; + out_end = out + out_len; + + while (in < in_end) + { + unsigned long uc = (in->byte1 << 8) | in->byte2; + int wrote = utf8_encode (uc, out, out_end - out); + if (wrote > 3) abort(); /* Can't happen with 16 bit input */ + out += wrote; + in++; + } + *out = 0; + + /* shrink */ + utf8 = (char *) realloc (utf8, (out - utf8 + 1) * sizeof(*utf8)); + + if (length_ret) + *length_ret = (int) (out - utf8); + + return utf8; +} + + +/* Converts a UTF8 string to the closest Latin1 or ASCII equivalent. + */ +char * +utf8_to_latin1 (const char *string, Bool ascii_p) +{ + long in_len = strlen(string); + const unsigned char *in = (const unsigned char *) string; + const unsigned char *in_end = in + in_len; + unsigned char *ret = (unsigned char *) malloc (in_len + 1); + unsigned char *out = ret; + + if (! ret) return 0; + + while (in < in_end) + { + unsigned long uc = 0; + long len2 = utf8_decode (in, in_end - in, &uc); + in += len2; + + if (uc > 0xFF) + switch (uc) { + + /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */ + + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + uc = ' '; + break; + + case 0x2010: /* HYPHEN */ + case 0x2011: /* NON-BREAKING HYPHEN */ + case 0x2012: /* FIGURE DASH */ + case 0x2013: /* EN DASH */ + case 0x2014: /* EM DASH */ + case 0x2015: /* HORIZONTAL BAR */ + uc = '-'; + break; + + case 0x2018: /* LEFT SINGLE QUOTATION MARK */ + case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */ + case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */ + case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */ + uc = '\''; + break; + + case 0x201C: /* LEFT DOUBLE QUOTATION MARK */ + case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */ + case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */ + case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */ + uc = '"'; + break; + + case 0x2022: uc = '\267'; break; /* BULLET */ + case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */ + case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */ + case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */ + case 0x2038: uc = '^'; break; /* CARET */ + case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */ + case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/ + case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */ + case 0x2042: uc = '*'; break; /* ASTERISM */ + case 0x2043: uc = '='; break; /* HYPHEN BULLET */ + case 0x2044: uc = '/'; break; /* FRACTION SLASH */ + case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */ + case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */ + case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */ + case 0x204E: uc = '*'; break; /* LOW ASTERISK */ + case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */ + default: + break; + } + else if (uc >= 0x2300 && uc <= 0x36F) + uc = 0; /* Discard "Unicode Combining Diacriticals Block" */ + else if (uc == '\240') + uc = ' '; /*   */ + + if (uc > 0xFF) + /* "Inverted question mark" looks enough like 0xFFFD, + the "Unicode Replacement Character". */ + uc = (ascii_p ? '#' : 0xBF); + + if (ascii_p) /* Map Latin1 to the closest ASCII versions. */ + { + const unsigned char latin1_to_ascii[96] = + " !C##Y|S_C#<=-R_##23'uP.,1o>###?" + "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS" + "aaaaaaeceeeeiiiionooooo/ouuuuypy"; + if (uc >= 0xA0) + uc = latin1_to_ascii[uc - 0xA0]; + } + + if (uc > 0) + *out++ = (unsigned char) uc; + } + *out = 0; + + /* shrink */ + ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret)); + + return (char *) ret; +} + + +/************************************************************************* + + cd ../hacks ; make test-utf8wc + + *************************************************************************/ + +#ifdef SELFTEST + +/* Convert a UTF8 string to Unicode and back again. + */ +static char * +split_and_join (const char *string) +{ + const unsigned char *in = (const unsigned char *) string; + int len = strlen (string); + const unsigned char *end = in + len; + unsigned long *unicode = (unsigned long *) + malloc((len + 1) * sizeof(*unicode)); + int i = 0; + char *ret, *out, *out_end; + + while (in < end) + { + long len2 = utf8_decode (in, len, &unicode[i]); + i++; + in += len2; + } + unicode[i] = 0; + + i = i*6 + 1; + out = ret = (char *) malloc(i); + out_end = out + i; + i = 0; + while (unicode[i]) + { + int len2 = utf8_encode (unicode[i], out, out_end - out); + out += len2; + i++; + } + *out = 0; + free (unicode); + + return ret; +} + + +static void +LOG (FILE *out, const char *prefix, const char *s) +{ + fprintf (out, "%6s: \"", prefix); + while (*s) + { + unsigned char c = *s; + if (c == '"' || c == '\\') fprintf(out, "\\%c", c); + else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c); + else fprintf (out, "%c", c); + s++; + } + fprintf (out, "\"\n"); +} + + +int +main (int argc, char **argv) +{ + /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt + */ + +# define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */ + + static const struct { const char *name, *in, *target, *target2; } tests[] = { + /* 1 Some correct UTF-8 text */ + + /* The Greek word 'kosme': */ + { "1", "\316\272\341\275\271\317\203\316\274\316\265" }, + + + /* 2 Boundary condition test cases */ + + /* 2.1 First possible sequence of a certain length */ + + { "2.1.1", /* 1 byte (U-00000000): */ "\000" }, + { "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" }, + { "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" }, + { "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC }, + { "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC }, + { "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC }, + + /* 2.2 Last possible sequence of a certain length */ + + { "2.2.1", /* 1 byte (U-0000007F): */ "\177" }, + { "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" }, + { "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" }, + { "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC }, + { "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC }, + { "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC }, + + /* 2.3 Other boundary conditions */ + + { "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" }, + { "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" }, + { "2.3.3", /* U-0000FFFD = ef bf bd = */ URC }, + { "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC }, + { "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC }, + + + /* 3 Malformed sequences */ + + /* 3.1 Unexpected continuation bytes */ + + /* Each unexpected continuation byte should be separately signalled as a + malformed sequence of its own. */ + + { "3.1.1", /* First continuation byte 0x80: */ "\200", URC }, + { "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC }, + { "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC }, + { "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC }, + { "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277", + URC URC URC URC }, + { "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200", + URC URC URC URC URC }, + { "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277", + URC URC URC URC URC URC }, + { "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200", + URC URC URC URC URC URC URC }, + + { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/ + + "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217" + "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237" + "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257" + "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277", + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, + + /* 3.2 Lonely start characters */ + + { "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf), + each followed by a space character: */ + + "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 " + "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 " + "\332 \333 \334 \335 \336 \337 ", + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, + + { "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef), + each followed by a space character: */ + "\340 \341 \342 \343 \344 \345 \346 \347 " + "\350 \351 \352 \353 \354 \355 \356 \357 ", + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, + + { "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7), + each followed by a space character: */ + URC URC URC URC URC URC URC URC }, + + { "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb), + each followed by a space character: */ + "\370 \371 \372 \373 ", + URC URC URC URC }, + + { "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd), + each followed by a space character: */ + "\374 \375 ", URC URC }, + + /* 3.3 Sequences with last continuation byte missing */ + + /* All bytes of an incomplete sequence should be signalled as a single + malformed sequence, i.e., you should see only a single replacement + character in each of the next 10 tests. (Characters as in section 2) */ + + { "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */ + "\300", URC }, + { "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */ + "\340\200", URC }, + { "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */ + "\360\200\200", URC }, + { "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */ + "\370\200\200\200", URC }, + { "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */ + "\374\200\200\200\200", URC }, + { "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */ + "\337", URC }, + { "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */ + "\357\277", URC }, + { "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */ + "\367\277\277", URC }, + { "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */ + "\373\277\277\277", URC }, + { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */ + "\375\277\277\277\277", URC }, + + /* 3.4 Concatenation of incomplete sequences */ + + /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed + sequences being signalled: */ + + { "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200" + "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277", + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, + + /* 3.5 Impossible bytes */ + + /* The following two bytes cannot appear in a correct UTF-8 string */ + + { "3.5.1", /* fe = */ "\376", URC }, + { "3.5.2", /* ff = */ "\377", URC }, + { "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC }, + + + /* 4 Overlong sequences */ + + /* 4.1 Examples of an overlong ASCII character */ + + { "4.1.1", /* U+002F = c0 af = */ "\300\257", URC }, + { "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC }, + { "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC }, + { "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257", + URC }, + { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257", + URC }, + + /* 4.2 Maximum overlong sequences */ + + { "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC }, + { "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC }, + { "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277", + URC }, + { "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277", + URC }, + { "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC }, + + /* 4.3 Overlong representation of the NUL character */ + + { "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC }, + { "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC }, + { "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC }, + { "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200", + URC }, + { "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200", + URC }, + + + /* 5 Illegal code positions */ + + /* 5.1 Single UTF-16 surrogates */ + + { "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC }, + { "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC }, + { "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC }, + { "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC }, + { "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC }, + { "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC }, + { "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC }, + + /* 5.2 Paired UTF-16 surrogates */ + + { "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC }, + { "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC }, + { "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC }, + { "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC }, + { "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC }, + { "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC }, + { "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC }, + { "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC }, + + /* 5.3 Other illegal code positions */ + + { "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" }, + { "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" }, + + + /* 6 Some other junk */ + + { "6.0", "" }, + { "6.1", "\001\002\003\004\005 ABC" }, + { "6.2", /* every non-ASCII Latin1 character */ + "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250" + "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260" + "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270" + "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200" + "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210" + "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220" + "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230" + "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240" + "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250" + "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260" + "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270" + "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" }, + + { "6.3", /* Christmas tree */ + "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020" + "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040" + "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060" + "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100" + "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120" + "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140" + "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160" + "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200" + "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220" + "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240" + "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260" + "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300" + "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320" + "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340" + "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360" + "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377", + + "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020" + "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037" + " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177" + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC + URC URC URC URC URC URC URC URC URC URC URC URC }, + }; + + int i; + int ok = 1; + for (i = 0; i < sizeof(tests)/sizeof(*tests); i++) + { + const char *name = tests[i].name; + const char *in = tests[i].in; + const char *target = (tests[i].target ? tests[i].target : in); + const char *target2 = (tests[i].target2 ? tests[i].target2 : target); + char *out = split_and_join (in); + XChar2b *out16 = utf8_to_XChar2b (in, 0); + char *out2 = XChar2b_to_utf8 (out16, 0); + if (strcmp (out, target)) + { + LOG (stderr, name, target); + LOG (stderr, "FAIL", out); + fprintf (stderr, "\n"); + ok = 0; + } + if (strcmp (out2, target2)) + { + LOG (stderr, name, target2); + LOG (stderr, "FAIL2", out2); + fprintf (stderr, "\n"); + ok = 0; + } + free (out); + free (out2); + free (out16); + } + + { + const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 " + "c\303\264t\303\251 de l'alc\303\264ve " + "ovo\303\257de, o\303\271 les b\303\273ches " + "se consument dans l'\303\242tre"); + const char *latin1 = ("son \356le int\351rieure, \340 " + "c\364t\351 de l'alc\364ve ovo\357de, " + "o\371 les b\373ches se consument dans " + "l'\342tre"); + const char *ascii = ("son ile interieure, a cote de l'alcove " + "ovoide, ou les buches se consument dans " + "l'atre"); + char *latin1b = utf8_to_latin1 (utf8, False); + char *ascii2 = utf8_to_latin1 (utf8, True); + if (strcmp (latin1, latin1b)) + { + LOG (stderr, "LATIN1", utf8); + LOG (stderr, "FAIL3", latin1b); + fprintf (stderr, "\n"); + ok = 0; + } + if (strcmp (ascii, ascii2)) + { + LOG (stderr, "ASCII", utf8); + LOG (stderr, "FAIL4", ascii2); + fprintf (stderr, "\n"); + ok = 0; + } + free (latin1b); + free (ascii2); + } + + + if (ok) fprintf (stderr, "OK\n"); + return (ok == 0); +} + +#endif /* SELFTEST */