X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=utils%2Futf8wc.c;h=c948f220a30ed4c9d21f317c4cfa61cc2cac406b;hb=c85f503f5793839a6be4c818332aca4a96927bb2;hp=e2db24170aadecef6caa7992e3fe402a5f90c08d;hpb=d5186197bc394e10a4402f7f6d23fbb14103bc50;p=xscreensaver diff --git a/utils/utf8wc.c b/utils/utf8wc.c index e2db2417..c948f220 100644 --- a/utils/utf8wc.c +++ b/utils/utf8wc.c @@ -1,4 +1,4 @@ -/* xscreensaver, Copyright (c) 2014 Jamie Zawinski +/* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that @@ -17,11 +17,9 @@ #include #include -#ifdef HAVE_COCOA +#ifdef HAVE_JWXYZ # include "jwxyz.h" -# elif defined(HAVE_ANDROID) -# include "jwxyz.h" -#else /* !HAVE_COCOA */ +#else /* !HAVE_JWXYZ */ # include #endif @@ -52,7 +50,7 @@ uc_truncate (unsigned long uc) /* Parse the first UTF8 character at the front of the string. Return the Unicode character, and the number of bytes read. */ -static long +long utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret) { const unsigned char *start = in; @@ -156,7 +154,7 @@ utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret) /* Converts a Unicode character to a multi-byte UTF8 sequence. Returns the number of bytes written. */ -static int +int utf8_encode (unsigned long uc, char *out, long length) { const char *old = out; @@ -239,12 +237,12 @@ utf8_to_XChar2b (const char *string, int *length_ret) out->byte1 = 0; out->byte2 = 0; - /* shrink */ - c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b)); - if (length_ret) *length_ret = (int) (out - c2b); + /* shrink */ + c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b)); + return c2b; } @@ -260,25 +258,64 @@ utf8_split (const char *string, int *length_ret) const unsigned char *end = in + len; char **ret = (char **) malloc ((len+1) * sizeof(*ret)); int i = 0; + int zwjp = 0; if (!ret) return 0; while (in < end) { - long len2 = utf8_decode (in, len, 0); + unsigned long uc; + long len2 = utf8_decode (in, len, &uc); char tmp[10]; strncpy (tmp, (char *) in, len2); tmp[len2] = 0; ret[i++] = strdup (tmp); in += len2; + + /* If this is a Combining Diacritical, append it to the previous + character. E.g., "y\314\206\314\206" is one string, not three. + + If this is ZWJ, Zero Width Joiner, then we append both this character + and the following character, e.g. "X ZWJ Y" is one string not three. + + #### Hmmm, should this also include every character in the + "Symbol, Modifier" category, or does ZWJ get used for those? + https://www.fileformat.info/info/unicode/category/Sk/list.htm + + Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8" + should be a single glyph? Or is that what "Combining Cedilla, 0327" + is for? I'm confused by the fact that the skin tones (1F3FB-1F3FF) + do not seem to be in a readily-identifiable block the way the various + combining diacriticals are. + */ + if (i > 1 && + ((uc >= 0x300 && uc <= 0x36F) || /* Combining Diacritical */ + (uc >= 0x1AB0 && uc <= 0x1AFF) || /* Combining Diacritical Ext. */ + (uc >= 0x1DC0 && uc <= 0x1DFF) || /* Combining Diacritical Supp. */ + (uc >= 0x20D0 && uc <= 0x20FF) || /* Combining Diacritical Sym. */ + (uc >= 0xFE20 && uc <= 0xFE2F) || /* Combining Half Marks */ + (uc >= 0x1F3FB && uc <= 0x1F3FF) || /* Emoji skin tone modifiers */ + zwjp || uc == 0x200D)) /* Zero Width Joiner */ + { + long L1 = strlen(ret[i-2]); + long L2 = strlen(ret[i-1]); + char *s2 = (char *) malloc (L1 + L2 + 1); + strncpy (s2, ret[i-2], L1); + strncpy (s2 + L1, ret[i-1], L2); + s2[L1 + L2] = 0; + free (ret[i-2]); + ret[i-2] = s2; + i--; + zwjp = (uc == 0x200D); /* Swallow the next character as well */ + } } ret[i] = 0; - /* shrink */ - ret = (char **) realloc (ret, (i+1) * sizeof(*ret)); - if (length_ret) *length_ret = i; + /* shrink */ + ret = (char **) realloc (ret, (i+1) * sizeof(*ret)); + return ret; } @@ -313,11 +350,13 @@ XChar2b_to_utf8 (const XChar2b *in, int *length_ret) } *out = 0; - /* shrink */ - utf8 = (char *) realloc (utf8, (out - utf8 + 1) * sizeof(*utf8)); + out_len = (int) (out - utf8 + 1); if (length_ret) - *length_ret = (int) (out - utf8); + *length_ret = out_len; + + /* shrink */ + utf8 = (char *) realloc (utf8, out_len); return utf8; } @@ -342,7 +381,20 @@ utf8_to_latin1 (const char *string, Bool ascii_p) long len2 = utf8_decode (in, in_end - in, &uc); in += len2; - if (uc > 0xFF) + if (uc == '\240') /*   */ + uc = ' '; + else if (uc >= 0x300 && uc <= 0x36F) + uc = 0; /* Discard "Combining Diacritical Marks" */ + else if (uc >= 0x1AB0 && uc <= 0x1AFF) + uc = 0; /* Discard "Combining Diacritical Marks Extended" */ + else if (uc >= 0x1DC0 && uc <= 0x1DFF) + uc = 0; /* Discard "Combining Diacritical Marks Supplement" */ + else if (uc >= 0x20D0 && uc <= 0x20FF) + uc = 0; /* Discard "Combining Diacritical Marks for Symbols" */ + else if (uc >= 0xFE20 && uc <= 0xFE2F) + uc = 0; /* Discard "Combining Half Marks" */ + + else if (uc > 0xFF) switch (uc) { /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */ @@ -403,15 +455,11 @@ utf8_to_latin1 (const char *string, Bool ascii_p) default: break; } - else if (uc >= 0x2300 && uc <= 0x36F) - uc = 0; /* Discard "Unicode Combining Diacriticals Block" */ - else if (uc == '\240') - uc = ' '; /*   */ if (uc > 0xFF) /* "Inverted question mark" looks enough like 0xFFFD, the "Unicode Replacement Character". */ - uc = (ascii_p ? '#' : 0xBF); + uc = (ascii_p ? '#' : '\277'); if (ascii_p) /* Map Latin1 to the closest ASCII versions. */ { @@ -787,6 +835,7 @@ main (int argc, char **argv) free (out16); } + /* Check conversion from UTF8 to Latin1 and ASCII. */ { const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 " "c\303\264t\303\251 de l'alc\303\264ve " @@ -819,6 +868,58 @@ main (int argc, char **argv) free (ascii2); } + /* Check de-composition of emoji that should all be treated as a unit + for measurement and display purposes. */ + { + static const char * const tests[] = { + + /* 0: "Man" */ + " \360\237\221\250 ", + + /* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */ + " \360\237\247\233\360\237\217\277 ", + + /* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" = + 1F468 1F3FF 200D 1F3EB + */ + " \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ", + + /* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */ + " \360\237\217\203\342\200\215\342\231\200 ", + + /* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */ + " \360\237\217\203\342\200\215\360\237\232\200 ", + + /* 5: + Group of people displayed as a single glyph: + Woman, dark skin tone, ZWJ, 1F469 1F3FF 200D + Man, light skin tone, ZWJ, 1F468 1F3FB 200D + Boy, medium skin tone, ZWJ, 1F466 1F3FD 200D + Girl, dark skin tone. 1F467 1F3FF + */ + " \360\237\221\251\360\237\217\277\342\200\215" + "\360\237\221\250\360\237\217\273\342\200\215" + "\360\237\221\246\360\237\217\275\342\200\215" + "\360\237\221\247\360\237\217\277 ", + }; + int i; + for (i = 0; i < sizeof(tests)/sizeof(*tests); i++) + { + int L = 0; + char **out = utf8_split (tests[i], &L); + char name[100]; + int j; + sprintf (name, "SPLIT %d: %d glyphs", i, L-2); + if (L != 3) + { + LOG (stderr, name, tests[i]); + ok = 0; + } + for (j = 0; j < L; j++) + free (out[j]); + free (out); + } + } if (ok) fprintf (stderr, "OK\n"); return (ok == 0);