-/* xscreensaver, Copyright (c) 2014-2015 Jamie Zawinski <jwz@jwz.org>
+/* xscreensaver, Copyright © 2014-2024 Jamie Zawinski <jwz@jwz.org>
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
#include <stdio.h>
#include <string.h>
-#ifdef HAVE_COCOA
+#ifdef HAVE_JWXYZ
# include "jwxyz.h"
-# elif defined(HAVE_ANDROID)
-# include "jwxyz.h"
-#else /* !HAVE_COCOA */
+#else /* !HAVE_JWXYZ */
# include <X11/Xlib.h>
#endif
if (in+3 > end) PREMATURE_EOF;
min = 1 << 16;
uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */
- ((in[0] & 0x3F) << 12) | /* 01111111----+------- */
+ ((in[0] & 0x3F) << 12) | /* 00111111----+------- */
((in[1] & 0x3F) << 6) | /* 00111111------ */
((in[2] & 0x3F))); /* 00111111 */
in += 3;
out->byte1 = 0;
out->byte2 = 0;
- /* shrink */
- c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
-
if (length_ret)
*length_ret = (int) (out - c2b);
+ /* shrink */
+ c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
+
return c2b;
}
const unsigned char *end = in + len;
char **ret = (char **) malloc ((len+1) * sizeof(*ret));
int i = 0;
+ int zwjp = 0;
if (!ret) return 0;
while (in < end)
unsigned long uc;
long len2 = utf8_decode (in, len, &uc);
char tmp[10];
- strncpy (tmp, (char *) in, len2);
+ memcpy (tmp, (char *) in, len2);
tmp[len2] = 0;
ret[i++] = strdup (tmp);
in += len2;
/* If this is a Combining Diacritical, append it to the previous
character. E.g., "y\314\206\314\206" is one string, not three.
+
+ If this is ZWJ, Zero Width Joiner, then we append both this character
+ and the following character, e.g. "X ZWJ Y" is one string not three.
+
+ #### Hmmm, should this also include every character in the
+ "Symbol, Modifier" category, or does ZWJ get used for those?
+ https://www.fileformat.info/info/unicode/category/Sk/list.htm
+
+ Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8"
+ should be a single glyph? Or is that what "Combining Cedilla, 0327"
+ is for? I'm confused by the fact that the skin tones (1F3FB-1F3FF)
+ do not seem to be in a readily-identifiable block the way the various
+ combining diacriticals are.
*/
- if (i > 1 && uc >= 0x300 && uc <= 0x36F)
+ if (i > 1 &&
+ ((uc >= 0x300 && uc <= 0x36F) || /* Combining Diacritical */
+ (uc >= 0x1AB0 && uc <= 0x1AFF) || /* Combining Diacritical Ext. */
+ (uc >= 0x1DC0 && uc <= 0x1DFF) || /* Combining Diacritical Supp. */
+ (uc >= 0x20D0 && uc <= 0x20FF) || /* Combining Diacritical Sym. */
+ (uc >= 0xFE20 && uc <= 0xFE2F) || /* Combining Half Marks */
+ (uc >= 0x1F3FB && uc <= 0x1F3FF) || /* Emoji skin tone modifiers */
+ zwjp || uc == 0x200D)) /* Zero Width Joiner */
{
long L1 = strlen(ret[i-2]);
long L2 = strlen(ret[i-1]);
char *s2 = (char *) malloc (L1 + L2 + 1);
- strncpy (s2, ret[i-2], L1);
- strncpy (s2 + L1, ret[i-1], L2);
+ memcpy (s2, ret[i-2], L1);
+ memcpy (s2 + L1, ret[i-1], L2);
s2[L1 + L2] = 0;
free (ret[i-2]);
ret[i-2] = s2;
i--;
+ zwjp = (uc == 0x200D); /* Swallow the next character as well */
}
}
ret[i] = 0;
- /* shrink */
- ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
-
if (length_ret)
*length_ret = i;
+ /* shrink */
+ ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
+
return ret;
}
}
*out = 0;
- /* shrink */
out_len = (int) (out - utf8 + 1);
- utf8 = (char *) realloc (utf8, out_len);
if (length_ret)
*length_ret = out_len;
+ /* shrink */
+ utf8 = (char *) realloc (utf8, out_len);
+
return utf8;
}
long in_len = strlen(string);
const unsigned char *in = (const unsigned char *) string;
const unsigned char *in_end = in + in_len;
- unsigned char *ret = (unsigned char *) malloc (in_len + 1);
+ unsigned char *ret = (unsigned char *) malloc ((in_len * 4) + 1);
unsigned char *out = ret;
if (! ret) return 0;
if (uc == '\240') /* */
uc = ' ';
else if (uc >= 0x300 && uc <= 0x36F)
- uc = 0; /* Discard "Unicode Combining Diacriticals Block" */
+ uc = 0; /* Discard "Combining Diacritical Marks" */
+ else if (uc >= 0x1AB0 && uc <= 0x1AFF)
+ uc = 0; /* Discard "Combining Diacritical Marks Extended" */
+ else if (uc >= 0x1DC0 && uc <= 0x1DFF)
+ uc = 0; /* Discard "Combining Diacritical Marks Supplement" */
+ else if (uc >= 0x20D0 && uc <= 0x20FF)
+ uc = 0; /* Discard "Combining Diacritical Marks for Symbols" */
+ else if (uc >= 0xFE20 && uc <= 0xFE2F)
+ uc = 0; /* Discard "Combining Half Marks" */
+
else if (uc > 0xFF)
switch (uc) {
if (ascii_p) /* Map Latin1 to the closest ASCII versions. */
{
- const unsigned char latin1_to_ascii[96] =
- " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
- "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
- "aaaaaaeceeeeiiiionooooo/ouuuuypy";
+ const char * const latin1_to_ascii[96] = {
+ " ", "!", "C", "#", "#", "Y", "|", "SS",
+ "_", "(c)", "#", "<", "=", "-", "(r)", "_",
+ "#", "+-", "2", "3", "'", "u", "PP", ".",
+ ",", "1", "o", ">", "1/4", "1/2", "3/4", "?",
+ "A", "A", "A", "A", "A", "A", "AE", "C",
+ "E", "E", "E", "E", "I", "I", "I", "I",
+ "D", "N", "O", "O", "O", "O", "O", "x",
+ "0", "U", "U", "U", "U", "Y", "p", "S",
+ "a", "a", "a", "a", "a", "a", "ae", "c",
+ "e", "e", "e", "e", "i", "i", "i", "i",
+ "o", "n", "o", "o", "o", "o", "o", "/",
+ "o", "u", "u", "u", "u", "y", "p", "y" };
if (uc >= 0xA0)
- uc = latin1_to_ascii[uc - 0xA0];
+ {
+ const char *c2 = latin1_to_ascii[uc - 0xA0];
+ while (*c2) { *out++ = *c2++; }
+ uc = 0;
+ }
}
if (uc > 0)
free (out16);
}
+ /* Check conversion from UTF8 to Latin1 and ASCII. */
{
const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
"c\303\264t\303\251 de l'alc\303\264ve "
"ovo\303\257de, o\303\271 les b\303\273ches "
- "se consument dans l'\303\242tre");
+ "se consument dans l'\303\242tre "
+ "\302\251\302\256\302\261\302\274\302\275\302\276"
+ "\303\206\303\246");
const char *latin1 = ("son \356le int\351rieure, \340 "
"c\364t\351 de l'alc\364ve ovo\357de, "
"o\371 les b\373ches se consument dans "
- "l'\342tre");
+ "l'\342tre "
+ "\251\256\261\274\275\276\306\346");
const char *ascii = ("son ile interieure, a cote de l'alcove "
"ovoide, ou les buches se consument dans "
- "l'atre");
+ "l'atre "
+ "(c)(r)+-1/41/23/4AEae");
char *latin1b = utf8_to_latin1 (utf8, False);
char *ascii2 = utf8_to_latin1 (utf8, True);
if (strcmp (latin1, latin1b))
free (ascii2);
}
+ /* Check de-composition of emoji that should all be treated as a unit
+ for measurement and display purposes. */
+ {
+ static const char * const tests[] = {
+
+ /* 0: "Man" */
+ " \360\237\221\250 ",
+
+ /* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */
+ " \360\237\247\233\360\237\217\277 ",
+
+ /* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" =
+ 1F468 1F3FF 200D 1F3EB
+ */
+ " \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ",
+
+ /* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */
+ " \360\237\217\203\342\200\215\342\231\200 ",
+
+ /* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */
+ " \360\237\217\203\342\200\215\360\237\232\200 ",
+
+ /* 5:
+ Group of people displayed as a single glyph:
+ Woman, dark skin tone, ZWJ, 1F469 1F3FF 200D
+ Man, light skin tone, ZWJ, 1F468 1F3FB 200D
+ Boy, medium skin tone, ZWJ, 1F466 1F3FD 200D
+ Girl, dark skin tone. 1F467 1F3FF
+ */
+ " \360\237\221\251\360\237\217\277\342\200\215"
+ "\360\237\221\250\360\237\217\273\342\200\215"
+ "\360\237\221\246\360\237\217\275\342\200\215"
+ "\360\237\221\247\360\237\217\277 ",
+ };
+ int i;
+ for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
+ {
+ int L = 0;
+ char **out = utf8_split (tests[i], &L);
+ char name[100];
+ int j;
+ sprintf (name, "SPLIT %d: %d glyphs", i, L-2);
+ if (L != 3)
+ {
+ LOG (stderr, name, tests[i]);
+ ok = 0;
+ }
+ for (j = 0; j < L; j++)
+ free (out[j]);
+ free (out);
+ }
+ }
if (ok) fprintf (stderr, "OK\n");
return (ok == 0);