From https://www.jwz.org/xscreensaver/xscreensaver-6.09.tar.gz

[xscreensaver] / utils / utf8wc.c
diff --git a/utils/utf8wc.c b/utils/utf8wc.c

index 2fb57e789f34f337fd518ef62d5c89c7aa8aa80e..ee96f93e05235c017c8faee3b82dadaa74c7f26c 100644 (file)
--- a/utils/utf8wc.c
+++ b/utils/utf8wc.c
@@ -1,4 +1,4 @@
-/* xscreensaver, Copyright (c) 2014-2015 Jamie Zawinski <jwz@jwz.org>
+/* xscreensaver, Copyright © 2014-2024 Jamie Zawinski <jwz@jwz.org>
   *
   * Permission to use, copy, modify, distribute, and sell this software and its
   * documentation for any purpose is hereby granted without fee, provided that
@@ -17,11 +17,9 @@
  #include <stdio.h>
  #include <string.h>
  
-#ifdef HAVE_COCOA
+#ifdef HAVE_JWXYZ
  # include "jwxyz.h"
-# elif defined(HAVE_ANDROID)
-# include "jwxyz.h"
-#else /* !HAVE_COCOA */
+#else /* !HAVE_JWXYZ */
  # include <X11/Xlib.h>
  #endif
  
@@ -92,7 +90,7 @@ utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
      if (in+3 > end) PREMATURE_EOF;
      min = 1 << 16;
      uc = (((c     & 0x07) << 18) | /* 00000111--+-------+------- */
-          ((in[0] & 0x3F) << 12) | /*       01111111----+------- */
+          ((in[0] & 0x3F) << 12) | /*       00111111----+------- */
            ((in[1] & 0x3F) <<  6) | /*             00111111------ */
            ((in[2] & 0x3F)));       /*                   00111111 */
      in += 3;
@@ -239,12 +237,12 @@ utf8_to_XChar2b (const char *string, int *length_ret)
    out->byte1 = 0;
    out->byte2 = 0;
  
-  /* shrink */
-  c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
-
    if (length_ret)
      *length_ret = (int) (out - c2b);
  
+  /* shrink */
+  c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
+
    return c2b;
  }
  
@@ -260,6 +258,7 @@ utf8_split (const char *string, int *length_ret)
    const unsigned char *end = in + len;
    char **ret = (char **) malloc ((len+1) * sizeof(*ret));
    int i = 0;
+  int zwjp = 0;
    if (!ret) return 0;
  
    while (in < end)
@@ -267,35 +266,56 @@ utf8_split (const char *string, int *length_ret)
        unsigned long uc;
        long len2 = utf8_decode (in, len, &uc);
        char tmp[10];
-      strncpy (tmp, (char *) in, len2);
+      memcpy (tmp, (char *) in, len2);
        tmp[len2] = 0;
        ret[i++] = strdup (tmp);
        in += len2;
  
        /* If this is a Combining Diacritical, append it to the previous
           character. E.g., "y\314\206\314\206" is one string, not three.
+
+         If this is ZWJ, Zero Width Joiner, then we append both this character
+         and the following character, e.g. "X ZWJ Y" is one string not three.
+
+         #### Hmmm, should this also include every character in the
+         "Symbol, Modifier" category, or does ZWJ get used for those?
+         https://www.fileformat.info/info/unicode/category/Sk/list.htm
+
+         Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8"
+         should be a single glyph? Or is that what "Combining Cedilla, 0327"
+         is for?  I'm confused by the fact that the skin tones (1F3FB-1F3FF)
+         do not seem to be in a readily-identifiable block the way the various
+         combining diacriticals are.
         */
-      if (i > 1 && uc >= 0x300 && uc <= 0x36F)
+      if (i > 1 && 
+          ((uc >=   0x300 && uc <=   0x36F) || /* Combining Diacritical */
+           (uc >=  0x1AB0 && uc <=  0x1AFF) || /* Combining Diacritical Ext. */
+           (uc >=  0x1DC0 && uc <=  0x1DFF) || /* Combining Diacritical Supp. */
+           (uc >=  0x20D0 && uc <=  0x20FF) || /* Combining Diacritical Sym. */
+           (uc >=  0xFE20 && uc <=  0xFE2F) || /* Combining Half Marks */
+           (uc >= 0x1F3FB && uc <= 0x1F3FF) || /* Emoji skin tone modifiers */
+           zwjp || uc == 0x200D))              /* Zero Width Joiner */
          {
            long L1 = strlen(ret[i-2]);
            long L2 = strlen(ret[i-1]);
            char *s2 = (char *) malloc (L1 + L2 + 1);
-          strncpy (s2,      ret[i-2], L1);
-          strncpy (s2 + L1, ret[i-1], L2);
+          memcpy (s2,      ret[i-2], L1);
+          memcpy (s2 + L1, ret[i-1], L2);
            s2[L1 + L2] = 0;
            free (ret[i-2]);
            ret[i-2] = s2;
            i--;
+          zwjp = (uc == 0x200D);  /* Swallow the next character as well */
          }
      }
    ret[i] = 0;
  
-  /* shrink */
-  ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
-
    if (length_ret)
      *length_ret = i;
  
+  /* shrink */
+  ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
+
    return ret;
  }
  
@@ -330,13 +350,14 @@ XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
      }
    *out = 0;
  
-  /* shrink */
    out_len = (int) (out - utf8 + 1);
-  utf8 = (char *) realloc (utf8, out_len);
  
    if (length_ret)
      *length_ret = out_len;
  
+  /* shrink */
+  utf8 = (char *) realloc (utf8, out_len);
+
    return utf8;
  }
  
@@ -349,7 +370,7 @@ utf8_to_latin1 (const char *string, Bool ascii_p)
    long in_len = strlen(string);
    const unsigned char *in = (const unsigned char *) string;
    const unsigned char *in_end = in + in_len;
-  unsigned char *ret = (unsigned char *) malloc (in_len + 1);
+  unsigned char *ret = (unsigned char *) malloc ((in_len * 4) + 1);
    unsigned char *out = ret;
  
    if (! ret) return 0;
@@ -363,7 +384,16 @@ utf8_to_latin1 (const char *string, Bool ascii_p)
        if (uc == '\240')        /* &nbsp; */
          uc = ' ';
        else if (uc >= 0x300 && uc <= 0x36F)
-        uc = 0;                /* Discard "Unicode Combining Diacriticals Block" */
+        uc = 0;                /* Discard "Combining Diacritical Marks" */
+      else if (uc >= 0x1AB0 && uc <= 0x1AFF)
+        uc = 0;                /* Discard "Combining Diacritical Marks Extended" */
+      else if (uc >= 0x1DC0 && uc <= 0x1DFF)
+        uc = 0;                /* Discard "Combining Diacritical Marks Supplement" */
+      else if (uc >= 0x20D0 && uc <= 0x20FF)
+        uc = 0;                /* Discard "Combining Diacritical Marks for Symbols" */
+      else if (uc >= 0xFE20 && uc <= 0xFE2F)
+        uc = 0;                /* Discard "Combining Half Marks" */
+
        else if (uc > 0xFF)
          switch (uc) {
  
@@ -433,12 +463,25 @@ utf8_to_latin1 (const char *string, Bool ascii_p)
  
        if (ascii_p)     /* Map Latin1 to the closest ASCII versions. */
          {
-          const unsigned char latin1_to_ascii[96] =
-             " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
-             "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
-             "aaaaaaeceeeeiiiionooooo/ouuuuypy";
+          const char * const latin1_to_ascii[96] = {
+            " ",  "!",   "C",  "#",  "#",   "Y",   "|",   "SS",
+            "_",  "(c)", "#",  "<",  "=",   "-",   "(r)", "_",
+            "#",  "+-",  "2",  "3",  "'",   "u",   "PP",  ".",
+            ",",  "1",   "o",  ">",  "1/4", "1/2", "3/4", "?",
+            "A",  "A",   "A",  "A",  "A",   "A",   "AE",  "C",
+            "E",  "E",   "E",  "E",  "I",   "I",   "I",   "I",
+            "D",  "N",   "O",  "O",  "O",   "O",   "O",   "x",
+            "0",  "U",   "U",  "U",  "U",   "Y",   "p",   "S",
+            "a",  "a",   "a",  "a",  "a",   "a",   "ae",  "c",
+            "e",  "e",   "e",  "e",  "i",   "i",   "i",   "i",
+            "o",  "n",   "o",  "o",  "o",   "o",   "o",   "/",
+            "o",  "u",   "u",  "u",  "u",   "y",   "p",   "y" };
            if (uc >= 0xA0)
-            uc = latin1_to_ascii[uc - 0xA0];
+            {
+              const char *c2 = latin1_to_ascii[uc - 0xA0];
+              while (*c2) { *out++ = *c2++; }
+              uc = 0;
+            }
          }
  
        if (uc > 0)
@@ -805,18 +848,23 @@ main (int argc, char **argv)
        free (out16);
      }
  
+  /* Check conversion from UTF8 to Latin1 and ASCII. */
    {
      const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
                          "c\303\264t\303\251 de l'alc\303\264ve "
                          "ovo\303\257de, o\303\271 les b\303\273ches "
-                        "se consument dans l'\303\242tre");
+                        "se consument dans l'\303\242tre "
+                        "\302\251\302\256\302\261\302\274\302\275\302\276"
+                        "\303\206\303\246");
      const char *latin1 = ("son \356le int\351rieure, \340 "
                            "c\364t\351 de l'alc\364ve ovo\357de, "
                            "o\371 les b\373ches se consument dans "
-                          "l'\342tre");
+                          "l'\342tre "
+                          "\251\256\261\274\275\276\306\346");
      const char *ascii = ("son ile interieure, a cote de l'alcove "
                           "ovoide, ou les buches se consument dans "
-                         "l'atre");
+                         "l'atre "
+                         "(c)(r)+-1/41/23/4AEae");
      char *latin1b = utf8_to_latin1 (utf8, False);
      char *ascii2  = utf8_to_latin1 (utf8, True);
      if (strcmp (latin1, latin1b))
@@ -837,6 +885,58 @@ main (int argc, char **argv)
      free (ascii2);
    }
  
+  /* Check de-composition of emoji that should all be treated as a unit
+     for measurement and display purposes. */
+  {
+    static const char * const tests[] = { 
+
+      /* 0: "Man" */
+      " \360\237\221\250 ",
+
+      /* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */
+      " \360\237\247\233\360\237\217\277 ",
+
+      /* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" =
+            1F468 1F3FF 200D 1F3EB
+       */
+      " \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ",
+
+      /* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */
+      " \360\237\217\203\342\200\215\342\231\200 ",
+
+      /* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */
+      " \360\237\217\203\342\200\215\360\237\232\200 ",
+
+      /* 5:
+         Group of people displayed as a single glyph:
+           Woman, dark skin tone, ZWJ,   1F469 1F3FF 200D
+           Man, light skin tone, ZWJ,    1F468 1F3FB 200D
+           Boy, medium skin tone, ZWJ,   1F466 1F3FD 200D
+           Girl, dark skin tone.         1F467 1F3FF
+       */
+      " \360\237\221\251\360\237\217\277\342\200\215"
+       "\360\237\221\250\360\237\217\273\342\200\215"
+       "\360\237\221\246\360\237\217\275\342\200\215"
+       "\360\237\221\247\360\237\217\277 ",
+    };
+    int i;
+    for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
+      {
+        int L = 0;
+        char **out = utf8_split (tests[i], &L);
+        char name[100];
+        int j;
+        sprintf (name, "SPLIT %d: %d glyphs", i, L-2);
+        if (L != 3)
+          {
+            LOG (stderr, name, tests[i]);
+            ok = 0;
+          }
+        for (j = 0; j < L; j++)
+          free (out[j]);
+        free (out);
+      }
+  }
  
    if (ok) fprintf (stderr, "OK\n");
    return (ok == 0);