git.hungrycats.org Git - xscreensaver/blob - utils/thread_util.h

   1 /* -*- mode: c; tab-width: 4; fill-column: 78 -*- */
   2 /* vi: set ts=4 tw=128: */
   3
   4 /*
   5 thread_util.h, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com>
   6
   7 Permission to use, copy, modify, distribute, and sell this software and its
   8 documentation for any purpose is hereby granted without fee, provided that
   9 the above copyright notice appear in all copies and that both that
  10 copyright notice and this permission notice appear in supporting
  11 documentation.  No representations are made about the suitability of this
  12 software for any purpose.  It is provided "as is" without express or
  13 implied warranty.
  14 */
  15
  16 #ifndef THREAD_UTIL_H
  17 #define THREAD_UTIL_H
  18
  19 /* thread_util.h because C11 took threads.h. */
  20
  21 /* And POSIX threads because there aren't too many systems that support C11
  22    threads that don't already support POSIX threads.
  23    ...Not that it would be too hard to convert from the one to the other.
  24    Or to have both.
  25  */
  26
  27 /* Beware!
  28    Multithreading is a great way to add insidious and catastrophic bugs to
  29    a program. Make sure you understand the risks.
  30
  31    You may wish to become familiar with race conditions, deadlocks, mutexes,
  32    condition variables, and, in lock-free code, memory ordering, cache
  33    hierarchies, etc., before working with threads.
  34
  35    On the other hand, if a screenhack locks up or crashes, it's not the
  36    end of the world: XScreenSaver won't unlock the screen if that happens.
  37 */
  38
  39 /*
  40    The basic stragegy for applying threads to a CPU-hungry screenhack:
  41
  42    1. Find the CPU-hungry part of the hack.
  43
  44    2. Change that part so the workload can be divided into N equal-sized
  45       loads, where N is the number of CPU cores in the machine.
  46       (For example: with two cores, one core could render even scan lines,
  47       and the other odd scan lines.)
  48
  49    2a. Keeping in mind that two threads should not write to the same memory
  50        at the same time. Specifically, they should not be writing to the
  51        same cache line at the same time -- so align memory allocation and
  52        memory accesses to the system cache line size as necessary.
  53
  54    3. On screenhack_init, create a threadpool object. This creates N worker
  55       threads, and each thread creates and owns a user-defined struct.
  56       After creation, the threads are idle.
  57
  58    4. On screenhack_frame, call threadpool_run(). Each thread simultaneously
  59       wakes up, calls a function that does one of the equal-sized loads,
  60       then goes back to sleep. The main thread then calls threadpool_wait(),
  61       which returns once all the worker threads have finished.
  62
  63       Using this to implement SMP won't necessarily increase performance by
  64       a factor of N (again, N is CPU cores.). Both X11 and Cocoa on OS X can
  65       impose a not-insignificant amount of overhead even when simply blitting
  66       full-screen XImages @ 30 FPS.
  67
  68       On systems with simultaneous multithreading (a.k.a. Hyper-threading),
  69       performance gains may be slim to non-existant.
  70  */
  71
  72 #include "aligned_malloc.h"
  73
  74 #if HAVE_CONFIG_H
  75 /* For HAVE_PTHREAD. */
  76 #       include "config.h"
  77 #endif
  78
  79 #include <stddef.h>
  80
  81 #if HAVE_UNISTD_H
  82 /* For _POSIX_THREADS. */
  83 #       include <unistd.h>
  84 #endif
  85
  86 #ifdef HAVE_COCOA
  87 #       include "jwxyz.h"
  88 #else
  89 #       include <X11/Xlib.h>
  90 #endif
  91
  92 unsigned hardware_concurrency(Display *dpy);
  93 /* This is supposed to return the number of available CPU cores. This number
  94    isn't necessarily constant: a system administrator can hotplug or
  95    enable/disable CPUs on certain systems, or the system can deactivate a
  96    malfunctioning core -- but these are rare.
  97
  98    If threads are unavailable, this function will return 1.
  99
 100    This function isn't fast; the result should be cached.
 101 */
 102
 103 unsigned thread_memory_alignment(Display *dpy);
 104
 105 /* Returns the proper alignment for memory allocated by a thread that is
 106    shared with other threads.
 107
 108    A typical CPU accesses the system RAM through a cache, and this cache is
 109    divided up into cache lines - aligned chunks of memory typically 32 or 64
 110    bytes in size. Cache faults cause cache lines to be populated from
 111    memory. And, in a multiprocessing environment, two CPU cores can access the
 112    same cache line. The consequences of this depend on the CPU model:
 113
 114    - x86 implements the MESI protocol [1] to maintain cache coherency between
 115      CPU cores, with a serious performance penalty on both Intel [1] and AMD
 116      [2].  Intel uses the term "false sharing" to describe two CPU cores
 117      accessing different memory in the same cache line.
 118
 119    - ARM allows CPU caches to become inconsistent in this case [3]. Memory
 120      fences are needed to prevent horrible non-deterministic bugs from
 121      occurring.  Other CPU architectures have similar behavior to one of the
 122      above, depending on whether they are "strongly-orderered" (like x86), or
 123      "weakly-ordered" (like ARM).
 124
 125    Aligning multithreaded memory accesses according to the cache line size
 126    neatly sidesteps both issues.
 127
 128    One complication is that CPU caches are divided up into separate levels,
 129    and occasionally different levels can have different cache line sizes, so
 130    to be safe this function returns the largest cache line size among all
 131    levels.
 132
 133    If multithreading is not in effect, this returns sizeof(void *), because
 134    posix_memalign(3) will error out if the alignment is set to be smaller than
 135    that.
 136
 137    [1] Intel(R) 64 and IA-32 Architectures Optimization Reference Manual
 138       (Order Number: 248966-026): 2.1.5 Cache Hierarchy
 139    [2] Software Optimization Guide for AMD Family 10h Processors (Publication
 140        #40546): 11.3.4 Data Sharing between Caches
 141    [3] http://wanderingcoder.net/2011/04/01/arm-memory-ordering/
 142 */
 143
 144 /*
 145    Note: aligned_malloc uses posix_memalign(3) when available, or malloc(3)
 146    otherwise. As of SUSv2 (1997), and *probably* earlier, these are guaranteed
 147    to be thread-safe. C89 does not discuss threads, or thread safety;
 148    non-POSIX systems, watch out!
 149    http://pubs.opengroup.org/onlinepubs/7908799/xsh/threads.html
 150    http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_09.html
 151 */
 152
 153 /* int thread_malloc(void **ptr, Display *dpy, unsigned size); */
 154 #define thread_malloc(ptr, dpy, size) \
 155   (aligned_malloc((ptr), thread_memory_alignment(dpy), (size)))
 156
 157 /*
 158    This simply does a malloc aligned to thread_memory_alignment(). See
 159    above. On failure, an errno is returned, usually ENOMEM.
 160
 161    It's possible for two malloc()'d blocks to at least partially share the
 162    same cache line. When a different thread is writing to each block, then bad
 163    things can happen (see thread_memory_alignment). Better malloc()
 164    implementations will divide memory into pools belonging to one thread or
 165    another, causing memory blocks belonging to different threads to typically
 166    be located on different memory pages (see getpagesize(2)), mitigating the
 167    problem in question...but there's nothing stopping threads from passing
 168    memory to each other. And it's not practical for the system to align each
 169    block to 64 or 128 byte boundaries -- it's not uncommon to need lots and
 170    lots of 8-32 byte allocations, and the waste could become a bit excessive.
 171
 172    Some rules of thumb to take away from this:
 173
 174    1. Use thread_alloc for memory that might be written to by a thread that
 175    didn't originally allocate the object.
 176
 177    2. Use thread_alloc for memory that will be handed from one thread to
 178    another.
 179
 180    3. Use malloc if a single thread allocates, reads from, writes to, and
 181    frees the block of memory.
 182
 183    Oddly, I (Dave) have not seen this problem described anywhere else.
 184 */
 185
 186 #define thread_free(ptr) aligned_free(ptr)
 187
 188 #if HAVE_PTHREAD
 189 #       if defined _POSIX_THREADS && _POSIX_THREADS >= 0
 190 /*
 191    See The Open Group Base Specifications Issue 7, <unistd.h>, Constants for
 192    Options and Option Groups
 193    http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html#tag_13_77_03_02
 194 */
 195
 196 #               include <pthread.h>
 197
 198 /* Most PThread synchronization functions only fail when they are misused. */
 199 #               if defined NDEBUG
 200 #                       define PTHREAD_VERIFY(expr) (void)(expr)
 201 #               else
 202 #                       include <assert.h>
 203 #                       define PTHREAD_VERIFY(expr) assert(!(expr))
 204 #               endif
 205
 206 extern const pthread_mutex_t mutex_initializer;
 207 extern const pthread_cond_t cond_initializer;
 208
 209 #       else
 210                 /* Whatever caused HAVE_PTHREAD to be defined (configure script,
 211            usually) made a mistake if this is reached. */
 212                 /* Maybe this should be a warning. */
 213 #               error HAVE_PTHREAD is defined, but _POSIX_THREADS is not.
 214                 /* #undef HAVE_PTHREAD */
 215 #       endif
 216 #endif
 217
 218 struct threadpool
 219 {
 220 /*      This is always the same as the count parameter fed to threadpool_create().
 221         Here's a neat trick: if the threadpool is zeroed out with a memset, and
 222         threadpool_create() is never called to create 0 threads, then
 223         threadpool::count can be used to determine if the threadpool object was
 224         ever initialized. */
 225         unsigned count;
 226
 227         /* Copied from threadpool_class. No need for thread_create here, though. */
 228         size_t thread_size;
 229         void (*thread_run)(void *self);
 230         void (*thread_destroy)(void *self);
 231
 232         void *serial_threads;
 233
 234 #if HAVE_PTHREAD
 235         pthread_mutex_t mutex;
 236         pthread_cond_t cond;
 237
 238         /* Number of threads waiting for the startup signal. */
 239         unsigned parallel_pending;
 240
 241         /* Number of threads still running. During startup, this is the index of the thread currently being initialized. */
 242         unsigned parallel_unfinished;
 243
 244         pthread_t *parallel_threads;
 245 #endif
 246 };
 247
 248 /*
 249    The threadpool_* functions manage a group of threads (naturally).  Each
 250    thread owns an object described by a threadpool_class. When
 251    threadpool_run() is called, the specified func parameter is called on each
 252    thread in parallel. Sometime after calling threadpool_run(), call
 253    threadpool_wait(), which waits for each thread to return from
 254    threadpool_class::run().
 255
 256    Note that thread 0 runs on the thread from which threadpool_run is called
 257    from, so if each thread has an equal workload, then when threadpool_run
 258    returns, the other threads will be finished or almost finished. Adding code
 259    between threadpool_run and threadpool_wait increases the odds that
 260    threadpool_wait won't actually have to wait at all -- which is nice.
 261
 262    If the system does not provide threads, then these functions will fake it:
 263    everything will appear to work normally from the perspective of the caller,
 264    but when threadpool_run() is called, the "threads" are run synchronously;
 265    threadpool_wait() does nothing.
 266 */
 267
 268 struct threadpool_class
 269 {
 270         /* Size of the thread private object. */
 271         size_t size;
 272
 273 /*      Create the thread private object. Called in sequence for each thread
 274         (effectively) from threadpool_create.  self: A pointer to size bytes of
 275         memory, allocated to hold the thread object.  pool: The threadpool object
 276         that owns all the threads. If the threadpool is nested in another struct,
 277         try GET_PARENT_OBJ.  id: The ID for the thread; numbering starts at zero
 278         and goes up by one for each thread.  Return 0 on success. On failure,
 279         return a value from errno.h; this will be returned from
 280         threadpool_create. */
 281         int (*create)(void *self, struct threadpool *pool, unsigned id);
 282
 283 /*      Destroys the thread private object. Called in sequence (though not always
 284         the same sequence as create).  Warning: During shutdown, it is possible
 285         for destroy() to be called while other threads are still in
 286         threadpool_run(). */
 287         void (*destroy)(void *self);
 288 };
 289
 290 /* Returns 0 on success, on failure can return ENOMEM, or any error code from
 291    threadpool_class.create. */
 292 int threadpool_create(struct threadpool *self, const struct threadpool_class *cls, Display *dpy, unsigned count);
 293 void threadpool_destroy(struct threadpool *self);
 294
 295 void threadpool_run(struct threadpool *self, void (*func)(void *));
 296 void threadpool_wait(struct threadpool *self);
 297
 298 #if HAVE_PTHREAD
 299 #       define THREAD_DEFAULTS \
 300         "*useThreads: True",
 301 #       define THREAD_OPTIONS \
 302         {"-threads",    ".useThreads", XrmoptionNoArg, "True"}, \
 303         {"-no-threads", ".useThreads", XrmoptionNoArg, "False"},
 304 #else
 305 #       define THREAD_DEFAULTS
 306 #       define THREAD_OPTIONS
 307 #endif
 308
 309 /*
 310    If a variable 'member' is known to be a member (named 'member_name') of a
 311    struct (named 'struct_name'), then this can find a pointer to the struct
 312    that contains it.
 313 */
 314 #define GET_PARENT_OBJ(struct_name, member_name, member) (struct_name *)((char *)member - offsetof(struct_name, member_name));
 315
 316 #endif