git.hungrycats.org Git - xscreensaver/blob - utils/thread_util.h

   1 /* -*- mode: c; tab-width: 4; fill-column: 78 -*- */
   2 /* vi: set ts=4 tw=78: */
   3
   4 /*
   5 thread_util.h, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com>
   6
   7 Permission to use, copy, modify, distribute, and sell this software and its
   8 documentation for any purpose is hereby granted without fee, provided that
   9 the above copyright notice appear in all copies and that both that
  10 copyright notice and this permission notice appear in supporting
  11 documentation.  No representations are made about the suitability of this
  12 software for any purpose.  It is provided "as is" without express or
  13 implied warranty.
  14 */
  15
  16 #ifndef THREAD_UTIL_H
  17 #define THREAD_UTIL_H
  18
  19 /* thread_util.h because C11 took threads.h. */
  20
  21 /* And POSIX threads because there aren't too many systems that support C11
  22    threads that don't already support POSIX threads.
  23    ...Not that it would be too hard to convert from the one to the other.
  24    Or to have both.
  25  */
  26
  27 /* Beware!
  28    Multithreading is a great way to add insidious and catastrophic bugs to
  29    a program. Make sure you understand the risks.
  30
  31    You may wish to become familiar with race conditions, deadlocks, mutexes,
  32    condition variables, and, in lock-free code, memory ordering, cache
  33    hierarchies, etc., before working with threads.
  34
  35    On the other hand, if a screenhack locks up or crashes, it's not the
  36    end of the world: XScreenSaver won't unlock the screen if that happens.
  37 */
  38
  39 /*
  40    The basic stragegy for applying threads to a CPU-hungry screenhack:
  41
  42    1. Find the CPU-hungry part of the hack.
  43
  44    2. Change that part so the workload can be divided into N equal-sized
  45       loads, where N is the number of CPU cores in the machine.
  46       (For example: with two cores, one core could render even scan lines,
  47       and the other odd scan lines.)
  48
  49    2a. Keeping in mind that two threads should not write to the same memory
  50        at the same time. Specifically, they should not be writing to the
  51        same cache line at the same time -- so align memory allocation and
  52        memory accesses to the system cache line size as necessary.
  53
  54    3. On screenhack_init, create a threadpool object. This creates N worker
  55       threads, and each thread creates and owns a user-defined struct.
  56       After creation, the threads are idle.
  57
  58    4. On screenhack_frame, call threadpool_run(). Each thread simultaneously
  59       wakes up, calls a function that does one of the equal-sized loads,
  60       then goes back to sleep. The main thread then calls threadpool_wait(),
  61       which returns once all the worker threads have finished.
  62
  63       Using this to implement SMP won't necessarily increase performance by
  64       a factor of N (again, N is CPU cores.). Both X11 and Cocoa on OS X can
  65       impose a not-insignificant amount of overhead even when simply blitting
  66       full-screen XImages @ 30 FPS.
  67
  68       On systems with simultaneous multithreading (a.k.a. Hyper-threading),
  69       performance gains may be slim to non-existant.
  70  */
  71
  72 #include "aligned_malloc.h"
  73
  74 #if HAVE_CONFIG_H
  75 /* For HAVE_PTHREAD. */
  76 #       include "config.h"
  77 #endif
  78
  79 #include <stddef.h>
  80
  81 #if HAVE_UNISTD_H
  82 /* For _POSIX_THREADS. */
  83 #       include <unistd.h>
  84 #endif
  85
  86 #ifdef HAVE_COCOA
  87 #       include "jwxyz.h"
  88 #else
  89 #       include <X11/Xlib.h>
  90 #endif
  91
  92 #if HAVE_PTHREAD
  93 int threads_available(Display *dpy);
  94 #else
  95 #       define threads_available(dpy) (-1)
  96 #endif
  97 /* > 0: Threads are available. This is normally _POSIX_VERSION.
  98     -1: Threads are not available.
  99 */
 100
 101 unsigned hardware_concurrency(Display *dpy);
 102 /* This is supposed to return the number of available CPU cores. This number
 103    isn't necessarily constant: a system administrator can hotplug or
 104    enable/disable CPUs on certain systems, or the system can deactivate a
 105    malfunctioning core -- but these are rare.
 106
 107    If threads are unavailable, this function will return 1.
 108
 109    This function isn't fast; the result should be cached.
 110 */
 111
 112 unsigned thread_memory_alignment(Display *dpy);
 113
 114 /* Returns the proper alignment for memory allocated by a thread that is
 115    shared with other threads.
 116
 117    A typical CPU accesses the system RAM through a cache, and this cache is
 118    divided up into cache lines - aligned chunks of memory typically 32 or 64
 119    bytes in size. Cache faults cause cache lines to be populated from
 120    memory. And, in a multiprocessing environment, two CPU cores can access the
 121    same cache line. The consequences of this depend on the CPU model:
 122
 123    - x86 implements the MESI protocol [1] to maintain cache coherency between
 124      CPU cores, with a serious performance penalty on both Intel [1] and AMD
 125      [2].  Intel uses the term "false sharing" to describe two CPU cores
 126      accessing different memory in the same cache line.
 127
 128    - ARM allows CPU caches to become inconsistent in this case [3]. Memory
 129      fences are needed to prevent horrible non-deterministic bugs from
 130      occurring.  Other CPU architectures have similar behavior to one of the
 131      above, depending on whether they are "strongly-orderered" (like x86), or
 132      "weakly-ordered" (like ARM).
 133
 134    Aligning multithreaded memory accesses according to the cache line size
 135    neatly sidesteps both issues.
 136
 137    One complication is that CPU caches are divided up into separate levels,
 138    and occasionally different levels can have different cache line sizes, so
 139    to be safe this function returns the largest cache line size among all
 140    levels.
 141
 142    If multithreading is not in effect, this returns sizeof(void *), because
 143    posix_memalign(3) will error out if the alignment is set to be smaller than
 144    that.
 145
 146    [1] Intel(R) 64 and IA-32 Architectures Optimization Reference Manual
 147       (Order Number: 248966-026): 2.1.5 Cache Hierarchy
 148    [2] Software Optimization Guide for AMD Family 10h Processors (Publication
 149        #40546): 11.3.4 Data Sharing between Caches
 150    [3] http://wanderingcoder.net/2011/04/01/arm-memory-ordering/
 151 */
 152
 153 /*
 154    Note: aligned_malloc uses posix_memalign(3) when available, or malloc(3)
 155    otherwise. As of SUSv2 (1997), and *probably* earlier, these are guaranteed
 156    to be thread-safe. C89 does not discuss threads, or thread safety;
 157    non-POSIX systems, watch out!
 158    http://pubs.opengroup.org/onlinepubs/7908799/xsh/threads.html
 159    http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_09.html
 160 */
 161
 162 /* int thread_malloc(void **ptr, Display *dpy, unsigned size); */
 163 #define thread_malloc(ptr, dpy, size) \
 164   (aligned_malloc((ptr), thread_memory_alignment(dpy), (size)))
 165
 166 /*
 167    This simply does a malloc aligned to thread_memory_alignment(). See
 168    above. On failure, an errno is returned, usually ENOMEM.
 169
 170    It's possible for two malloc()'d blocks to at least partially share the
 171    same cache line. When a different thread is writing to each block, then bad
 172    things can happen (see thread_memory_alignment). Better malloc()
 173    implementations will divide memory into pools belonging to one thread or
 174    another, causing memory blocks belonging to different threads to typically
 175    be located on different memory pages (see getpagesize(2)), mitigating the
 176    problem in question...but there's nothing stopping threads from passing
 177    memory to each other. And it's not practical for the system to align each
 178    block to 64 or 128 byte boundaries -- it's not uncommon to need lots and
 179    lots of 8-32 byte allocations, and the waste could become a bit excessive.
 180
 181    Some rules of thumb to take away from this:
 182
 183    1. Use thread_alloc for memory that might be written to by a thread that
 184    didn't originally allocate the object.
 185
 186    2. Use thread_alloc for memory that will be handed from one thread to
 187    another.
 188
 189    3. Use malloc if a single thread allocates, reads from, writes to, and
 190    frees the block of memory.
 191
 192    Oddly, I (Dave) have not seen this problem described anywhere else.
 193 */
 194
 195 #define thread_free(ptr) aligned_free(ptr)
 196
 197 #if HAVE_PTHREAD
 198 #       if defined _POSIX_THREADS && _POSIX_THREADS >= 0
 199 /*
 200    See The Open Group Base Specifications Issue 7, <unistd.h>, Constants for
 201    Options and Option Groups
 202    http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html#tag_13_77_03_02
 203 */
 204
 205 #               include <pthread.h>
 206
 207 /* Most PThread synchronization functions only fail when they are misused. */
 208 #               if defined NDEBUG
 209 #                       define PTHREAD_VERIFY(expr) (void)(expr)
 210 #               else
 211 #                       include <assert.h>
 212 #                       define PTHREAD_VERIFY(expr) assert(!(expr))
 213 #               endif
 214
 215 extern const pthread_mutex_t mutex_initializer;
 216 extern const pthread_cond_t cond_initializer;
 217
 218 #       else
 219                 /* Whatever caused HAVE_PTHREAD to be defined (configure script,
 220            usually) made a mistake if this is reached. */
 221                 /* Maybe this should be a warning. */
 222 #               error HAVE_PTHREAD is defined, but _POSIX_THREADS is not.
 223                 /* #undef HAVE_PTHREAD */
 224 #       endif
 225 #endif
 226
 227 struct threadpool
 228 {
 229 /*      This is always the same as the count parameter fed to threadpool_create().
 230         Here's a neat trick: if the threadpool is zeroed out with a memset, and
 231         threadpool_create() is never called to create 0 threads, then
 232         threadpool::count can be used to determine if the threadpool object was
 233         ever initialized. */
 234         unsigned count;
 235
 236         /* Copied from threadpool_class. No need for thread_create here, though. */
 237         size_t thread_size;
 238         void (*thread_run)(void *self);
 239         void (*thread_destroy)(void *self);
 240
 241         void *serial_threads;
 242
 243 #if HAVE_PTHREAD
 244         pthread_mutex_t mutex;
 245         pthread_cond_t cond;
 246
 247         /* Number of threads waiting for the startup signal. */
 248         unsigned parallel_pending;
 249
 250         /* Number of threads still running. During startup, this is the index of the thread currently being initialized. */
 251         unsigned parallel_unfinished;
 252
 253         pthread_t *parallel_threads;
 254 #endif
 255 };
 256
 257 /*
 258    The threadpool_* functions manage a group of threads (naturally).  Each
 259    thread owns an object described by a threadpool_class. When
 260    threadpool_run() is called, the specified func parameter is called on each
 261    thread in parallel. Sometime after calling threadpool_run(), call
 262    threadpool_wait(), which waits for each thread to return from
 263    threadpool_class::run().
 264
 265    Note that thread 0 runs on the thread from which threadpool_run is called
 266    from, so if each thread has an equal workload, then when threadpool_run
 267    returns, the other threads will be finished or almost finished. Adding code
 268    between threadpool_run and threadpool_wait increases the odds that
 269    threadpool_wait won't actually have to wait at all -- which is nice.
 270
 271    If the system does not provide threads, then these functions will fake it:
 272    everything will appear to work normally from the perspective of the caller,
 273    but when threadpool_run() is called, the "threads" are run synchronously;
 274    threadpool_wait() does nothing.
 275 */
 276
 277 struct threadpool_class
 278 {
 279         /* Size of the thread private object. */
 280         size_t size;
 281
 282 /*      Create the thread private object. Called in sequence for each thread
 283         (effectively) from threadpool_create.
 284     self: A pointer to size bytes of memory, allocated to hold the thread
 285           object.
 286     pool: The threadpool object that owns all the threads. If the threadpool
 287           is nested in another struct, try GET_PARENT_OBJ.
 288     id:   The ID for the thread; numbering starts at zero and goes up by one
 289           for each thread.
 290     Return 0 on success. On failure, return a value from errno.h; this will
 291     be returned from threadpool_create. */
 292         int (*create)(void *self, struct threadpool *pool, unsigned id);
 293
 294 /*      Destroys the thread private object. Called in sequence (though not always
 295         the same sequence as create).  Warning: During shutdown, it is possible
 296         for destroy() to be called while other threads are still in
 297         threadpool_run(). */
 298         void (*destroy)(void *self);
 299 };
 300
 301 /* Returns 0 on success, on failure can return ENOMEM, or any error code from
 302    threadpool_class.create. */
 303 int threadpool_create(struct threadpool *self, const struct threadpool_class *cls, Display *dpy, unsigned count);
 304 void threadpool_destroy(struct threadpool *self);
 305
 306 void threadpool_run(struct threadpool *self, void (*func)(void *));
 307 void threadpool_wait(struct threadpool *self);
 308
 309 /*
 310    io_thread is meant to wrap blocking I/O operations in a one-shot worker
 311    thread, with cancel semantics.
 312
 313    Unlike threadpool_*, io_thread will not 'fake it'; it is up to the caller
 314    to figure out what to do if the system doesn't have threads. In
 315    particular, the start_routine passed to io_thread_create will never be
 316    called.
 317
 318    Clients of io_thread implement four functions:
 319    - state *process_start(...);
 320      Starts the worker thread.
 321    - bool process_is_done(state *);
 322      Returns true if the I/O operation is complete.
 323    - void process_cancel(state *);
 324      "Cancels" the I/O operation. The thread will continue to run, but it
 325      will detach, and clean itself up upon completion.
 326    - int process_finish(state *, ...)
 327      Waits for the I/O operation to complete, returns results, and cleans up.
 328
 329    Or:        /---\
 330              \/   |    /--> cancel
 331    start -> is_done --+
 332                        \--> finish
 333
 334    These functions follow a basic pattern:
 335    - start:
 336      1. Allocate a thread state object with thread_alloc. This state object
 337         contains an io_thread member.
 338      2. Save parameters from the start parameters to the state object.
 339      3. Start the thread with _io_thread_create. The thread receives the state
 340         object as its parameter.
 341    - On the worker thread:
 342      1. Do the I/O.
 343      2. Call io_thread_return.
 344        2a. If the result != 0, free the state object.
 345    - is_done:
 346      1. Just call _io_thread_is_done.
 347    - cancel:
 348      1. Call io_thread_cancel.
 349        1a. If the result != 0, free the state object.
 350    - finish:
 351      1. Call io_thread_finish.
 352      2. Copy results out of the state object as needed.
 353      3. Free the state object...or return it to the caller.
 354
 355    Incidentally, there may sometimes be asynchronous versions of blocking I/O
 356    functions (struct aiocb and friends, for example); these should be
 357    preferred over io_thread when performance is a concern.
 358  */
 359
 360 enum _io_thread_status
 361 {
 362         _io_thread_working, _io_thread_done, _io_thread_cancelled
 363 };
 364
 365 struct io_thread
 366 {
 367 #if HAVE_PTHREAD
 368         /* Common misconception: "volatile" should be applied to atomic variables,
 369            such as 'status', below. This is false, see
 370            <http://stackoverflow.com/q/2484980>. */
 371         enum _io_thread_status status;
 372         pthread_t thread;
 373 #else
 374         char gcc_emits_a_warning_when_the_struct_has_no_members;
 375 #endif
 376 };
 377
 378 #if HAVE_PTHREAD
 379
 380 void *io_thread_create(struct io_thread *self, void *parent, void *(*start_routine)(void *), Display *dpy, unsigned stacksize);
 381 /*
 382    Create the thread, returns NULL on failure.  Failure is usually due to
 383    ENOMEM, or the system doesn't support threads.
 384    self:          The io_thread object to be initialized.
 385    parent:        The parameter to start_routine.  The io_thread should be
 386                   contained within or be reachable from this.
 387    start_routine: The start routine for the worker thread.
 388    dpy:           The X11 Display, so that '*useThreads' is honored.
 389    stacksize:     The stack size for the thread. Set to 0 for the system
 390                   default.
 391    A note about stacksize: Linux, for example, uses a default of 2 MB of
 392    stack per thread. Now, this memory is usually committed on the first
 393    write, so lots of threads won't waste RAM, but it does mean that on a
 394    32-bit system, there's a limit of just under 1024 threads with the 2 MB
 395    default due to typical address space limitations of 2 GB for userspace
 396    processes.  And 1024 threads might not always be enough...
 397  */
 398
 399 int io_thread_return(struct io_thread *self);
 400 /* Called at the end of start_routine, from above. Returns non-zero if the
 401    thread has been cancelled, and cleanup needs to take place. */
 402
 403 int io_thread_is_done(struct io_thread *self);
 404 /* Call from the main thread. Returns non-zero if the thread finished. */
 405
 406 int io_thread_cancel(struct io_thread *self);
 407 /* Call from the main thread if the results from the worker thread are not
 408    needed. This cleans up the io_thread. Returns non-zero if cleanup needs
 409    to take place. */
 410
 411 void io_thread_finish(struct io_thread *self);
 412 /* Call from the main thread to wait for the worker thread to finish. This
 413    cleans up the io_thread. */
 414
 415 #else
 416
 417 #define IO_THREAD_STACK_MIN 0
 418
 419 #define io_thread_create(self, parent, start_routine, dpy, stacksize) NULL
 420 #define io_thread_return(self) 0
 421 #define io_thread_is_done(self) 1
 422 #define io_thread_cancel(self) 0
 423 #define io_thread_finish(self)
 424
 425 #endif
 426
 427 #if HAVE_PTHREAD
 428 #       define THREAD_DEFAULTS       "*useThreads: True",
 429 #       define THREAD_DEFAULTS_XLOCK "*useThreads: True\n"
 430 #       define THREAD_OPTIONS \
 431         {"-threads",    ".useThreads", XrmoptionNoArg, "True"}, \
 432         {"-no-threads", ".useThreads", XrmoptionNoArg, "False"},
 433 #else
 434 #       define THREAD_DEFAULTS
 435 #       define THREAD_DEFAULTS_XLOCK
 436 #       define THREAD_OPTIONS
 437 #endif
 438
 439 /*
 440    If a variable 'member' is known to be a member (named 'member_name') of a
 441    struct (named 'struct_name'), then this can find a pointer to the struct
 442    that contains it.
 443 */
 444 #define GET_PARENT_OBJ(struct_name, member_name, member) (struct_name *)((char *)member - offsetof(struct_name, member_name));
 445
 446 #endif