1 /* -*- mode: c; tab-width: 4; fill-column: 78 -*- */
2 /* vi: set ts=4 tw=128: */
5 thread_util.h, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com>
7 Permission to use, copy, modify, distribute, and sell this software and its
8 documentation for any purpose is hereby granted without fee, provided that
9 the above copyright notice appear in all copies and that both that
10 copyright notice and this permission notice appear in supporting
11 documentation. No representations are made about the suitability of this
12 software for any purpose. It is provided "as is" without express or
19 /* thread_util.h because C11 took threads.h. */
21 /* And POSIX threads because there aren't too many systems that support C11
22 threads that don't already support POSIX threads.
23 ...Not that it would be too hard to convert from the one to the other.
28 Multithreading is a great way to add insidious and catastrophic bugs to
29 a program. Make sure you understand the risks.
31 You may wish to become familiar with race conditions, deadlocks, mutexes,
32 condition variables, and, in lock-free code, memory ordering, cache
33 hierarchies, etc., before working with threads.
35 On the other hand, if a screenhack locks up or crashes, it's not the
36 end of the world: XScreenSaver won't unlock the screen if that happens.
40 The basic stragegy for applying threads to a CPU-hungry screenhack:
42 1. Find the CPU-hungry part of the hack.
44 2. Change that part so the workload can be divided into N equal-sized
45 loads, where N is the number of CPU cores in the machine.
46 (For example: with two cores, one core could render even scan lines,
47 and the other odd scan lines.)
49 2a. Keeping in mind that two threads should not write to the same memory
50 at the same time. Specifically, they should not be writing to the
51 same cache line at the same time -- so align memory allocation and
52 memory accesses to the system cache line size as necessary.
54 3. On screenhack_init, create a threadpool object. This creates N worker
55 threads, and each thread creates and owns a user-defined struct.
56 After creation, the threads are idle.
58 4. On screenhack_frame, call threadpool_run(). Each thread simultaneously
59 wakes up, calls a function that does one of the equal-sized loads,
60 then goes back to sleep. The main thread then calls threadpool_wait(),
61 which returns once all the worker threads have finished.
63 Using this to implement SMP won't necessarily increase performance by
64 a factor of N (again, N is CPU cores.). Both X11 and Cocoa on OS X can
65 impose a not-insignificant amount of overhead even when simply blitting
66 full-screen XImages @ 30 FPS.
68 On systems with simultaneous multithreading (a.k.a. Hyper-threading),
69 performance gains may be slim to non-existant.
72 #include "aligned_malloc.h"
75 /* For HAVE_PTHREAD. */
82 /* For _POSIX_THREADS. */
89 # include <X11/Xlib.h>
92 unsigned hardware_concurrency(Display *dpy);
93 /* This is supposed to return the number of available CPU cores. This number
94 isn't necessarily constant: a system administrator can hotplug or
95 enable/disable CPUs on certain systems, or the system can deactivate a
96 malfunctioning core -- but these are rare.
98 If threads are unavailable, this function will return 1.
100 This function isn't fast; the result should be cached.
103 unsigned thread_memory_alignment(Display *dpy);
105 /* Returns the proper alignment for memory allocated by a thread that is
106 shared with other threads.
108 A typical CPU accesses the system RAM through a cache, and this cache is
109 divided up into cache lines - aligned chunks of memory typically 32 or 64
110 bytes in size. Cache faults cause cache lines to be populated from
111 memory. And, in a multiprocessing environment, two CPU cores can access the
112 same cache line. The consequences of this depend on the CPU model:
114 - x86 implements the MESI protocol [1] to maintain cache coherency between
115 CPU cores, with a serious performance penalty on both Intel [1] and AMD
116 [2]. Intel uses the term "false sharing" to describe two CPU cores
117 accessing different memory in the same cache line.
119 - ARM allows CPU caches to become inconsistent in this case [3]. Memory
120 fences are needed to prevent horrible non-deterministic bugs from
121 occurring. Other CPU architectures have similar behavior to one of the
122 above, depending on whether they are "strongly-orderered" (like x86), or
123 "weakly-ordered" (like ARM).
125 Aligning multithreaded memory accesses according to the cache line size
126 neatly sidesteps both issues.
128 One complication is that CPU caches are divided up into separate levels,
129 and occasionally different levels can have different cache line sizes, so
130 to be safe this function returns the largest cache line size among all
133 If multithreading is not in effect, this returns sizeof(void *), because
134 posix_memalign(3) will error out if the alignment is set to be smaller than
137 [1] Intel(R) 64 and IA-32 Architectures Optimization Reference Manual
138 (Order Number: 248966-026): 2.1.5 Cache Hierarchy
139 [2] Software Optimization Guide for AMD Family 10h Processors (Publication
140 #40546): 11.3.4 Data Sharing between Caches
141 [3] http://wanderingcoder.net/2011/04/01/arm-memory-ordering/
145 Note: aligned_malloc uses posix_memalign(3) when available, or malloc(3)
146 otherwise. As of SUSv2 (1997), and *probably* earlier, these are guaranteed
147 to be thread-safe. C89 does not discuss threads, or thread safety;
148 non-POSIX systems, watch out!
149 http://pubs.opengroup.org/onlinepubs/7908799/xsh/threads.html
150 http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_09.html
153 /* int thread_malloc(void **ptr, Display *dpy, unsigned size); */
154 #define thread_malloc(ptr, dpy, size) \
155 (aligned_malloc((ptr), thread_memory_alignment(dpy), (size)))
158 This simply does a malloc aligned to thread_memory_alignment(). See
159 above. On failure, an errno is returned, usually ENOMEM.
161 It's possible for two malloc()'d blocks to at least partially share the
162 same cache line. When a different thread is writing to each block, then bad
163 things can happen (see thread_memory_alignment). Better malloc()
164 implementations will divide memory into pools belonging to one thread or
165 another, causing memory blocks belonging to different threads to typically
166 be located on different memory pages (see getpagesize(2)), mitigating the
167 problem in question...but there's nothing stopping threads from passing
168 memory to each other. And it's not practical for the system to align each
169 block to 64 or 128 byte boundaries -- it's not uncommon to need lots and
170 lots of 8-32 byte allocations, and the waste could become a bit excessive.
172 Some rules of thumb to take away from this:
174 1. Use thread_alloc for memory that might be written to by a thread that
175 didn't originally allocate the object.
177 2. Use thread_alloc for memory that will be handed from one thread to
180 3. Use malloc if a single thread allocates, reads from, writes to, and
181 frees the block of memory.
183 Oddly, I (Dave) have not seen this problem described anywhere else.
186 #define thread_free(ptr) aligned_free(ptr)
189 # if defined _POSIX_THREADS && _POSIX_THREADS >= 0
191 See The Open Group Base Specifications Issue 7, <unistd.h>, Constants for
192 Options and Option Groups
193 http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html#tag_13_77_03_02
196 # include <pthread.h>
198 /* Most PThread synchronization functions only fail when they are misused. */
200 # define PTHREAD_VERIFY(expr) (void)(expr)
203 # define PTHREAD_VERIFY(expr) assert(!(expr))
206 extern const pthread_mutex_t mutex_initializer;
207 extern const pthread_cond_t cond_initializer;
210 /* Whatever caused HAVE_PTHREAD to be defined (configure script,
211 usually) made a mistake if this is reached. */
212 /* Maybe this should be a warning. */
213 # error HAVE_PTHREAD is defined, but _POSIX_THREADS is not.
214 /* #undef HAVE_PTHREAD */
220 /* This is always the same as the count parameter fed to threadpool_create().
221 Here's a neat trick: if the threadpool is zeroed out with a memset, and
222 threadpool_create() is never called to create 0 threads, then
223 threadpool::count can be used to determine if the threadpool object was
227 /* Copied from threadpool_class. No need for thread_create here, though. */
229 void (*thread_run)(void *self);
230 void (*thread_destroy)(void *self);
232 void *serial_threads;
235 pthread_mutex_t mutex;
238 /* Number of threads waiting for the startup signal. */
239 unsigned parallel_pending;
241 /* Number of threads still running. During startup, this is the index of the thread currently being initialized. */
242 unsigned parallel_unfinished;
244 pthread_t *parallel_threads;
249 The threadpool_* functions manage a group of threads (naturally). Each
250 thread owns an object described by a threadpool_class. When
251 threadpool_run() is called, the specified func parameter is called on each
252 thread in parallel. Sometime after calling threadpool_run(), call
253 threadpool_wait(), which waits for each thread to return from
254 threadpool_class::run().
256 Note that thread 0 runs on the thread from which threadpool_run is called
257 from, so if each thread has an equal workload, then when threadpool_run
258 returns, the other threads will be finished or almost finished. Adding code
259 between threadpool_run and threadpool_wait increases the odds that
260 threadpool_wait won't actually have to wait at all -- which is nice.
262 If the system does not provide threads, then these functions will fake it:
263 everything will appear to work normally from the perspective of the caller,
264 but when threadpool_run() is called, the "threads" are run synchronously;
265 threadpool_wait() does nothing.
268 struct threadpool_class
270 /* Size of the thread private object. */
273 /* Create the thread private object. Called in sequence for each thread
274 (effectively) from threadpool_create. self: A pointer to size bytes of
275 memory, allocated to hold the thread object. pool: The threadpool object
276 that owns all the threads. If the threadpool is nested in another struct,
277 try GET_PARENT_OBJ. id: The ID for the thread; numbering starts at zero
278 and goes up by one for each thread. Return 0 on success. On failure,
279 return a value from errno.h; this will be returned from
280 threadpool_create. */
281 int (*create)(void *self, struct threadpool *pool, unsigned id);
283 /* Destroys the thread private object. Called in sequence (though not always
284 the same sequence as create). Warning: During shutdown, it is possible
285 for destroy() to be called while other threads are still in
287 void (*destroy)(void *self);
290 /* Returns 0 on success, on failure can return ENOMEM, or any error code from
291 threadpool_class.create. */
292 int threadpool_create(struct threadpool *self, const struct threadpool_class *cls, Display *dpy, unsigned count);
293 void threadpool_destroy(struct threadpool *self);
295 void threadpool_run(struct threadpool *self, void (*func)(void *));
296 void threadpool_wait(struct threadpool *self);
299 # define THREAD_DEFAULTS \
301 # define THREAD_OPTIONS \
302 {"-threads", ".useThreads", XrmoptionNoArg, "True"}, \
303 {"-no-threads", ".useThreads", XrmoptionNoArg, "False"},
305 # define THREAD_DEFAULTS
306 # define THREAD_OPTIONS
310 If a variable 'member' is known to be a member (named 'member_name') of a
311 struct (named 'struct_name'), then this can find a pointer to the struct
314 #define GET_PARENT_OBJ(struct_name, member_name, member) (struct_name *)((char *)member - offsetof(struct_name, member_name));