X-Git-Url: http://git.hungrycats.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=hacks%2Finterference.c;h=fe3084e4c2a78ee7ba3e46f2403b190d143bc3ae;hb=aa75c7476aeaa84cf3abc192b376a8b03c325213;hp=3a270abe17d86ec3a040bca5eff6dc9d6c234c61;hpb=4ade52359b6eba3621566dac79793a33aa4c915f;p=xscreensaver diff --git a/hacks/interference.c b/hacks/interference.c index 3a270abe..fe3084e4 100644 --- a/hacks/interference.c +++ b/hacks/interference.c @@ -30,9 +30,15 @@ * Last modified: Wed May 15 00:04:43 2013, * Dave Odell * Tuned performance; double-buffering is now off by default. - * Made animation speed independant of FPS. + * Made animation speed independent of FPS. * Added cleanup code, fixed a few glitches. * Added gratuitous #ifdefs. + * Last modified: Fri Feb 21 02:14:29 2014, + * Added support for SMP rendering. + * Tweaked math a bit re: performance. + * Last modified: Tue Dec 30 16:43:33 2014, + * Killed the black margin on the right and bottom. + * Reduced the default grid size to 2. */ #include @@ -40,8 +46,10 @@ #include "screenhack.h" -#ifdef HAVE_STDINT_H -# include +#include "thread_util.h" + +#ifdef HAVE_INTTYPES_H +# include #else typedef unsigned int uint32_t; @@ -59,7 +67,7 @@ Does double-buffering make sense? (gridsize = 2) USE_XIMAGE is off: Yes (-db: 4.1 FPS, -no-db: 2.9 FPS) XPutImage in strips: No (-db: 35.9 FPS, -no-db: 38.7 FPS) XPutImage, whole image: No (-db: 32.3 FPS, -no-db: 33.7 FPS) -MIT-SHM, whole image: Doesn't work anyway: (37.3 FPS) +MIT-SHM, whole image: Doesn't work anyway: (-no-db: 37.3 FPS) If gridsize = 1, XPutImage is slow when the XImage is one line at a time. XPutImage in strips: -db: 21.2 FPS, -no-db: 19.7 FPS @@ -78,7 +86,7 @@ quite a bit worse when gridsize = 1. * gridsize = 1. (And SHM is turned off.) */ #define USE_BIG_XIMAGE -/* Numbers are wave_table size, measured in unsigned integers. +/* Numbers are wave_table size, measured in # of unsigned integers. * FPS/radius = 50/radius = 800/radius = 1500/Big-O memory usage * * Use at most one of the following: @@ -103,8 +111,8 @@ static const char *interference_defaults [] = { ".background: black", ".foreground: white", "*count: 3", /* number of waves */ - "*gridsize: 4", /* pixel size, smaller values for better resolution */ - "*ncolors: 128", /* number of colours used */ + "*gridsize: 2", /* pixel size, smaller values for better resolution */ + "*ncolors: 192", /* number of colours used */ "*hue: 0", /* hue to use for base color (0-360) */ "*speed: 30", /* speed of wave origins moving around */ "*delay: 30000", /* or something */ @@ -122,9 +130,10 @@ static const char *interference_defaults [] = { #ifdef HAVE_XSHM_EXTENSION "*useSHM: True", /* use shared memory extension */ #endif /* HAVE_XSHM_EXTENSION */ -#ifdef USE_IPHONE +#ifdef HAVE_MOBILE "*ignoreRotation: True", #endif + THREAD_DEFAULTS 0 }; @@ -145,6 +154,7 @@ static XrmOptionDescRec interference_options [] = { { "-shm", ".useSHM", XrmoptionNoArg, "True" }, { "-no-shm", ".useSHM", XrmoptionNoArg, "False" }, #endif /* HAVE_XSHM_EXTENSION */ + THREAD_OPTIONS { 0, 0, 0, 0 } }; @@ -193,17 +203,18 @@ struct inter_context { */ int w; int h; + unsigned w_div_g, h_div_g; Colormap cmap; Screen *screen; + unsigned bits_per_pixel; XColor* pal; #ifndef USE_XIMAGE GC* gcs; #endif int radius; /* Not always the same as the X resource. */ double last_frame; -#ifdef USE_XIMAGE - uint32_t* row; -#endif + + struct threadpool threadpool; /* * lookup tables @@ -216,6 +227,18 @@ struct inter_context { struct inter_source* source; }; +struct inter_thread +{ + const struct inter_context *context; + unsigned thread_id; + +#ifdef USE_XIMAGE + uint32_t* row; +#endif + + unsigned* result_row; +}; + #ifdef HAVE_DOUBLE_BUFFER_EXTENSION # define TARGET(c) ((c)->back_buf ? (c)->back_buf : \ (c)->pix_buf ? (c)->pix_buf : (c)->win) @@ -319,7 +342,6 @@ static double fast_inv_table(unsigned x) #endif -/* Also destroys c->row. */ static void destroy_image(Display* dpy, struct inter_context* c) { #ifdef USE_XIMAGE @@ -330,13 +352,19 @@ static void destroy_image(Display* dpy, struct inter_context* c) } else # endif { - /* Also frees c->ximage->data, which isn't allocated by XCreateImage. */ + /* Don't let XDestroyImage free c->ximage->data. */ + thread_free(c->ximage->data); + c->ximage->data = NULL; XDestroyImage(c->ximage); } } - - free(c->row); #endif + + if(c->threadpool.count) + { + threadpool_destroy(&c->threadpool); + c->threadpool.count = 0; + } } static void inter_free(Display* dpy, struct inter_context* c) @@ -368,12 +396,17 @@ static void inter_free(Display* dpy, struct inter_context* c) free(c->source); } -static void abort_no_mem(void) +static void abort_on_error(int error) { - fprintf(stderr, "interference: %s\n", strerror(ENOMEM)); + fprintf(stderr, "interference: %s\n", strerror(error)); exit(1); } +static void abort_no_mem(void) +{ + abort_on_error(ENOMEM); +} + static void check_no_mem(Display* dpy, struct inter_context* c, void* ptr) { if(!ptr) { @@ -382,19 +415,242 @@ static void check_no_mem(Display* dpy, struct inter_context* c, void* ptr) } } -/* On allocation error, c->row == NULL. */ +static int inter_thread_create( + void* self_raw, + struct threadpool* pool, + unsigned id) +{ + struct inter_thread* self = (struct inter_thread*)self_raw; + const struct inter_context* c = GET_PARENT_OBJ(struct inter_context, threadpool, pool); + + self->context = c; + self->thread_id = id; + + self->result_row = malloc(c->w_div_g * sizeof(unsigned)); + if(!self->result_row) + return ENOMEM; + +#ifdef USE_XIMAGE + self->row = malloc(c->w_div_g * sizeof(uint32_t)); + if(!self->row) { + free(self->result_row); + return ENOMEM; + } +#endif + + return 0; +} + +static void inter_thread_destroy(void* self_raw) +{ + struct inter_thread* self = (struct inter_thread*)self_raw; +#ifdef USE_XIMAGE + free(self->row); +#endif + free(self->result_row); +} + +/* +A higher performance design would have input and output queues, so that when +worker threads finish with one frame, they can pull the next work order from +the queue and get started on it immediately, rather than going straight to +sleep. The current "single-buffered" design still provides reasonable +performance at low frame rates; high frame rates are noticeably less efficient. +*/ + +static void inter_thread_run(void* self_raw) +{ + struct inter_thread* self = (struct inter_thread*)self_raw; + const struct inter_context* c = self->context; + + int i, j, k; + unsigned result; + int dist1; + int g = c->grid_size; + + int dx, dy, g2 = 2 * g * g; + int px, py, px2g; + + int dist0, ddist; + +#ifdef USE_XIMAGE + unsigned img_y = g * self->thread_id; + void *scanline = c->ximage->data + c->ximage->bytes_per_line * g * self->thread_id; +#endif + + for(j = self->thread_id; j < c->h_div_g; j += c->threadpool.count) { + px = g/2; + py = j*g + px; + + memset(self->result_row, 0, c->w_div_g * sizeof(unsigned)); + + for(k = 0; k < c->count; k++) { + + dx = px - c->source[k].x; + dy = py - c->source[k].y; + + dist0 = dx*dx + dy*dy; + ddist = -2 * g * c->source[k].x; + + /* px2g = g*(px*2 + g); */ + px2g = g2; + + for(i = 0; i < c->w_div_g; i++) { + /* + * Discarded possibilities for improving performance here: + * 1. Using octagon-based distance estimation + * (Which causes giant octagons to appear.) + * 2. Square root approximation by reinterpret-casting IEEE floats to + * integers. + * (Which causes angles to appear when two waves interfere.) + */ + +/* int_float u; + u.f = dx*dx + dy*dy; + u.i = (1 << 29) + (u.i >> 1) - (1 << 22); + dist = u.f; */ + +#if defined USE_FAST_SQRT_BIGTABLE2 + dist1 = FAST_TABLE(dist0); +#elif defined USE_FAST_SQRT_HACKISH + dist1 = fast_log2(dist0); +#else + dist1 = sqrt(dist0); +#endif + + if(dist1 < c->radius) + self->result_row[i] += c->wave_height[dist1]; + + dist0 += px2g + ddist; + px2g += g2; + } + } + + for(i = 0; i < c->w_div_g; i++) { + + result = self->result_row[i]; + + /* It's slightly faster to do a subtraction or two before calculating the + * modulus. - D.O. */ + if(result >= c->colors) + { + result -= c->colors; + if(result >= c->colors) + result %= (unsigned)c->colors; + } + +#ifdef USE_XIMAGE + self->row[i] = c->pal[result].pixel; +#else + XFillRectangle(c->dpy, TARGET(c), c->gcs[result], g*i, g*j, g, g); +#endif /* USE_XIMAGE */ + } + +#ifdef USE_XIMAGE + /* Fill in these `gridsize' horizontal bits in the scanline */ + if(c->ximage->bits_per_pixel == 32) + { + uint32_t *ptr = (uint32_t *)scanline; + for(i = 0; i < c->w_div_g; i++) { + for(k = 0; k < g; k++) + ptr[g*i+k] = self->row[i]; + } + } + else if(c->ximage->bits_per_pixel == 24) + { + uint8_t *ptr = (uint8_t *)scanline; + for(i = 0; i < c->w_div_g; i++) { + for(k = 0; k < g; k++) { + uint32_t pixel = self->row[i]; + /* Might not work on big-endian. */ + ptr[0] = pixel; + ptr[1] = (pixel & 0x0000ff00) >> 8; + ptr[2] = (pixel & 0x00ff0000) >> 16; + ptr += 3; + } + } + } + else if(c->ximage->bits_per_pixel == 16) + { + uint16_t *ptr = (uint16_t *)scanline; + for(i = 0; i < c->w_div_g; i++) { + for(k = 0; k < g; k++) + ptr[g*i+k] = self->row[i]; + } + } + else if(c->ximage->bits_per_pixel == 8) + { + uint8_t *ptr = (uint8_t *)scanline; + for(i = 0; i < c->w_div_g; i++) { + for(k = 0; k < g; k++) + ptr[g*i+k] = self->row[i]; + } + } + else + { + for(i = 0; i < c->w_div_g; i++) { + for(k = 0; k < g; k++) + /* XPutPixel is thread safe as long as the XImage didn't have its + * bits_per_pixel changed. */ + XPutPixel(c->ximage, (g*i)+k, img_y, self->row[i]); + } + } + + /* Only the first scanline of the image has been filled in; clone that + scanline to the rest of the `gridsize' lines in the ximage */ + for(k = 0; k < (g-1); k++) + memcpy(c->ximage->data + (c->ximage->bytes_per_line * (img_y + k + 1)), + c->ximage->data + (c->ximage->bytes_per_line * img_y), + c->ximage->bytes_per_line); + +# ifndef USE_BIG_XIMAGE + /* Move the bits for this horizontal stripe to the server. */ +# ifdef HAVE_XSHM_EXTENSION + if (!c->use_shm) +# endif /* HAVE_XSHM_EXTENSION */ + XPutImage(c->dpy, TARGET(c), c->copy_gc, c->ximage, + 0, 0, 0, g*j, c->ximage->width, c->ximage->height); +# endif + +# if defined HAVE_XSHM_EXTENSION && !defined USE_BIG_XIMAGE + if (c->use_shm) +# endif + { +# if defined HAVE_XSHM_EXTENSION || defined USE_BIG_XIMAGE + scanline = (char *)scanline + c->ximage->bytes_per_line * g * c->threadpool.count; + img_y += g * c->threadpool.count; +# endif + } + +#endif /* USE_XIMAGE */ + } +} + +/* On allocation error, c->ximage == NULL. */ static void create_image( Display* dpy, struct inter_context* c, const XWindowAttributes* xgwa) { #ifdef USE_XIMAGE - c->row = malloc((c->w / c->grid_size) * sizeof(uint32_t)); - check_no_mem(dpy, c, c->row); + + /* Set the width so that each thread can work on a different line. */ + unsigned align = thread_memory_alignment(dpy) * 8 - 1; + unsigned wbits, w, h; + + c->w = xgwa->width; + c->h = xgwa->height; + c->w_div_g = (c->w + c->grid_size - 1) / c->grid_size; + c->h_div_g = (c->h + c->grid_size - 1) / c->grid_size; + w = c->w_div_g * c->grid_size; + h = c->h_div_g * c->grid_size; + + /* The width of a scan line, in *bits*. */ + wbits = (w * c->bits_per_pixel + align) & ~align; # ifdef HAVE_XSHM_EXTENSION /* - * interference used to put one row at a time to the X server. This changes + * Interference used to put one row at a time to the X server. This changes * today. * * XShmPutImage is asynchronous; the contents of the XImage must not be @@ -417,7 +673,7 @@ static void create_image( { c->ximage = create_xshm_image(dpy, xgwa->visual, xgwa->depth, ZPixmap, 0, &c->shm_info, - xgwa->width, xgwa->height); + wbits / c->bits_per_pixel, h); if (!c->ximage) c->use_shm = False; /* If create_xshm_image fails, it will not be attempted again. */ @@ -431,35 +687,58 @@ static void create_image( c->ximage = XCreateImage(dpy, xgwa->visual, xgwa->depth, ZPixmap, 0, 0, /* depth, fmt, offset, data */ - xgwa->width, /* width */ + w, /* width */ # ifdef USE_BIG_XIMAGE - xgwa->height, /* height */ + h, /* height */ # else c->grid_size, /* height */ # endif - 8, 0); /* pad, bpl */ + 8, wbits / 8); /* pad, bpl */ if(c->ximage) { - c->ximage->data = (char *) - calloc(c->ximage->height, c->ximage->bytes_per_line); - - if(!c->ximage->data) + if(thread_malloc((void **)&c->ximage->data, dpy, + c->ximage->height * c->ximage->bytes_per_line)) { - free(c->ximage); + XFree(c->ximage); c->ximage = NULL; } } } - if(!c->ximage) + check_no_mem(dpy, c, c->ximage); +#endif /* USE_XIMAGE */ + + { + static const struct threadpool_class cls = { - free(c->row); - c->row = 0; - } + sizeof(struct inter_thread), + inter_thread_create, + inter_thread_destroy + }; + + int error = threadpool_create( + &c->threadpool, + &cls, + dpy, +#if defined USE_XIMAGE && defined USE_BIG_XIMAGE + hardware_concurrency(dpy) +#else + 1 + /* At least three issues with threads without USE_BIG_XIMAGE: + * 1. Most of Xlib isn't thread safe without XInitThreads. + * 2. X(Un)LockDisplay would need to be called for each line, which is terrible. + * 3. There's only one XImage buffer at the moment. + */ +#endif + ); - check_no_mem(dpy, c, c->row); -#endif /* USE_XIMAGE */ + if(error) { + c->threadpool.count = 0; /* See the note in thread_util.h. */ + inter_free(dpy, c); + abort_on_error(error); + } + } } static void create_pix_buf(Display* dpy, Window win, struct inter_context *c, @@ -500,7 +779,7 @@ static void inter_init(Display* dpy, Window win, struct inter_context* c) unsigned long valmask = 0; #endif -# ifdef HAVE_COCOA /* Don't second-guess Quartz's double-buffering */ +# ifdef HAVE_JWXYZ /* Don't second-guess Quartz's double-buffering */ dbuf = False; # endif @@ -509,13 +788,14 @@ static void inter_init(Display* dpy, Window win, struct inter_context* c) c->dpy = dpy; c->win = win; + c->delay = get_integer_resource(dpy, "delay", "Integer"); XGetWindowAttributes(c->dpy, c->win, &xgwa); - c->w = xgwa.width; - c->h = xgwa.height; c->cmap = xgwa.colormap; c->screen = xgwa.screen; + c->bits_per_pixel = get_bits_per_pixel(c->dpy, xgwa.depth); + check_no_mem(dpy, c, (void *)(ptrdiff_t)c->bits_per_pixel); #ifdef HAVE_XSHM_EXTENSION c->use_shm = get_boolean_resource(dpy, "useSHM", "Boolean"); @@ -624,6 +904,7 @@ static void inter_init(Display* dpy, Window win, struct inter_context* c) c->radius = radius; #endif + if (c->radius < 1) c->radius = 1; c->wave_height = calloc(c->radius, sizeof(unsigned)); check_no_mem(dpy, c, c->wave_height); @@ -662,27 +943,14 @@ static void inter_init(Display* dpy, Window win, struct inter_context* c) (c->h/2 + ((int)(cos(c->source[i].y_theta)*((float)c->h/2.0)))) /* - * This is rather suboptimal. Calculating the distance per-pixel is going to + * This is somewhat suboptimal. Calculating the distance per-pixel is going to * be a lot slower than using now-ubiquitous SIMD CPU instructions to do four - * or eight pixels at a time. Plus, this could be almost trivially - * parallelized, what with all the multi-core hardware nowadays. + * or eight pixels at a time. */ static unsigned long do_inter(struct inter_context* c) { - int i, j, k; - unsigned result; - int dist; - int g = c->grid_size; - unsigned w_div_g = c->w/g; - - int dx, dy; - int px, py; - -#ifdef USE_XIMAGE - unsigned img_y = 0; - void *scanline = c->ximage->data; -#endif + int i; double now; float elapsed; @@ -709,134 +977,8 @@ static unsigned long do_inter(struct inter_context* c) c->source[i].y = source_y(c, i); } - for(j = 0; j < c->h/g; j++) { - for(i = 0; i < w_div_g; i++) { - result = 0; - px = i*g + g/2; - py = j*g + g/2; - for(k = 0; k < c->count; k++) { - - dx = px - c->source[k].x; - dy = py - c->source[k].y; - - /* - * Other possibilities for improving performance here: - * 1. Using octagon-based distance estimation - * (Which causes giant octagons to appear.) - * 2. Square root approximation by reinterpret-casting IEEE floats to - * integers. - * (Which causes angles to appear when two waves interfere.) - */ - -/* int_float u; - u.f = dx*dx + dy*dy; - u.i = (1 << 29) + (u.i >> 1) - (1 << 22); - dist = u.f; */ - -#if defined USE_FAST_SQRT_BIGTABLE2 - dist = dx*dx + dy*dy; - dist = FAST_TABLE(dist); -#elif defined USE_FAST_SQRT_HACKISH - dist = fast_log2(dx*dx + dy*dy); -#else - dist = sqrt(dx*dx + dy*dy); -#endif - - result += (dist >= c->radius ? 0 : c->wave_height[dist]); - } - - /* It's slightly faster to do a subtraction or two before calculating the - * modulus. - D.O. */ - if(result >= c->colors) - { - result -= c->colors; - if(result >= c->colors) - result %= (unsigned)c->colors; - } - -#ifdef USE_XIMAGE - c->row[i] = c->pal[result].pixel; -#else - XFillRectangle(c->dpy, TARGET(c), c->gcs[result], g*i, g*j, g, g); -#endif /* USE_XIMAGE */ - } - -#ifdef USE_XIMAGE - /* Fill in these `gridsize' horizontal bits in the scanline */ - if(c->ximage->bits_per_pixel == 32) - { - uint32_t *ptr = (uint32_t *)scanline; - for(i = 0; i < w_div_g; i++) { - for(k = 0; k < g; k++) - ptr[g*i+k] = c->row[i]; - } - } - else if(c->ximage->bits_per_pixel == 24) - { - uint8_t *ptr = (uint8_t *)scanline; - for(i = 0; i < w_div_g; i++) { - for(k = 0; k < g; k++) { - uint32_t pixel = c->row[i]; - /* Might not work on big-endian. */ - ptr[0] = pixel; - ptr[1] = (pixel & 0x0000ff00) >> 8; - ptr[2] = (pixel & 0x00ff0000) >> 16; - ptr += 3; - } - } - } - else if(c->ximage->bits_per_pixel == 16) - { - uint16_t *ptr = (uint16_t *)scanline; - for(i = 0; i < w_div_g; i++) { - for(k = 0; k < g; k++) - ptr[g*i+k] = c->row[i]; - } - } - else if(c->ximage->bits_per_pixel == 8) - { - uint8_t *ptr = (uint8_t *)scanline; - for(i = 0; i < w_div_g; i++) { - for(k = 0; k < g; k++) - ptr[g*i+k] = c->row[i]; - } - } - else - { - for(i = 0; i < w_div_g; i++) { - for(k = 0; k < g; k++) - XPutPixel(c->ximage, (g*i)+k, img_y, c->row[i]); - } - } - - /* Only the first scanline of the image has been filled in; clone that - scanline to the rest of the `gridsize' lines in the ximage */ - for(k = 0; k < (g-1); k++) - memcpy(c->ximage->data + (c->ximage->bytes_per_line * (img_y + k + 1)), - c->ximage->data + (c->ximage->bytes_per_line * img_y), - c->ximage->bytes_per_line); - -# ifndef USE_BIG_XIMAGE - /* Move the bits for this horizontal stripe to the server. */ -# ifdef HAVE_XSHM_EXTENSION - if (!c->use_shm) -# endif /* HAVE_XSHM_EXTENSION */ - XPutImage(c->dpy, TARGET(c), c->copy_gc, c->ximage, - 0, 0, 0, g*j, c->ximage->width, c->ximage->height); -# endif - -# if defined HAVE_XSHM_EXTENSION && !defined USE_BIG_XIMAGE - if (c->use_shm) -# endif - { -# if defined HAVE_XSHM_EXTENSION || defined USE_BIG_XIMAGE - scanline = (char *)scanline + c->ximage->bytes_per_line * g; - img_y += g; -# endif - } - -#endif /* USE_XIMAGE */ - } + threadpool_run(&c->threadpool, inter_thread_run); + threadpool_wait(&c->threadpool); #ifdef HAVE_XSHM_EXTENSION if (c->use_shm) @@ -899,15 +1041,12 @@ interference_reshape (Display *dpy, Window window, void *closure, { struct inter_context *c = (struct inter_context *) closure; XWindowAttributes xgwa; - Bool dbuf = (c->pix_buf + Bool dbuf = (!!c->pix_buf # ifdef HAVE_DOUBLE_BUFFER_EXTENSION || c->back_buf # endif ); - c->w = w; - c->h = h; - #ifdef USE_XIMAGE destroy_image(dpy, c); c->ximage = 0;