+static int inter_thread_create(
+ void* self_raw,
+ struct threadpool* pool,
+ unsigned id)
+{
+ struct inter_thread* self = (struct inter_thread*)self_raw;
+ const struct inter_context* c = GET_PARENT_OBJ(struct inter_context, threadpool, pool);
+
+ self->context = c;
+ self->thread_id = id;
+
+ self->result_row = malloc(c->w_div_g * sizeof(unsigned));
+ if(!self->result_row)
+ return ENOMEM;
+
+#ifdef USE_XIMAGE
+ self->row = malloc(c->w_div_g * sizeof(uint32_t));
+ if(!self->row) {
+ free(self->result_row);
+ return ENOMEM;
+ }
+#endif
+
+ return 0;
+}
+
+static void inter_thread_destroy(void* self_raw)
+{
+ struct inter_thread* self = (struct inter_thread*)self_raw;
+#ifdef USE_XIMAGE
+ free(self->row);
+#endif
+ free(self->result_row);
+}
+
+/*
+A higher performance design would have input and output queues, so that when
+worker threads finish with one frame, they can pull the next work order from
+the queue and get started on it immediately, rather than going straight to
+sleep. The current "single-buffered" design still provides reasonable
+performance at low frame rates; high frame rates are noticeably less efficient.
+*/
+
+static void inter_thread_run(void* self_raw)
+{
+ struct inter_thread* self = (struct inter_thread*)self_raw;
+ const struct inter_context* c = self->context;
+
+ int i, j, k;
+ unsigned result;
+ int dist1;
+ int g = c->grid_size;
+
+ int dx, dy, g2 = 2 * g * g;
+ int px, py, px2g;
+
+ int dist0, ddist;
+
+#ifdef USE_XIMAGE
+ unsigned img_y = g * self->thread_id;
+ void *scanline = c->ximage->data + c->ximage->bytes_per_line * g * self->thread_id;
+#endif
+
+ for(j = self->thread_id; j < c->h_div_g; j += c->threadpool.count) {
+ px = g/2;
+ py = j*g + px;
+
+ memset(self->result_row, 0, c->w_div_g * sizeof(unsigned));
+
+ for(k = 0; k < c->count; k++) {
+
+ dx = px - c->source[k].x;
+ dy = py - c->source[k].y;
+
+ dist0 = dx*dx + dy*dy;
+ ddist = -2 * g * c->source[k].x;
+
+ /* px2g = g*(px*2 + g); */
+ px2g = g2;
+
+ for(i = 0; i < c->w_div_g; i++) {
+ /*
+ * Discarded possibilities for improving performance here:
+ * 1. Using octagon-based distance estimation
+ * (Which causes giant octagons to appear.)
+ * 2. Square root approximation by reinterpret-casting IEEE floats to
+ * integers.
+ * (Which causes angles to appear when two waves interfere.)
+ */
+
+/* int_float u;
+ u.f = dx*dx + dy*dy;
+ u.i = (1 << 29) + (u.i >> 1) - (1 << 22);
+ dist = u.f; */
+
+#if defined USE_FAST_SQRT_BIGTABLE2
+ dist1 = FAST_TABLE(dist0);
+#elif defined USE_FAST_SQRT_HACKISH
+ dist1 = fast_log2(dist0);
+#else
+ dist1 = sqrt(dist0);
+#endif
+
+ if(dist1 < c->radius)
+ self->result_row[i] += c->wave_height[dist1];
+
+ dist0 += px2g + ddist;
+ px2g += g2;
+ }
+ }
+
+ for(i = 0; i < c->w_div_g; i++) {
+
+ result = self->result_row[i];
+
+ /* It's slightly faster to do a subtraction or two before calculating the
+ * modulus. - D.O. */
+ if(result >= c->colors)
+ {
+ result -= c->colors;
+ if(result >= c->colors)
+ result %= (unsigned)c->colors;
+ }
+
+#ifdef USE_XIMAGE
+ self->row[i] = c->pal[result].pixel;
+#else
+ XFillRectangle(c->dpy, TARGET(c), c->gcs[result], g*i, g*j, g, g);
+#endif /* USE_XIMAGE */
+ }
+
+#ifdef USE_XIMAGE
+ /* Fill in these `gridsize' horizontal bits in the scanline */
+ if(c->ximage->bits_per_pixel == 32)
+ {
+ uint32_t *ptr = (uint32_t *)scanline;
+ for(i = 0; i < c->w_div_g; i++) {
+ for(k = 0; k < g; k++)
+ ptr[g*i+k] = self->row[i];
+ }
+ }
+ else if(c->ximage->bits_per_pixel == 24)
+ {
+ uint8_t *ptr = (uint8_t *)scanline;
+ for(i = 0; i < c->w_div_g; i++) {
+ for(k = 0; k < g; k++) {
+ uint32_t pixel = self->row[i];
+ /* Might not work on big-endian. */
+ ptr[0] = pixel;
+ ptr[1] = (pixel & 0x0000ff00) >> 8;
+ ptr[2] = (pixel & 0x00ff0000) >> 16;
+ ptr += 3;
+ }
+ }
+ }
+ else if(c->ximage->bits_per_pixel == 16)
+ {
+ uint16_t *ptr = (uint16_t *)scanline;
+ for(i = 0; i < c->w_div_g; i++) {
+ for(k = 0; k < g; k++)
+ ptr[g*i+k] = self->row[i];
+ }
+ }
+ else if(c->ximage->bits_per_pixel == 8)
+ {
+ uint8_t *ptr = (uint8_t *)scanline;
+ for(i = 0; i < c->w_div_g; i++) {
+ for(k = 0; k < g; k++)
+ ptr[g*i+k] = self->row[i];
+ }
+ }
+ else
+ {
+ for(i = 0; i < c->w_div_g; i++) {
+ for(k = 0; k < g; k++)
+ /* XPutPixel is thread safe as long as the XImage didn't have its
+ * bits_per_pixel changed. */
+ XPutPixel(c->ximage, (g*i)+k, img_y, self->row[i]);
+ }
+ }
+
+ /* Only the first scanline of the image has been filled in; clone that
+ scanline to the rest of the `gridsize' lines in the ximage */
+ for(k = 0; k < (g-1); k++)
+ memcpy(c->ximage->data + (c->ximage->bytes_per_line * (img_y + k + 1)),
+ c->ximage->data + (c->ximage->bytes_per_line * img_y),
+ c->ximage->bytes_per_line);
+
+# ifndef USE_BIG_XIMAGE
+ /* Move the bits for this horizontal stripe to the server. */
+# ifdef HAVE_XSHM_EXTENSION
+ if (!c->use_shm)
+# endif /* HAVE_XSHM_EXTENSION */
+ XPutImage(c->dpy, TARGET(c), c->copy_gc, c->ximage,
+ 0, 0, 0, g*j, c->ximage->width, c->ximage->height);
+# endif
+
+# if defined HAVE_XSHM_EXTENSION && !defined USE_BIG_XIMAGE
+ if (c->use_shm)
+# endif
+ {
+# if defined HAVE_XSHM_EXTENSION || defined USE_BIG_XIMAGE
+ scanline = (char *)scanline + c->ximage->bytes_per_line * g * c->threadpool.count;
+ img_y += g * c->threadpool.count;
+# endif
+ }
+
+#endif /* USE_XIMAGE */
+ }
+}
+
+/* On allocation error, c->ximage == NULL. */