+static void glow_blur(struct state *st)
+{
+ unsigned int n, nn;
+ unsigned char *ps = st->palaka1;
+ unsigned char *pd = st->palaka2;
+ unsigned char *pa = st->palaka1 - (st->width * 4);
+ unsigned char *pb = st->palaka1 + (st->width * 4);
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+
+ xmm0 = _mm_setzero_si128();
+ nn = st->width * st->height * 4;
+ for (n = 0; n < nn; n+=16)
+ {
+ _mm_prefetch((const void *)&ps[n+16],_MM_HINT_T0);
+ _mm_prefetch((const void *)&pa[n+16],_MM_HINT_T0);
+ _mm_prefetch((const void *)&pb[n+16],_MM_HINT_T0);
+
+ xmm1 = _mm_load_si128((const __m128i*)&ps[n]);
+ xmm2 = xmm1;
+ xmm1 = _mm_unpacklo_epi8(xmm1,xmm0);
+ xmm2 = _mm_unpackhi_epi8(xmm2,xmm0);
+ xmm3 = _mm_loadu_si128((const __m128i*)&ps[n+4]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm3 = _mm_slli_epi16(xmm3,3);
+ xmm4 = _mm_slli_epi16(xmm4,3);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&ps[n+8]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+
+ xmm3 = _mm_load_si128((const __m128i*)&pa[n]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pa[n+4]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pa[n+8]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+
+ xmm3 = _mm_load_si128((const __m128i*)&pb[n]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pb[n+4]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pb[n+8]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+
+ xmm3 = xmm1;
+ xmm4 = xmm2;
+ xmm1 = _mm_srli_epi16(xmm1,4);
+ xmm2 = _mm_srli_epi16(xmm2,4);
+ xmm3 = _mm_srli_epi16(xmm3,3);
+ xmm4 = _mm_srli_epi16(xmm4,3);
+ xmm1 = _mm_packus_epi16(xmm1,xmm2);
+ xmm3 = _mm_packus_epi16(xmm3,xmm4);
+
+ _mm_storeu_si128((__m128i*)&ps[n+4], xmm1);
+ _mm_storeu_si128((__m128i*)&pd[n+4], xmm3);
+ }
+}
+
+static void chromo_2x2_light(struct state *st)
+{
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+ __m128i xmi4, xmi5, xmi6, xmi7;
+
+ unsigned int x, y, v = 0;
+ unsigned int nl = st->width * 4;
+ unsigned char *mem = st->palaka2;
+ fireshell *fs = st->fireshell_array;
+
+ xmm0 = _mm_setr_ps(fs[0].flash_b, fs[0].flash_g, fs[0].flash_r, 0.0);
+ xmm1 = _mm_setr_ps(fs[1].flash_b, fs[1].flash_g, fs[1].flash_r, 0.0);
+ xmm2 = _mm_setr_ps(fs[2].flash_b, fs[2].flash_g, fs[2].flash_r, 0.0);
+ xmm3 = _mm_setr_ps(fs[3].flash_b, fs[3].flash_g, fs[3].flash_r, 0.0);
+
+ for (y = st->height/2; y; y--, mem += nl)
+ {
+ for (x = st->width/4; x; x--, v += 8, mem += 16)
+ {
+ xmm4 = _mm_set1_ps(st->light_map[v+0]);
+ xmm5 = xmm0;
+ xmm5 = _mm_mul_ps(xmm5,xmm4);
+ xmm4 = _mm_set1_ps(st->light_map[v+1]);
+ xmm4 = _mm_mul_ps(xmm4,xmm1);
+ xmm5 = _mm_add_ps(xmm5,xmm4);
+ xmm4 = _mm_set1_ps(st->light_map[v+2]);
+ xmm4 = _mm_mul_ps(xmm4,xmm2);
+ xmm5 = _mm_add_ps(xmm5,xmm4);
+ xmm4 = _mm_set1_ps(st->light_map[v+3]);
+ xmm4 = _mm_mul_ps(xmm4,xmm3);
+ xmm5 = _mm_add_ps(xmm5,xmm4);
+
+ xmm4 = _mm_set1_ps(st->light_map[v+4]);
+ xmm6 = xmm0;
+ xmm6 = _mm_mul_ps(xmm6,xmm4);
+ xmm4 = _mm_set1_ps(st->light_map[v+5]);
+ xmm4 = _mm_mul_ps(xmm4,xmm1);
+ xmm6 = _mm_add_ps(xmm6,xmm4);
+ xmm4 = _mm_set1_ps(st->light_map[v+6]);
+ xmm4 = _mm_mul_ps(xmm4,xmm2);
+ xmm6 = _mm_add_ps(xmm6,xmm4);
+ xmm4 = _mm_set1_ps(st->light_map[v+7]);
+ xmm4 = _mm_mul_ps(xmm4,xmm3);
+ xmm6 = _mm_add_ps(xmm6,xmm4);
+
+ xmi6 = _mm_cvtps_epi32(xmm5);
+ xmi7 = _mm_cvtps_epi32(xmm6);
+ xmi6 = _mm_packs_epi32(xmi6,xmi6);
+ xmi7 = _mm_packs_epi32(xmi7,xmi7);
+
+ xmi4 = _mm_load_si128((const __m128i*) mem);
+ xmi5 = _mm_unpacklo_epi8(xmi5,xmi4);
+ xmi5 = _mm_srli_epi16(xmi5,8);
+ xmi4 = _mm_unpackhi_epi8(xmi4,xmi4);
+ xmi4 = _mm_srli_epi16(xmi4,8);
+ xmi5 = _mm_add_epi16(xmi5,xmi6);
+ xmi4 = _mm_add_epi16(xmi4,xmi7);
+ xmi5 = _mm_packus_epi16(xmi5,xmi4);
+ _mm_store_si128((__m128i*) mem, xmi5);
+
+ xmi4 = _mm_load_si128((const __m128i*) &mem[nl]);
+ xmi5 = _mm_unpacklo_epi8(xmi5,xmi4);
+ xmi5 = _mm_srli_epi16(xmi5,8);
+ xmi4 = _mm_unpackhi_epi8(xmi4,xmi4);
+ xmi4 = _mm_srli_epi16(xmi4,8);
+ xmi5 = _mm_add_epi16(xmi5,xmi6);
+ xmi4 = _mm_add_epi16(xmi4,xmi7);
+ xmi5 = _mm_packus_epi16(xmi5,xmi4);
+ _mm_store_si128((__m128i*) &mem[nl], xmi5);
+ }
+ }
+}
+
+#else
+
+static void glow_blur(struct state *st)
+{
+ unsigned int n, q;
+ unsigned char *pm = st->palaka1;
+ unsigned char *po = st->palaka2;
+ unsigned char *pa = pm - (st->width * 4);
+ unsigned char *pb = pm + (st->width * 4);
+ /*
+ unsigned int rgba = 0;
+ for (n = st->width*st->height*4; n; n--, pm++, pa++, pb++, po++)
+ {
+ if(++rgba > 3)
+ {
+ rgba = 0;
+ continue;
+ }
+ q = pm[0] + pm[4] * 8 + pm[8] +
+ pa[0] + pa[4] + pa[8] +
+ pb[0] + pb[4] + pb[8];
+ pm[4] = q >> 4;
+ po[4] = q > 2047 ? 255 : q >> 3;
+ }
+ --- using unrolled version ------------
+ */
+ for (n = st->width*st->height*4; n; n-=4)
+ {
+ q = pm[0] + pm[4] * 8 + pm[8] +
+ pa[0] + pa[4] + pa[8] +
+ pb[0] + pb[4] + pb[8];
+ pm[4] = q >> 4;
+ po[4] = q > 2047 ? 255 : q >> 3;
+ q = pm[1] + pm[5] * 8 + pm[9] +
+ pa[1] + pa[5] + pa[9] +
+ pb[1] + pb[5] + pb[9];
+ pm[5] = q >> 4;
+ po[5] = q > 2047 ? 255 : q >> 3;
+ q = pm[2] + pm[6] * 8 + pm[10] +
+ pa[2] + pa[6] + pa[10] +
+ pb[2] + pb[6] + pb[10];
+ pm[6] = q >> 4;
+ po[6] = q > 2047 ? 255 : q >> 3;
+
+ pm+=4, pa+=4, pb+=4, po+=4;
+ }
+}
+
+static inline unsigned char addbs(unsigned char c, unsigned int i)
+{
+ i += c;
+ return(i > 255 ? 255 : i);
+}
+
+static void chromo_2x2_light(struct state *st)
+{
+ unsigned int n, x, y, v = 0;
+ unsigned int nl = st->width * 4;
+ unsigned char *mem = st->palaka2;
+ float r, g, b;
+ float rgb[SHELLCOUNT*4];
+ fireshell *fs = st->fireshell_array;
+
+ for (n = 0, x = 0; n < SHELLCOUNT; n++, x += 4, fs++)
+ {
+ rgb[x ] = fs->flash_r;
+ rgb[x+1] = fs->flash_g;
+ rgb[x+2] = fs->flash_b;
+ }
+
+ for (y = st->height/2; y; y--)
+ {
+ for (x = st->width/2; x; x--, v += 4)
+ {
+ r = rgb[0] * st->light_map[v] + rgb[4] * st->light_map[v+1]
+ + rgb[ 8] * st->light_map[v+2] + rgb[12] * st->light_map[v+3];
+ g = rgb[1] * st->light_map[v] + rgb[5] * st->light_map[v+1]
+ + rgb[ 9] * st->light_map[v+2] + rgb[13] * st->light_map[v+3];
+ b = rgb[2] * st->light_map[v] + rgb[6] * st->light_map[v+1]
+ + rgb[10] * st->light_map[v+2] + rgb[14] * st->light_map[v+3];
+
+ mem[0] = addbs(mem[0], b);
+ mem[1] = addbs(mem[1], g);
+ mem[2] = addbs(mem[2], r);
+ mem[4] = addbs(mem[4], b);
+ mem[5] = addbs(mem[5], g);
+ mem[6] = addbs(mem[6], r);
+
+ mem += nl;
+
+ mem[0] = addbs(mem[0], b);
+ mem[1] = addbs(mem[1], g);
+ mem[2] = addbs(mem[2], r);
+ mem[4] = addbs(mem[4], b);
+ mem[5] = addbs(mem[5], g);
+ mem[6] = addbs(mem[6], r);
+
+ mem -= nl - 8;
+ }
+ mem += nl;
+ }
+}
+
+#endif