+/* SSE2 optimized versions of glow_blur() and chromo_2x2_light() */
+
+static void glow_blur(struct state *st)
+{
+ unsigned int n, nn;
+ unsigned char *ps = st->palaka1;
+ unsigned char *pd = st->palaka2;
+ unsigned char *pa = st->palaka1 - (st->width * 4);
+ unsigned char *pb = st->palaka1 + (st->width * 4);
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+
+ xmm0 = _mm_setzero_si128();
+ nn = st->width * st->height * 4;
+ for (n = 0; n < nn; n+=16)
+ {
+ _mm_prefetch((const void *)&ps[n+16],_MM_HINT_T0);
+ _mm_prefetch((const void *)&pa[n+16],_MM_HINT_T0);
+ _mm_prefetch((const void *)&pb[n+16],_MM_HINT_T0);
+
+ xmm1 = _mm_load_si128((const __m128i*)&ps[n]);
+ xmm2 = xmm1;
+ xmm1 = _mm_unpacklo_epi8(xmm1,xmm0);
+ xmm2 = _mm_unpackhi_epi8(xmm2,xmm0);
+ xmm3 = _mm_loadu_si128((const __m128i*)&ps[n+4]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm3 = _mm_slli_epi16(xmm3,3);
+ xmm4 = _mm_slli_epi16(xmm4,3);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&ps[n+8]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+
+ xmm3 = _mm_load_si128((const __m128i*)&pa[n]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pa[n+4]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pa[n+8]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+
+ xmm3 = _mm_load_si128((const __m128i*)&pb[n]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pb[n+4]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+ xmm3 = _mm_loadu_si128((const __m128i*)&pb[n+8]);
+ xmm4 = xmm3;
+ xmm3 = _mm_unpacklo_epi8(xmm3,xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm4,xmm0);
+ xmm1 = _mm_add_epi16(xmm1,xmm3);
+ xmm2 = _mm_add_epi16(xmm2,xmm4);
+
+ xmm3 = xmm1;
+ xmm4 = xmm2;
+ xmm1 = _mm_srli_epi16(xmm1,4);
+ xmm2 = _mm_srli_epi16(xmm2,4);
+ xmm3 = _mm_srli_epi16(xmm3,3);
+ xmm4 = _mm_srli_epi16(xmm4,3);
+ xmm1 = _mm_packus_epi16(xmm1,xmm2);
+ xmm3 = _mm_packus_epi16(xmm3,xmm4);
+
+ _mm_storeu_si128((__m128i*)&ps[n+4], xmm1);
+ _mm_storeu_si128((__m128i*)&pd[n+4], xmm3);
+ }
+}
+
+static void chromo_2x2_light(struct state *st)