diff --git a/src/blitter/32bpp_anim_sse4.cpp b/src/blitter/32bpp_anim_sse4.cpp index a200d8d8c9..5bc63bbb0a 100644 --- a/src/blitter/32bpp_anim_sse4.cpp +++ b/src/blitter/32bpp_anim_sse4.cpp @@ -80,10 +80,11 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL } case RM_WITH_SKIP: { - uint32 mvX2 = *((uint32 *) const_cast(src_mv)); - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); for (uint x = (uint) effective_width/2; x != 0; x--) { + uint32 mvX2 = *((uint32 *) const_cast(src_mv)); + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); + /* Remap colours. */ const byte m0 = mvX2; if (m0 >= PALETTE_ANIM_START) { @@ -125,42 +126,32 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL bmno_alpha_blend: ALPHA_BLEND_2(pack_low_cm); bmno_full_opacity: - srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); - - src_mv += 2; - mvX2 = *((uint32 *) const_cast(src_mv)); - src += 2; - anim += 2; - dstABCD = _mm_loadu_si128((__m128i*) (dst+2)); - _mm_storeu_si128((__m128i *) dst, srcABCD); - srcABCD = _mm_loadu_si128((const __m128i*) src); - dst += 2; - continue; - + _mm_storel_epi64((__m128i *) dst, srcABCD); bmno_full_transparency: src_mv += 2; - mvX2 = *((uint32 *) const_cast(src_mv)); - dst += 2; src += 2; anim += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - srcABCD = _mm_loadu_si128((const __m128i*) src); + dst += 2; } if (bt_last == BT_ODD) { if (src->a == 0) { } else if (src->a == 255) { - *anim = (uint16) mvX2; - *dst = ((byte) mvX2 >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette((byte) mvX2), (byte) (mvX2 >> 8)) : *src; + *anim = *(const uint16*) src_mv; + *dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v) : *src; } else { *anim = 0; - if ((byte) mvX2 >= PALETTE_ANIM_START) { - ALIGN(16) Colour colour = AdjustBrightness(LookupColourInPalette((byte) mvX2), (byte) (mvX2 >> 8)); + __m128i srcABCD; + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); + if (src_mv->m >= PALETTE_ANIM_START) { + Colour colour = AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v); colour.a = src->a; - srcABCD = _mm_load_si128((__m128i*) &colour); + srcABCD = _mm_cvtsi32_si128(colour.data); + } else { + srcABCD = _mm_cvtsi32_si128(src->data); } ALPHA_BLEND_2(pack_low_cm); - (*dst).data = EXTR32(srcABCD, 0); + dst->data = _mm_cvtsi128_si32(srcABCD); } } break; @@ -181,18 +172,18 @@ bmno_full_transparency: const int width_diff = si->sprite_width - bp->width; effective_width = bp->width - (int) src_rgba_line[0].data; const int delta_diff = (int) src_rgba_line[1].data - width_diff; - const int nd = effective_width - delta_diff; - effective_width = delta_diff > 0 ? nd : effective_width; + const int new_width = effective_width - delta_diff; + effective_width = delta_diff > 0 ? new_width : effective_width; if (effective_width <= 0) break; /* FALLTHROUGH */ } case RM_WITH_SKIP: { - uint32 mvX2 = *((uint32 *) const_cast(src_mv)); - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); - for (uint x = (uint) effective_width / 2; x != 0; x--) { + uint32 mvX2 = *((uint32 *) const_cast(src_mv)); + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); + /* Remap colours. */ const uint m0 = (byte) mvX2; const uint r0 = remap[m0]; @@ -250,53 +241,40 @@ bmno_full_transparency: bmcr_alpha_blend: ALPHA_BLEND_2(pack_low_cm); bmcr_full_opacity: - srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); - - src += 2; - src_mv += 2; - anim += 2; - mvX2 = *((uint32 *) const_cast(src_mv)); - dstABCD = _mm_loadu_si128((__m128i*) (dst+2)); - _mm_storeu_si128((__m128i *) dst, srcABCD); - srcABCD = _mm_loadu_si128((const __m128i*) src); - dst += 2; - continue; - + _mm_storel_epi64((__m128i *) dst, srcABCD); bmcr_full_transparency: src_mv += 2; - mvX2 = *((uint32 *) const_cast(src_mv)); dst += 2; src += 2; anim += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (effective_width & 1) { /* In case the m-channel is zero, do not remap this pixel in any way. */ - if (src->a == 0) { - } else if ((byte) mvX2 != 0) { - const uint r = remap[(byte) mvX2]; - *anim = (src->a == 255) ? (r | ((uint16) mvX2 & 0xFF00)) : 0; + __m128i srcABCD; + if (src->a == 0) break; + if (src_mv->m) { + const uint r = remap[src_mv->m]; + *anim = (src->a == 255) ? r | ((uint16) src_mv->v << 8 ) : 0; if (r != 0) { - Colour remapped_colour = AdjustBrightness(LookupColourInPalette(r), (byte) (mvX2 >> 8)); + Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); if (src->a == 255) { *dst = remapped_colour; } else { remapped_colour.a = src->a; - INSR32(remapped_colour.data, srcABCD, 0); + srcABCD = _mm_cvtsi32_si128(remapped_colour.data); goto bmcr_alpha_blend_single; } } } else { *anim = 0; - if (src->a == 255) { - *dst = *src; - } else { + srcABCD = _mm_cvtsi32_si128(src->data); + if (src->a < 255) { bmcr_alpha_blend_single: + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); ALPHA_BLEND_2(pack_low_cm); - (*dst).data = EXTR32(srcABCD, 0); } + dst->data = _mm_cvtsi128_si32(srcABCD); } } break; @@ -309,29 +287,27 @@ bmcr_alpha_blend_single: case BM_TRANSPARENT: { /* Make the current colour a bit more black, so it looks like this image is transparent. */ - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); for (uint x = (uint) bp->width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstCD); - Colour *old_dst = dst; + dstAB = _mm_packus_epi16(dstAB, dstAB); + _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; anim += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, dstAB); - srcABCD = _mm_loadu_si128((const __m128i*) src); if (src[-2].a) anim[-2] = 0; if (src[-1].a) anim[-1] = 0; } if (bp->width & 1) { + __m128i srcABCD = _mm_cvtsi32_si128(src->data); + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); @@ -340,7 +316,7 @@ bmcr_alpha_blend_single: dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_packus_epi16(dstAB, dstAB); - (*dst).data = EXTR32(dstAB, 0); + dst->data = _mm_cvtsi128_si32(dstAB); if (src[0].a) anim[0] = 0; } break; diff --git a/src/blitter/32bpp_sse2.cpp b/src/blitter/32bpp_sse2.cpp index 137c914172..c5e7e70d0a 100644 --- a/src/blitter/32bpp_sse2.cpp +++ b/src/blitter/32bpp_sse2.cpp @@ -70,18 +70,18 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel case RM_WITH_SKIP: { for (uint x = (uint) effective_width / 2; x > 0; x--) { - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); ALPHA_BLEND_2(); - *(uint64*) dst = EXTR64(srcABCD, 0); + _mm_storel_epi64((__m128i*) dst, srcABCD); src += 2; dst += 2; } if (bt_last == BT_ODD) { - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); + __m128i srcABCD = _mm_cvtsi32_si128(src->data); + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); ALPHA_BLEND_2(); - (*dst).data = EXTR32(srcABCD, 0); + dst->data = _mm_cvtsi128_si32(srcABCD); } break; } @@ -99,8 +99,8 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel const int width_diff = si->sprite_width - bp->width; effective_width = bp->width - (int) src_rgba_line[0].data; const int delta_diff = (int) src_rgba_line[1].data - width_diff; - const int nd = effective_width - delta_diff; - effective_width = delta_diff > 0 ? nd : effective_width; + const int new_width = effective_width - delta_diff; + effective_width = delta_diff > 0 ? new_width : effective_width; if (effective_width <= 0) break; /* FALLTHROUGH */ } @@ -108,30 +108,28 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel case RM_WITH_SKIP: { const byte *remap = bp->remap; for (uint x = (uint) effective_width; x != 0; x--) { - /* In case the m-channel is zero, do not remap this pixel in any way */ - if (src_mv->m == 0) { - if (src->a < 255) { - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); - ALPHA_BLEND_2(); - (*dst).data = EXTR32(srcABCD, 0); - } else { - *dst = src->data; - } - } else { + /* In case the m-channel is zero, do not remap this pixel in any way. */ + __m128i srcABCD; + if (src_mv->m) { const uint r = remap[src_mv->m]; if (r != 0) { Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); - if (src->a < 255) { - __m128i srcABCD; - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); - remapped_colour.a = src->a; - INSR32(remapped_colour.data, srcABCD, 0); - ALPHA_BLEND_2(); - (*dst).data = EXTR32(srcABCD, 0); - } else + if (src->a == 255) { *dst = remapped_colour; + } else { + remapped_colour.a = src->a; + srcABCD = _mm_cvtsi32_si128(remapped_colour.data); + goto bmcr_alpha_blend_single; + } } + } else { + srcABCD = _mm_cvtsi32_si128(src->data); + if (src->a < 255) { +bmcr_alpha_blend_single: + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); + ALPHA_BLEND_2(); + } + dst->data = _mm_cvtsi128_si32(srcABCD); } src_mv++; dst++; @@ -149,27 +147,25 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel /* Make the current colour a bit more black, so it looks like this image is transparent. * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) */ - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); for (uint x = (uint) bp->width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F); alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstCD); - Colour *old_dst = dst; + dstAB = _mm_packus_epi16(dstAB, dstAB); + _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, dstAB); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (bp->width & 1) { + __m128i srcABCD = _mm_cvtsi32_si128(src->data); + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F); @@ -179,7 +175,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_packus_epi16(dstAB, dstAB); - (*dst).data = EXTR32(dstAB, 0); + dst->data = _mm_cvtsi128_si32(dstAB); } break; } @@ -345,7 +341,7 @@ inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightnes IGNORE_UNINITIALIZED_WARNING_START /* static */ Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness) { - ALIGN(16) uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; + uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; c16 *= brightness; uint64 c16_ob = c16; // Helps out of order execution. c16 /= DEFAULT_BRIGHTNESS; @@ -357,12 +353,20 @@ IGNORE_UNINITIALIZED_WARNING_START const uint32 alpha32 = colour.data & 0xFF000000; __m128i ret; +#ifdef _SQ64 + ret = _mm_cvtsi64_si128(c16); +#else INSR64(c16, ret, 0); +#endif if (ob != 0) { /* Reduce overbright strength. */ ob /= 2; __m128i ob128; +#ifdef _SQ64 + ob128 = _mm_cvtsi64_si128(ob | ob << 16 | ob << 32); +#else INSR64(ob | ob << 16 | ob << 32, ob128, 0); +#endif __m128i white = OVERBRIGHT_VALUE_MASK; __m128i c128 = ret; ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */ @@ -372,7 +376,7 @@ IGNORE_UNINITIALIZED_WARNING_START } ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */ - return alpha32 | EXTR32(ret, 0); + return alpha32 | _mm_cvtsi128_si32(ret); } IGNORE_UNINITIALIZED_WARNING_STOP diff --git a/src/blitter/32bpp_sse4.cpp b/src/blitter/32bpp_sse4.cpp index 357a587d98..c00dab0d73 100644 --- a/src/blitter/32bpp_sse4.cpp +++ b/src/blitter/32bpp_sse4.cpp @@ -76,24 +76,19 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel } case RM_WITH_SKIP: { - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); for (uint x = (uint) effective_width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); ALPHA_BLEND_2(pack_low_cm); - srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); - Colour *old_dst = dst; + _mm_storel_epi64((__m128i*) dst, srcABCD); src += 2; dst += 2; - /* It is VERY important to read next data before it gets invalidated in cpu cache. - * And PEXTR latency is a real problem here. - */ - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, srcABCD); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (bt_last == BT_ODD) { + __m128i srcABCD = _mm_cvtsi32_si128(src->data); + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); ALPHA_BLEND_2(pack_low_cm); - *dst = (Colour) EXTR32(srcABCD, 0); + dst->data = _mm_cvtsi128_si32(srcABCD); } break; } @@ -112,18 +107,18 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel const int width_diff = si->sprite_width - bp->width; effective_width = bp->width - (int) src_rgba_line[0].data; const int delta_diff = (int) src_rgba_line[1].data - width_diff; - const int nd = effective_width - delta_diff; - effective_width = delta_diff > 0 ? nd : effective_width; + const int new_width = effective_width - delta_diff; + effective_width = delta_diff > 0 ? new_width : effective_width; if (effective_width <= 0) break; /* FALLTHROUGH */ } case RM_WITH_SKIP: { - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); - uint32 mvX2 = *((uint32 *) const_cast(src_mv)); - for (uint x = (uint) effective_width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); + uint32 mvX2 = *((uint32 *) const_cast(src_mv)); + /* Remap colours. */ if (mvX2 & 0x00FF00FF) { /* Written so the compiler uses CMOV. */ @@ -152,38 +147,35 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel /* Blend colours. */ ALPHA_BLEND_2(pack_low_cm); - srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); - Colour *old_dst = dst; + _mm_storel_epi64((__m128i *) dst, srcABCD); dst += 2; src += 2; src_mv += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, srcABCD); - mvX2 = *((uint32 *) const_cast(src_mv)); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (effective_width & 1) { /* In case the m-channel is zero, do not remap this pixel in any way. */ - if ((byte) mvX2 == 0) { - if (src->a < 255) { - ALPHA_BLEND_2(pack_low_cm); - (*dst).data = EXTR32(srcABCD, 0); - } else - *dst = *src; - } else { - const uint r = remap[(byte) mvX2]; + __m128i srcABCD; + if (src_mv->m) { + const uint r = remap[src_mv->m]; if (r != 0) { - Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), (byte) (mvX2 >> 8)); + Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); if (src->a == 255) { *dst = remapped_colour; } else { remapped_colour.a = src->a; - INSR32(remapped_colour.data, srcABCD, 0); - ALPHA_BLEND_2(pack_low_cm); - (*dst).data = EXTR32(srcABCD, 0); + srcABCD = _mm_cvtsi32_si128(remapped_colour.data); + goto bmcr_alpha_blend_single; } } + } else { + srcABCD = _mm_cvtsi32_si128(src->data); + if (src->a < 255) { +bmcr_alpha_blend_single: + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); + ALPHA_BLEND_2(pack_low_cm); + } + dst->data = _mm_cvtsi128_si32(srcABCD); } } break; @@ -199,26 +191,24 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel /* Make the current colour a bit more black, so it looks like this image is transparent. * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) */ - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); for (uint x = (uint) bp->width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstCD); - Colour *old_dst = dst; + dstAB = _mm_packus_epi16(dstAB, dstAB); + _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, dstAB); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (bp->width & 1) { + __m128i srcABCD = _mm_cvtsi32_si128(src->data); + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); @@ -227,7 +217,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_packus_epi16(dstAB, dstAB); - (*dst).data = EXTR32(dstAB, 0); + dst->data = _mm_cvtsi128_si32(dstAB); } break; @@ -290,7 +280,7 @@ inline Colour Blitter_32bppSSE4::AdjustBrightness(Colour colour, uint8 brightnes IGNORE_UNINITIALIZED_WARNING_START /* static */ Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness) { - ALIGN(16) uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; + uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; c16 *= brightness; uint64 c16_ob = c16; // Helps out of order execution. c16 /= DEFAULT_BRIGHTNESS; @@ -317,7 +307,7 @@ IGNORE_UNINITIALIZED_WARNING_START } ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */ - return alpha32 | EXTR32(ret, 0); + return alpha32 | _mm_cvtsi128_si32(ret); } IGNORE_UNINITIALIZED_WARNING_STOP diff --git a/src/blitter/32bpp_ssse3.cpp b/src/blitter/32bpp_ssse3.cpp index 5312621627..ebfc3ce6d0 100644 --- a/src/blitter/32bpp_ssse3.cpp +++ b/src/blitter/32bpp_ssse3.cpp @@ -50,7 +50,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel /* Load these variables into register before loop. */ const __m128i a_cm = ALPHA_CONTROL_MASK; - const __m128i pack_hi_cm = PACK_HIGH_CONTROL_MASK; + const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK; const __m128i briAB_cm = BRIGHTNESS_LOW_CONTROL_MASK; const __m128i div_cleaner = BRIGHTNESS_DIV_CLEANER; const __m128i ob_check = OVERBRIGHT_PRESENCE_MASK; @@ -79,27 +79,19 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel } case RM_WITH_SKIP: { - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); for (uint x = (uint) effective_width / 2; x > 0; x--) { - ALPHA_BLEND_2(pack_hi_cm); - /* With high repack, srcABCD have its 2 blended pixels like: [S0 S1 S2 S3] -> [-- -- BS0 BS1] - * dstABCD shuffled: [D0 D1 D2 D3] -> [D2 D3 D0 D0] - * PALIGNR takes what's in (): [-- -- (BS0 BS1] [D2 D3) D0 D0] - */ - dstABCD = _mm_shuffle_epi32(dstABCD, 0x0E); - srcABCD = _mm_alignr_epi8(dstABCD, srcABCD, 8); - Colour *old_dst = dst; + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); + ALPHA_BLEND_2(pack_low_cm); + _mm_storel_epi64((__m128i*) dst, srcABCD); src += 2; dst += 2; - /* It is VERY important to read next data before it gets invalidated in cpu cache. */ - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, srcABCD); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (bt_last == BT_ODD) { - ALPHA_BLEND_2(pack_hi_cm); - (*dst).data = EXTR32(srcABCD, 2); + __m128i srcABCD = _mm_cvtsi32_si128(src->data); + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); + ALPHA_BLEND_2(pack_low_cm); + dst->data = _mm_cvtsi128_si32(srcABCD); } break; } @@ -117,18 +109,18 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel const int width_diff = si->sprite_width - bp->width; effective_width = bp->width - (int) src_rgba_line[0].data; const int delta_diff = (int) src_rgba_line[1].data - width_diff; - const int nd = effective_width - delta_diff; - effective_width = delta_diff > 0 ? nd : effective_width; + const int new_width = effective_width - delta_diff; + effective_width = delta_diff > 0 ? new_width : effective_width; if (effective_width <= 0) break; /* FALLTHROUGH */ } case RM_WITH_SKIP: { - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); - uint32 mvX2 = *((uint32 *) const_cast(src_mv)); - for (uint x = (uint) effective_width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); + uint32 mvX2 = *((uint32 *) const_cast(src_mv)); + /* Remap colours. */ if (mvX2 & 0x00FF00FF) { /* Written so the compiler uses CMOV. */ @@ -139,7 +131,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel Colour c0 = 0; // Use alpha of 0 to keep dst as is. c0 = r0 == 0 ? c0 : c0map; c0 = m0 != 0 ? c0 : src0; - INSR32(c0.data, srcABCD, 0); + srcABCD = _mm_cvtsi32_si128(c0.data); const Colour src1 = src[1]; const uint m1 = (byte) (mvX2 >> 16); @@ -156,40 +148,36 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel } /* Blend colours. */ - ALPHA_BLEND_2(pack_hi_cm); - dstABCD = _mm_shuffle_epi32(dstABCD, 0x0E); - srcABCD = _mm_alignr_epi8(dstABCD, srcABCD, 8); - Colour *old_dst = dst; + ALPHA_BLEND_2(pack_low_cm); + _mm_storel_epi64((__m128i *) dst, srcABCD); dst += 2; src += 2; src_mv += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, srcABCD); - mvX2 = *((uint32 *) const_cast(src_mv)); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (effective_width & 1) { - /* In case the m-channel is zero, do not remap this pixel in any way */ - if (src_mv->m == 0) { - if (src->a < 255) { - ALPHA_BLEND_2(pack_hi_cm); - (*dst).data = EXTR32(srcABCD, 2); - } else { - *dst = src->data; - } - } else { + /* In case the m-channel is zero, do not remap this pixel in any way. */ + __m128i srcABCD; + if (src_mv->m) { const uint r = remap[src_mv->m]; if (r != 0) { Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); - if (src->a < 255) { - remapped_colour.a = src->a; - INSR32(remapped_colour.data, srcABCD, 0); - ALPHA_BLEND_2(pack_hi_cm); - (*dst).data = EXTR32(srcABCD, 2); - } else + if (src->a == 255) { *dst = remapped_colour; + } else { + remapped_colour.a = src->a; + srcABCD = _mm_cvtsi32_si128(remapped_colour.data); + goto bmcr_alpha_blend_single; + } } + } else { + srcABCD = _mm_cvtsi32_si128(src->data); + if (src->a < 255) { +bmcr_alpha_blend_single: + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); + ALPHA_BLEND_2(pack_low_cm); + } + dst->data = _mm_cvtsi128_si32(srcABCD); } } break; @@ -200,30 +188,29 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel src_mv_line += si->sprite_width; break; } + case BM_TRANSPARENT: { /* Make the current colour a bit more black, so it looks like this image is transparent. * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) */ - __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); - __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); for (uint x = (uint) bp->width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); + __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstCD); - Colour *old_dst = dst; + dstAB = _mm_packus_epi16(dstAB, dstAB); + _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; - dstABCD = _mm_loadu_si128((__m128i*) dst); - _mm_storeu_si128((__m128i *) old_dst, dstAB); - srcABCD = _mm_loadu_si128((const __m128i*) src); } if (bp->width & 1) { + __m128i srcABCD = _mm_cvtsi32_si128(src->data); + __m128i dstABCD = _mm_cvtsi32_si128(dst->data); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); @@ -232,7 +219,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_packus_epi16(dstAB, dstAB); - (*dst).data = EXTR32(dstAB, 0); + dst->data = _mm_cvtsi128_si32(dstAB); } break; } diff --git a/src/blitter/32bpp_ssse3.hpp b/src/blitter/32bpp_ssse3.hpp index cede4185d4..97c67657e3 100644 --- a/src/blitter/32bpp_ssse3.hpp +++ b/src/blitter/32bpp_ssse3.hpp @@ -47,8 +47,7 @@ __m128i zero = _mm_setzero_si128(); \ __m128i colAB = _mm_unpacklo_epi8(colourX2, zero); \ \ - __m128i briAB; \ - INSR64(brightnessX2, briAB, 0); \ + __m128i briAB = _mm_cvtsi32_si128(brightnessX2); \ briAB = _mm_shuffle_epi8(briAB, briAB_cm); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \ colAB = _mm_mullo_epi16(colAB, briAB); \ __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \ diff --git a/src/video/win32_v.cpp b/src/video/win32_v.cpp index 02cae68e2f..186ac10258 100644 --- a/src/video/win32_v.cpp +++ b/src/video/win32_v.cpp @@ -1077,7 +1077,7 @@ static bool AllocateDibSection(int w, int h, bool force) bi->bmiHeader.biSize = sizeof(BITMAPINFOHEADER); bi->bmiHeader.biWidth = _wnd.width = w; - bi->bmiHeader.biHeight = -(_wnd.height = h+1); // Allocate extra room to prevent out-of-bounds when SSE reads a 16B block at the end of the buffer. + bi->bmiHeader.biHeight = -(_wnd.height = h); bi->bmiHeader.biPlanes = 1; bi->bmiHeader.biBitCount = BlitterFactory::GetCurrentBlitter()->GetScreenDepth();