(svn r26247) -Fix [FS#5854, FS#5855]: Possible out of bounds reads with the sse blitters (MJP)

This commit is contained in:
rubidium 2014-01-13 17:54:24 +00:00
parent 54a898be33
commit a942619911
6 changed files with 164 additions and 208 deletions

View File

@ -80,10 +80,11 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
} }
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) effective_width/2; x != 0; x--) { for (uint x = (uint) effective_width/2; x != 0; x--) {
uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
/* Remap colours. */ /* Remap colours. */
const byte m0 = mvX2; const byte m0 = mvX2;
if (m0 >= PALETTE_ANIM_START) { if (m0 >= PALETTE_ANIM_START) {
@ -125,42 +126,32 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
bmno_alpha_blend: bmno_alpha_blend:
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
bmno_full_opacity: bmno_full_opacity:
srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); _mm_storel_epi64((__m128i *) dst, srcABCD);
src_mv += 2;
mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
src += 2;
anim += 2;
dstABCD = _mm_loadu_si128((__m128i*) (dst+2));
_mm_storeu_si128((__m128i *) dst, srcABCD);
srcABCD = _mm_loadu_si128((const __m128i*) src);
dst += 2;
continue;
bmno_full_transparency: bmno_full_transparency:
src_mv += 2; src_mv += 2;
mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
dst += 2;
src += 2; src += 2;
anim += 2; anim += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst); dst += 2;
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (bt_last == BT_ODD) { if (bt_last == BT_ODD) {
if (src->a == 0) { if (src->a == 0) {
} else if (src->a == 255) { } else if (src->a == 255) {
*anim = (uint16) mvX2; *anim = *(const uint16*) src_mv;
*dst = ((byte) mvX2 >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette((byte) mvX2), (byte) (mvX2 >> 8)) : *src; *dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
} else { } else {
*anim = 0; *anim = 0;
if ((byte) mvX2 >= PALETTE_ANIM_START) { __m128i srcABCD;
ALIGN(16) Colour colour = AdjustBrightness(LookupColourInPalette((byte) mvX2), (byte) (mvX2 >> 8)); __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
if (src_mv->m >= PALETTE_ANIM_START) {
Colour colour = AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v);
colour.a = src->a; colour.a = src->a;
srcABCD = _mm_load_si128((__m128i*) &colour); srcABCD = _mm_cvtsi32_si128(colour.data);
} else {
srcABCD = _mm_cvtsi32_si128(src->data);
} }
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
(*dst).data = EXTR32(srcABCD, 0); dst->data = _mm_cvtsi128_si32(srcABCD);
} }
} }
break; break;
@ -181,18 +172,18 @@ bmno_full_transparency:
const int width_diff = si->sprite_width - bp->width; const int width_diff = si->sprite_width - bp->width;
effective_width = bp->width - (int) src_rgba_line[0].data; effective_width = bp->width - (int) src_rgba_line[0].data;
const int delta_diff = (int) src_rgba_line[1].data - width_diff; const int delta_diff = (int) src_rgba_line[1].data - width_diff;
const int nd = effective_width - delta_diff; const int new_width = effective_width - delta_diff;
effective_width = delta_diff > 0 ? nd : effective_width; effective_width = delta_diff > 0 ? new_width : effective_width;
if (effective_width <= 0) break; if (effective_width <= 0) break;
/* FALLTHROUGH */ /* FALLTHROUGH */
} }
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) effective_width / 2; x != 0; x--) { for (uint x = (uint) effective_width / 2; x != 0; x--) {
uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
/* Remap colours. */ /* Remap colours. */
const uint m0 = (byte) mvX2; const uint m0 = (byte) mvX2;
const uint r0 = remap[m0]; const uint r0 = remap[m0];
@ -250,53 +241,40 @@ bmno_full_transparency:
bmcr_alpha_blend: bmcr_alpha_blend:
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
bmcr_full_opacity: bmcr_full_opacity:
srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); _mm_storel_epi64((__m128i *) dst, srcABCD);
src += 2;
src_mv += 2;
anim += 2;
mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
dstABCD = _mm_loadu_si128((__m128i*) (dst+2));
_mm_storeu_si128((__m128i *) dst, srcABCD);
srcABCD = _mm_loadu_si128((const __m128i*) src);
dst += 2;
continue;
bmcr_full_transparency: bmcr_full_transparency:
src_mv += 2; src_mv += 2;
mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
dst += 2; dst += 2;
src += 2; src += 2;
anim += 2; anim += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst);
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (effective_width & 1) { if (effective_width & 1) {
/* In case the m-channel is zero, do not remap this pixel in any way. */ /* In case the m-channel is zero, do not remap this pixel in any way. */
if (src->a == 0) { __m128i srcABCD;
} else if ((byte) mvX2 != 0) { if (src->a == 0) break;
const uint r = remap[(byte) mvX2]; if (src_mv->m) {
*anim = (src->a == 255) ? (r | ((uint16) mvX2 & 0xFF00)) : 0; const uint r = remap[src_mv->m];
*anim = (src->a == 255) ? r | ((uint16) src_mv->v << 8 ) : 0;
if (r != 0) { if (r != 0) {
Colour remapped_colour = AdjustBrightness(LookupColourInPalette(r), (byte) (mvX2 >> 8)); Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) { if (src->a == 255) {
*dst = remapped_colour; *dst = remapped_colour;
} else { } else {
remapped_colour.a = src->a; remapped_colour.a = src->a;
INSR32(remapped_colour.data, srcABCD, 0); srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
goto bmcr_alpha_blend_single; goto bmcr_alpha_blend_single;
} }
} }
} else { } else {
*anim = 0; *anim = 0;
if (src->a == 255) { srcABCD = _mm_cvtsi32_si128(src->data);
*dst = *src; if (src->a < 255) {
} else {
bmcr_alpha_blend_single: bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
(*dst).data = EXTR32(srcABCD, 0);
} }
dst->data = _mm_cvtsi128_si32(srcABCD);
} }
} }
break; break;
@ -309,29 +287,27 @@ bmcr_alpha_blend_single:
case BM_TRANSPARENT: { case BM_TRANSPARENT: {
/* Make the current colour a bit more black, so it looks like this image is transparent. */ /* Make the current colour a bit more black, so it looks like this image is transparent. */
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) bp->width / 2; x > 0; x--) { for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstCD); dstAB = _mm_packus_epi16(dstAB, dstAB);
Colour *old_dst = dst; _mm_storel_epi64((__m128i *) dst, dstAB);
src += 2; src += 2;
dst += 2; dst += 2;
anim += 2; anim += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, dstAB);
srcABCD = _mm_loadu_si128((const __m128i*) src);
if (src[-2].a) anim[-2] = 0; if (src[-2].a) anim[-2] = 0;
if (src[-1].a) anim[-1] = 0; if (src[-1].a) anim[-1] = 0;
} }
if (bp->width & 1) { if (bp->width & 1) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
@ -340,7 +316,7 @@ bmcr_alpha_blend_single:
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstAB); dstAB = _mm_packus_epi16(dstAB, dstAB);
(*dst).data = EXTR32(dstAB, 0); dst->data = _mm_cvtsi128_si32(dstAB);
if (src[0].a) anim[0] = 0; if (src[0].a) anim[0] = 0;
} }
break; break;

View File

@ -70,18 +70,18 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
for (uint x = (uint) effective_width / 2; x > 0; x--) { for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src); __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
ALPHA_BLEND_2(); ALPHA_BLEND_2();
*(uint64*) dst = EXTR64(srcABCD, 0); _mm_storel_epi64((__m128i*) dst, srcABCD);
src += 2; src += 2;
dst += 2; dst += 2;
} }
if (bt_last == BT_ODD) { if (bt_last == BT_ODD) {
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src); __m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst); __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2(); ALPHA_BLEND_2();
(*dst).data = EXTR32(srcABCD, 0); dst->data = _mm_cvtsi128_si32(srcABCD);
} }
break; break;
} }
@ -99,8 +99,8 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
const int width_diff = si->sprite_width - bp->width; const int width_diff = si->sprite_width - bp->width;
effective_width = bp->width - (int) src_rgba_line[0].data; effective_width = bp->width - (int) src_rgba_line[0].data;
const int delta_diff = (int) src_rgba_line[1].data - width_diff; const int delta_diff = (int) src_rgba_line[1].data - width_diff;
const int nd = effective_width - delta_diff; const int new_width = effective_width - delta_diff;
effective_width = delta_diff > 0 ? nd : effective_width; effective_width = delta_diff > 0 ? new_width : effective_width;
if (effective_width <= 0) break; if (effective_width <= 0) break;
/* FALLTHROUGH */ /* FALLTHROUGH */
} }
@ -108,31 +108,29 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
const byte *remap = bp->remap; const byte *remap = bp->remap;
for (uint x = (uint) effective_width; x != 0; x--) { for (uint x = (uint) effective_width; x != 0; x--) {
/* In case the m-channel is zero, do not remap this pixel in any way */ /* In case the m-channel is zero, do not remap this pixel in any way. */
if (src_mv->m == 0) { __m128i srcABCD;
if (src->a < 255) { if (src_mv->m) {
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
ALPHA_BLEND_2();
(*dst).data = EXTR32(srcABCD, 0);
} else {
*dst = src->data;
}
} else {
const uint r = remap[src_mv->m]; const uint r = remap[src_mv->m];
if (r != 0) { if (r != 0) {
Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
if (src->a < 255) { if (src->a == 255) {
__m128i srcABCD;
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
remapped_colour.a = src->a;
INSR32(remapped_colour.data, srcABCD, 0);
ALPHA_BLEND_2();
(*dst).data = EXTR32(srcABCD, 0);
} else
*dst = remapped_colour; *dst = remapped_colour;
} else {
remapped_colour.a = src->a;
srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
goto bmcr_alpha_blend_single;
} }
} }
} else {
srcABCD = _mm_cvtsi32_si128(src->data);
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
src_mv++; src_mv++;
dst++; dst++;
src++; src++;
@ -149,27 +147,25 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Make the current colour a bit more black, so it looks like this image is transparent. /* Make the current colour a bit more black, so it looks like this image is transparent.
* rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
*/ */
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) bp->width / 2; x > 0; x--) { for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F); __m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F);
alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F);
alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstCD); dstAB = _mm_packus_epi16(dstAB, dstAB);
Colour *old_dst = dst; _mm_storel_epi64((__m128i *) dst, dstAB);
src += 2; src += 2;
dst += 2; dst += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, dstAB);
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (bp->width & 1) { if (bp->width & 1) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F); __m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F);
@ -179,7 +175,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstAB); dstAB = _mm_packus_epi16(dstAB, dstAB);
(*dst).data = EXTR32(dstAB, 0); dst->data = _mm_cvtsi128_si32(dstAB);
} }
break; break;
} }
@ -345,7 +341,7 @@ inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightnes
IGNORE_UNINITIALIZED_WARNING_START IGNORE_UNINITIALIZED_WARNING_START
/* static */ Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness) /* static */ Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness)
{ {
ALIGN(16) uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
c16 *= brightness; c16 *= brightness;
uint64 c16_ob = c16; // Helps out of order execution. uint64 c16_ob = c16; // Helps out of order execution.
c16 /= DEFAULT_BRIGHTNESS; c16 /= DEFAULT_BRIGHTNESS;
@ -357,12 +353,20 @@ IGNORE_UNINITIALIZED_WARNING_START
const uint32 alpha32 = colour.data & 0xFF000000; const uint32 alpha32 = colour.data & 0xFF000000;
__m128i ret; __m128i ret;
#ifdef _SQ64
ret = _mm_cvtsi64_si128(c16);
#else
INSR64(c16, ret, 0); INSR64(c16, ret, 0);
#endif
if (ob != 0) { if (ob != 0) {
/* Reduce overbright strength. */ /* Reduce overbright strength. */
ob /= 2; ob /= 2;
__m128i ob128; __m128i ob128;
#ifdef _SQ64
ob128 = _mm_cvtsi64_si128(ob | ob << 16 | ob << 32);
#else
INSR64(ob | ob << 16 | ob << 32, ob128, 0); INSR64(ob | ob << 16 | ob << 32, ob128, 0);
#endif
__m128i white = OVERBRIGHT_VALUE_MASK; __m128i white = OVERBRIGHT_VALUE_MASK;
__m128i c128 = ret; __m128i c128 = ret;
ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */ ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */
@ -372,7 +376,7 @@ IGNORE_UNINITIALIZED_WARNING_START
} }
ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */ ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */
return alpha32 | EXTR32(ret, 0); return alpha32 | _mm_cvtsi128_si32(ret);
} }
IGNORE_UNINITIALIZED_WARNING_STOP IGNORE_UNINITIALIZED_WARNING_STOP

View File

@ -76,24 +76,19 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
} }
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) effective_width / 2; x > 0; x--) { for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); _mm_storel_epi64((__m128i*) dst, srcABCD);
Colour *old_dst = dst;
src += 2; src += 2;
dst += 2; dst += 2;
/* It is VERY important to read next data before it gets invalidated in cpu cache.
* And PEXTR latency is a real problem here.
*/
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, srcABCD);
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (bt_last == BT_ODD) { if (bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
*dst = (Colour) EXTR32(srcABCD, 0); dst->data = _mm_cvtsi128_si32(srcABCD);
} }
break; break;
} }
@ -112,18 +107,18 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
const int width_diff = si->sprite_width - bp->width; const int width_diff = si->sprite_width - bp->width;
effective_width = bp->width - (int) src_rgba_line[0].data; effective_width = bp->width - (int) src_rgba_line[0].data;
const int delta_diff = (int) src_rgba_line[1].data - width_diff; const int delta_diff = (int) src_rgba_line[1].data - width_diff;
const int nd = effective_width - delta_diff; const int new_width = effective_width - delta_diff;
effective_width = delta_diff > 0 ? nd : effective_width; effective_width = delta_diff > 0 ? new_width : effective_width;
if (effective_width <= 0) break; if (effective_width <= 0) break;
/* FALLTHROUGH */ /* FALLTHROUGH */
} }
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src); for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst); __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv)); uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
for (uint x = (uint) effective_width / 2; x > 0; x--) {
/* Remap colours. */ /* Remap colours. */
if (mvX2 & 0x00FF00FF) { if (mvX2 & 0x00FF00FF) {
/* Written so the compiler uses CMOV. */ /* Written so the compiler uses CMOV. */
@ -152,38 +147,35 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Blend colours. */ /* Blend colours. */
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0); _mm_storel_epi64((__m128i *) dst, srcABCD);
Colour *old_dst = dst;
dst += 2; dst += 2;
src += 2; src += 2;
src_mv += 2; src_mv += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, srcABCD);
mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (effective_width & 1) { if (effective_width & 1) {
/* In case the m-channel is zero, do not remap this pixel in any way. */ /* In case the m-channel is zero, do not remap this pixel in any way. */
if ((byte) mvX2 == 0) { __m128i srcABCD;
if (src->a < 255) { if (src_mv->m) {
ALPHA_BLEND_2(pack_low_cm); const uint r = remap[src_mv->m];
(*dst).data = EXTR32(srcABCD, 0);
} else
*dst = *src;
} else {
const uint r = remap[(byte) mvX2];
if (r != 0) { if (r != 0) {
Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), (byte) (mvX2 >> 8)); Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) { if (src->a == 255) {
*dst = remapped_colour; *dst = remapped_colour;
} else { } else {
remapped_colour.a = src->a; remapped_colour.a = src->a;
INSR32(remapped_colour.data, srcABCD, 0); srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
goto bmcr_alpha_blend_single;
}
}
} else {
srcABCD = _mm_cvtsi32_si128(src->data);
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2(pack_low_cm); ALPHA_BLEND_2(pack_low_cm);
(*dst).data = EXTR32(srcABCD, 0);
}
} }
dst->data = _mm_cvtsi128_si32(srcABCD);
} }
} }
break; break;
@ -199,26 +191,24 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Make the current colour a bit more black, so it looks like this image is transparent. /* Make the current colour a bit more black, so it looks like this image is transparent.
* rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
*/ */
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) bp->width / 2; x > 0; x--) { for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstCD); dstAB = _mm_packus_epi16(dstAB, dstAB);
Colour *old_dst = dst; _mm_storel_epi64((__m128i *) dst, dstAB);
src += 2; src += 2;
dst += 2; dst += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, dstAB);
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (bp->width & 1) { if (bp->width & 1) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
@ -227,7 +217,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstAB); dstAB = _mm_packus_epi16(dstAB, dstAB);
(*dst).data = EXTR32(dstAB, 0); dst->data = _mm_cvtsi128_si32(dstAB);
} }
break; break;
@ -290,7 +280,7 @@ inline Colour Blitter_32bppSSE4::AdjustBrightness(Colour colour, uint8 brightnes
IGNORE_UNINITIALIZED_WARNING_START IGNORE_UNINITIALIZED_WARNING_START
/* static */ Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness) /* static */ Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness)
{ {
ALIGN(16) uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
c16 *= brightness; c16 *= brightness;
uint64 c16_ob = c16; // Helps out of order execution. uint64 c16_ob = c16; // Helps out of order execution.
c16 /= DEFAULT_BRIGHTNESS; c16 /= DEFAULT_BRIGHTNESS;
@ -317,7 +307,7 @@ IGNORE_UNINITIALIZED_WARNING_START
} }
ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */ ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */
return alpha32 | EXTR32(ret, 0); return alpha32 | _mm_cvtsi128_si32(ret);
} }
IGNORE_UNINITIALIZED_WARNING_STOP IGNORE_UNINITIALIZED_WARNING_STOP

View File

@ -50,7 +50,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Load these variables into register before loop. */ /* Load these variables into register before loop. */
const __m128i a_cm = ALPHA_CONTROL_MASK; const __m128i a_cm = ALPHA_CONTROL_MASK;
const __m128i pack_hi_cm = PACK_HIGH_CONTROL_MASK; const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK;
const __m128i briAB_cm = BRIGHTNESS_LOW_CONTROL_MASK; const __m128i briAB_cm = BRIGHTNESS_LOW_CONTROL_MASK;
const __m128i div_cleaner = BRIGHTNESS_DIV_CLEANER; const __m128i div_cleaner = BRIGHTNESS_DIV_CLEANER;
const __m128i ob_check = OVERBRIGHT_PRESENCE_MASK; const __m128i ob_check = OVERBRIGHT_PRESENCE_MASK;
@ -79,27 +79,19 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
} }
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) effective_width / 2; x > 0; x--) { for (uint x = (uint) effective_width / 2; x > 0; x--) {
ALPHA_BLEND_2(pack_hi_cm); __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
/* With high repack, srcABCD have its 2 blended pixels like: [S0 S1 S2 S3] -> [-- -- BS0 BS1] __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
* dstABCD shuffled: [D0 D1 D2 D3] -> [D2 D3 D0 D0] ALPHA_BLEND_2(pack_low_cm);
* PALIGNR takes what's in (): [-- -- (BS0 BS1] [D2 D3) D0 D0] _mm_storel_epi64((__m128i*) dst, srcABCD);
*/
dstABCD = _mm_shuffle_epi32(dstABCD, 0x0E);
srcABCD = _mm_alignr_epi8(dstABCD, srcABCD, 8);
Colour *old_dst = dst;
src += 2; src += 2;
dst += 2; dst += 2;
/* It is VERY important to read next data before it gets invalidated in cpu cache. */
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, srcABCD);
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (bt_last == BT_ODD) { if (bt_last == BT_ODD) {
ALPHA_BLEND_2(pack_hi_cm); __m128i srcABCD = _mm_cvtsi32_si128(src->data);
(*dst).data = EXTR32(srcABCD, 2); __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2(pack_low_cm);
dst->data = _mm_cvtsi128_si32(srcABCD);
} }
break; break;
} }
@ -117,18 +109,18 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
const int width_diff = si->sprite_width - bp->width; const int width_diff = si->sprite_width - bp->width;
effective_width = bp->width - (int) src_rgba_line[0].data; effective_width = bp->width - (int) src_rgba_line[0].data;
const int delta_diff = (int) src_rgba_line[1].data - width_diff; const int delta_diff = (int) src_rgba_line[1].data - width_diff;
const int nd = effective_width - delta_diff; const int new_width = effective_width - delta_diff;
effective_width = delta_diff > 0 ? nd : effective_width; effective_width = delta_diff > 0 ? new_width : effective_width;
if (effective_width <= 0) break; if (effective_width <= 0) break;
/* FALLTHROUGH */ /* FALLTHROUGH */
} }
case RM_WITH_SKIP: { case RM_WITH_SKIP: {
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src); for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst); __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv)); uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
for (uint x = (uint) effective_width / 2; x > 0; x--) {
/* Remap colours. */ /* Remap colours. */
if (mvX2 & 0x00FF00FF) { if (mvX2 & 0x00FF00FF) {
/* Written so the compiler uses CMOV. */ /* Written so the compiler uses CMOV. */
@ -139,7 +131,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
Colour c0 = 0; // Use alpha of 0 to keep dst as is. Colour c0 = 0; // Use alpha of 0 to keep dst as is.
c0 = r0 == 0 ? c0 : c0map; c0 = r0 == 0 ? c0 : c0map;
c0 = m0 != 0 ? c0 : src0; c0 = m0 != 0 ? c0 : src0;
INSR32(c0.data, srcABCD, 0); srcABCD = _mm_cvtsi32_si128(c0.data);
const Colour src1 = src[1]; const Colour src1 = src[1];
const uint m1 = (byte) (mvX2 >> 16); const uint m1 = (byte) (mvX2 >> 16);
@ -156,41 +148,37 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
} }
/* Blend colours. */ /* Blend colours. */
ALPHA_BLEND_2(pack_hi_cm); ALPHA_BLEND_2(pack_low_cm);
dstABCD = _mm_shuffle_epi32(dstABCD, 0x0E); _mm_storel_epi64((__m128i *) dst, srcABCD);
srcABCD = _mm_alignr_epi8(dstABCD, srcABCD, 8);
Colour *old_dst = dst;
dst += 2; dst += 2;
src += 2; src += 2;
src_mv += 2; src_mv += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, srcABCD);
mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (effective_width & 1) { if (effective_width & 1) {
/* In case the m-channel is zero, do not remap this pixel in any way */ /* In case the m-channel is zero, do not remap this pixel in any way. */
if (src_mv->m == 0) { __m128i srcABCD;
if (src->a < 255) { if (src_mv->m) {
ALPHA_BLEND_2(pack_hi_cm);
(*dst).data = EXTR32(srcABCD, 2);
} else {
*dst = src->data;
}
} else {
const uint r = remap[src_mv->m]; const uint r = remap[src_mv->m];
if (r != 0) { if (r != 0) {
Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
if (src->a < 255) { if (src->a == 255) {
remapped_colour.a = src->a;
INSR32(remapped_colour.data, srcABCD, 0);
ALPHA_BLEND_2(pack_hi_cm);
(*dst).data = EXTR32(srcABCD, 2);
} else
*dst = remapped_colour; *dst = remapped_colour;
} else {
remapped_colour.a = src->a;
srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
goto bmcr_alpha_blend_single;
} }
} }
} else {
srcABCD = _mm_cvtsi32_si128(src->data);
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2(pack_low_cm);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
} }
break; break;
} }
@ -200,30 +188,29 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
src_mv_line += si->sprite_width; src_mv_line += si->sprite_width;
break; break;
} }
case BM_TRANSPARENT: { case BM_TRANSPARENT: {
/* Make the current colour a bit more black, so it looks like this image is transparent. /* Make the current colour a bit more black, so it looks like this image is transparent.
* rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
*/ */
__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
for (uint x = (uint) bp->width / 2; x > 0; x--) { for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstCD); dstAB = _mm_packus_epi16(dstAB, dstAB);
Colour *old_dst = dst; _mm_storel_epi64((__m128i *) dst, dstAB);
src += 2; src += 2;
dst += 2; dst += 2;
dstABCD = _mm_loadu_si128((__m128i*) dst);
_mm_storeu_si128((__m128i *) old_dst, dstAB);
srcABCD = _mm_loadu_si128((const __m128i*) src);
} }
if (bp->width & 1) { if (bp->width & 1) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
@ -232,7 +219,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
dstAB = _mm_mullo_epi16(dstAB, nom); dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8); dstAB = _mm_srli_epi16(dstAB, 8);
dstAB = _mm_packus_epi16(dstAB, dstAB); dstAB = _mm_packus_epi16(dstAB, dstAB);
(*dst).data = EXTR32(dstAB, 0); dst->data = _mm_cvtsi128_si32(dstAB);
} }
break; break;
} }

View File

@ -47,8 +47,7 @@
__m128i zero = _mm_setzero_si128(); \ __m128i zero = _mm_setzero_si128(); \
__m128i colAB = _mm_unpacklo_epi8(colourX2, zero); \ __m128i colAB = _mm_unpacklo_epi8(colourX2, zero); \
\ \
__m128i briAB; \ __m128i briAB = _mm_cvtsi32_si128(brightnessX2); \
INSR64(brightnessX2, briAB, 0); \
briAB = _mm_shuffle_epi8(briAB, briAB_cm); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \ briAB = _mm_shuffle_epi8(briAB, briAB_cm); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \
colAB = _mm_mullo_epi16(colAB, briAB); \ colAB = _mm_mullo_epi16(colAB, briAB); \
__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \ __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \

View File

@ -1077,7 +1077,7 @@ static bool AllocateDibSection(int w, int h, bool force)
bi->bmiHeader.biSize = sizeof(BITMAPINFOHEADER); bi->bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
bi->bmiHeader.biWidth = _wnd.width = w; bi->bmiHeader.biWidth = _wnd.width = w;
bi->bmiHeader.biHeight = -(_wnd.height = h+1); // Allocate extra room to prevent out-of-bounds when SSE reads a 16B block at the end of the buffer. bi->bmiHeader.biHeight = -(_wnd.height = h);
bi->bmiHeader.biPlanes = 1; bi->bmiHeader.biPlanes = 1;
bi->bmiHeader.biBitCount = BlitterFactory::GetCurrentBlitter()->GetScreenDepth(); bi->bmiHeader.biBitCount = BlitterFactory::GetCurrentBlitter()->GetScreenDepth();